Unverified Commit 64936449 authored by q.yao's avatar q.yao Committed by GitHub
Browse files

use huggingface tokenizer (#26)

* add hf tokenizer

* format

* fix for comment

* don't skip speical tokens
parent 0cc48011
...@@ -5,6 +5,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ ...@@ -5,6 +5,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
&& rm -rf /var/lib/apt/lists/* && rm -rf /var/lib/apt/lists/*
RUN python3 -m pip install torch==1.13.1+cu117 torchvision==0.14.1+cu117 --extra-index-url https://download.pytorch.org/whl/cu117 RUN python3 -m pip install torch==1.13.1+cu117 torchvision==0.14.1+cu117 --extra-index-url https://download.pytorch.org/whl/cu117
RUN python3 -m pip install sentencepiece cmake RUN python3 -m pip install sentencepiece cmake transformers protobuf==3.20.3
ENV NCCL_LAUNCH_MODE=GROUP ENV NCCL_LAUNCH_MODE=GROUP
from sentencepiece import SentencePieceProcessor
from typing import List from typing import List
import fire import fire
import sys
class Tokenizer: class Tokenizer:
def __init__(self, model_file: str): def __init__(self, model_file: str):
self.model = SentencePieceProcessor(model_file=model_file) if model_file.endswith('.model'):
self.vocab_size = self.model.vocab_size() from sentencepiece import SentencePieceProcessor
self.start_id = self.model.bos_id() self.model = SentencePieceProcessor(model_file=model_file)
self.end_id = self.model.eos_id() self.vocab_size = self.model.vocab_size()
self.pad_id = self.model.pad_id() self.start_id = self.model.bos_id()
self.end_id = self.model.eos_id()
self.pad_id = self.model.pad_id()
else:
from transformers import AutoTokenizer
self.model = AutoTokenizer.from_pretrained(model_file)
self.vocab_size = self.model.vocab_size
self.start_id = self.model.bos_token_id
self.end_id = self.model.eos_token_id
self.pad_id = self.model.pad_token_id
print(f'vocab_size = {self.vocab_size}') print(f'vocab_size = {self.vocab_size}')
print(f'start_id = {self.start_id}') print(f'start_id = {self.start_id}')
print(f'end_id = {self.end_id}') print(f'end_id = {self.end_id}')
print(f'pad_id = {self.pad_id}') print(f'pad_id = {self.pad_id}')
def encode(self, s: str): def encode(self, s: str):
return self.model.Encode(s, add_bos=True) if hasattr(self.model, 'Encode'):
return self.model.Encode(s, add_bos=True)
else:
return self.model.encode(s, add_special_tokens=True)
def decode(self, t: List[int]): def decode(self, t: List[int]):
return self.model.Decode(t) if hasattr(self.model, 'Decode'):
return self.model.Decode(t)
else:
return self.model.decode(t)
def main(model_file: str = '/data/llama/model/tokenizer.model', def main(model_file: str = '/data/llama/model/tokenizer.model',
encode_file: str = None, decode_file: str = None): encode_file: str = None,
decode_file: str = None):
tokenizer = Tokenizer(model_file) tokenizer = Tokenizer(model_file)
if encode_file: if encode_file:
with open(encode_file, 'r') as f: with open(encode_file, 'r') as f:
...@@ -54,4 +70,4 @@ def main(model_file: str = '/data/llama/model/tokenizer.model', ...@@ -54,4 +70,4 @@ def main(model_file: str = '/data/llama/model/tokenizer.model',
if __name__ == '__main__': if __name__ == '__main__':
fire.Fire(main) fire.Fire(main)
\ No newline at end of file
...@@ -107,13 +107,14 @@ class Chatbot: ...@@ -107,13 +107,14 @@ class Chatbot:
stop_words = None stop_words = None
bad_words = np.array([[[self.eos_id], [1]]], dtype=np.int32) bad_words = np.array([[[self.eos_id], [1]]], dtype=np.int32)
self.cfg = mmengine.Config( self.cfg = mmengine.Config(
dict(session_len=session_len, dict(
top_p=top_p, session_len=session_len,
top_k=top_k, top_p=top_p,
temperature=temperature, top_k=top_k,
repetition_penalty=repetition_penalty, temperature=temperature,
stop_words=stop_words, repetition_penalty=repetition_penalty,
bad_words=bad_words)) stop_words=stop_words,
bad_words=bad_words))
self.log_level = log_level self.log_level = log_level
self.display = display self.display = display
self.profile_generation = profile_generation self.profile_generation = profile_generation
...@@ -200,13 +201,16 @@ class Chatbot: ...@@ -200,13 +201,16 @@ class Chatbot:
return StatusCode.TRITON_SESSION_CLOSED return StatusCode.TRITON_SESSION_CLOSED
self._session.status = 0 self._session.status = 0
for status, _, _ in self._stream_infer(self._session, for status, _, _ in self._stream_infer(
prompt='', self._session,
request_output_len=0, prompt='',
sequence_start=False, request_output_len=0,
sequence_end=True): sequence_start=False,
sequence_end=True):
if status != StatusCode.TRITON_STREAM_END: if status != StatusCode.TRITON_STREAM_END:
return status return status
self.reset_session()
return StatusCode.TRITON_STREAM_END return StatusCode.TRITON_STREAM_END
def cancel(self, session_id: int, *args, **kwargs): def cancel(self, session_id: int, *args, **kwargs):
...@@ -238,12 +242,13 @@ class Chatbot: ...@@ -238,12 +242,13 @@ class Chatbot:
return StatusCode.TRITON_SESSION_CLOSED return StatusCode.TRITON_SESSION_CLOSED
prev_session = self._session prev_session = self._session
for status, res, _ in self._stream_infer(self._session, for status, res, _ in self._stream_infer(
prompt='', self._session,
request_output_len=0, prompt='',
sequence_start=False, request_output_len=0,
sequence_end=False, sequence_start=False,
cancel=True): sequence_end=False,
cancel=True):
if status.value < 0: if status.value < 0:
break break
if status == StatusCode.TRITON_STREAM_END: if status == StatusCode.TRITON_STREAM_END:
...@@ -336,11 +341,11 @@ class Chatbot: ...@@ -336,11 +341,11 @@ class Chatbot:
session.response = '' session.response = ''
que = queue.Queue() que = queue.Queue()
producer = threading.Thread(target=self._stream_producer, producer = threading.Thread(
args=(self.tritonserver_addr, session, que, target=self._stream_producer,
self.cfg, input_ids, input_lengths, args=(self.tritonserver_addr, session, que, self.cfg, input_ids,
request_output_len, sequence_start, input_lengths, request_output_len, sequence_start,
sequence_end, preseq_length, cancel)) sequence_end, preseq_length, cancel))
producer.start() producer.start()
for state, res, tokens in self.stream_consumer( for state, res, tokens in self.stream_consumer(
self.postprocess, que, session, preseq_length, cancel, logger, self.postprocess, que, session, preseq_length, cancel, logger,
...@@ -411,12 +416,13 @@ class Chatbot: ...@@ -411,12 +416,13 @@ class Chatbot:
random_seed * np.ones((1, 1), dtype=np.uint64)) random_seed * np.ones((1, 1), dtype=np.uint64))
] ]
client.start_stream(callback) client.start_stream(callback)
client.async_stream_infer('fastertransformer', client.async_stream_infer(
inputs, 'fastertransformer',
sequence_id=session.session_id, inputs,
request_id=session.request_id, sequence_id=session.session_id,
sequence_start=sequence_start, request_id=session.request_id,
sequence_end=sequence_end) sequence_start=sequence_start,
sequence_end=sequence_end)
que.put(None) que.put(None)
@staticmethod @staticmethod
......
...@@ -127,28 +127,29 @@ def export(model_name: str, ...@@ -127,28 +127,29 @@ def export(model_name: str,
vocab_size, bos_id, eos_id = tokenizer_info(tokenizer_path) vocab_size, bos_id, eos_id = tokenizer_info(tokenizer_path)
assert _vocab_size == vocab_size, \ assert _vocab_size == vocab_size, \
f'different vocab size {_vocab_size} vs {vocab_size}' f'different vocab size {_vocab_size} vs {vocab_size}'
cfg = dict(llama=dict( cfg = dict(
model_name=model_name, llama=dict(
head_num=head_num, model_name=model_name,
size_per_head=size_per_head, head_num=head_num,
vocab_size=vocab_size, size_per_head=size_per_head,
num_layer=num_layer, vocab_size=vocab_size,
rotary_embedding=size_per_head, num_layer=num_layer,
inter_size=inter_size, rotary_embedding=size_per_head,
norm_eps=norm_eps, inter_size=inter_size,
attn_bias=attn_bias, norm_eps=norm_eps,
start_id=bos_id, attn_bias=attn_bias,
end_id=eos_id, start_id=bos_id,
weight_type='fp16', end_id=eos_id,
# parameters for fastertransformer weight_type='fp16',
max_batch_size=32, # parameters for fastertransformer
max_context_token_num=4, max_batch_size=32,
session_len=2048, max_context_token_num=4,
step_length=1, session_len=2048,
cache_max_entry_count=48, step_length=1,
cache_chunk_size=8, cache_max_entry_count=48,
use_context_fmha=1, cache_chunk_size=8,
quant_policy=0)) use_context_fmha=1,
quant_policy=0))
config = configparser.ConfigParser() config = configparser.ConfigParser()
for section, key_values in cfg.items(): for section, key_values in cfg.items():
...@@ -166,7 +167,7 @@ def deploy_llama(model_name: str, model_path: str, tokenizer_path: str, ...@@ -166,7 +167,7 @@ def deploy_llama(model_name: str, model_path: str, tokenizer_path: str,
shutil.copy(tokenizer_path, shutil.copy(tokenizer_path,
osp.join(triton_models_path, 'tokenizer/tokenizer.model')) osp.join(triton_models_path, 'tokenizer/tokenizer.model'))
else: else:
print('tokenizer model {tokenizer_path} does not exist') print(f'tokenizer model {tokenizer_path} does not exist')
return False return False
# read model arguments from params.json # read model arguments from params.json
try: try:
...@@ -190,9 +191,8 @@ def deploy_llama(model_name: str, model_path: str, tokenizer_path: str, ...@@ -190,9 +191,8 @@ def deploy_llama(model_name: str, model_path: str, tokenizer_path: str,
def get_param(_name, _size): def get_param(_name, _size):
print(_name, _size) print(_name, _size)
if _name not in model_params: if _name not in model_params:
model_params[_name] = torch.zeros(_size, model_params[_name] = torch.zeros(
dtype=torch.float16, _size, dtype=torch.float16, device='cpu')
device='cpu')
return model_params[_name] return model_params[_name]
for i, ckpt_path in enumerate(checkpoints): for i, ckpt_path in enumerate(checkpoints):
...@@ -204,7 +204,8 @@ def deploy_llama(model_name: str, model_path: str, tokenizer_path: str, ...@@ -204,7 +204,8 @@ def deploy_llama(model_name: str, model_path: str, tokenizer_path: str,
size = param_data.size(0) size = param_data.size(0)
if ext == 'weight': if ext == 'weight':
param = get_param( param = get_param(
param_name, [size * n_ckpt, param_data.size(1)]) param_name,
[size * n_ckpt, param_data.size(1)])
param.data[size * i:size * (i + 1), :] = param_data param.data[size * i:size * (i + 1), :] = param_data
else: # bias else: # bias
param = get_param(param_name, [size * n_ckpt]) param = get_param(param_name, [size * n_ckpt])
...@@ -235,8 +236,9 @@ def deploy_llama(model_name: str, model_path: str, tokenizer_path: str, ...@@ -235,8 +236,9 @@ def deploy_llama(model_name: str, model_path: str, tokenizer_path: str,
# concat qkv projection # concat qkv projection
for t in ['weight', 'bias']: for t in ['weight', 'bias']:
for i in range(1000): for i in range(1000):
_qkv = [f'layers.{i}.attention.{k}.{t}' for k in [ _qkv = [
'wq', 'wk', 'wv']] f'layers.{i}.attention.{k}.{t}' for k in ['wq', 'wk', 'wv']
]
try: try:
qkv = tuple(map(model_params.pop, _qkv)) qkv = tuple(map(model_params.pop, _qkv))
except KeyError: except KeyError:
...@@ -278,8 +280,15 @@ def deploy_hf(model_name: str, model_path: str, tokenizer_path: str, ...@@ -278,8 +280,15 @@ def deploy_hf(model_name: str, model_path: str, tokenizer_path: str,
if osp.exists(tokenizer_path): if osp.exists(tokenizer_path):
shutil.copy(tokenizer_path, shutil.copy(tokenizer_path,
osp.join(triton_models_path, 'tokenizer/tokenizer.model')) osp.join(triton_models_path, 'tokenizer/tokenizer.model'))
for json_file in os.listdir(model_path):
if json_file.endswith(
'.json') and json_file != 'pytorch_model.bin.index.json':
json_path = osp.join(model_path, json_file)
shutil.copy(
json_path,
osp.join(triton_models_path, 'tokenizer', json_file))
else: else:
print('tokenizer model {tokenizer_path} does not exist') print(f'tokenizer model {tokenizer_path} does not exist')
exit(-1) exit(-1)
# read model arguments from params.json # read model arguments from params.json
...@@ -371,19 +380,22 @@ def deploy_hf(model_name: str, model_path: str, tokenizer_path: str, ...@@ -371,19 +380,22 @@ def deploy_hf(model_name: str, model_path: str, tokenizer_path: str,
for ft, hf in other: for ft, hf in other:
model_params[ft] = get_tensor(hf) model_params[ft] = get_tensor(hf)
return export(model_name, num_layer, norm_eps, model_params, tokenizer_path, return export(model_name, num_layer, norm_eps, model_params,
triton_models_path, tp) tokenizer_path, triton_models_path, tp)
def pack_model_repository(workspace_path: str): def pack_model_repository(workspace_path: str):
model_repo_dir = osp.join(workspace_path, 'model_repository') model_repo_dir = osp.join(workspace_path, 'model_repository')
os.makedirs(model_repo_dir, exist_ok=True) os.makedirs(model_repo_dir, exist_ok=True)
os.symlink(src=osp.join('../triton_models/interactive'), os.symlink(
dst=osp.join(model_repo_dir, 'fastertransformer')) src=osp.join('../triton_models/interactive'),
os.symlink(src=osp.join('../triton_models/preprocessing'), dst=osp.join(model_repo_dir, 'fastertransformer'))
dst=osp.join(model_repo_dir, 'preprocessing')) os.symlink(
os.symlink(src=osp.join('../triton_models/postprocessing'), src=osp.join('../triton_models/preprocessing'),
dst=osp.join(model_repo_dir, 'postprocessing')) dst=osp.join(model_repo_dir, 'preprocessing'))
os.symlink(
src=osp.join('../triton_models/postprocessing'),
dst=osp.join(model_repo_dir, 'postprocessing'))
def main(model_name: str, def main(model_name: str,
......
...@@ -6,22 +6,63 @@ from typing import List ...@@ -6,22 +6,63 @@ from typing import List
import numpy as np import numpy as np
import triton_python_backend_utils as pb_utils import triton_python_backend_utils as pb_utils
from sentencepiece import SentencePieceProcessor
class Tokenizer: class Tokenizer:
def __init__(self, model_file: str): def __init__(self, model_file: str):
self.model = SentencePieceProcessor(model_file=model_file) model_folder = osp.split(model_file)[0]
self.vocab_size = self.model.vocab_size() tokenizer_config_file = osp.join(model_folder, 'tokenizer_config.json')
self.start_id = self.model.bos_id() use_hf_model = osp.exists(tokenizer_config_file)
self.eos_id = self.model.eos_id() self.use_hf_model = use_hf_model
if not self.use_hf_model:
from sentencepiece import SentencePieceProcessor
self.model = SentencePieceProcessor(model_file=model_file)
self.vocab_size = self.model.vocab_size()
self.start_id = self.model.bos_id()
self.end_id = self.model.eos_id()
else:
from transformers import AutoTokenizer
backend_tokenizer_file = osp.join(model_folder, 'tokenizer.json')
if not osp.exists(backend_tokenizer_file):
print('WARNING: Can not find tokenizer.json. '
'It may take long time to initialize the tokenizer.')
self.model = AutoTokenizer.from_pretrained(model_folder)
self.vocab_size = self.model.vocab_size
self.start_id = self.model.bos_token_id
self.end_id = self.model.eos_token_id
# save tokenizer.json to reuse
if not osp.exists(backend_tokenizer_file):
self.model.backend_tokenizer.save(backend_tokenizer_file)
def encode(self, s: str): def encode(self, s: str):
return self.model.Encode(s) if not self.use_hf_model:
add_bos = False
add_eos = False
if s.find('<BOS>') != -1:
s = s.replace('<BOS>', '')
add_bos = True
if s == '<EOS>':
s = ''
add_eos = True
return self.model.Encode(s, add_bos=add_bos, add_eos=add_eos)
else:
add_special_tokens = False
if s.find('<BOS>') != -1:
s = s.replace('<BOS>', '<s>')
if s == '<EOS>':
s = '</s>'
if len(s) == 0:
add_special_tokens = True
return self.model.encode(s, add_special_tokens=add_special_tokens)
def decode(self, t: List[int]): def decode(self, t: List[int]):
return self.model.Decode(t) if not self.use_hf_model:
return self.model.Decode(t)
else:
skip_special_tokens = False
return self.model.decode(
t, skip_special_tokens=skip_special_tokens)
class TritonPythonModel: class TritonPythonModel:
......
...@@ -7,31 +7,64 @@ from typing import List ...@@ -7,31 +7,64 @@ from typing import List
import numpy as np import numpy as np
import torch import torch
import triton_python_backend_utils as pb_utils import triton_python_backend_utils as pb_utils
from sentencepiece import SentencePieceProcessor
from torch.nn.utils.rnn import pad_sequence from torch.nn.utils.rnn import pad_sequence
class Tokenizer: class Tokenizer:
def __init__(self, model_file: str): def __init__(self, model_file: str):
self.model = SentencePieceProcessor(model_file=model_file) model_folder = osp.split(model_file)[0]
self.vocab_size = self.model.vocab_size() tokenizer_config_file = osp.join(model_folder, 'tokenizer_config.json')
self.start_id = self.model.bos_id() use_hf_model = osp.exists(tokenizer_config_file)
self.end_id = self.model.eos_id() self.use_hf_model = use_hf_model
if not self.use_hf_model:
from sentencepiece import SentencePieceProcessor
self.model = SentencePieceProcessor(model_file=model_file)
self.vocab_size = self.model.vocab_size()
self.start_id = self.model.bos_id()
self.end_id = self.model.eos_id()
else:
from transformers import AutoTokenizer
backend_tokenizer_file = osp.join(model_folder, 'tokenizer.json')
if not osp.exists(backend_tokenizer_file):
print('WARNING: Can not find tokenizer.json. '
'It may take long time to initialize the tokenizer.')
self.model = AutoTokenizer.from_pretrained(model_folder)
self.vocab_size = self.model.vocab_size
self.start_id = self.model.bos_token_id
self.end_id = self.model.eos_token_id
# save tokenizer.json to reuse
if not osp.exists(backend_tokenizer_file):
self.model.backend_tokenizer.save(backend_tokenizer_file)
def encode(self, s: str): def encode(self, s: str):
add_bos = False if not self.use_hf_model:
add_eos = False add_bos = False
if s.find('<BOS>') != -1: add_eos = False
s = s.replace('<BOS>', '') if s.find('<BOS>') != -1:
add_bos = True s = s.replace('<BOS>', '')
if s == '<EOS>': add_bos = True
s = '' if s == '<EOS>':
add_eos = True s = ''
return self.model.Encode(s, add_bos=add_bos, add_eos=add_eos) add_eos = True
return self.model.Encode(s, add_bos=add_bos, add_eos=add_eos)
else:
add_special_tokens = False
if s.find('<BOS>') != -1:
s = s.replace('<BOS>', '<s>')
if s == '<EOS>':
s = '</s>'
if len(s) == 0:
add_special_tokens = True
return self.model.encode(s, add_special_tokens=add_special_tokens)
def decode(self, t: List[int]): def decode(self, t: List[int]):
return self.model.Decode(t) if not self.use_hf_model:
return self.model.Decode(t)
else:
skip_special_tokens = False
return self.model.decode(
t, skip_special_tokens=skip_special_tokens)
class TritonPythonModel: class TritonPythonModel:
...@@ -157,7 +190,6 @@ class TritonPythonModel: ...@@ -157,7 +190,6 @@ class TritonPythonModel:
for s in query for s in query
] ]
start_lengths = torch.IntTensor([[len(ids)] for ids in start_ids]) start_lengths = torch.IntTensor([[len(ids)] for ids in start_ids])
start_ids = pad_sequence(start_ids, start_ids = pad_sequence(
batch_first=True, start_ids, batch_first=True, padding_value=self.end_id)
padding_value=self.end_id)
return start_ids, start_lengths return start_ids, start_lengths
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment