Unverified Commit 64936449 authored by q.yao's avatar q.yao Committed by GitHub
Browse files

use huggingface tokenizer (#26)

* add hf tokenizer

* format

* fix for comment

* don't skip speical tokens
parent 0cc48011
......@@ -5,6 +5,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
&& rm -rf /var/lib/apt/lists/*
RUN python3 -m pip install torch==1.13.1+cu117 torchvision==0.14.1+cu117 --extra-index-url https://download.pytorch.org/whl/cu117
RUN python3 -m pip install sentencepiece cmake
RUN python3 -m pip install sentencepiece cmake transformers protobuf==3.20.3
ENV NCCL_LAUNCH_MODE=GROUP
from sentencepiece import SentencePieceProcessor
from typing import List
import fire
import sys
class Tokenizer:
def __init__(self, model_file: str):
self.model = SentencePieceProcessor(model_file=model_file)
self.vocab_size = self.model.vocab_size()
self.start_id = self.model.bos_id()
self.end_id = self.model.eos_id()
self.pad_id = self.model.pad_id()
if model_file.endswith('.model'):
from sentencepiece import SentencePieceProcessor
self.model = SentencePieceProcessor(model_file=model_file)
self.vocab_size = self.model.vocab_size()
self.start_id = self.model.bos_id()
self.end_id = self.model.eos_id()
self.pad_id = self.model.pad_id()
else:
from transformers import AutoTokenizer
self.model = AutoTokenizer.from_pretrained(model_file)
self.vocab_size = self.model.vocab_size
self.start_id = self.model.bos_token_id
self.end_id = self.model.eos_token_id
self.pad_id = self.model.pad_token_id
print(f'vocab_size = {self.vocab_size}')
print(f'start_id = {self.start_id}')
print(f'end_id = {self.end_id}')
print(f'pad_id = {self.pad_id}')
def encode(self, s: str):
return self.model.Encode(s, add_bos=True)
if hasattr(self.model, 'Encode'):
return self.model.Encode(s, add_bos=True)
else:
return self.model.encode(s, add_special_tokens=True)
def decode(self, t: List[int]):
return self.model.Decode(t)
if hasattr(self.model, 'Decode'):
return self.model.Decode(t)
else:
return self.model.decode(t)
def main(model_file: str = '/data/llama/model/tokenizer.model',
encode_file: str = None, decode_file: str = None):
encode_file: str = None,
decode_file: str = None):
tokenizer = Tokenizer(model_file)
if encode_file:
with open(encode_file, 'r') as f:
......@@ -54,4 +70,4 @@ def main(model_file: str = '/data/llama/model/tokenizer.model',
if __name__ == '__main__':
fire.Fire(main)
\ No newline at end of file
fire.Fire(main)
......@@ -107,13 +107,14 @@ class Chatbot:
stop_words = None
bad_words = np.array([[[self.eos_id], [1]]], dtype=np.int32)
self.cfg = mmengine.Config(
dict(session_len=session_len,
top_p=top_p,
top_k=top_k,
temperature=temperature,
repetition_penalty=repetition_penalty,
stop_words=stop_words,
bad_words=bad_words))
dict(
session_len=session_len,
top_p=top_p,
top_k=top_k,
temperature=temperature,
repetition_penalty=repetition_penalty,
stop_words=stop_words,
bad_words=bad_words))
self.log_level = log_level
self.display = display
self.profile_generation = profile_generation
......@@ -200,13 +201,16 @@ class Chatbot:
return StatusCode.TRITON_SESSION_CLOSED
self._session.status = 0
for status, _, _ in self._stream_infer(self._session,
prompt='',
request_output_len=0,
sequence_start=False,
sequence_end=True):
for status, _, _ in self._stream_infer(
self._session,
prompt='',
request_output_len=0,
sequence_start=False,
sequence_end=True):
if status != StatusCode.TRITON_STREAM_END:
return status
self.reset_session()
return StatusCode.TRITON_STREAM_END
def cancel(self, session_id: int, *args, **kwargs):
......@@ -238,12 +242,13 @@ class Chatbot:
return StatusCode.TRITON_SESSION_CLOSED
prev_session = self._session
for status, res, _ in self._stream_infer(self._session,
prompt='',
request_output_len=0,
sequence_start=False,
sequence_end=False,
cancel=True):
for status, res, _ in self._stream_infer(
self._session,
prompt='',
request_output_len=0,
sequence_start=False,
sequence_end=False,
cancel=True):
if status.value < 0:
break
if status == StatusCode.TRITON_STREAM_END:
......@@ -336,11 +341,11 @@ class Chatbot:
session.response = ''
que = queue.Queue()
producer = threading.Thread(target=self._stream_producer,
args=(self.tritonserver_addr, session, que,
self.cfg, input_ids, input_lengths,
request_output_len, sequence_start,
sequence_end, preseq_length, cancel))
producer = threading.Thread(
target=self._stream_producer,
args=(self.tritonserver_addr, session, que, self.cfg, input_ids,
input_lengths, request_output_len, sequence_start,
sequence_end, preseq_length, cancel))
producer.start()
for state, res, tokens in self.stream_consumer(
self.postprocess, que, session, preseq_length, cancel, logger,
......@@ -411,12 +416,13 @@ class Chatbot:
random_seed * np.ones((1, 1), dtype=np.uint64))
]
client.start_stream(callback)
client.async_stream_infer('fastertransformer',
inputs,
sequence_id=session.session_id,
request_id=session.request_id,
sequence_start=sequence_start,
sequence_end=sequence_end)
client.async_stream_infer(
'fastertransformer',
inputs,
sequence_id=session.session_id,
request_id=session.request_id,
sequence_start=sequence_start,
sequence_end=sequence_end)
que.put(None)
@staticmethod
......
......@@ -127,28 +127,29 @@ def export(model_name: str,
vocab_size, bos_id, eos_id = tokenizer_info(tokenizer_path)
assert _vocab_size == vocab_size, \
f'different vocab size {_vocab_size} vs {vocab_size}'
cfg = dict(llama=dict(
model_name=model_name,
head_num=head_num,
size_per_head=size_per_head,
vocab_size=vocab_size,
num_layer=num_layer,
rotary_embedding=size_per_head,
inter_size=inter_size,
norm_eps=norm_eps,
attn_bias=attn_bias,
start_id=bos_id,
end_id=eos_id,
weight_type='fp16',
# parameters for fastertransformer
max_batch_size=32,
max_context_token_num=4,
session_len=2048,
step_length=1,
cache_max_entry_count=48,
cache_chunk_size=8,
use_context_fmha=1,
quant_policy=0))
cfg = dict(
llama=dict(
model_name=model_name,
head_num=head_num,
size_per_head=size_per_head,
vocab_size=vocab_size,
num_layer=num_layer,
rotary_embedding=size_per_head,
inter_size=inter_size,
norm_eps=norm_eps,
attn_bias=attn_bias,
start_id=bos_id,
end_id=eos_id,
weight_type='fp16',
# parameters for fastertransformer
max_batch_size=32,
max_context_token_num=4,
session_len=2048,
step_length=1,
cache_max_entry_count=48,
cache_chunk_size=8,
use_context_fmha=1,
quant_policy=0))
config = configparser.ConfigParser()
for section, key_values in cfg.items():
......@@ -166,7 +167,7 @@ def deploy_llama(model_name: str, model_path: str, tokenizer_path: str,
shutil.copy(tokenizer_path,
osp.join(triton_models_path, 'tokenizer/tokenizer.model'))
else:
print('tokenizer model {tokenizer_path} does not exist')
print(f'tokenizer model {tokenizer_path} does not exist')
return False
# read model arguments from params.json
try:
......@@ -190,9 +191,8 @@ def deploy_llama(model_name: str, model_path: str, tokenizer_path: str,
def get_param(_name, _size):
print(_name, _size)
if _name not in model_params:
model_params[_name] = torch.zeros(_size,
dtype=torch.float16,
device='cpu')
model_params[_name] = torch.zeros(
_size, dtype=torch.float16, device='cpu')
return model_params[_name]
for i, ckpt_path in enumerate(checkpoints):
......@@ -204,7 +204,8 @@ def deploy_llama(model_name: str, model_path: str, tokenizer_path: str,
size = param_data.size(0)
if ext == 'weight':
param = get_param(
param_name, [size * n_ckpt, param_data.size(1)])
param_name,
[size * n_ckpt, param_data.size(1)])
param.data[size * i:size * (i + 1), :] = param_data
else: # bias
param = get_param(param_name, [size * n_ckpt])
......@@ -235,8 +236,9 @@ def deploy_llama(model_name: str, model_path: str, tokenizer_path: str,
# concat qkv projection
for t in ['weight', 'bias']:
for i in range(1000):
_qkv = [f'layers.{i}.attention.{k}.{t}' for k in [
'wq', 'wk', 'wv']]
_qkv = [
f'layers.{i}.attention.{k}.{t}' for k in ['wq', 'wk', 'wv']
]
try:
qkv = tuple(map(model_params.pop, _qkv))
except KeyError:
......@@ -278,8 +280,15 @@ def deploy_hf(model_name: str, model_path: str, tokenizer_path: str,
if osp.exists(tokenizer_path):
shutil.copy(tokenizer_path,
osp.join(triton_models_path, 'tokenizer/tokenizer.model'))
for json_file in os.listdir(model_path):
if json_file.endswith(
'.json') and json_file != 'pytorch_model.bin.index.json':
json_path = osp.join(model_path, json_file)
shutil.copy(
json_path,
osp.join(triton_models_path, 'tokenizer', json_file))
else:
print('tokenizer model {tokenizer_path} does not exist')
print(f'tokenizer model {tokenizer_path} does not exist')
exit(-1)
# read model arguments from params.json
......@@ -371,19 +380,22 @@ def deploy_hf(model_name: str, model_path: str, tokenizer_path: str,
for ft, hf in other:
model_params[ft] = get_tensor(hf)
return export(model_name, num_layer, norm_eps, model_params, tokenizer_path,
triton_models_path, tp)
return export(model_name, num_layer, norm_eps, model_params,
tokenizer_path, triton_models_path, tp)
def pack_model_repository(workspace_path: str):
model_repo_dir = osp.join(workspace_path, 'model_repository')
os.makedirs(model_repo_dir, exist_ok=True)
os.symlink(src=osp.join('../triton_models/interactive'),
dst=osp.join(model_repo_dir, 'fastertransformer'))
os.symlink(src=osp.join('../triton_models/preprocessing'),
dst=osp.join(model_repo_dir, 'preprocessing'))
os.symlink(src=osp.join('../triton_models/postprocessing'),
dst=osp.join(model_repo_dir, 'postprocessing'))
os.symlink(
src=osp.join('../triton_models/interactive'),
dst=osp.join(model_repo_dir, 'fastertransformer'))
os.symlink(
src=osp.join('../triton_models/preprocessing'),
dst=osp.join(model_repo_dir, 'preprocessing'))
os.symlink(
src=osp.join('../triton_models/postprocessing'),
dst=osp.join(model_repo_dir, 'postprocessing'))
def main(model_name: str,
......
......@@ -6,22 +6,63 @@ from typing import List
import numpy as np
import triton_python_backend_utils as pb_utils
from sentencepiece import SentencePieceProcessor
class Tokenizer:
def __init__(self, model_file: str):
self.model = SentencePieceProcessor(model_file=model_file)
self.vocab_size = self.model.vocab_size()
self.start_id = self.model.bos_id()
self.eos_id = self.model.eos_id()
model_folder = osp.split(model_file)[0]
tokenizer_config_file = osp.join(model_folder, 'tokenizer_config.json')
use_hf_model = osp.exists(tokenizer_config_file)
self.use_hf_model = use_hf_model
if not self.use_hf_model:
from sentencepiece import SentencePieceProcessor
self.model = SentencePieceProcessor(model_file=model_file)
self.vocab_size = self.model.vocab_size()
self.start_id = self.model.bos_id()
self.end_id = self.model.eos_id()
else:
from transformers import AutoTokenizer
backend_tokenizer_file = osp.join(model_folder, 'tokenizer.json')
if not osp.exists(backend_tokenizer_file):
print('WARNING: Can not find tokenizer.json. '
'It may take long time to initialize the tokenizer.')
self.model = AutoTokenizer.from_pretrained(model_folder)
self.vocab_size = self.model.vocab_size
self.start_id = self.model.bos_token_id
self.end_id = self.model.eos_token_id
# save tokenizer.json to reuse
if not osp.exists(backend_tokenizer_file):
self.model.backend_tokenizer.save(backend_tokenizer_file)
def encode(self, s: str):
return self.model.Encode(s)
if not self.use_hf_model:
add_bos = False
add_eos = False
if s.find('<BOS>') != -1:
s = s.replace('<BOS>', '')
add_bos = True
if s == '<EOS>':
s = ''
add_eos = True
return self.model.Encode(s, add_bos=add_bos, add_eos=add_eos)
else:
add_special_tokens = False
if s.find('<BOS>') != -1:
s = s.replace('<BOS>', '<s>')
if s == '<EOS>':
s = '</s>'
if len(s) == 0:
add_special_tokens = True
return self.model.encode(s, add_special_tokens=add_special_tokens)
def decode(self, t: List[int]):
return self.model.Decode(t)
if not self.use_hf_model:
return self.model.Decode(t)
else:
skip_special_tokens = False
return self.model.decode(
t, skip_special_tokens=skip_special_tokens)
class TritonPythonModel:
......
......@@ -7,31 +7,64 @@ from typing import List
import numpy as np
import torch
import triton_python_backend_utils as pb_utils
from sentencepiece import SentencePieceProcessor
from torch.nn.utils.rnn import pad_sequence
class Tokenizer:
def __init__(self, model_file: str):
self.model = SentencePieceProcessor(model_file=model_file)
self.vocab_size = self.model.vocab_size()
self.start_id = self.model.bos_id()
self.end_id = self.model.eos_id()
model_folder = osp.split(model_file)[0]
tokenizer_config_file = osp.join(model_folder, 'tokenizer_config.json')
use_hf_model = osp.exists(tokenizer_config_file)
self.use_hf_model = use_hf_model
if not self.use_hf_model:
from sentencepiece import SentencePieceProcessor
self.model = SentencePieceProcessor(model_file=model_file)
self.vocab_size = self.model.vocab_size()
self.start_id = self.model.bos_id()
self.end_id = self.model.eos_id()
else:
from transformers import AutoTokenizer
backend_tokenizer_file = osp.join(model_folder, 'tokenizer.json')
if not osp.exists(backend_tokenizer_file):
print('WARNING: Can not find tokenizer.json. '
'It may take long time to initialize the tokenizer.')
self.model = AutoTokenizer.from_pretrained(model_folder)
self.vocab_size = self.model.vocab_size
self.start_id = self.model.bos_token_id
self.end_id = self.model.eos_token_id
# save tokenizer.json to reuse
if not osp.exists(backend_tokenizer_file):
self.model.backend_tokenizer.save(backend_tokenizer_file)
def encode(self, s: str):
add_bos = False
add_eos = False
if s.find('<BOS>') != -1:
s = s.replace('<BOS>', '')
add_bos = True
if s == '<EOS>':
s = ''
add_eos = True
return self.model.Encode(s, add_bos=add_bos, add_eos=add_eos)
if not self.use_hf_model:
add_bos = False
add_eos = False
if s.find('<BOS>') != -1:
s = s.replace('<BOS>', '')
add_bos = True
if s == '<EOS>':
s = ''
add_eos = True
return self.model.Encode(s, add_bos=add_bos, add_eos=add_eos)
else:
add_special_tokens = False
if s.find('<BOS>') != -1:
s = s.replace('<BOS>', '<s>')
if s == '<EOS>':
s = '</s>'
if len(s) == 0:
add_special_tokens = True
return self.model.encode(s, add_special_tokens=add_special_tokens)
def decode(self, t: List[int]):
return self.model.Decode(t)
if not self.use_hf_model:
return self.model.Decode(t)
else:
skip_special_tokens = False
return self.model.decode(
t, skip_special_tokens=skip_special_tokens)
class TritonPythonModel:
......@@ -157,7 +190,6 @@ class TritonPythonModel:
for s in query
]
start_lengths = torch.IntTensor([[len(ids)] for ids in start_ids])
start_ids = pad_sequence(start_ids,
batch_first=True,
padding_value=self.end_id)
start_ids = pad_sequence(
start_ids, batch_first=True, padding_value=self.end_id)
return start_ids, start_lengths
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment