Unverified Commit 4f47f78c authored by lvhan028's avatar lvhan028 Committed by GitHub
Browse files

check-in fastertransformer's triton models (#3)

parent ef2adb04
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
name: "fastertransformer"
backend: "fastertransformer"
default_model_filename: "weights"
max_batch_size: 1
model_transaction_policy {
decoupled: True
}
instance_group [
{
# max concurrent instances
count: 48
kind: KIND_CPU
}
]
input [
{
name: "input_ids"
data_type: TYPE_UINT32
dims: [ -1 ]
# allow_ragged_batch: true
},
{
name: "input_lengths"
data_type: TYPE_UINT32
dims: [ 1 ]
reshape: { shape: [ ] }
},
{
name: "request_output_len"
data_type: TYPE_UINT32
dims: [ -1 ]
},
{
name: "step"
data_type: TYPE_INT32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "session_len"
data_type: TYPE_UINT32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "runtime_top_k"
data_type: TYPE_UINT32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "runtime_top_p"
data_type: TYPE_FP32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "beam_search_diversity_rate"
data_type: TYPE_FP32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "temperature"
data_type: TYPE_FP32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "len_penalty"
data_type: TYPE_FP32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "repetition_penalty"
data_type: TYPE_FP32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "random_seed"
data_type: TYPE_UINT64
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "is_return_log_probs"
data_type: TYPE_BOOL
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "beam_width"
data_type: TYPE_UINT32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "start_id"
data_type: TYPE_UINT32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "end_id"
data_type: TYPE_UINT32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "bad_words_list"
data_type: TYPE_INT32
dims: [ 2, -1 ]
optional: true
},
{
name: "stop_words_list"
data_type: TYPE_INT32
dims: [ 2, -1 ]
optional: true
},
{
name: "prompt_learning_task_name_ids"
data_type: TYPE_UINT32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "top_p_decay"
data_type: TYPE_FP32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "top_p_min"
data_type: TYPE_FP32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "top_p_reset_ids"
data_type: TYPE_UINT32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "START"
data_type: TYPE_INT32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "END"
data_type: TYPE_INT32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "STOP"
data_type: TYPE_INT32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "CORRID"
data_type: TYPE_UINT64
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
}
]
output [
{
name: "output_ids"
data_type: TYPE_UINT32
dims: [ -1, -1 ]
},
{
name: "sequence_length"
data_type: TYPE_UINT32
dims: [ -1 ]
},
{
name: "cum_log_probs"
data_type: TYPE_FP32
dims: [ -1 ]
},
{
name: "output_log_probs"
data_type: TYPE_FP32
dims: [ -1, -1 ]
}
]
parameters {
key: "tensor_para_size"
value: {
string_value: "1"
}
}
parameters {
key: "pipeline_para_size"
value: {
string_value: "1"
}
}
parameters {
key: "data_type"
value: {
string_value: "fp16"
}
}
parameters {
key: "model_type"
value: {
string_value: "Llama"
}
}
parameters {
key: "enable_custom_all_reduce"
value: {
string_value: "0"
}
}
# Copyright (c) OpenMMLab. All rights reserved.
import json
import os.path as osp
from pathlib import Path
from typing import List
import numpy as np
import triton_python_backend_utils as pb_utils
from sentencepiece import SentencePieceProcessor
class Tokenizer:
def __init__(self, model_file: str):
self.model = SentencePieceProcessor(model_file=model_file)
self.vocab_size = self.model.vocab_size()
self.start_id = self.model.bos_id()
self.eos_id = self.model.eos_id()
def encode(self, s: str):
return self.model.Encode(s)
def decode(self, t: List[int]):
return self.model.Decode(t)
class TritonPythonModel:
"""Your Python model must use the same class name.
Every Python model that is created must have "TritonPythonModel" as the
class name.
"""
def initialize(self, args):
"""`initialize` is called only once when the model is being loaded.
Implementing `initialize` function is optional. This function allows
the model to initialize any state associated with this model.
Parameters
----------
args : dict
Both keys and values are strings. The dictionary keys and values are:
* model_config: A JSON string containing the model configuration
* model_instance_kind: A string containing model instance kind
* model_instance_device_id: A string containing model instance device
ID
* model_repository: Model repository path
* model_version: Model version
* model_name: Model name
"""
# Parse model configs
self.model_config = model_config = json.loads(args['model_config'])
# Parse model output configs
output_config = pb_utils.get_output_config_by_name(
model_config, 'OUTPUT')
# Convert Triton types to numpy types
self.output_dtype = pb_utils.triton_string_to_numpy(
output_config['data_type'])
cur_folder = Path(__file__).parent
self.tokenizer = Tokenizer(
osp.join(
cur_folder, self.model_config['parameters']['tokenizer_path']
['string_value']))
def execute(self, requests):
"""`execute` must be implemented in every Python model. `execute`
function receives a list of pb_utils.InferenceRequest as the only
argument. This function is called when an inference is requested
for this model. Depending on the batching configuration (e.g. Dynamic
Batching) used, `requests` may contain multiple requests. Every
Python model, must create one pb_utils.InferenceResponse for every
pb_utils.InferenceRequest in `requests`. If there is an error, you can
set the error argument when creating a pb_utils.InferenceResponse.
Parameters
----------
requests : list
A list of pb_utils.InferenceRequest
Returns
-------
list
A list of pb_utils.InferenceResponse. The length of this list must
be the same as `requests`
"""
responses = []
# Every Python backend must iterate over everyone of the requests
# and create a pb_utils.InferenceResponse for each of them.
for idx, request in enumerate(requests):
# Get input tensors
tokens_batch = pb_utils.get_input_tensor_by_name(
request, 'TOKENS_BATCH').as_numpy()
sequence_length = pb_utils.get_input_tensor_by_name(
request, 'sequence_length').as_numpy()
# Postprocessing output data.
outputs = self._postprocessing(tokens_batch.tolist(),
sequence_length)
# Create output tensors. You need pb_utils.Tensor
# objects to create pb_utils.InferenceResponse.
output_tensor = pb_utils.Tensor(
'OUTPUT',
np.array(outputs).astype(self.output_dtype))
# Create InferenceResponse. You can set an error here in case
# there was a problem with handling this inference request.
# Below is an example of how you can set errors in inference
# response:
#
# pb_utils.InferenceResponse(
# output_tensors=..., TritonError("An error occurred"))
inference_response = pb_utils.InferenceResponse(
output_tensors=[output_tensor])
responses.append(inference_response)
# You should return a list of pb_utils.InferenceResponse. Length
# of this list must match the length of `requests` list.
return responses
def finalize(self):
"""`finalize` is called only once when the model is being unloaded.
Implementing `finalize` function is optional. This function allows the
model to perform any necessary clean ups before exit.
"""
print('Cleaning up...')
def _postprocessing(self, tokens_batch, sequence_length):
outputs = []
for beam_tokens, beam_len in zip(tokens_batch, sequence_length):
for tokens, _len in zip(beam_tokens, beam_len):
output = self.tokenizer.decode(tokens[:_len])
output = output.encode('utf8')
outputs.append(output)
return outputs
name: "postprocessing"
backend: "python"
max_batch_size: 1
input [
{
name: "TOKENS_BATCH"
data_type: TYPE_UINT32
dims: [ -1, -1 ]
},
{
name: "sequence_length"
data_type: TYPE_UINT32
dims: [ -1 ]
}
]
output [
{
name: "OUTPUT"
data_type: TYPE_STRING
dims: [ -1, -1 ]
}
]
instance_group [
{
count: 1
kind: KIND_CPU
}
]
parameters {
key: "tokenizer_path"
value: {
string_value: "tokenizer/tokenizer.model"
}
}
# Copyright (c) OpenMMLab. All rights reserved.
import json
import os.path as osp
from pathlib import Path
from typing import List
import numpy as np
import torch
import triton_python_backend_utils as pb_utils
from sentencepiece import SentencePieceProcessor
from torch.nn.utils.rnn import pad_sequence
class Tokenizer:
def __init__(self, model_file: str):
self.model = SentencePieceProcessor(model_file=model_file)
self.vocab_size = self.model.vocab_size()
self.start_id = self.model.bos_id()
self.end_id = self.model.eos_id()
def encode(self, s: str):
add_bos = False
if s.find('<BOS>') != -1:
s = s.replace('<BOS>', '')
add_bos = True
return self.model.Encode(s, add_bos=add_bos)
def decode(self, t: List[int]):
return self.model.Decode(t)
class TritonPythonModel:
"""Your Python model must use the same class name.
Every Python model that is created must have "TritonPythonModel" as the
class name.
"""
def initialize(self, args):
"""`initialize` is called only once when the model is being loaded.
Implementing `initialize` function is optional. This function allows
the model to initialize any state associated with this model.
Parameters
----------
args : dict
Both keys and values are strings. The dictionary keys and values are:
* model_config: A JSON string containing the model configuration
* model_instance_kind: A string containing model instance kind
* model_instance_device_id: A string containing model instance device
ID
* model_repository: Model repository path
* model_version: Model version
* model_name: Model name
"""
# Parse model configs
self.model_config = model_config = json.loads(args['model_config'])
# Parse model output configs and convert Triton types to numpy types
input_names = ['INPUT_ID', 'REQUEST_INPUT_LEN']
for input_name in input_names:
setattr(
self,
input_name.lower() + '_dtype',
pb_utils.triton_string_to_numpy(
pb_utils.get_output_config_by_name(
model_config, input_name)['data_type']))
cur_folder = Path(__file__).parent
self.tokenizer = Tokenizer(
osp.join(
cur_folder, self.model_config['parameters']['tokenizer_path']
['string_value']))
self.start_id = self.tokenizer.start_id
self.end_id = self.tokenizer.end_id
def execute(self, requests):
"""`execute` must be implemented in every Python model. `execute`
function receives a list of pb_utils.InferenceRequest as the only
argument. This function is called when an inference is requested
for this model. Depending on the batching configuration (e.g. Dynamic
Batching) used, `requests` may contain multiple requests. Every
Python model, must create one pb_utils.InferenceResponse for every
pb_utils.InferenceRequest in `requests`. If there is an error, you can
set the error argument when creating a pb_utils.InferenceResponse.
Parameters
----------
requests : list
A list of pb_utils.InferenceRequest
Returns
-------
list
A list of pb_utils.InferenceResponse. The length of this list must
be the same as `requests`
"""
responses = []
# Every Python backend must iterate over everyone of the requests
# and create a pb_utils.InferenceResponse for each of them.
for idx, request in enumerate(requests):
# Get input tensors
query = pb_utils.get_input_tensor_by_name(request,
'QUERY').as_numpy()
# Preprocessing input data.
input_id, request_input_len = self._create_request(query)
# Create output tensors. You need pb_utils.Tensor
# objects to create pb_utils.InferenceResponse.
input_id_tensor = pb_utils.Tensor(
'INPUT_ID',
np.array(input_id).astype(self.input_id_dtype))
# Create InferenceResponse. You can set an error here in case
# there was a problem with handling this inference request.
# Below is an example of how you can set errors in inference
# response:
#
# pb_utils.InferenceResponse(
# output_tensors=..., TritonError("An error occurred"))
inference_response = pb_utils.InferenceResponse(
output_tensors=[input_id_tensor])
responses.append(inference_response)
# You should return a list of pb_utils.InferenceResponse. Length
# of this list must match the length of `requests` list.
return responses
def finalize(self):
"""`finalize` is called only once when the model is being unloaded.
Implementing `finalize` function is optional. This function allows the
model to perform any necessary clean ups before exit.
"""
print('Cleaning up...')
def _create_request(self, query):
start_ids = [
torch.IntTensor(self.tokenizer.encode(s[0].decode()))
for s in query
]
start_lengths = torch.IntTensor([[len(ids)] for ids in start_ids])
start_ids = pad_sequence(start_ids,
batch_first=True,
padding_value=self.end_id)
return start_ids, start_lengths
name: "preprocessing"
backend: "python"
max_batch_size: 1
input [
{
name: "QUERY"
data_type: TYPE_STRING
dims: [ -1 ]
}
]
output [
{
name: "INPUT_ID"
data_type: TYPE_UINT32
dims: [ -1 ]
}
]
instance_group [
{
count: 4
kind: KIND_CPU
}
]
parameters {
key: "tokenizer_path"
value: {
string_value: "tokenizer/tokenizer.model"
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment