Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Lmdeploy
Commits
4f47f78c
Unverified
Commit
4f47f78c
authored
Jun 18, 2023
by
lvhan028
Committed by
GitHub
Jun 18, 2023
Browse files
check-in fastertransformer's triton models (
#3
)
parent
ef2adb04
Changes
10
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
630 additions
and
0 deletions
+630
-0
llmdeploy/serve/fastertransformer/triton_models/interactive/1/weights
...rve/fastertransformer/triton_models/interactive/1/weights
+1
-0
llmdeploy/serve/fastertransformer/triton_models/interactive/config.pbtxt
.../fastertransformer/triton_models/interactive/config.pbtxt
+274
-0
llmdeploy/serve/fastertransformer/triton_models/postprocessing/1/model.py
...fastertransformer/triton_models/postprocessing/1/model.py
+139
-0
llmdeploy/serve/fastertransformer/triton_models/postprocessing/1/tokenizer
...astertransformer/triton_models/postprocessing/1/tokenizer
+1
-0
llmdeploy/serve/fastertransformer/triton_models/postprocessing/config.pbtxt
...stertransformer/triton_models/postprocessing/config.pbtxt
+36
-0
llmdeploy/serve/fastertransformer/triton_models/preprocessing/1/model.py
.../fastertransformer/triton_models/preprocessing/1/model.py
+147
-0
llmdeploy/serve/fastertransformer/triton_models/preprocessing/1/tokenizer
...fastertransformer/triton_models/preprocessing/1/tokenizer
+1
-0
llmdeploy/serve/fastertransformer/triton_models/preprocessing/config.pbtxt
...astertransformer/triton_models/preprocessing/config.pbtxt
+31
-0
llmdeploy/serve/fastertransformer/triton_models/tokenizer/placeholder
...rve/fastertransformer/triton_models/tokenizer/placeholder
+0
-0
llmdeploy/serve/fastertransformer/triton_models/weights/config.ini
.../serve/fastertransformer/triton_models/weights/config.ini
+0
-0
No files found.
llmdeploy/serve/fastertransformer/triton_models/interactive/1/weights
0 → 120000
View file @
4f47f78c
../../weights
\ No newline at end of file
llmdeploy/serve/fastertransformer/triton_models/interactive/config.pbtxt
0 → 100644
View file @
4f47f78c
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
name: "fastertransformer"
backend: "fastertransformer"
default_model_filename: "weights"
max_batch_size: 1
model_transaction_policy {
decoupled: True
}
instance_group [
{
# max concurrent instances
count: 48
kind: KIND_CPU
}
]
input [
{
name: "input_ids"
data_type: TYPE_UINT32
dims: [ -1 ]
# allow_ragged_batch: true
},
{
name: "input_lengths"
data_type: TYPE_UINT32
dims: [ 1 ]
reshape: { shape: [ ] }
},
{
name: "request_output_len"
data_type: TYPE_UINT32
dims: [ -1 ]
},
{
name: "step"
data_type: TYPE_INT32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "session_len"
data_type: TYPE_UINT32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "runtime_top_k"
data_type: TYPE_UINT32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "runtime_top_p"
data_type: TYPE_FP32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "beam_search_diversity_rate"
data_type: TYPE_FP32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "temperature"
data_type: TYPE_FP32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "len_penalty"
data_type: TYPE_FP32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "repetition_penalty"
data_type: TYPE_FP32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "random_seed"
data_type: TYPE_UINT64
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "is_return_log_probs"
data_type: TYPE_BOOL
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "beam_width"
data_type: TYPE_UINT32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "start_id"
data_type: TYPE_UINT32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "end_id"
data_type: TYPE_UINT32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "bad_words_list"
data_type: TYPE_INT32
dims: [ 2, -1 ]
optional: true
},
{
name: "stop_words_list"
data_type: TYPE_INT32
dims: [ 2, -1 ]
optional: true
},
{
name: "prompt_learning_task_name_ids"
data_type: TYPE_UINT32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "top_p_decay"
data_type: TYPE_FP32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "top_p_min"
data_type: TYPE_FP32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "top_p_reset_ids"
data_type: TYPE_UINT32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "START"
data_type: TYPE_INT32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "END"
data_type: TYPE_INT32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "STOP"
data_type: TYPE_INT32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "CORRID"
data_type: TYPE_UINT64
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
}
]
output [
{
name: "output_ids"
data_type: TYPE_UINT32
dims: [ -1, -1 ]
},
{
name: "sequence_length"
data_type: TYPE_UINT32
dims: [ -1 ]
},
{
name: "cum_log_probs"
data_type: TYPE_FP32
dims: [ -1 ]
},
{
name: "output_log_probs"
data_type: TYPE_FP32
dims: [ -1, -1 ]
}
]
parameters {
key: "tensor_para_size"
value: {
string_value: "1"
}
}
parameters {
key: "pipeline_para_size"
value: {
string_value: "1"
}
}
parameters {
key: "data_type"
value: {
string_value: "fp16"
}
}
parameters {
key: "model_type"
value: {
string_value: "Llama"
}
}
parameters {
key: "enable_custom_all_reduce"
value: {
string_value: "0"
}
}
llmdeploy/serve/fastertransformer/triton_models/postprocessing/1/model.py
0 → 100644
View file @
4f47f78c
# Copyright (c) OpenMMLab. All rights reserved.
import
json
import
os.path
as
osp
from
pathlib
import
Path
from
typing
import
List
import
numpy
as
np
import
triton_python_backend_utils
as
pb_utils
from
sentencepiece
import
SentencePieceProcessor
class
Tokenizer
:
def
__init__
(
self
,
model_file
:
str
):
self
.
model
=
SentencePieceProcessor
(
model_file
=
model_file
)
self
.
vocab_size
=
self
.
model
.
vocab_size
()
self
.
start_id
=
self
.
model
.
bos_id
()
self
.
eos_id
=
self
.
model
.
eos_id
()
def
encode
(
self
,
s
:
str
):
return
self
.
model
.
Encode
(
s
)
def
decode
(
self
,
t
:
List
[
int
]):
return
self
.
model
.
Decode
(
t
)
class
TritonPythonModel
:
"""Your Python model must use the same class name.
Every Python model that is created must have "TritonPythonModel" as the
class name.
"""
def
initialize
(
self
,
args
):
"""`initialize` is called only once when the model is being loaded.
Implementing `initialize` function is optional. This function allows
the model to initialize any state associated with this model.
Parameters
----------
args : dict
Both keys and values are strings. The dictionary keys and values are:
* model_config: A JSON string containing the model configuration
* model_instance_kind: A string containing model instance kind
* model_instance_device_id: A string containing model instance device
ID
* model_repository: Model repository path
* model_version: Model version
* model_name: Model name
"""
# Parse model configs
self
.
model_config
=
model_config
=
json
.
loads
(
args
[
'model_config'
])
# Parse model output configs
output_config
=
pb_utils
.
get_output_config_by_name
(
model_config
,
'OUTPUT'
)
# Convert Triton types to numpy types
self
.
output_dtype
=
pb_utils
.
triton_string_to_numpy
(
output_config
[
'data_type'
])
cur_folder
=
Path
(
__file__
).
parent
self
.
tokenizer
=
Tokenizer
(
osp
.
join
(
cur_folder
,
self
.
model_config
[
'parameters'
][
'tokenizer_path'
]
[
'string_value'
]))
def
execute
(
self
,
requests
):
"""`execute` must be implemented in every Python model. `execute`
function receives a list of pb_utils.InferenceRequest as the only
argument. This function is called when an inference is requested
for this model. Depending on the batching configuration (e.g. Dynamic
Batching) used, `requests` may contain multiple requests. Every
Python model, must create one pb_utils.InferenceResponse for every
pb_utils.InferenceRequest in `requests`. If there is an error, you can
set the error argument when creating a pb_utils.InferenceResponse.
Parameters
----------
requests : list
A list of pb_utils.InferenceRequest
Returns
-------
list
A list of pb_utils.InferenceResponse. The length of this list must
be the same as `requests`
"""
responses
=
[]
# Every Python backend must iterate over everyone of the requests
# and create a pb_utils.InferenceResponse for each of them.
for
idx
,
request
in
enumerate
(
requests
):
# Get input tensors
tokens_batch
=
pb_utils
.
get_input_tensor_by_name
(
request
,
'TOKENS_BATCH'
).
as_numpy
()
sequence_length
=
pb_utils
.
get_input_tensor_by_name
(
request
,
'sequence_length'
).
as_numpy
()
# Postprocessing output data.
outputs
=
self
.
_postprocessing
(
tokens_batch
.
tolist
(),
sequence_length
)
# Create output tensors. You need pb_utils.Tensor
# objects to create pb_utils.InferenceResponse.
output_tensor
=
pb_utils
.
Tensor
(
'OUTPUT'
,
np
.
array
(
outputs
).
astype
(
self
.
output_dtype
))
# Create InferenceResponse. You can set an error here in case
# there was a problem with handling this inference request.
# Below is an example of how you can set errors in inference
# response:
#
# pb_utils.InferenceResponse(
# output_tensors=..., TritonError("An error occurred"))
inference_response
=
pb_utils
.
InferenceResponse
(
output_tensors
=
[
output_tensor
])
responses
.
append
(
inference_response
)
# You should return a list of pb_utils.InferenceResponse. Length
# of this list must match the length of `requests` list.
return
responses
def
finalize
(
self
):
"""`finalize` is called only once when the model is being unloaded.
Implementing `finalize` function is optional. This function allows the
model to perform any necessary clean ups before exit.
"""
print
(
'Cleaning up...'
)
def
_postprocessing
(
self
,
tokens_batch
,
sequence_length
):
outputs
=
[]
for
beam_tokens
,
beam_len
in
zip
(
tokens_batch
,
sequence_length
):
for
tokens
,
_len
in
zip
(
beam_tokens
,
beam_len
):
output
=
self
.
tokenizer
.
decode
(
tokens
[:
_len
])
output
=
output
.
encode
(
'utf8'
)
outputs
.
append
(
output
)
return
outputs
llmdeploy/serve/fastertransformer/triton_models/postprocessing/1/tokenizer
0 → 120000
View file @
4f47f78c
../../tokenizer/
\ No newline at end of file
llmdeploy/serve/fastertransformer/triton_models/postprocessing/config.pbtxt
0 → 100644
View file @
4f47f78c
name: "postprocessing"
backend: "python"
max_batch_size: 1
input [
{
name: "TOKENS_BATCH"
data_type: TYPE_UINT32
dims: [ -1, -1 ]
},
{
name: "sequence_length"
data_type: TYPE_UINT32
dims: [ -1 ]
}
]
output [
{
name: "OUTPUT"
data_type: TYPE_STRING
dims: [ -1, -1 ]
}
]
instance_group [
{
count: 1
kind: KIND_CPU
}
]
parameters {
key: "tokenizer_path"
value: {
string_value: "tokenizer/tokenizer.model"
}
}
llmdeploy/serve/fastertransformer/triton_models/preprocessing/1/model.py
0 → 100644
View file @
4f47f78c
# Copyright (c) OpenMMLab. All rights reserved.
import
json
import
os.path
as
osp
from
pathlib
import
Path
from
typing
import
List
import
numpy
as
np
import
torch
import
triton_python_backend_utils
as
pb_utils
from
sentencepiece
import
SentencePieceProcessor
from
torch.nn.utils.rnn
import
pad_sequence
class
Tokenizer
:
def
__init__
(
self
,
model_file
:
str
):
self
.
model
=
SentencePieceProcessor
(
model_file
=
model_file
)
self
.
vocab_size
=
self
.
model
.
vocab_size
()
self
.
start_id
=
self
.
model
.
bos_id
()
self
.
end_id
=
self
.
model
.
eos_id
()
def
encode
(
self
,
s
:
str
):
add_bos
=
False
if
s
.
find
(
'<BOS>'
)
!=
-
1
:
s
=
s
.
replace
(
'<BOS>'
,
''
)
add_bos
=
True
return
self
.
model
.
Encode
(
s
,
add_bos
=
add_bos
)
def
decode
(
self
,
t
:
List
[
int
]):
return
self
.
model
.
Decode
(
t
)
class
TritonPythonModel
:
"""Your Python model must use the same class name.
Every Python model that is created must have "TritonPythonModel" as the
class name.
"""
def
initialize
(
self
,
args
):
"""`initialize` is called only once when the model is being loaded.
Implementing `initialize` function is optional. This function allows
the model to initialize any state associated with this model.
Parameters
----------
args : dict
Both keys and values are strings. The dictionary keys and values are:
* model_config: A JSON string containing the model configuration
* model_instance_kind: A string containing model instance kind
* model_instance_device_id: A string containing model instance device
ID
* model_repository: Model repository path
* model_version: Model version
* model_name: Model name
"""
# Parse model configs
self
.
model_config
=
model_config
=
json
.
loads
(
args
[
'model_config'
])
# Parse model output configs and convert Triton types to numpy types
input_names
=
[
'INPUT_ID'
,
'REQUEST_INPUT_LEN'
]
for
input_name
in
input_names
:
setattr
(
self
,
input_name
.
lower
()
+
'_dtype'
,
pb_utils
.
triton_string_to_numpy
(
pb_utils
.
get_output_config_by_name
(
model_config
,
input_name
)[
'data_type'
]))
cur_folder
=
Path
(
__file__
).
parent
self
.
tokenizer
=
Tokenizer
(
osp
.
join
(
cur_folder
,
self
.
model_config
[
'parameters'
][
'tokenizer_path'
]
[
'string_value'
]))
self
.
start_id
=
self
.
tokenizer
.
start_id
self
.
end_id
=
self
.
tokenizer
.
end_id
def
execute
(
self
,
requests
):
"""`execute` must be implemented in every Python model. `execute`
function receives a list of pb_utils.InferenceRequest as the only
argument. This function is called when an inference is requested
for this model. Depending on the batching configuration (e.g. Dynamic
Batching) used, `requests` may contain multiple requests. Every
Python model, must create one pb_utils.InferenceResponse for every
pb_utils.InferenceRequest in `requests`. If there is an error, you can
set the error argument when creating a pb_utils.InferenceResponse.
Parameters
----------
requests : list
A list of pb_utils.InferenceRequest
Returns
-------
list
A list of pb_utils.InferenceResponse. The length of this list must
be the same as `requests`
"""
responses
=
[]
# Every Python backend must iterate over everyone of the requests
# and create a pb_utils.InferenceResponse for each of them.
for
idx
,
request
in
enumerate
(
requests
):
# Get input tensors
query
=
pb_utils
.
get_input_tensor_by_name
(
request
,
'QUERY'
).
as_numpy
()
# Preprocessing input data.
input_id
,
request_input_len
=
self
.
_create_request
(
query
)
# Create output tensors. You need pb_utils.Tensor
# objects to create pb_utils.InferenceResponse.
input_id_tensor
=
pb_utils
.
Tensor
(
'INPUT_ID'
,
np
.
array
(
input_id
).
astype
(
self
.
input_id_dtype
))
# Create InferenceResponse. You can set an error here in case
# there was a problem with handling this inference request.
# Below is an example of how you can set errors in inference
# response:
#
# pb_utils.InferenceResponse(
# output_tensors=..., TritonError("An error occurred"))
inference_response
=
pb_utils
.
InferenceResponse
(
output_tensors
=
[
input_id_tensor
])
responses
.
append
(
inference_response
)
# You should return a list of pb_utils.InferenceResponse. Length
# of this list must match the length of `requests` list.
return
responses
def
finalize
(
self
):
"""`finalize` is called only once when the model is being unloaded.
Implementing `finalize` function is optional. This function allows the
model to perform any necessary clean ups before exit.
"""
print
(
'Cleaning up...'
)
def
_create_request
(
self
,
query
):
start_ids
=
[
torch
.
IntTensor
(
self
.
tokenizer
.
encode
(
s
[
0
].
decode
()))
for
s
in
query
]
start_lengths
=
torch
.
IntTensor
([[
len
(
ids
)]
for
ids
in
start_ids
])
start_ids
=
pad_sequence
(
start_ids
,
batch_first
=
True
,
padding_value
=
self
.
end_id
)
return
start_ids
,
start_lengths
llmdeploy/serve/fastertransformer/triton_models/preprocessing/1/tokenizer
0 → 120000
View file @
4f47f78c
../../tokenizer
\ No newline at end of file
llmdeploy/serve/fastertransformer/triton_models/preprocessing/config.pbtxt
0 → 100644
View file @
4f47f78c
name: "preprocessing"
backend: "python"
max_batch_size: 1
input [
{
name: "QUERY"
data_type: TYPE_STRING
dims: [ -1 ]
}
]
output [
{
name: "INPUT_ID"
data_type: TYPE_UINT32
dims: [ -1 ]
}
]
instance_group [
{
count: 4
kind: KIND_CPU
}
]
parameters {
key: "tokenizer_path"
value: {
string_value: "tokenizer/tokenizer.model"
}
}
llmdeploy/serve/fastertransformer/triton_models/tokenizer/placeholder
0 → 100644
View file @
4f47f78c
llmdeploy/serve/fastertransformer/triton_models/weights/config.ini
0 → 100644
View file @
4f47f78c
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment