check-in fastertransformer's triton models (#3)

4f47f78c · lvhan028 · GitHub · ef2adb04 · 4f47f78c · 4f47f78c
Unverified Commit 4f47f78c authored Jun 18, 2023 by lvhan028 Committed by GitHub Jun 18, 2023
10 changed files
--- a/llmdeploy/serve/fastertransformer/triton_models/interactive/1/weights
+++ b/llmdeploy/serve/fastertransformer/triton_models/interactive/1/weights
+../../weights
\ No newline at end of file
--- a/llmdeploy/serve/fastertransformer/triton_models/interactive/config.pbtxt
+++ b/llmdeploy/serve/fastertransformer/triton_models/interactive/config.pbtxt
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+name: "fastertransformer"
+backend: "fastertransformer"
+default_model_filename: "weights"
+max_batch_size: 1
+
+model_transaction_policy {
+  decoupled: True
+}
+
+instance_group [
+  {
+    # max concurrent instances
+    count: 48
+    kind: KIND_CPU
+  }
+]
+
+input [
+  {
+    name: "input_ids"
+    data_type: TYPE_UINT32
+    dims: [ -1 ]
+    # allow_ragged_batch: true
+  },
+  {
+    name: "input_lengths"
+    data_type: TYPE_UINT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+  },
+  {
+    name: "request_output_len"
+    data_type: TYPE_UINT32
+    dims: [ -1 ]
+  },
+  {
+    name: "step"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "session_len"
+    data_type: TYPE_UINT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "runtime_top_k"
+    data_type: TYPE_UINT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "runtime_top_p"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "beam_search_diversity_rate"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "temperature"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "len_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "repetition_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "random_seed"
+    data_type: TYPE_UINT64
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "is_return_log_probs"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "beam_width"
+    data_type: TYPE_UINT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+    {
+    name: "start_id"
+    data_type: TYPE_UINT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "end_id"
+    data_type: TYPE_UINT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "bad_words_list"
+    data_type: TYPE_INT32
+    dims: [ 2, -1 ]
+    optional: true
+  },
+  {
+    name: "stop_words_list"
+    data_type: TYPE_INT32
+    dims: [ 2, -1 ]
+    optional: true
+  },
+  {
+    name: "prompt_learning_task_name_ids"
+    data_type: TYPE_UINT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "top_p_decay"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "top_p_min"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "top_p_reset_ids"
+    data_type: TYPE_UINT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "START"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "END"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "STOP"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "CORRID"
+    data_type: TYPE_UINT64
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  }
+]
+output [
+  {
+    name: "output_ids"
+    data_type: TYPE_UINT32
+    dims: [ -1, -1 ]
+  },
+  {
+    name: "sequence_length"
+    data_type: TYPE_UINT32
+    dims: [ -1 ]
+  },
+  {
+    name: "cum_log_probs"
+    data_type: TYPE_FP32
+    dims: [ -1 ]
+  },
+  {
+    name: "output_log_probs"
+    data_type: TYPE_FP32
+    dims: [ -1, -1 ]
+  }
+]
+parameters {
+  key: "tensor_para_size"
+  value: {
+    string_value: "1"
+  }
+}
+parameters {
+  key: "pipeline_para_size"
+  value: {
+    string_value: "1"
+  }
+}
+parameters {
+  key: "data_type"
+  value: {
+    string_value: "fp16"
+  }
+}
+parameters {
+  key: "model_type"
+  value: {
+    string_value: "Llama"
+  }
+}
+
+parameters {
+  key: "enable_custom_all_reduce"
+  value: {
+    string_value: "0"
+  }
+}
--- a/llmdeploy/serve/fastertransformer/triton_models/postprocessing/1/model.py
+++ b/llmdeploy/serve/fastertransformer/triton_models/postprocessing/1/model.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os.path as osp
+from pathlib import Path
+from typing import List
+
+import numpy as np
+import triton_python_backend_utils as pb_utils
+from sentencepiece import SentencePieceProcessor
+
+
+class Tokenizer:
+
+    def __init__(self, model_file: str):
+        self.model = SentencePieceProcessor(model_file=model_file)
+        self.vocab_size = self.model.vocab_size()
+        self.start_id = self.model.bos_id()
+        self.eos_id = self.model.eos_id()
+
+    def encode(self, s: str):
+        return self.model.Encode(s)
+
+    def decode(self, t: List[int]):
+        return self.model.Decode(t)
+
+
+class TritonPythonModel:
+    """Your Python model must use the same class name.
+
+    Every Python model that is created must have "TritonPythonModel" as the
+    class name.
+    """
+
+    def initialize(self, args):
+        """`initialize` is called only once when the model is being loaded.
+        Implementing `initialize` function is optional. This function allows
+        the model to initialize any state associated with this model.
+        Parameters
+        ----------
+        args : dict
+          Both keys and values are strings. The dictionary keys and values are:
+          * model_config: A JSON string containing the model configuration
+          * model_instance_kind: A string containing model instance kind
+          * model_instance_device_id: A string containing model instance device
+          ID
+          * model_repository: Model repository path
+          * model_version: Model version
+          * model_name: Model name
+        """
+        # Parse model configs
+        self.model_config = model_config = json.loads(args['model_config'])
+
+        # Parse model output configs
+        output_config = pb_utils.get_output_config_by_name(
+            model_config, 'OUTPUT')
+
+        # Convert Triton types to numpy types
+        self.output_dtype = pb_utils.triton_string_to_numpy(
+            output_config['data_type'])
+
+        cur_folder = Path(__file__).parent
+
+        self.tokenizer = Tokenizer(
+            osp.join(
+                cur_folder, self.model_config['parameters']['tokenizer_path']
+                ['string_value']))
+
+    def execute(self, requests):
+        """`execute` must be implemented in every Python model. `execute`
+        function receives a list of pb_utils.InferenceRequest as the only
+        argument. This function is called when an inference is requested
+        for this model. Depending on the batching configuration (e.g. Dynamic
+        Batching) used, `requests` may contain multiple requests. Every
+        Python model, must create one pb_utils.InferenceResponse for every
+        pb_utils.InferenceRequest in `requests`. If there is an error, you can
+        set the error argument when creating a pb_utils.InferenceResponse.
+        Parameters
+        ----------
+        requests : list
+          A list of pb_utils.InferenceRequest
+        Returns
+        -------
+        list
+          A list of pb_utils.InferenceResponse. The length of this list must
+          be the same as `requests`
+        """
+
+        responses = []
+
+        # Every Python backend must iterate over everyone of the requests
+        # and create a pb_utils.InferenceResponse for each of them.
+        for idx, request in enumerate(requests):
+            # Get input tensors
+            tokens_batch = pb_utils.get_input_tensor_by_name(
+                request, 'TOKENS_BATCH').as_numpy()
+            sequence_length = pb_utils.get_input_tensor_by_name(
+                request, 'sequence_length').as_numpy()
+
+            # Postprocessing output data.
+            outputs = self._postprocessing(tokens_batch.tolist(),
+                                           sequence_length)
+
+            # Create output tensors. You need pb_utils.Tensor
+            # objects to create pb_utils.InferenceResponse.
+            output_tensor = pb_utils.Tensor(
+                'OUTPUT',
+                np.array(outputs).astype(self.output_dtype))
+
+            # Create InferenceResponse. You can set an error here in case
+            # there was a problem with handling this inference request.
+            # Below is an example of how you can set errors in inference
+            # response:
+            #
+            # pb_utils.InferenceResponse(
+            #    output_tensors=..., TritonError("An error occurred"))
+            inference_response = pb_utils.InferenceResponse(
+                output_tensors=[output_tensor])
+            responses.append(inference_response)
+
+        # You should return a list of pb_utils.InferenceResponse. Length
+        # of this list must match the length of `requests` list.
+        return responses
+
+    def finalize(self):
+        """`finalize` is called only once when the model is being unloaded.
+
+        Implementing `finalize` function is optional. This function allows the
+        model to perform any necessary clean ups before exit.
+        """
+        print('Cleaning up...')
+
+    def _postprocessing(self, tokens_batch, sequence_length):
+        outputs = []
+        for beam_tokens, beam_len in zip(tokens_batch, sequence_length):
+            for tokens, _len in zip(beam_tokens, beam_len):
+                output = self.tokenizer.decode(tokens[:_len])
+                output = output.encode('utf8')
+                outputs.append(output)
+        return outputs
--- a/llmdeploy/serve/fastertransformer/triton_models/postprocessing/1/tokenizer
+++ b/llmdeploy/serve/fastertransformer/triton_models/postprocessing/1/tokenizer
+../../tokenizer/
\ No newline at end of file
--- a/llmdeploy/serve/fastertransformer/triton_models/postprocessing/config.pbtxt
+++ b/llmdeploy/serve/fastertransformer/triton_models/postprocessing/config.pbtxt
+name: "postprocessing"
+backend: "python"
+max_batch_size: 1
+input [
+  {
+    name: "TOKENS_BATCH"
+    data_type: TYPE_UINT32
+    dims: [ -1, -1 ]
+  },
+  {
+    name: "sequence_length"
+    data_type: TYPE_UINT32
+    dims: [ -1 ]
+  }
+]
+output [
+  {
+    name: "OUTPUT"
+    data_type: TYPE_STRING
+    dims: [ -1, -1 ]
+  }
+]
+
+instance_group [
+    {
+        count: 1
+        kind: KIND_CPU
+    }
+]
+
+parameters {
+  key: "tokenizer_path"
+  value: {
+    string_value: "tokenizer/tokenizer.model"
+  }
+}
--- a/llmdeploy/serve/fastertransformer/triton_models/preprocessing/1/model.py
+++ b/llmdeploy/serve/fastertransformer/triton_models/preprocessing/1/model.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os.path as osp
+from pathlib import Path
+from typing import List
+
+import numpy as np
+import torch
+import triton_python_backend_utils as pb_utils
+from sentencepiece import SentencePieceProcessor
+from torch.nn.utils.rnn import pad_sequence
+
+
+class Tokenizer:
+
+    def __init__(self, model_file: str):
+        self.model = SentencePieceProcessor(model_file=model_file)
+        self.vocab_size = self.model.vocab_size()
+        self.start_id = self.model.bos_id()
+        self.end_id = self.model.eos_id()
+
+    def encode(self, s: str):
+        add_bos = False
+        if s.find('<BOS>') != -1:
+            s = s.replace('<BOS>', '')
+            add_bos = True
+        return self.model.Encode(s, add_bos=add_bos)
+
+    def decode(self, t: List[int]):
+        return self.model.Decode(t)
+
+
+class TritonPythonModel:
+    """Your Python model must use the same class name.
+
+    Every Python model that is created must have "TritonPythonModel" as the
+    class name.
+    """
+
+    def initialize(self, args):
+        """`initialize` is called only once when the model is being loaded.
+        Implementing `initialize` function is optional. This function allows
+        the model to initialize any state associated with this model.
+        Parameters
+        ----------
+        args : dict
+          Both keys and values are strings. The dictionary keys and values are:
+          * model_config: A JSON string containing the model configuration
+          * model_instance_kind: A string containing model instance kind
+          * model_instance_device_id: A string containing model instance device
+          ID
+          * model_repository: Model repository path
+          * model_version: Model version
+          * model_name: Model name
+        """
+        # Parse model configs
+        self.model_config = model_config = json.loads(args['model_config'])
+
+        # Parse model output configs and convert Triton types to numpy types
+        input_names = ['INPUT_ID', 'REQUEST_INPUT_LEN']
+        for input_name in input_names:
+            setattr(
+                self,
+                input_name.lower() + '_dtype',
+                pb_utils.triton_string_to_numpy(
+                    pb_utils.get_output_config_by_name(
+                        model_config, input_name)['data_type']))
+
+        cur_folder = Path(__file__).parent
+        self.tokenizer = Tokenizer(
+            osp.join(
+                cur_folder, self.model_config['parameters']['tokenizer_path']
+                ['string_value']))
+        self.start_id = self.tokenizer.start_id
+        self.end_id = self.tokenizer.end_id
+
+    def execute(self, requests):
+        """`execute` must be implemented in every Python model. `execute`
+        function receives a list of pb_utils.InferenceRequest as the only
+        argument. This function is called when an inference is requested
+        for this model. Depending on the batching configuration (e.g. Dynamic
+        Batching) used, `requests` may contain multiple requests. Every
+        Python model, must create one pb_utils.InferenceResponse for every
+        pb_utils.InferenceRequest in `requests`. If there is an error, you can
+        set the error argument when creating a pb_utils.InferenceResponse.
+        Parameters
+        ----------
+        requests : list
+          A list of pb_utils.InferenceRequest
+        Returns
+        -------
+        list
+          A list of pb_utils.InferenceResponse. The length of this list must
+          be the same as `requests`
+        """
+
+        responses = []
+
+        # Every Python backend must iterate over everyone of the requests
+        # and create a pb_utils.InferenceResponse for each of them.
+        for idx, request in enumerate(requests):
+            # Get input tensors
+            query = pb_utils.get_input_tensor_by_name(request,
+                                                      'QUERY').as_numpy()
+
+            # Preprocessing input data.
+            input_id, request_input_len = self._create_request(query)
+
+            # Create output tensors. You need pb_utils.Tensor
+            # objects to create pb_utils.InferenceResponse.
+            input_id_tensor = pb_utils.Tensor(
+                'INPUT_ID',
+                np.array(input_id).astype(self.input_id_dtype))
+
+            # Create InferenceResponse. You can set an error here in case
+            # there was a problem with handling this inference request.
+            # Below is an example of how you can set errors in inference
+            # response:
+            #
+            # pb_utils.InferenceResponse(
+            #    output_tensors=..., TritonError("An error occurred"))
+            inference_response = pb_utils.InferenceResponse(
+                output_tensors=[input_id_tensor])
+            responses.append(inference_response)
+
+        # You should return a list of pb_utils.InferenceResponse. Length
+        # of this list must match the length of `requests` list.
+        return responses
+
+    def finalize(self):
+        """`finalize` is called only once when the model is being unloaded.
+
+        Implementing `finalize` function is optional. This function allows the
+        model to perform any necessary clean ups before exit.
+        """
+        print('Cleaning up...')
+
+    def _create_request(self, query):
+        start_ids = [
+            torch.IntTensor(self.tokenizer.encode(s[0].decode()))
+            for s in query
+        ]
+        start_lengths = torch.IntTensor([[len(ids)] for ids in start_ids])
+        start_ids = pad_sequence(start_ids,
+                                 batch_first=True,
+                                 padding_value=self.end_id)
+        return start_ids, start_lengths
--- a/llmdeploy/serve/fastertransformer/triton_models/preprocessing/1/tokenizer
+++ b/llmdeploy/serve/fastertransformer/triton_models/preprocessing/1/tokenizer
+../../tokenizer
\ No newline at end of file
--- a/llmdeploy/serve/fastertransformer/triton_models/preprocessing/config.pbtxt
+++ b/llmdeploy/serve/fastertransformer/triton_models/preprocessing/config.pbtxt
+name: "preprocessing"
+backend: "python"
+max_batch_size: 1
+input [
+    {
+        name: "QUERY"
+        data_type: TYPE_STRING
+        dims: [ -1 ]
+    }
+]
+output [
+    {
+        name: "INPUT_ID"
+        data_type: TYPE_UINT32
+        dims: [ -1 ]
+    }
+]
+
+instance_group [
+    {
+        count: 4
+        kind: KIND_CPU
+    }
+]
+
+parameters {
+  key: "tokenizer_path"
+  value: {
+    string_value: "tokenizer/tokenizer.model"
+  }
+}
--- a/llmdeploy/serve/fastertransformer/triton_models/tokenizer/placeholder
+++ b/llmdeploy/serve/fastertransformer/triton_models/tokenizer/placeholder
--- a/llmdeploy/serve/fastertransformer/triton_models/weights/config.ini
+++ b/llmdeploy/serve/fastertransformer/triton_models/weights/config.ini