refactor: remove python native runtime

0bfd9a76 · Neelay Shah · GitHub · 8f741f14 · 8f741f14 · 8f741f14
Commit 0bfd9a76 authored Feb 24, 2025 by Neelay Shah Committed by GitHub Feb 24, 2025
20 changed files
--- a/examples/python/llm/tensorrtllm/operators/triton_core_models/simple_postprocessing/1/model.py
+++ b/examples/python/llm/tensorrtllm/operators/triton_core_models/simple_postprocessing/1/model.py
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-
-import numpy as np
-import triton_python_backend_utils as pb_utils
-from transformers import AutoTokenizer
-
-
-class TritonPythonModel:
-    """
-    This model allows Triton to act like a api server for T3 ICP
-    """
-
-    @staticmethod
-    def auto_complete_config(auto_complete_model_config):
-        inputs = [
-            {"name": "tokens_batch", "data_type": "TYPE_INT32", "dims": [-1, -1]},
-            {"name": "sequence_lengths", "data_type": "TYPE_INT32", "dims": [-1]},
-        ]
-        outputs = [
-            {"name": "output", "data_type": "TYPE_STRING", "dims": [-1]},
-        ]
-
-        # Store the model configuration as a dictionary.
-        config = auto_complete_model_config.as_dict()
-        input_names = []
-        output_names = []
-        for input in config["input"]:
-            input_names.append(input["name"])
-        for output in config["output"]:
-            output_names.append(output["name"])
-
-        # Add only missing inputs and output to the model configuration.
-        for input in inputs:
-            if input["name"] not in input_names:
-                auto_complete_model_config.add_input(input)
-        for output in outputs:
-            if output["name"] not in output_names:
-                auto_complete_model_config.add_output(output)
-
-        return auto_complete_model_config
-
-    def initialize(self, args):
-        model_config = json.loads(args["model_config"])
-        self.logger = pb_utils.Logger
-
-        # Parse model configs
-        model_config = json.loads(args["model_config"])
-        tokenizer_dir = model_config["parameters"]["tokenizer_dir"]["string_value"]
-
-        skip_special_tokens = model_config["parameters"].get("skip_special_tokens")
-        if skip_special_tokens is not None:
-            skip_special_tokens_str = skip_special_tokens["string_value"].lower()
-            if skip_special_tokens_str in [
-                "true",
-                "false",
-                "1",
-                "0",
-                "t",
-                "f",
-                "y",
-                "n",
-                "yes",
-                "no",
-            ]:
-                self.skip_special_tokens = skip_special_tokens_str in [
-                    "true",
-                    "1",
-                    "t",
-                    "y",
-                    "yes",
-                ]
-            else:
-                self.logger.log_warn(
-                    f"[TensorRT-LLM][WARNING] Don't setup 'skip_special_tokens' correctly (set value is {skip_special_tokens['string_value']}). Set it as True by default."
-                )
-                self.skip_special_tokens = True
-        else:
-            self.logger.log_warn(
-                "[TensorRT-LLM][WARNING] Don't setup 'skip_special_tokens'. Set it as True by default."
-            )
-            self.skip_special_tokens = True
-
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            tokenizer_dir, legacy=False, padding_side="left", trust_remote_code=True
-        )
-        if not self.tokenizer.pad_token:
-            self.tokenizer.pad_token = self.tokenizer.eos_token
-
-        for output_name in ["output"]:
-            setattr(
-                self,
-                output_name.lower() + "_dtype",
-                pb_utils.triton_string_to_numpy(
-                    pb_utils.get_output_config_by_name(model_config, output_name)[
-                        "data_type"
-                    ]
-                ),
-            )
-
-    def execute(self, requests):
-        tokens_batch = []
-        sequence_lengths = []
-        for idx, request in enumerate(requests):
-            for input_tensor in request.inputs():
-                if input_tensor.name() == "tokens_batch":
-                    tokens_batch.append(input_tensor.as_numpy())
-                elif input_tensor.name() == "sequence_lengths":
-                    sequence_lengths.append(input_tensor.as_numpy())
-                else:
-                    raise ValueError(f"unknown input {input_tensor.name}")
-
-        # batch decode
-        list_of_tokens = []
-        req_idx_offset = 0
-        req_idx_offsets = [req_idx_offset]
-        for idx, token_batch in enumerate(tokens_batch):
-            for batch_idx, beam_tokens in enumerate(token_batch):
-                for beam_idx, tokens in enumerate(beam_tokens):
-                    seq_len = sequence_lengths[idx][batch_idx][beam_idx]
-                    list_of_tokens.append(tokens[:seq_len])
-                    req_idx_offset += 1
-
-            req_idx_offsets.append(req_idx_offset)
-
-        all_outputs = self.tokenizer.batch_decode(
-            list_of_tokens, skip_special_tokens=self.skip_special_tokens
-        )
-
-        # construct responses
-        responses = []
-        for idx, request in enumerate(requests):
-            req_outputs = [
-                x.encode("utf8")
-                for x in all_outputs[req_idx_offsets[idx] : req_idx_offsets[idx + 1]]
-            ]
-
-            output_tensor = pb_utils.Tensor(
-                "output", np.array(req_outputs).astype(self.output_dtype)
-            )
-
-            outputs = [output_tensor]
-
-            inference_response = pb_utils.InferenceResponse(output_tensors=outputs)
-            responses.append(inference_response)
-
-        return responses
-
-    def finalize(self):
-        """`finalize` is called only once when the model is being unloaded.
-        Implementing `finalize` function is optional. This function allows
-        the model to perform any necessary clean ups before exit.
-        """
-        print("Cleaning up...")
--- a/examples/python/llm/tensorrtllm/operators/triton_core_models/simple_postprocessing/config.pbtxt
+++ b/examples/python/llm/tensorrtllm/operators/triton_core_models/simple_postprocessing/config.pbtxt
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-backend: "python"
-# TODO: Tune dynamic batcher
-max_batch_size: 64
-dynamic_batching {}
-
-
-parameters {
-  key: "tokenizer_dir"
-  value: {
-    string_value: "/workspace/examples/python/llm/tensorrtllm/operators/hf_downloads/llama-3.1-8b-instruct"
-  }
-}
-
-#parameters {
-#  key: "skip_special_tokens"
-#  value: {
-#    string_value: "${skip_special_tokens}"
-#  }
-#}
-
-instance_group [
-    {
-        count: 10
-        kind : KIND_CPU
-    }
-]
\ No newline at end of file
--- a/examples/python/llm/tensorrtllm/operators/triton_core_models/simple_preprocessing/1/model.py
+++ b/examples/python/llm/tensorrtllm/operators/triton_core_models/simple_preprocessing/1/model.py
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-
-import numpy
-import triton_python_backend_utils as pb_utils
-from transformers import AutoTokenizer, T5Tokenizer
-
-
-class TritonPythonModel:
-    """
-    This model allows Triton to act like a api server for T3 ICP
-    """
-
-    @staticmethod
-    def auto_complete_config(auto_complete_model_config):
-        inputs = [
-            {"name": "query", "data_type": "TYPE_STRING", "dims": [1]},
-        ]
-        outputs = [
-            {"name": "start_ids", "data_type": "TYPE_INT32", "dims": [-1]},
-            {"name": "start_lengths", "data_type": "TYPE_INT32", "dims": [-1]},
-        ]
-
-        # Store the model configuration as a dictionary.
-        config = auto_complete_model_config.as_dict()
-        input_names = []
-        output_names = []
-        for input in config["input"]:
-            input_names.append(input["name"])
-        for output in config["output"]:
-            output_names.append(output["name"])
-
-        # Add only missing inputs and output to the model configuration.
-        for input in inputs:
-            if input["name"] not in input_names:
-                auto_complete_model_config.add_input(input)
-        for output in outputs:
-            if output["name"] not in output_names:
-                auto_complete_model_config.add_output(output)
-
-        return auto_complete_model_config
-
-    def initialize(self, args):
-        model_config = json.loads(args["model_config"])
-        self.logger = pb_utils.Logger
-
-        tokenizer_dir = model_config["parameters"]["tokenizer_dir"]["string_value"]
-
-        self._tokenizer = AutoTokenizer.from_pretrained(
-            tokenizer_dir, legacy=False, padding_side="left", trust_remote_code=True
-        )
-
-        if isinstance(self._tokenizer, T5Tokenizer):
-            self._tokenizer_bos_id = self._tokenizer.sp_model.bos_id()
-
-        if not self._tokenizer.pad_token:
-            self._tokenizer.pad_token = self._tokenizer.eos_token
-
-        self._tokenizer_end_id = self._tokenizer.encode(
-            self._tokenizer.eos_token, add_special_tokens=False
-        )[0]
-        self._tokenizer_pad_id = self._tokenizer.encode(
-            self._tokenizer.pad_token, add_special_tokens=False
-        )[0]
-        self._vocab_size = self._tokenizer.vocab_size
-
-        for output_name in ["start_ids", "start_lengths"]:
-            setattr(
-                self,
-                output_name.lower() + "_dtype",
-                pb_utils.triton_string_to_numpy(
-                    pb_utils.get_output_config_by_name(model_config, output_name)[
-                        "data_type"
-                    ]
-                ),
-            )
-
-    def execute(self, requests):
-        responses = []
-
-        for request in requests:
-            query = pb_utils.get_input_tensor_by_name(request, "query").as_numpy()
-
-            # Preprocessing input data.
-            if isinstance(self._tokenizer, T5Tokenizer):
-                start_ids = [
-                    numpy.array(
-                        [self._tokenizer_bos_id]
-                        + self._tokenizer.encode(
-                            s[0].decode(), add_special_tokens=False
-                        )
-                    ).astype(numpy.int32)
-                    for s in query
-                ]
-            else:
-                start_ids = [
-                    numpy.array(
-                        self._tokenizer.encode(s[0].decode(), add_special_tokens=False)
-                    ).astype(numpy.int32)
-                    for s in query
-                ]
-
-            start_lengths = numpy.array([[len(ids)] for ids in start_ids]).astype(
-                numpy.int32
-            )
-
-            max_len = 0
-            for seq in start_ids:
-                max_len = max(max_len, seq.shape[0])
-            start_ids = numpy.stack(
-                [
-                    numpy.pad(
-                        seq,
-                        (0, max_len - seq.shape[0]),
-                        "constant",
-                        constant_values=(0, self._tokenizer_pad_id),
-                    )
-                    for seq in start_ids
-                ]
-            )
-
-            start_ids_tensor = pb_utils.Tensor(
-                "start_ids", numpy.array(start_ids).astype(self.start_ids_dtype)
-            )
-            start_lengths_tensor = pb_utils.Tensor(
-                "start_lengths",
-                numpy.array(start_lengths).astype(self.start_lengths_dtype),
-            )
-
-            outputs = [start_ids_tensor, start_lengths_tensor]
-
-            inference_response = pb_utils.InferenceResponse(output_tensors=outputs)
-            responses.append(inference_response)
-
-        return responses
--- a/examples/python/llm/tensorrtllm/operators/triton_core_models/simple_preprocessing/config.pbtxt
+++ b/examples/python/llm/tensorrtllm/operators/triton_core_models/simple_preprocessing/config.pbtxt
--- a/examples/python/llm/tensorrtllm/scripts/gpu_info.py
+++ b/examples/python/llm/tensorrtllm/scripts/gpu_info.py
--- a/examples/python/llm/tensorrtllm/scripts/known_models.py
+++ b/examples/python/llm/tensorrtllm/scripts/known_models.py
--- a/examples/python/llm/tensorrtllm/scripts/prepare_models.py
+++ b/examples/python/llm/tensorrtllm/scripts/prepare_models.py
--- a/examples/python/llm/vllm/README.md
+++ b/examples/python/llm/vllm/README.md
--- a/examples/python/llm/vllm/__init__.py
+++ b/examples/python/llm/vllm/__init__.py
--- a/examples/python/llm/vllm/assets/vllm_disagg_architecture_overview.jpg
+++ b/examples/python/llm/vllm/assets/vllm_disagg_architecture_overview.jpg
--- a/examples/python/llm/vllm/benchmark/README.md
+++ b/examples/python/llm/vllm/benchmark/README.md
--- a/examples/python/llm/vllm/benchmark/deploy_llama_70b_context_tp2dp4.sh
+++ b/examples/python/llm/vllm/benchmark/deploy_llama_70b_context_tp2dp4.sh
--- a/examples/python/llm/vllm/benchmark/deploy_llama_70b_generate_tp8dp1.sh
+++ b/examples/python/llm/vllm/benchmark/deploy_llama_70b_generate_tp8dp1.sh
--- a/examples/python/llm/vllm/deploy/__main__.py
+++ b/examples/python/llm/vllm/deploy/__main__.py
--- a/examples/python/llm/vllm/deploy/deploy_llama_8b_baseline.sh
+++ b/examples/python/llm/vllm/deploy/deploy_llama_8b_baseline.sh
--- a/examples/python/llm/vllm/deploy/deploy_llama_8b_disaggregated.sh
+++ b/examples/python/llm/vllm/deploy/deploy_llama_8b_disaggregated.sh
--- a/examples/python/llm/vllm/deploy/deploy_llama_8b_disaggregated_multinode.sh
+++ b/examples/python/llm/vllm/deploy/deploy_llama_8b_disaggregated_multinode.sh
--- a/examples/python/llm/vllm/deploy/parser.py
+++ b/examples/python/llm/vllm/deploy/parser.py
--- a/examples/python/llm/vllm/operators/__init__.py
+++ b/examples/python/llm/vllm/operators/__init__.py
--- a/examples/python/llm/vllm/operators/stages.py
+++ b/examples/python/llm/vllm/operators/stages.py