refactor: remove python native runtime

0bfd9a76 · Neelay Shah · GitHub · 8f741f14 · 8f741f14 · 8f741f14
Commit 0bfd9a76 authored Feb 24, 2025 by Neelay Shah Committed by GitHub Feb 24, 2025
12 changed files
--- a/runtime/tests/python/integration/operators/triton_core_models/multiply/config.pbtxt
+++ b/runtime/tests/python/integration/operators/triton_core_models/multiply/config.pbtxt
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-backend: "python"
--- a/runtime/tests/python/integration/operators/triton_core_models/postprocessing/1/model.py
+++ b/runtime/tests/python/integration/operators/triton_core_models/postprocessing/1/model.py
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-
-import numpy as np
-import triton_python_backend_utils as pb_utils
-
-# from transformers import LlamaTokenizer
-# llama_tokenizer = LlamaTokenizer.from_pretrained("/path/to/hfmodel")
-from transformers import XLNetTokenizer
-
-
-class TritonPythonModel:
-    def initialize(self, args):
-        model_config = json.loads(args["model_config"])
-
-        for output_name in ["OUTPUT"]:
-            setattr(
-                self,
-                output_name.lower() + "_dtype",
-                pb_utils.triton_string_to_numpy(
-                    pb_utils.get_output_config_by_name(model_config, output_name)[
-                        "data_type"
-                    ]
-                ),
-            )
-
-        # Using a mock hard coded auto-tokenizer
-        self.tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
-        self._logger = pb_utils.Logger
-
-    def execute(self, requests):
-        responses = []
-        for idx, request in enumerate(requests):
-            # Get input tensors
-            output_ids = pb_utils.get_input_tensor_by_name(
-                request, "OUTPUT_IDS"
-            ).as_numpy()
-
-            output_result = np.array(
-                self.tokenizer.convert_ids_to_tokens((output_ids.tolist()))
-            )
-            self._logger.log_verbose(f"Output Result \n\n {output_result}")
-
-            output_tensor = pb_utils.Tensor(
-                "OUTPUT", output_result.astype(self.output_dtype)
-            )
-            inference_response = pb_utils.InferenceResponse(
-                output_tensors=[output_tensor]
-            )
-            responses.append(inference_response)
-
-        return responses
--- a/runtime/tests/python/integration/operators/triton_core_models/postprocessing/config.pbtxt
+++ b/runtime/tests/python/integration/operators/triton_core_models/postprocessing/config.pbtxt
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Emulates the pre-processing config from:
-# https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/all_models/inflight_batcher_llm/postprocessing/config.pbtxt
-
-name: "postprocessing"
-backend: "python"
-max_batch_size: 0
-
-input [
-    {
-        name: "OUTPUT_IDS"
-        data_type: TYPE_INT32
-        dims: [ -1 ]
-    },
-    {
-        name: "SEQUENCE_LENGTH"
-        data_type: TYPE_INT32
-        dims: [ 1 ]
-    }
-    # Add more inputs as per requirement.
-    # For simplicity only sticking with these
-    # inputs for preprocessing.
-]
-
-output [
-    {
-        name: "OUTPUT"
-        data_type: TYPE_STRING
-        dims: [ -1 ]
-    }
-    # Add more outputs as per requirement.
-    # For simplicity only sticking with these
-    # outputs for preprocessing.
-]
-
-instance_group [
-    {
-        count: 4
-        kind: KIND_CPU
-    }
-]
--- a/runtime/tests/python/integration/operators/triton_core_models/preprocessing/1/model.py
+++ b/runtime/tests/python/integration/operators/triton_core_models/preprocessing/1/model.py
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-
-import numpy as np
-import triton_python_backend_utils as pb_utils
-
-# from transformers import LlamaTokenizer
-# llama_tokenizer = LlamaTokenizer.from_pretrained("/path/to/hfmodel")
-from transformers import XLNetTokenizer
-
-
-class TritonPythonModel:
-    """
-    This is a mock disaggregated serving pre-processing model.
-    """
-
-    def initialize(self, args):
-        model_config = json.loads(args["model_config"])
-
-        for output_name in ["INPUT_IDS", "INPUT_LENGTH", "REQUEST_OUTPUT_LEN"]:
-            setattr(
-                self,
-                output_name.lower() + "_dtype",
-                pb_utils.triton_string_to_numpy(
-                    pb_utils.get_output_config_by_name(model_config, output_name)[
-                        "data_type"
-                    ]
-                ),
-            )
-
-        # Using a mock hard coded auto-tokenizer
-        self.tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
-        self.logger = pb_utils.Logger
-
-    def execute(self, requests):
-        self.logger.log_verbose("In preprocessing execute!")
-        responses = []
-
-        for idx, request in enumerate(requests):
-            # Get input tensors
-            query = pb_utils.get_input_tensor_by_name(request, "query").as_numpy()
-            request_output_len = pb_utils.get_input_tensor_by_name(
-                request, "request_output_len"
-            ).as_numpy()
-
-            self.logger.log_verbose(f"query(pre-proc) {query}")
-            tokenize = np.array(self.tokenizer.encode(query[0].decode()))
-            self.logger.log_verbose(f"tokenize(pre-proc) {tokenize.size}")
-            input_length = np.array([tokenize.size])
-
-            # Just forwarding query to the pre-processed input_ids
-            input_id_tensor = pb_utils.Tensor(
-                "INPUT_IDS", tokenize.astype(self.input_ids_dtype)
-            )
-            # Just forwarding query to the pre-processed input_ids
-            input_length_tensor = pb_utils.Tensor(
-                "INPUT_LENGTH", input_length.astype(self.input_length_dtype)
-            )
-            request_output_len_tensor = pb_utils.Tensor(
-                "REQUEST_OUTPUT_LEN", request_output_len
-            )
-
-            inference_response = pb_utils.InferenceResponse(
-                output_tensors=[
-                    input_id_tensor,
-                    input_length_tensor,
-                    request_output_len_tensor,
-                ]
-            )
-            responses.append(inference_response)
-
-        return responses
--- a/runtime/tests/python/integration/operators/triton_core_models/preprocessing/config.pbtxt
+++ b/runtime/tests/python/integration/operators/triton_core_models/preprocessing/config.pbtxt
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Emulates the pre-processing config from:
-# https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/all_models/inflight_batcher_llm/preprocessing/config.pbtxt
-
-name: "preprocessing"
-backend: "python"
-max_batch_size: 0
-
-input [
-    {
-        name: "query"
-        data_type: TYPE_STRING
-        dims: [ 1 ]
-    },
-    {
-        name: "request_output_len"
-        data_type: TYPE_INT32
-        dims: [ 1 ]
-    }
-
-    # Add more inputs as per requirement.
-    # For simplicity only sticking with these
-    # inputs for preprocessing.
-]
-
-output [
-    {
-        name: "INPUT_IDS"
-        data_type: TYPE_INT32
-        dims: [ -1 ]
-    },
-    {
-        name: "INPUT_LENGTH"
-        data_type: TYPE_INT32
-        dims: [ 1 ]
-    },
-    {
-        name: "REQUEST_OUTPUT_LEN"
-        data_type: TYPE_INT32
-        dims: [ 1 ]
-    }
-
-    # Add more outputs as per requirement.
-    # For simplicity only sticking with these
-    # outputs for preprocessing.
-
-]
-
-instance_group [
-    {
-        count: 4
-        kind: KIND_CPU
-    }
-]
--- a/runtime/tests/python/integration/test_add_multiply_divide.py
+++ b/runtime/tests/python/integration/test_add_multiply_divide.py
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import asyncio
-import sys
-from multiprocessing import Process
-
-import cupy
-import numpy
-import pytest
-import ucp
-from cupy_backends.cuda.api.runtime import CUDARuntimeError
-
-from triton_distributed.icp.nats_request_plane import NatsRequestPlane
-from triton_distributed.icp.ucp_data_plane import UcpDataPlane
-from triton_distributed.runtime.deployment import Deployment
-from triton_distributed.runtime.logger import get_logger
-from triton_distributed.runtime.operator import OperatorConfig
-from triton_distributed.runtime.remote_operator import RemoteOperator
-from triton_distributed.runtime.triton_core_operator import TritonCoreOperator
-from triton_distributed.runtime.worker import WorkerConfig
-
-NATS_PORT = 4223
-MODEL_REPOSITORY = (
-    "/workspace/runtime/tests/python/integration/operators/triton_core_models"
-)
-OPERATORS_REPOSITORY = "/workspace/runtime/tests/python/integration/operators"
-TRITON_LOG_LEVEL = 6
-
-logger = get_logger(__name__)
-
-# Run cupy's cuda.is_available once to
-# avoid the exception hitting runtime code.
-try:
-    if cupy.cuda.is_available():
-        pass
-    else:
-        print("CUDA not available.")
-except CUDARuntimeError:
-    print("CUDA not available")
-
-# TODO
-# Decide if this should be
-# pre merge, nightly, or weekly
-pytestmark = pytest.mark.pre_merge
-
-
-@pytest.fixture
-def workers(request, log_dir):
-    operator_configs = {}
-
-    store_outputs_in_response = request.getfixturevalue("store_outputs_in_response")
-    # Add configs for triton core operators
-    triton_core_operators = ["add", "multiply", "divide"]
-    for operator_name in triton_core_operators:
-        operator_configs[operator_name] = OperatorConfig(
-            name=operator_name,
-            implementation=TritonCoreOperator,
-            version=1,
-            max_inflight_requests=10,
-            parameters={"store_outputs_in_response": store_outputs_in_response},
-            repository=MODEL_REPOSITORY,
-        )
-
-    # Add configs for other custom operators
-    operator_name = "add_multiply_divide"
-    operator_configs[operator_name] = OperatorConfig(
-        name=operator_name,
-        implementation="add_multiply_divide:AddMultiplyDivide",
-        version=1,
-        max_inflight_requests=10,
-        parameters={"store_outputs_in_response": store_outputs_in_response},
-        repository=OPERATORS_REPOSITORY,
-    )
-
-    worker_configs = []
-
-    test_log_dir = log_dir / request.node.name
-    test_log_dir.mkdir(parents=True, exist_ok=True)
-
-    # We will instantiate a worker for each operator
-    for name, operator_config in operator_configs.items():
-        # Set the logging directory
-        worker_log_dir = test_log_dir / name
-        worker_configs.append(
-            WorkerConfig(
-                name=name,
-                request_plane=NatsRequestPlane,
-                data_plane=UcpDataPlane,
-                request_plane_args=(
-                    [],
-                    {"request_plane_uri": f"nats://localhost:{NATS_PORT}"},
-                ),
-                log_level=TRITON_LOG_LEVEL,
-                log_dir=str(worker_log_dir),
-                operators=[operator_config],
-            )
-        )
-
-    worker_deployment = Deployment(worker_configs)
-
-    worker_deployment.start()
-    yield worker_deployment
-    worker_deployment.shutdown()
-
-
-def _create_inputs(number, size):
-    inputs = []
-    outputs = []
-
-    for index in range(number):
-        input_ = numpy.random.randint(low=1, high=100, size=[2, size])
-
-        expected_ = {}
-
-        expected_["add_int64_output_total"] = numpy.array([[input_.sum()]])
-
-        expected_["add_int64_output_partial"] = numpy.array([[x.sum() for x in input_]])
-
-        expected_["multiply_int64_output_total"] = numpy.array(
-            [[x.prod() for x in expected_["add_int64_output_partial"]]]
-        )
-
-        divisor = expected_["add_int64_output_total"][0][0]
-
-        dividends = expected_["add_int64_output_partial"]
-
-        expected_["divide_fp64_output_partial"] = numpy.array(
-            [numpy.divide(dividends, divisor)]
-        )
-        inputs.append(input_)
-        outputs.append(expected_)
-    return inputs, outputs
-
-
-async def post_requests(num_requests, store_inputs_in_request):
-    """
-    Post requests to add_multiply_divide operator.
-    """
-    ucp.reset()
-    timeout = 5
-
-    data_plane = UcpDataPlane()
-    data_plane.connect()
-
-    request_plane = NatsRequestPlane(f"nats://localhost:{NATS_PORT}")
-    await request_plane.connect()
-
-    add_multiply_divide_operator = RemoteOperator(
-        "add_multiply_divide", request_plane, data_plane
-    )
-
-    results = []
-    expected_results = {}
-
-    inputs, outputs = _create_inputs(num_requests, 40)
-
-    for i, input_ in enumerate(inputs):
-        request_id = str(i)
-        request = add_multiply_divide_operator.create_request(
-            inputs={"int64_input": input_}, request_id=request_id
-        )
-        if store_inputs_in_request:
-            request.store_inputs_in_request.add("int64_input")
-        print(request)
-        results.append(add_multiply_divide_operator.async_infer(request))
-        expected_results[request_id] = outputs[i]
-
-    for result in asyncio.as_completed(results):
-        responses = await result
-        async for response in responses:
-            print(response)
-
-            for output_name, expected_value in expected_results[
-                response.request_id
-            ].items():
-                output = response.outputs[output_name]
-                output_value = numpy.from_dlpack(output.to_host())
-                numpy.testing.assert_equal(output_value, expected_value)
-                del output
-
-            print(expected_results[response.request_id])
-
-            del response
-
-    timeout = 5
-    data_plane.close(timeout)
-    await request_plane.close()
-
-
-def run(num_requests, store_inputs_in_request=False):
-    sys.exit(
-        asyncio.run(
-            post_requests(
-                num_requests=num_requests,
-                store_inputs_in_request=store_inputs_in_request,
-            )
-        )
-    )
-
-
-@pytest.mark.skipif(
-    "(not os.path.exists('/usr/local/bin/nats-server'))",
-    reason="NATS.io not present",
-)
-@pytest.mark.timeout(120)
-@pytest.mark.parametrize(
-    ["store_inputs_in_request", "store_outputs_in_response"],
-    [(False, False), (True, True)],
-)
-def test_add_multiply_divide(
-    request,
-    nats_server,
-    workers,
-    store_inputs_in_request,
-    store_outputs_in_response,
-):
-    # Using a separate process to use data plane across multiple tests.
-    p = Process(target=run, args=(2, store_inputs_in_request))
-    p.start()
-    p.join()
-    assert p.exitcode == 0
--- a/runtime/tests/python/integration/test_consolidated_logging.py
+++ b/runtime/tests/python/integration/test_consolidated_logging.py
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import asyncio
-import pathlib
-import sys
-from multiprocessing import Process
-
-import cupy
-import numpy
-import pytest
-import ucp
-from cupy_backends.cuda.api.runtime import CUDARuntimeError
-
-from triton_distributed.icp.nats_request_plane import NatsRequestPlane
-from triton_distributed.icp.ucp_data_plane import UcpDataPlane
-from triton_distributed.runtime.deployment import Deployment
-from triton_distributed.runtime.logger import get_logger
-from triton_distributed.runtime.operator import OperatorConfig
-from triton_distributed.runtime.remote_operator import RemoteOperator
-from triton_distributed.runtime.triton_core_operator import TritonCoreOperator
-from triton_distributed.runtime.worker import WorkerConfig
-
-NATS_PORT = 4223
-MODEL_REPOSITORY = (
-    "/workspace/runtime/tests/python/integration/operators/triton_core_models"
-)
-OPERATORS_REPOSITORY = "/workspace/runtime/tests/python/integration/operators"
-TRITON_LOG_LEVEL = 6
-
-logger = get_logger(__name__)
-
-# Run cupy's cuda.is_available once to
-# avoid the exception hitting runtime code.
-try:
-    if cupy.cuda.is_available():
-        pass
-    else:
-        print("CUDA not available.")
-except CUDARuntimeError:
-    print("CUDA not available")
-
-# TODO
-# Decide if this should be
-# pre merge, nightly, or weekly
-pytestmark = pytest.mark.pre_merge
-
-
-@pytest.fixture
-def workers(request, log_dir):
-    operator_configs = {}
-    # Add configs for triton core operators
-    triton_core_operators = ["add", "multiply", "divide"]
-    for operator_name in triton_core_operators:
-        operator_configs[operator_name] = OperatorConfig(
-            name=operator_name,
-            implementation=TritonCoreOperator,
-            version=1,
-            max_inflight_requests=10,
-            repository=MODEL_REPOSITORY,
-        )
-
-    # Add configs for other custom operators
-    operator_name = "add_multiply_divide"
-    operator_configs[operator_name] = OperatorConfig(
-        name=operator_name,
-        implementation="add_multiply_divide:AddMultiplyDivide",
-        version=1,
-        max_inflight_requests=10,
-        repository=OPERATORS_REPOSITORY,
-    )
-
-    worker_configs = []
-
-    test_log_dir = log_dir / request.node.name
-    test_log_dir.mkdir(parents=True, exist_ok=True)
-
-    # We will instantiate a worker for each operator
-    for name, operator_config in operator_configs.items():
-        # Set the logging directory
-        worker_log_dir = test_log_dir / name
-        worker_configs.append(
-            WorkerConfig(
-                name=name,
-                request_plane=NatsRequestPlane,
-                data_plane=UcpDataPlane,
-                request_plane_args=(
-                    [],
-                    {"request_plane_uri": f"nats://localhost:{NATS_PORT}"},
-                ),
-                log_level=TRITON_LOG_LEVEL,
-                log_dir=str(worker_log_dir),
-                operators=[operator_config],
-            )
-        )
-
-    consolidate_logs = request.getfixturevalue("consolidate_logs")
-    worker_deployment = Deployment(
-        worker_configs,
-        consolidate_logs=consolidate_logs,
-        log_dir=log_dir,
-    )
-
-    worker_deployment.start()
-    yield worker_deployment
-    worker_deployment.shutdown()
-
-
-def _create_inputs(number, size):
-    inputs = []
-    outputs = []
-
-    for index in range(number):
-        input_ = numpy.random.randint(low=1, high=100, size=[2, size])
-
-        expected_ = {}
-
-        expected_["add_int64_output_total"] = numpy.array([[input_.sum()]])
-
-        expected_["add_int64_output_partial"] = numpy.array([[x.sum() for x in input_]])
-
-        expected_["multiply_int64_output_total"] = numpy.array(
-            [[x.prod() for x in expected_["add_int64_output_partial"]]]
-        )
-
-        divisor = expected_["add_int64_output_total"][0][0]
-
-        dividends = expected_["add_int64_output_partial"]
-
-        expected_["divide_fp64_output_partial"] = numpy.array(
-            [numpy.divide(dividends, divisor)]
-        )
-        inputs.append(input_)
-        outputs.append(expected_)
-    return inputs, outputs
-
-
-async def post_requests(num_requests):
-    """
-    Post requests to add_multiply_divide operator.
-    """
-    ucp.reset()
-    timeout = 5
-
-    data_plane = UcpDataPlane()
-    data_plane.connect()
-
-    request_plane = NatsRequestPlane(f"nats://localhost:{NATS_PORT}")
-    await request_plane.connect()
-
-    add_multiply_divide_operator = RemoteOperator(
-        "add_multiply_divide", request_plane, data_plane
-    )
-
-    results = []
-    expected_results = {}
-
-    inputs, outputs = _create_inputs(num_requests, 40)
-
-    for i, input_ in enumerate(inputs):
-        request_id = str(i)
-        request = add_multiply_divide_operator.create_request(
-            inputs={"int64_input": input_}, request_id=request_id
-        )
-        print(request)
-        results.append(add_multiply_divide_operator.async_infer(request))
-        expected_results[request_id] = outputs[i]
-
-    for result in asyncio.as_completed(results):
-        responses = await result
-        async for response in responses:
-            print(response)
-
-            for output_name, expected_value in expected_results[
-                response.request_id
-            ].items():
-                output = response.outputs[output_name]
-                output_value = numpy.from_dlpack(output.to_host())
-                numpy.testing.assert_equal(output_value, expected_value)
-                del output
-
-            print(expected_results[response.request_id])
-
-            del response
-
-    timeout = 5
-    data_plane.close(timeout)
-    await request_plane.close()
-
-
-def run(num_requests):
-    sys.exit(asyncio.run(post_requests(num_requests=num_requests)))
-
-
-@pytest.mark.skipif(
-    "(not os.path.exists('/usr/local/bin/nats-server'))",
-    reason="NATS.io not present",
-)
-@pytest.mark.timeout(120)
-@pytest.mark.parametrize(
-    "consolidate_logs",
-    [True, False],
-)
-def test_consolidate_logs(request, nats_server, workers, consolidate_logs, log_dir):
-    # Using a separate process to use data plane across multiple tests.
-    p = Process(target=run, args=(2,))
-    p.start()
-    p.join()
-    assert p.exitcode == 0
-
-    # Test the number of logs that were created
-    log_dir_path = pathlib.Path(log_dir) / request.node.name
-    worker_log_dir_count = 0
-    for name in log_dir_path.iterdir():
-        worker_log_dir_count += 1
-        expected_worker_log_count = 1
-        if not consolidate_logs and name.stem not in ["add_multiply_divide"]:
-            expected_worker_log_count = 2
-        worker_log_path = log_dir_path / name.stem
-        worker_log_count = 0
-        for log_name in worker_log_path.iterdir():
-            worker_log_count += 1
-        assert worker_log_count == expected_worker_log_count
-
-    assert worker_log_dir_count == 4
--- a/runtime/tests/python/integration/test_direct.py
+++ b/runtime/tests/python/integration/test_direct.py
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import asyncio
-import sys
-import uuid
-from multiprocessing import Process
-
-import cupy
-import numpy
-import pytest
-import ucp
-from cupy_backends.cuda.api.runtime import CUDARuntimeError
-
-from triton_distributed.icp.nats_request_plane import NatsRequestPlane
-from triton_distributed.icp.ucp_data_plane import UcpDataPlane
-from triton_distributed.runtime.deployment import Deployment
-from triton_distributed.runtime.logger import get_logger
-from triton_distributed.runtime.operator import OperatorConfig
-from triton_distributed.runtime.remote_operator import RemoteOperator
-from triton_distributed.runtime.worker import WorkerConfig
-
-NATS_PORT = 4223
-MODEL_REPOSITORY = (
-    "/workspace/runtime/tests/python/integration/operators/triton_core_models"
-)
-OPERATORS_REPOSITORY = "/workspace/runtime/tests/python/integration/operators"
-TRITON_LOG_LEVEL = 6
-
-logger = get_logger(__name__)
-
-# Run cupy's cuda.is_available once to
-# avoid the exception hitting runtime code.
-try:
-    if cupy.cuda.is_available():
-        pass
-    else:
-        print("CUDA not available.")
-except CUDARuntimeError:
-    print("CUDA not available")
-
-# TODO
-# Decide if this should be
-# pre merge, nightly, or weekly
-pytestmark = pytest.mark.pre_merge
-
-
-@pytest.fixture
-def workers(request, log_dir, number_workers=10):
-    # Add configs for identity operator
-    operator_name = "identity"
-    operator_config = OperatorConfig(
-        name=operator_name,
-        implementation="identity:Identity",
-        version=1,
-        max_inflight_requests=10,
-        repository=OPERATORS_REPOSITORY,
-    )
-
-    worker_configs = []
-
-    test_log_dir = log_dir / request.node.name
-    test_log_dir.mkdir(parents=True, exist_ok=True)
-
-    for i in range(number_workers):
-        # Set the logging directory
-        worker_log_dir = test_log_dir / (operator_name + "_" + str(i))
-        worker_configs.append(
-            WorkerConfig(
-                name=operator_name,
-                request_plane=NatsRequestPlane,
-                data_plane=UcpDataPlane,
-                request_plane_args=(
-                    [],
-                    {"request_plane_uri": f"nats://localhost:{NATS_PORT}"},
-                ),
-                log_level=TRITON_LOG_LEVEL,
-                log_dir=str(worker_log_dir),
-                operators=[operator_config],
-            )
-        )
-
-    worker_deployment = Deployment(worker_configs)
-
-    worker_deployment.start()
-    yield worker_deployment
-    worker_deployment.shutdown()
-
-
-async def post_requests(num_requests, num_targets):
-    """
-    Posts requests until the number of
-    workers that respond is equal to the number of targets
-    after that - only sends requests to one of the targets
-    """
-    ucp.reset()
-    timeout = 5
-
-    data_plane = UcpDataPlane()
-    data_plane.connect()
-
-    request_plane = NatsRequestPlane(f"nats://localhost:{NATS_PORT}")
-    await request_plane.connect()
-
-    identity_operator = RemoteOperator("identity", request_plane, data_plane)
-
-    target_components = set()
-    target_component_list: list[uuid.UUID] = []
-    responding_components = set()
-
-    for index in range(num_requests):
-        request = identity_operator.create_request(
-            inputs={"input": [index]},
-        )
-        target_component = None
-
-        if target_component_list:
-            # we have the list of targets
-            # only send to workers in that list
-            target_index = index % len(target_component_list)
-            target_component = target_component_list[target_index]
-            identity_operator.component_id = target_component
-
-        async for response in await identity_operator.async_infer(request):
-            responding_component = response.component_id
-            numpy.testing.assert_equal(
-                numpy.from_dlpack(response.outputs["output"]), request.inputs["input"]
-            )
-            responding_components.add(responding_component)
-
-            if not target_component_list:
-                # add to list of acceptable targets
-                target_components.add(responding_component)
-
-        if len(target_components) >= num_targets:
-            # finalize list
-            target_component_list = list(target_components)
-
-    timeout = 5
-    data_plane.close(timeout)
-    await request_plane.close()
-    assert target_components == responding_components
-
-
-def run(num_requests, num_targets=5):
-    sys.exit(
-        asyncio.run(
-            post_requests(
-                num_requests=num_requests,
-                num_targets=num_targets,
-            )
-        )
-    )
-
-
-@pytest.mark.skipif(
-    "(not os.path.exists('/usr/local/bin/nats-server'))",
-    reason="NATS.io not present",
-)
-@pytest.mark.timeout(30)
-def test_direct(request, nats_server, workers):
-    # Using a separate process to use data plane across multiple tests.
-    p = Process(target=run, args=(50,))
-    p.start()
-    p.join()
-    assert p.exitcode == 0
--- a/runtime/tests/python/integration/test_mock_disaggregated_serving.py
+++ b/runtime/tests/python/integration/test_mock_disaggregated_serving.py
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import asyncio
-import queue
-import sys
-import time
-from functools import partial
-from multiprocessing import Process
-
-import cupy
-import numpy
-import pytest
-import tritonclient.grpc as grpcclient
-import ucp
-from cupy_backends.cuda.api.runtime import CUDARuntimeError
-from transformers import XLNetTokenizer
-from tritonclient.utils import InferenceServerException
-from tritonserver import Tensor
-
-from triton_distributed.icp.nats_request_plane import NatsRequestPlane
-from triton_distributed.icp.ucp_data_plane import UcpDataPlane
-from triton_distributed.runtime.deployment import Deployment
-from triton_distributed.runtime.logger import get_logger
-from triton_distributed.runtime.operator import OperatorConfig
-from triton_distributed.runtime.remote_operator import RemoteOperator
-from triton_distributed.runtime.triton_core_operator import TritonCoreOperator
-from triton_distributed.runtime.worker import WorkerConfig
-
-NATS_PORT = 4223
-MODEL_REPOSITORY = (
-    "/workspace/runtime/tests/python/integration/operators/triton_core_models"
-)
-OPERATORS_REPOSITORY = "/workspace/runtime/tests/python/integration/operators"
-TRITON_LOG_LEVEL = 6
-
-logger = get_logger(__name__)
-
-# Run cupy's cuda.is_available once to
-# avoid the exception hitting runtime code.
-try:
-    if cupy.cuda.is_available():
-        pass
-    else:
-        print("CUDA not available.")
-except CUDARuntimeError:
-    print("CUDA not available")
-
-# Slower test than others - make it nightly for now
-pytestmark = pytest.mark.nightly
-
-
-@pytest.fixture
-def workers(request, log_dir):
-    operator_configs = {}
-
-    # Add configs for triton core operators
-    triton_core_operators = ["preprocessing", "context", "generation", "postprocessing"]
-    for operator_name in triton_core_operators:
-        operator_configs[operator_name] = OperatorConfig(
-            name=operator_name,
-            implementation=TritonCoreOperator,
-            version=1,
-            max_inflight_requests=10,
-            repository=MODEL_REPOSITORY,
-        )
-
-    # Add configs for other custom operators
-    operator_name = "mock_disaggregated_serving"
-    operator_configs[operator_name] = OperatorConfig(
-        name=operator_name,
-        implementation="mock_disaggregated_serving:MockDisaggregatedServing",
-        version=1,
-        max_inflight_requests=10,
-        repository=OPERATORS_REPOSITORY,
-    )
-
-    worker_configs = []
-
-    test_log_dir = log_dir / request.node.name
-    test_log_dir.mkdir(parents=True, exist_ok=True)
-
-    # We will instantiate a worker for each operator
-    for name, operator_config in operator_configs.items():
-        # Set the logging directory
-        worker_log_dir = test_log_dir / name
-        worker_configs.append(
-            WorkerConfig(
-                name=name,
-                request_plane=NatsRequestPlane,
-                data_plane=UcpDataPlane,
-                request_plane_args=(
-                    [],
-                    {"request_plane_uri": f"nats://localhost:{NATS_PORT}"},
-                ),
-                log_level=TRITON_LOG_LEVEL,
-                log_dir=str(worker_log_dir),
-                operators=[operator_config],
-            )
-        )
-
-    worker_deployment = Deployment(worker_configs)
-
-    worker_deployment.start()
-    yield worker_deployment
-    worker_deployment.shutdown()
-
-
-def _create_inputs(number):
-    inputs = []
-    outputs = []
-
-    for _ in range(number):
-        request_output_len = 10
-        query_arr = numpy.array(["This is a sample prompt"], dtype=numpy.object_)
-        request_output_len_arr = numpy.array([request_output_len], dtype=numpy.int32)
-        input_ = {"query": query_arr, "request_output_len": request_output_len_arr}
-
-        expected_output = numpy.repeat(query_arr, request_output_len)
-
-        tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
-        tokens = numpy.array(tokenizer.encode(query_arr[0]))
-        expected_output = numpy.array(
-            tokenizer.convert_ids_to_tokens((tokens.tolist()))
-        )
-
-        output_data_ = {"output": Tensor._from_object(expected_output)}
-
-        inputs.append(input_)
-        outputs.append(output_data_)
-    return inputs, outputs
-
-
-async def post_requests(num_requests):
-    ucp.reset()
-
-    data_plane = UcpDataPlane()
-    data_plane.connect()
-
-    request_plane = NatsRequestPlane(f"nats://localhost:{NATS_PORT}")
-    await request_plane.connect()
-
-    mock_disaggregated_serving_operator = RemoteOperator(
-        "mock_disaggregated_serving", request_plane, data_plane
-    )
-
-    expected_results = {}
-
-    inputs, outputs = _create_inputs(num_requests)
-    begin = None
-    token_latency = []
-    timeout = True
-    for i, input_dict in enumerate(inputs):
-        request_id = str(i)
-        request = mock_disaggregated_serving_operator.create_request(
-            inputs=input_dict, request_id=request_id
-        )
-
-        begin = time.time()
-        response_count = 0
-
-        try:
-            async for response in await mock_disaggregated_serving_operator.async_infer(
-                inference_request=request
-            ):
-                token_latency.append(time.time() - begin)
-                expected_results[request_id] = outputs[i]
-                if not response.final:
-                    for output_name, expected_value in expected_results[
-                        response.request_id
-                    ].items():
-                        output = response.outputs[output_name]
-                        output_value = output.to_bytes_array()
-                        print(f"Final Output: {output_value}")
-                        numpy.testing.assert_equal(
-                            output_value, expected_value.to_bytes_array()
-                        )
-                    response_count += 1
-
-            # 1 response from context and 10 responses from generation
-            assert response_count == 11
-
-        except Exception as e:
-            print("Failed collecting responses:" + repr(e))
-            del response
-            print(f"Token latency: {token_latency}")
-            data_plane.close(wait_for_release=timeout)
-            await request_plane.close()
-            raise e
-
-    print(f"Token latency: {token_latency}")
-    data_plane.close(wait_for_release=timeout)
-    await request_plane.close()
-
-
-def run(num_requests):
-    sys.exit(asyncio.run(post_requests(num_requests=num_requests)))
-
-
-@pytest.mark.skipif(
-    "(not os.path.exists('/usr/local/bin/nats-server'))",
-    reason="NATS.io not present or test is not configured to run with mock disaggregated serving",
-)
-def test_mock_disaggregated_serving(request, nats_server, workers):
-    # Using a separate process to use data plane across multiple tests.
-    p = Process(target=run, args=(1,))
-    p.start()
-    p.join()
-    assert p.exitcode == 0
-
-
-class UserData:
-    def __init__(self):
-        self._completed_requests: queue.Queue[
-            grpcclient.Result | InferenceServerException
-        ] = queue.Queue()
-
-
-# Define the callback function. Note the last two parameters should be
-# result and error. InferenceServerClient would povide the results of an
-# inference as grpcclient.InferResult in result. For successful
-# inference, error will be None, otherwise it will be an object of
-# tritonclientutils.InferenceServerException holding the error details
-def callback(user_data, result, error):
-    if error:
-        user_data._completed_requests.put(error)
-    else:
-        user_data._completed_requests.put(result)
-
-
-async def send_kserve_requests(num_requests):
-    inputs_dict, outputs_dicts = _create_inputs(num_requests)
-    inputs = []
-    inputs.append(grpcclient.InferInput("query", [1], "BYTES"))
-    inputs.append(grpcclient.InferInput("request_output_len", [1], "INT32"))
-
-    user_data = UserData()
-    with grpcclient.InferenceServerClient("localhost:8001") as client:
-        client.start_stream(
-            callback=partial(callback, user_data),
-        )
-        for i, input_dict in enumerate(inputs_dict):
-            inputs[0].set_data_from_numpy(input_dict["query"])
-            inputs[1].set_data_from_numpy(input_dict["request_output_len"])
-
-            client.async_stream_infer(
-                model_name="mock_disaggregated_serving", inputs=inputs
-            )
-
-        recv_count = 0
-        while recv_count < 10:
-            data_item = user_data._completed_requests.get()
-            recv_count += 1
-            if isinstance(data_item, InferenceServerException):
-                raise data_item
-            else:
-                result = data_item.as_numpy("output")
-                print("test \n")
-                print(result)
-
-    # Wait for the tensor clean-up
-    time.sleep(5)
-
-
-def run_kserve(num_requests):
-    sys.exit(asyncio.run(send_kserve_requests(num_requests=num_requests)))
-
-
-@pytest.mark.skipif(
-    "(not os.path.exists('/usr/local/bin/nats-server'))",
-    reason="NATS.io not present",
-)
-def test_mock_disaggregated_serving_kserve(request, nats_server, workers, api_server):
-    # Using a separate process to use data plane across multiple tests.
-    p = Process(target=run_kserve, args=(1,))
-    p.start()
-    p.join()
-    assert p.exitcode == 0
--- a/runtime/tests/python/integration/test_perf_benchmark.py
+++ b/runtime/tests/python/integration/test_perf_benchmark.py
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import asyncio
-
-import numpy
-import pytest
-import ucp
-
-from triton_distributed.icp.nats_request_plane import NatsRequestPlane
-from triton_distributed.icp.ucp_data_plane import UcpDataPlane
-from triton_distributed.runtime.deployment import Deployment
-from triton_distributed.runtime.logger import get_logger
-from triton_distributed.runtime.operator import OperatorConfig
-from triton_distributed.runtime.remote_operator import RemoteOperator
-from triton_distributed.runtime.worker import WorkerConfig
-
-NATS_PORT = 4223
-MODEL_REPOSITORY = (
-    "/workspace/runtime/tests/python/integration/operators/triton_core_models"
-)
-OPERATORS_REPOSITORY = "/workspace/runtime/tests/python/integration/operators"
-TRITON_LOG_LEVEL = 0
-
-logger = get_logger(__name__)
-
-# Slower test than others - make it nightly for now
-pytestmark = pytest.mark.nightly
-
-
-@pytest.fixture
-def workers(log_dir, request, number_workers=1):
-    store_outputs_in_response = request.getfixturevalue("store_outputs_in_response")
-
-    # Add configs for identity operator
-    operator_name = "identity"
-    operator_config = OperatorConfig(
-        name=operator_name,
-        implementation="identity:Identity",
-        version=1,
-        max_inflight_requests=10,
-        parameters={"store_outputs_in_response": store_outputs_in_response},
-        repository=OPERATORS_REPOSITORY,
-    )
-
-    worker_configs = []
-
-    test_log_dir = log_dir / request.node.name
-    test_log_dir.mkdir(parents=True, exist_ok=True)
-
-    for i in range(number_workers):
-        # Set the logging directory
-        worker_log_dir = test_log_dir / (operator_name + "_" + str(i))
-        worker_configs.append(
-            WorkerConfig(
-                name=operator_name,
-                request_plane=NatsRequestPlane,
-                data_plane=UcpDataPlane,
-                request_plane_args=(
-                    [],
-                    {"request_plane_uri": f"nats://localhost:{NATS_PORT}"},
-                ),
-                log_level=TRITON_LOG_LEVEL,
-                log_dir=str(worker_log_dir),
-                operators=[operator_config],
-            )
-        )
-
-    worker_deployment = Deployment(worker_configs)
-
-    worker_deployment.start()
-    yield worker_deployment
-    worker_deployment.shutdown()
-
-
-def _create_inputs(number, tensor_size_in_kb):
-    inputs = []
-    outputs = []
-
-    elem_cnt = int(tensor_size_in_kb * 1024 / 4)
-    for _ in range(number):
-        input_ = numpy.random.randint(low=1, high=100, size=[elem_cnt])
-
-        expected_ = {}
-
-        expected_["output"] = input_
-
-        inputs.append(input_)
-        outputs.append(expected_)
-    return inputs, outputs
-
-
-def run(
-    aio_benchmark,
-    store_inputs_in_request,
-    store_outputs_in_response,
-    tensor_size_in_kb,
-    data_plane_tracker,
-):
-    if data_plane_tracker.is_first_run:
-        ucp.reset()
-        data_plane_tracker._data_plane = UcpDataPlane()
-        data_plane_tracker._data_plane.connect()
-
-    request_plane = NatsRequestPlane(f"nats://localhost:{NATS_PORT}")
-    asyncio.get_event_loop().run_until_complete(request_plane.connect())
-
-    identity_operator = RemoteOperator(
-        "identity", request_plane, data_plane_tracker._data_plane
-    )
-
-    inputs, outputs = _create_inputs(1, tensor_size_in_kb)
-
-    aio_benchmark(
-        post_requests,
-        identity_operator,
-        inputs,
-        outputs,
-        store_inputs_in_request,
-        store_outputs_in_response,
-    )
-
-    timeout = 5
-    asyncio.get_event_loop().run_until_complete(request_plane.close())
-
-    if data_plane_tracker.is_last_run:
-        data_plane_tracker._data_plane.close(timeout)
-
-
-async def post_requests(
-    identity_model, inputs, outputs, store_inputs_in_request, store_outputs_in_response
-):
-    results = []
-    expected_results = {}
-
-    for i, input_ in enumerate(inputs):
-        request_id = str(i)
-        request = identity_model.create_request(
-            inputs={"input": input_}, request_id=request_id
-        )
-        if store_inputs_in_request:
-            request.store_inputs_in_request.add("input")
-        results.append(identity_model.async_infer(request))
-        expected_results[request_id] = outputs[i]
-
-    for result in asyncio.as_completed(results):
-        responses = await result
-        async for response in responses:
-            for output_name, expected_value in expected_results[
-                response.request_id
-            ].items():
-                output = response.outputs[output_name]
-                _ = numpy.from_dlpack(output.to_host())
-
-                del output
-
-            del response
-
-
-@pytest.fixture(scope="module")
-def data_plane_tracker():
-    class Tracker:
-        def __init__(self):
-            self.total_runs = 0
-            self.current_run = 0
-            self._data_plane = None
-
-        def increment_run(self):
-            self.current_run += 1
-
-        @property
-        def is_first_run(self):
-            return self.current_run == 1
-
-        @property
-        def is_last_run(self):
-            return self.current_run == self.total_runs
-
-    return Tracker()
-
-
-# FIXME: NATS default size limit is 1 MB. However, even when the tensor_size_in_kb
-# is set as 600, which corresponds to 0.6144 MB, we are hiting MaxPayloadError.
-# Need to investigate why the limit is being hit.
-@pytest.mark.skipif(
-    "(not os.path.exists('/usr/local/bin/nats-server'))",
-    reason="NATS.io not present or test is configured to run with mock disaggregated_serving",
-)
-@pytest.mark.parametrize(
-    ["store_inputs_in_request", "store_outputs_in_response"],
-    [(True, True), (False, False)],
-)
-@pytest.mark.parametrize(
-    "tensor_size_in_kb",
-    [10, 100, 500],
-)
-@pytest.mark.benchmark(min_rounds=100, max_time=1)
-def test_identity(
-    request,
-    nats_server,
-    workers,
-    aio_benchmark,
-    store_inputs_in_request,
-    store_outputs_in_response,
-    tensor_size_in_kb,
-    data_plane_tracker,
-):
-    """
-    This benchmark test checks the latency of a simple operator which returns input in its output
-    without any processing.
-    NOTE: We can not use benchmark fixture in the child process. Hence, we are required to use the
-    same process for opening then data plane object as pytest.
-    This means that the pytest main process cannot create another data plane object in any other
-    tests. Hence, we will use a run tracker to open and close the data plane
-    """
-    if data_plane_tracker.total_runs == 0:
-        data_plane_tracker.total_runs = 6  # Set this to the number of parameters
-    data_plane_tracker.increment_run()
-    run(
-        aio_benchmark,
-        store_inputs_in_request,
-        store_outputs_in_response,
-        tensor_size_in_kb,
-        data_plane_tracker,
-    )
--- a/runtime/tests/python/unit/test_args.py
+++ b/runtime/tests/python/unit/test_args.py
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-from triton_distributed.runtime.parser import Parser
-
-"""
-Tests for parsing the arguments by command line parser
-"""
-
-
-@pytest.fixture
-def default_values():
-    # Add default values for the command-line interface
-    return {
-        "request_plane_uri": "nats://localhost:4222",
-        "log_level": 0,
-        # TODO: Add the default options for the worker executable here
-    }
-
-
-def test_parse_args_default(default_values):
-    # Tests for default values
-    args, parser = Parser.parse_args([])
-    assert args.request_plane_uri == default_values["request_plane_uri"]
-    assert args.log_level == default_values["log_level"]
-    if args.operators:
-        raise Exception(f"Expected no operators by default, got {args.operators}")
-
-    if args.operator_configs:
-        raise Exception(
-            f"Expected no operators by default, got {args.operator_configs}"
-        )
-
-
-@pytest.mark.parametrize(
-    "valid_request_plane_uri",
-    [
-        "https://example.com",
-        # Add valid request plane uri values
-    ],
-)
-def test_parse_args_valid_request_plane_uri(valid_request_plane_uri):
-    # Tests with valid values for request plane uri
-    args, _ = Parser.parse_args(["--request-plane-uri", valid_request_plane_uri])
-    assert args.request_plane_uri == valid_request_plane_uri
-
-
-def clean_argument_list(args_list):
-    return [x for x in args_list if x is not None]
-
-
-@pytest.mark.parametrize(
-    "first_arg, second_arg, third_arg",
-    [
-        ("name:abc", "version:1", "max_inflight_requests:5"),
-        ("name:abc", "max_inflight_requests:5", None),
-        ("name:abc", "version:1", None),
-        ("name:abc", None, None),
-        # Add valid cases
-    ],
-)
-def test_parse_args_valid_model(first_arg, second_arg, third_arg, tmp_path):
-    model_repo_path = tmp_path / "model_repo"
-    model_repo_path.mkdir()
-    d = model_repo_path / "abc"
-    d.mkdir()
-    # Tests with valid arguments
-    input_args = ["--operator"]
-    model_args = clean_argument_list(
-        [
-            first_arg,
-            second_arg,
-            third_arg,
-            f"repository:{model_repo_path}",
-            "module:worker.triton_core_operator:TritonCoreOperator",
-        ]
-    )
-    print(model_args)
-    input_args = input_args + model_args
-    args, _ = Parser.parse_args(input_args)
-    assert args.operators[0] == model_args
-
-
-def test_parse_args_invalid_operator(capsys):
-    # Tests with  invalid arguments
-    with pytest.raises(SystemExit):
-        Parser.parse_args(["--operator"])
-    captured = capsys.readouterr()
-    assert "expected at least one argument" in captured.err
-
-
-@pytest.mark.parametrize(
-    "first_arg, second_arg, third_arg",
-    [
-        ("name:abc", "version:1", "max_inflight_requests:5"),
-        ("name:abc", "max_inflight_requests:5", None),
-        ("name:abc", "version:1", None),
-        # TODO: Revisit can be uncommented once the operator module can be inferred automatically.
-        # ("abc", None, None),
-        # Add valid cases
-    ],
-)
-def test_parse_args_valid_operator(first_arg, second_arg, third_arg, tmp_path):
-    repo_path = tmp_path / "worker_repo"
-    repo_path.mkdir()
-    d = repo_path / "abc"
-    d.mkdir()
-    # Tests with valid arguments
-    input_args = ["--operator"]
-    operator_args = clean_argument_list([first_arg, second_arg, third_arg])
-    input_args = input_args + operator_args + ["module:dummyworkflow:Workflow"]
-    args, _ = Parser.parse_args(input_args)
-    assert args.operators[0] == operator_args + ["module:dummyworkflow:Workflow"]
--- a/runtime/tests/python/unit/test_logger.py
+++ b/runtime/tests/python/unit/test_logger.py
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-
-import pytest
-
-from triton_distributed.runtime.logger import get_logger
-
-logger = logging.getLogger(__name__)
-
-MSG = "This is a sample message"
-
-"""
-Tests for Logging module
-"""
-
-
-def logging_function(logger):
-    logger.info(MSG)
-    logger.warning(MSG)
-    try:
-        raise Exception("This is an exception")
-    except Exception:
-        logger.exception(MSG)
-
-    logger.error(MSG)
-    logger.debug(MSG)
-
-
-@pytest.fixture
-def reset_logger(caplog):
-    loggers = [logging.getLogger(name) for name in logging.root.manager.loggerDict]
-    loggers.append(logging.getLogger())
-    for logger in loggers:
-        handlers = logger.handlers[:]
-        for handler in handlers:
-            logger.removeHandler(handler)
-            handler.close()
-        logger.setLevel(logging.NOTSET)
-        logger.propagate = True
-        caplog.clear()
-
-
-@pytest.mark.parametrize(
-    "log_level, expected_record_counts",
-    [
-        # For log-level 0 only error and exception should be recorded
-        (0, 2),
-        # For log-level 1 only info, error, exception and warning should be recorded
-        (1, 4),
-        # All logs(error, exception, info, debug and warning) should be printed for log-level 2
-        (2, 5),
-    ],
-)
-def test_logging(reset_logger, caplog, log_level, expected_record_counts):
-    caplog.set_level(log_level)
-    logger = get_logger(logger_name="test_logging", log_level=log_level)
-    logging_function(logger)
-    assert len(caplog.records) == expected_record_counts