Commit 0bfd9a76 authored by Neelay Shah's avatar Neelay Shah Committed by GitHub
Browse files

refactor: remove python native runtime

parent 8f741f14
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
backend: "python"
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import numpy as np
import triton_python_backend_utils as pb_utils
# from transformers import LlamaTokenizer
# llama_tokenizer = LlamaTokenizer.from_pretrained("/path/to/hfmodel")
from transformers import XLNetTokenizer
class TritonPythonModel:
def initialize(self, args):
model_config = json.loads(args["model_config"])
for output_name in ["OUTPUT"]:
setattr(
self,
output_name.lower() + "_dtype",
pb_utils.triton_string_to_numpy(
pb_utils.get_output_config_by_name(model_config, output_name)[
"data_type"
]
),
)
# Using a mock hard coded auto-tokenizer
self.tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
self._logger = pb_utils.Logger
def execute(self, requests):
responses = []
for idx, request in enumerate(requests):
# Get input tensors
output_ids = pb_utils.get_input_tensor_by_name(
request, "OUTPUT_IDS"
).as_numpy()
output_result = np.array(
self.tokenizer.convert_ids_to_tokens((output_ids.tolist()))
)
self._logger.log_verbose(f"Output Result \n\n {output_result}")
output_tensor = pb_utils.Tensor(
"OUTPUT", output_result.astype(self.output_dtype)
)
inference_response = pb_utils.InferenceResponse(
output_tensors=[output_tensor]
)
responses.append(inference_response)
return responses
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Emulates the pre-processing config from:
# https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/all_models/inflight_batcher_llm/postprocessing/config.pbtxt
name: "postprocessing"
backend: "python"
max_batch_size: 0
input [
{
name: "OUTPUT_IDS"
data_type: TYPE_INT32
dims: [ -1 ]
},
{
name: "SEQUENCE_LENGTH"
data_type: TYPE_INT32
dims: [ 1 ]
}
# Add more inputs as per requirement.
# For simplicity only sticking with these
# inputs for preprocessing.
]
output [
{
name: "OUTPUT"
data_type: TYPE_STRING
dims: [ -1 ]
}
# Add more outputs as per requirement.
# For simplicity only sticking with these
# outputs for preprocessing.
]
instance_group [
{
count: 4
kind: KIND_CPU
}
]
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import numpy as np
import triton_python_backend_utils as pb_utils
# from transformers import LlamaTokenizer
# llama_tokenizer = LlamaTokenizer.from_pretrained("/path/to/hfmodel")
from transformers import XLNetTokenizer
class TritonPythonModel:
"""
This is a mock disaggregated serving pre-processing model.
"""
def initialize(self, args):
model_config = json.loads(args["model_config"])
for output_name in ["INPUT_IDS", "INPUT_LENGTH", "REQUEST_OUTPUT_LEN"]:
setattr(
self,
output_name.lower() + "_dtype",
pb_utils.triton_string_to_numpy(
pb_utils.get_output_config_by_name(model_config, output_name)[
"data_type"
]
),
)
# Using a mock hard coded auto-tokenizer
self.tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
self.logger = pb_utils.Logger
def execute(self, requests):
self.logger.log_verbose("In preprocessing execute!")
responses = []
for idx, request in enumerate(requests):
# Get input tensors
query = pb_utils.get_input_tensor_by_name(request, "query").as_numpy()
request_output_len = pb_utils.get_input_tensor_by_name(
request, "request_output_len"
).as_numpy()
self.logger.log_verbose(f"query(pre-proc) {query}")
tokenize = np.array(self.tokenizer.encode(query[0].decode()))
self.logger.log_verbose(f"tokenize(pre-proc) {tokenize.size}")
input_length = np.array([tokenize.size])
# Just forwarding query to the pre-processed input_ids
input_id_tensor = pb_utils.Tensor(
"INPUT_IDS", tokenize.astype(self.input_ids_dtype)
)
# Just forwarding query to the pre-processed input_ids
input_length_tensor = pb_utils.Tensor(
"INPUT_LENGTH", input_length.astype(self.input_length_dtype)
)
request_output_len_tensor = pb_utils.Tensor(
"REQUEST_OUTPUT_LEN", request_output_len
)
inference_response = pb_utils.InferenceResponse(
output_tensors=[
input_id_tensor,
input_length_tensor,
request_output_len_tensor,
]
)
responses.append(inference_response)
return responses
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Emulates the pre-processing config from:
# https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/all_models/inflight_batcher_llm/preprocessing/config.pbtxt
name: "preprocessing"
backend: "python"
max_batch_size: 0
input [
{
name: "query"
data_type: TYPE_STRING
dims: [ 1 ]
},
{
name: "request_output_len"
data_type: TYPE_INT32
dims: [ 1 ]
}
# Add more inputs as per requirement.
# For simplicity only sticking with these
# inputs for preprocessing.
]
output [
{
name: "INPUT_IDS"
data_type: TYPE_INT32
dims: [ -1 ]
},
{
name: "INPUT_LENGTH"
data_type: TYPE_INT32
dims: [ 1 ]
},
{
name: "REQUEST_OUTPUT_LEN"
data_type: TYPE_INT32
dims: [ 1 ]
}
# Add more outputs as per requirement.
# For simplicity only sticking with these
# outputs for preprocessing.
]
instance_group [
{
count: 4
kind: KIND_CPU
}
]
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import asyncio
import sys
from multiprocessing import Process
import cupy
import numpy
import pytest
import ucp
from cupy_backends.cuda.api.runtime import CUDARuntimeError
from triton_distributed.icp.nats_request_plane import NatsRequestPlane
from triton_distributed.icp.ucp_data_plane import UcpDataPlane
from triton_distributed.runtime.deployment import Deployment
from triton_distributed.runtime.logger import get_logger
from triton_distributed.runtime.operator import OperatorConfig
from triton_distributed.runtime.remote_operator import RemoteOperator
from triton_distributed.runtime.triton_core_operator import TritonCoreOperator
from triton_distributed.runtime.worker import WorkerConfig
NATS_PORT = 4223
MODEL_REPOSITORY = (
"/workspace/runtime/tests/python/integration/operators/triton_core_models"
)
OPERATORS_REPOSITORY = "/workspace/runtime/tests/python/integration/operators"
TRITON_LOG_LEVEL = 6
logger = get_logger(__name__)
# Run cupy's cuda.is_available once to
# avoid the exception hitting runtime code.
try:
if cupy.cuda.is_available():
pass
else:
print("CUDA not available.")
except CUDARuntimeError:
print("CUDA not available")
# TODO
# Decide if this should be
# pre merge, nightly, or weekly
pytestmark = pytest.mark.pre_merge
@pytest.fixture
def workers(request, log_dir):
operator_configs = {}
store_outputs_in_response = request.getfixturevalue("store_outputs_in_response")
# Add configs for triton core operators
triton_core_operators = ["add", "multiply", "divide"]
for operator_name in triton_core_operators:
operator_configs[operator_name] = OperatorConfig(
name=operator_name,
implementation=TritonCoreOperator,
version=1,
max_inflight_requests=10,
parameters={"store_outputs_in_response": store_outputs_in_response},
repository=MODEL_REPOSITORY,
)
# Add configs for other custom operators
operator_name = "add_multiply_divide"
operator_configs[operator_name] = OperatorConfig(
name=operator_name,
implementation="add_multiply_divide:AddMultiplyDivide",
version=1,
max_inflight_requests=10,
parameters={"store_outputs_in_response": store_outputs_in_response},
repository=OPERATORS_REPOSITORY,
)
worker_configs = []
test_log_dir = log_dir / request.node.name
test_log_dir.mkdir(parents=True, exist_ok=True)
# We will instantiate a worker for each operator
for name, operator_config in operator_configs.items():
# Set the logging directory
worker_log_dir = test_log_dir / name
worker_configs.append(
WorkerConfig(
name=name,
request_plane=NatsRequestPlane,
data_plane=UcpDataPlane,
request_plane_args=(
[],
{"request_plane_uri": f"nats://localhost:{NATS_PORT}"},
),
log_level=TRITON_LOG_LEVEL,
log_dir=str(worker_log_dir),
operators=[operator_config],
)
)
worker_deployment = Deployment(worker_configs)
worker_deployment.start()
yield worker_deployment
worker_deployment.shutdown()
def _create_inputs(number, size):
inputs = []
outputs = []
for index in range(number):
input_ = numpy.random.randint(low=1, high=100, size=[2, size])
expected_ = {}
expected_["add_int64_output_total"] = numpy.array([[input_.sum()]])
expected_["add_int64_output_partial"] = numpy.array([[x.sum() for x in input_]])
expected_["multiply_int64_output_total"] = numpy.array(
[[x.prod() for x in expected_["add_int64_output_partial"]]]
)
divisor = expected_["add_int64_output_total"][0][0]
dividends = expected_["add_int64_output_partial"]
expected_["divide_fp64_output_partial"] = numpy.array(
[numpy.divide(dividends, divisor)]
)
inputs.append(input_)
outputs.append(expected_)
return inputs, outputs
async def post_requests(num_requests, store_inputs_in_request):
"""
Post requests to add_multiply_divide operator.
"""
ucp.reset()
timeout = 5
data_plane = UcpDataPlane()
data_plane.connect()
request_plane = NatsRequestPlane(f"nats://localhost:{NATS_PORT}")
await request_plane.connect()
add_multiply_divide_operator = RemoteOperator(
"add_multiply_divide", request_plane, data_plane
)
results = []
expected_results = {}
inputs, outputs = _create_inputs(num_requests, 40)
for i, input_ in enumerate(inputs):
request_id = str(i)
request = add_multiply_divide_operator.create_request(
inputs={"int64_input": input_}, request_id=request_id
)
if store_inputs_in_request:
request.store_inputs_in_request.add("int64_input")
print(request)
results.append(add_multiply_divide_operator.async_infer(request))
expected_results[request_id] = outputs[i]
for result in asyncio.as_completed(results):
responses = await result
async for response in responses:
print(response)
for output_name, expected_value in expected_results[
response.request_id
].items():
output = response.outputs[output_name]
output_value = numpy.from_dlpack(output.to_host())
numpy.testing.assert_equal(output_value, expected_value)
del output
print(expected_results[response.request_id])
del response
timeout = 5
data_plane.close(timeout)
await request_plane.close()
def run(num_requests, store_inputs_in_request=False):
sys.exit(
asyncio.run(
post_requests(
num_requests=num_requests,
store_inputs_in_request=store_inputs_in_request,
)
)
)
@pytest.mark.skipif(
"(not os.path.exists('/usr/local/bin/nats-server'))",
reason="NATS.io not present",
)
@pytest.mark.timeout(120)
@pytest.mark.parametrize(
["store_inputs_in_request", "store_outputs_in_response"],
[(False, False), (True, True)],
)
def test_add_multiply_divide(
request,
nats_server,
workers,
store_inputs_in_request,
store_outputs_in_response,
):
# Using a separate process to use data plane across multiple tests.
p = Process(target=run, args=(2, store_inputs_in_request))
p.start()
p.join()
assert p.exitcode == 0
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import asyncio
import pathlib
import sys
from multiprocessing import Process
import cupy
import numpy
import pytest
import ucp
from cupy_backends.cuda.api.runtime import CUDARuntimeError
from triton_distributed.icp.nats_request_plane import NatsRequestPlane
from triton_distributed.icp.ucp_data_plane import UcpDataPlane
from triton_distributed.runtime.deployment import Deployment
from triton_distributed.runtime.logger import get_logger
from triton_distributed.runtime.operator import OperatorConfig
from triton_distributed.runtime.remote_operator import RemoteOperator
from triton_distributed.runtime.triton_core_operator import TritonCoreOperator
from triton_distributed.runtime.worker import WorkerConfig
NATS_PORT = 4223
MODEL_REPOSITORY = (
"/workspace/runtime/tests/python/integration/operators/triton_core_models"
)
OPERATORS_REPOSITORY = "/workspace/runtime/tests/python/integration/operators"
TRITON_LOG_LEVEL = 6
logger = get_logger(__name__)
# Run cupy's cuda.is_available once to
# avoid the exception hitting runtime code.
try:
if cupy.cuda.is_available():
pass
else:
print("CUDA not available.")
except CUDARuntimeError:
print("CUDA not available")
# TODO
# Decide if this should be
# pre merge, nightly, or weekly
pytestmark = pytest.mark.pre_merge
@pytest.fixture
def workers(request, log_dir):
operator_configs = {}
# Add configs for triton core operators
triton_core_operators = ["add", "multiply", "divide"]
for operator_name in triton_core_operators:
operator_configs[operator_name] = OperatorConfig(
name=operator_name,
implementation=TritonCoreOperator,
version=1,
max_inflight_requests=10,
repository=MODEL_REPOSITORY,
)
# Add configs for other custom operators
operator_name = "add_multiply_divide"
operator_configs[operator_name] = OperatorConfig(
name=operator_name,
implementation="add_multiply_divide:AddMultiplyDivide",
version=1,
max_inflight_requests=10,
repository=OPERATORS_REPOSITORY,
)
worker_configs = []
test_log_dir = log_dir / request.node.name
test_log_dir.mkdir(parents=True, exist_ok=True)
# We will instantiate a worker for each operator
for name, operator_config in operator_configs.items():
# Set the logging directory
worker_log_dir = test_log_dir / name
worker_configs.append(
WorkerConfig(
name=name,
request_plane=NatsRequestPlane,
data_plane=UcpDataPlane,
request_plane_args=(
[],
{"request_plane_uri": f"nats://localhost:{NATS_PORT}"},
),
log_level=TRITON_LOG_LEVEL,
log_dir=str(worker_log_dir),
operators=[operator_config],
)
)
consolidate_logs = request.getfixturevalue("consolidate_logs")
worker_deployment = Deployment(
worker_configs,
consolidate_logs=consolidate_logs,
log_dir=log_dir,
)
worker_deployment.start()
yield worker_deployment
worker_deployment.shutdown()
def _create_inputs(number, size):
inputs = []
outputs = []
for index in range(number):
input_ = numpy.random.randint(low=1, high=100, size=[2, size])
expected_ = {}
expected_["add_int64_output_total"] = numpy.array([[input_.sum()]])
expected_["add_int64_output_partial"] = numpy.array([[x.sum() for x in input_]])
expected_["multiply_int64_output_total"] = numpy.array(
[[x.prod() for x in expected_["add_int64_output_partial"]]]
)
divisor = expected_["add_int64_output_total"][0][0]
dividends = expected_["add_int64_output_partial"]
expected_["divide_fp64_output_partial"] = numpy.array(
[numpy.divide(dividends, divisor)]
)
inputs.append(input_)
outputs.append(expected_)
return inputs, outputs
async def post_requests(num_requests):
"""
Post requests to add_multiply_divide operator.
"""
ucp.reset()
timeout = 5
data_plane = UcpDataPlane()
data_plane.connect()
request_plane = NatsRequestPlane(f"nats://localhost:{NATS_PORT}")
await request_plane.connect()
add_multiply_divide_operator = RemoteOperator(
"add_multiply_divide", request_plane, data_plane
)
results = []
expected_results = {}
inputs, outputs = _create_inputs(num_requests, 40)
for i, input_ in enumerate(inputs):
request_id = str(i)
request = add_multiply_divide_operator.create_request(
inputs={"int64_input": input_}, request_id=request_id
)
print(request)
results.append(add_multiply_divide_operator.async_infer(request))
expected_results[request_id] = outputs[i]
for result in asyncio.as_completed(results):
responses = await result
async for response in responses:
print(response)
for output_name, expected_value in expected_results[
response.request_id
].items():
output = response.outputs[output_name]
output_value = numpy.from_dlpack(output.to_host())
numpy.testing.assert_equal(output_value, expected_value)
del output
print(expected_results[response.request_id])
del response
timeout = 5
data_plane.close(timeout)
await request_plane.close()
def run(num_requests):
sys.exit(asyncio.run(post_requests(num_requests=num_requests)))
@pytest.mark.skipif(
"(not os.path.exists('/usr/local/bin/nats-server'))",
reason="NATS.io not present",
)
@pytest.mark.timeout(120)
@pytest.mark.parametrize(
"consolidate_logs",
[True, False],
)
def test_consolidate_logs(request, nats_server, workers, consolidate_logs, log_dir):
# Using a separate process to use data plane across multiple tests.
p = Process(target=run, args=(2,))
p.start()
p.join()
assert p.exitcode == 0
# Test the number of logs that were created
log_dir_path = pathlib.Path(log_dir) / request.node.name
worker_log_dir_count = 0
for name in log_dir_path.iterdir():
worker_log_dir_count += 1
expected_worker_log_count = 1
if not consolidate_logs and name.stem not in ["add_multiply_divide"]:
expected_worker_log_count = 2
worker_log_path = log_dir_path / name.stem
worker_log_count = 0
for log_name in worker_log_path.iterdir():
worker_log_count += 1
assert worker_log_count == expected_worker_log_count
assert worker_log_dir_count == 4
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import asyncio
import sys
import uuid
from multiprocessing import Process
import cupy
import numpy
import pytest
import ucp
from cupy_backends.cuda.api.runtime import CUDARuntimeError
from triton_distributed.icp.nats_request_plane import NatsRequestPlane
from triton_distributed.icp.ucp_data_plane import UcpDataPlane
from triton_distributed.runtime.deployment import Deployment
from triton_distributed.runtime.logger import get_logger
from triton_distributed.runtime.operator import OperatorConfig
from triton_distributed.runtime.remote_operator import RemoteOperator
from triton_distributed.runtime.worker import WorkerConfig
NATS_PORT = 4223
MODEL_REPOSITORY = (
"/workspace/runtime/tests/python/integration/operators/triton_core_models"
)
OPERATORS_REPOSITORY = "/workspace/runtime/tests/python/integration/operators"
TRITON_LOG_LEVEL = 6
logger = get_logger(__name__)
# Run cupy's cuda.is_available once to
# avoid the exception hitting runtime code.
try:
if cupy.cuda.is_available():
pass
else:
print("CUDA not available.")
except CUDARuntimeError:
print("CUDA not available")
# TODO
# Decide if this should be
# pre merge, nightly, or weekly
pytestmark = pytest.mark.pre_merge
@pytest.fixture
def workers(request, log_dir, number_workers=10):
# Add configs for identity operator
operator_name = "identity"
operator_config = OperatorConfig(
name=operator_name,
implementation="identity:Identity",
version=1,
max_inflight_requests=10,
repository=OPERATORS_REPOSITORY,
)
worker_configs = []
test_log_dir = log_dir / request.node.name
test_log_dir.mkdir(parents=True, exist_ok=True)
for i in range(number_workers):
# Set the logging directory
worker_log_dir = test_log_dir / (operator_name + "_" + str(i))
worker_configs.append(
WorkerConfig(
name=operator_name,
request_plane=NatsRequestPlane,
data_plane=UcpDataPlane,
request_plane_args=(
[],
{"request_plane_uri": f"nats://localhost:{NATS_PORT}"},
),
log_level=TRITON_LOG_LEVEL,
log_dir=str(worker_log_dir),
operators=[operator_config],
)
)
worker_deployment = Deployment(worker_configs)
worker_deployment.start()
yield worker_deployment
worker_deployment.shutdown()
async def post_requests(num_requests, num_targets):
"""
Posts requests until the number of
workers that respond is equal to the number of targets
after that - only sends requests to one of the targets
"""
ucp.reset()
timeout = 5
data_plane = UcpDataPlane()
data_plane.connect()
request_plane = NatsRequestPlane(f"nats://localhost:{NATS_PORT}")
await request_plane.connect()
identity_operator = RemoteOperator("identity", request_plane, data_plane)
target_components = set()
target_component_list: list[uuid.UUID] = []
responding_components = set()
for index in range(num_requests):
request = identity_operator.create_request(
inputs={"input": [index]},
)
target_component = None
if target_component_list:
# we have the list of targets
# only send to workers in that list
target_index = index % len(target_component_list)
target_component = target_component_list[target_index]
identity_operator.component_id = target_component
async for response in await identity_operator.async_infer(request):
responding_component = response.component_id
numpy.testing.assert_equal(
numpy.from_dlpack(response.outputs["output"]), request.inputs["input"]
)
responding_components.add(responding_component)
if not target_component_list:
# add to list of acceptable targets
target_components.add(responding_component)
if len(target_components) >= num_targets:
# finalize list
target_component_list = list(target_components)
timeout = 5
data_plane.close(timeout)
await request_plane.close()
assert target_components == responding_components
def run(num_requests, num_targets=5):
sys.exit(
asyncio.run(
post_requests(
num_requests=num_requests,
num_targets=num_targets,
)
)
)
@pytest.mark.skipif(
"(not os.path.exists('/usr/local/bin/nats-server'))",
reason="NATS.io not present",
)
@pytest.mark.timeout(30)
def test_direct(request, nats_server, workers):
# Using a separate process to use data plane across multiple tests.
p = Process(target=run, args=(50,))
p.start()
p.join()
assert p.exitcode == 0
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import asyncio
import queue
import sys
import time
from functools import partial
from multiprocessing import Process
import cupy
import numpy
import pytest
import tritonclient.grpc as grpcclient
import ucp
from cupy_backends.cuda.api.runtime import CUDARuntimeError
from transformers import XLNetTokenizer
from tritonclient.utils import InferenceServerException
from tritonserver import Tensor
from triton_distributed.icp.nats_request_plane import NatsRequestPlane
from triton_distributed.icp.ucp_data_plane import UcpDataPlane
from triton_distributed.runtime.deployment import Deployment
from triton_distributed.runtime.logger import get_logger
from triton_distributed.runtime.operator import OperatorConfig
from triton_distributed.runtime.remote_operator import RemoteOperator
from triton_distributed.runtime.triton_core_operator import TritonCoreOperator
from triton_distributed.runtime.worker import WorkerConfig
NATS_PORT = 4223
MODEL_REPOSITORY = (
"/workspace/runtime/tests/python/integration/operators/triton_core_models"
)
OPERATORS_REPOSITORY = "/workspace/runtime/tests/python/integration/operators"
TRITON_LOG_LEVEL = 6
logger = get_logger(__name__)
# Run cupy's cuda.is_available once to
# avoid the exception hitting runtime code.
try:
if cupy.cuda.is_available():
pass
else:
print("CUDA not available.")
except CUDARuntimeError:
print("CUDA not available")
# Slower test than others - make it nightly for now
pytestmark = pytest.mark.nightly
@pytest.fixture
def workers(request, log_dir):
operator_configs = {}
# Add configs for triton core operators
triton_core_operators = ["preprocessing", "context", "generation", "postprocessing"]
for operator_name in triton_core_operators:
operator_configs[operator_name] = OperatorConfig(
name=operator_name,
implementation=TritonCoreOperator,
version=1,
max_inflight_requests=10,
repository=MODEL_REPOSITORY,
)
# Add configs for other custom operators
operator_name = "mock_disaggregated_serving"
operator_configs[operator_name] = OperatorConfig(
name=operator_name,
implementation="mock_disaggregated_serving:MockDisaggregatedServing",
version=1,
max_inflight_requests=10,
repository=OPERATORS_REPOSITORY,
)
worker_configs = []
test_log_dir = log_dir / request.node.name
test_log_dir.mkdir(parents=True, exist_ok=True)
# We will instantiate a worker for each operator
for name, operator_config in operator_configs.items():
# Set the logging directory
worker_log_dir = test_log_dir / name
worker_configs.append(
WorkerConfig(
name=name,
request_plane=NatsRequestPlane,
data_plane=UcpDataPlane,
request_plane_args=(
[],
{"request_plane_uri": f"nats://localhost:{NATS_PORT}"},
),
log_level=TRITON_LOG_LEVEL,
log_dir=str(worker_log_dir),
operators=[operator_config],
)
)
worker_deployment = Deployment(worker_configs)
worker_deployment.start()
yield worker_deployment
worker_deployment.shutdown()
def _create_inputs(number):
inputs = []
outputs = []
for _ in range(number):
request_output_len = 10
query_arr = numpy.array(["This is a sample prompt"], dtype=numpy.object_)
request_output_len_arr = numpy.array([request_output_len], dtype=numpy.int32)
input_ = {"query": query_arr, "request_output_len": request_output_len_arr}
expected_output = numpy.repeat(query_arr, request_output_len)
tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
tokens = numpy.array(tokenizer.encode(query_arr[0]))
expected_output = numpy.array(
tokenizer.convert_ids_to_tokens((tokens.tolist()))
)
output_data_ = {"output": Tensor._from_object(expected_output)}
inputs.append(input_)
outputs.append(output_data_)
return inputs, outputs
async def post_requests(num_requests):
ucp.reset()
data_plane = UcpDataPlane()
data_plane.connect()
request_plane = NatsRequestPlane(f"nats://localhost:{NATS_PORT}")
await request_plane.connect()
mock_disaggregated_serving_operator = RemoteOperator(
"mock_disaggregated_serving", request_plane, data_plane
)
expected_results = {}
inputs, outputs = _create_inputs(num_requests)
begin = None
token_latency = []
timeout = True
for i, input_dict in enumerate(inputs):
request_id = str(i)
request = mock_disaggregated_serving_operator.create_request(
inputs=input_dict, request_id=request_id
)
begin = time.time()
response_count = 0
try:
async for response in await mock_disaggregated_serving_operator.async_infer(
inference_request=request
):
token_latency.append(time.time() - begin)
expected_results[request_id] = outputs[i]
if not response.final:
for output_name, expected_value in expected_results[
response.request_id
].items():
output = response.outputs[output_name]
output_value = output.to_bytes_array()
print(f"Final Output: {output_value}")
numpy.testing.assert_equal(
output_value, expected_value.to_bytes_array()
)
response_count += 1
# 1 response from context and 10 responses from generation
assert response_count == 11
except Exception as e:
print("Failed collecting responses:" + repr(e))
del response
print(f"Token latency: {token_latency}")
data_plane.close(wait_for_release=timeout)
await request_plane.close()
raise e
print(f"Token latency: {token_latency}")
data_plane.close(wait_for_release=timeout)
await request_plane.close()
def run(num_requests):
sys.exit(asyncio.run(post_requests(num_requests=num_requests)))
@pytest.mark.skipif(
"(not os.path.exists('/usr/local/bin/nats-server'))",
reason="NATS.io not present or test is not configured to run with mock disaggregated serving",
)
def test_mock_disaggregated_serving(request, nats_server, workers):
# Using a separate process to use data plane across multiple tests.
p = Process(target=run, args=(1,))
p.start()
p.join()
assert p.exitcode == 0
class UserData:
def __init__(self):
self._completed_requests: queue.Queue[
grpcclient.Result | InferenceServerException
] = queue.Queue()
# Define the callback function. Note the last two parameters should be
# result and error. InferenceServerClient would povide the results of an
# inference as grpcclient.InferResult in result. For successful
# inference, error will be None, otherwise it will be an object of
# tritonclientutils.InferenceServerException holding the error details
def callback(user_data, result, error):
if error:
user_data._completed_requests.put(error)
else:
user_data._completed_requests.put(result)
async def send_kserve_requests(num_requests):
inputs_dict, outputs_dicts = _create_inputs(num_requests)
inputs = []
inputs.append(grpcclient.InferInput("query", [1], "BYTES"))
inputs.append(grpcclient.InferInput("request_output_len", [1], "INT32"))
user_data = UserData()
with grpcclient.InferenceServerClient("localhost:8001") as client:
client.start_stream(
callback=partial(callback, user_data),
)
for i, input_dict in enumerate(inputs_dict):
inputs[0].set_data_from_numpy(input_dict["query"])
inputs[1].set_data_from_numpy(input_dict["request_output_len"])
client.async_stream_infer(
model_name="mock_disaggregated_serving", inputs=inputs
)
recv_count = 0
while recv_count < 10:
data_item = user_data._completed_requests.get()
recv_count += 1
if isinstance(data_item, InferenceServerException):
raise data_item
else:
result = data_item.as_numpy("output")
print("test \n")
print(result)
# Wait for the tensor clean-up
time.sleep(5)
def run_kserve(num_requests):
sys.exit(asyncio.run(send_kserve_requests(num_requests=num_requests)))
@pytest.mark.skipif(
"(not os.path.exists('/usr/local/bin/nats-server'))",
reason="NATS.io not present",
)
def test_mock_disaggregated_serving_kserve(request, nats_server, workers, api_server):
# Using a separate process to use data plane across multiple tests.
p = Process(target=run_kserve, args=(1,))
p.start()
p.join()
assert p.exitcode == 0
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import asyncio
import numpy
import pytest
import ucp
from triton_distributed.icp.nats_request_plane import NatsRequestPlane
from triton_distributed.icp.ucp_data_plane import UcpDataPlane
from triton_distributed.runtime.deployment import Deployment
from triton_distributed.runtime.logger import get_logger
from triton_distributed.runtime.operator import OperatorConfig
from triton_distributed.runtime.remote_operator import RemoteOperator
from triton_distributed.runtime.worker import WorkerConfig
NATS_PORT = 4223
MODEL_REPOSITORY = (
"/workspace/runtime/tests/python/integration/operators/triton_core_models"
)
OPERATORS_REPOSITORY = "/workspace/runtime/tests/python/integration/operators"
TRITON_LOG_LEVEL = 0
logger = get_logger(__name__)
# Slower test than others - make it nightly for now
pytestmark = pytest.mark.nightly
@pytest.fixture
def workers(log_dir, request, number_workers=1):
store_outputs_in_response = request.getfixturevalue("store_outputs_in_response")
# Add configs for identity operator
operator_name = "identity"
operator_config = OperatorConfig(
name=operator_name,
implementation="identity:Identity",
version=1,
max_inflight_requests=10,
parameters={"store_outputs_in_response": store_outputs_in_response},
repository=OPERATORS_REPOSITORY,
)
worker_configs = []
test_log_dir = log_dir / request.node.name
test_log_dir.mkdir(parents=True, exist_ok=True)
for i in range(number_workers):
# Set the logging directory
worker_log_dir = test_log_dir / (operator_name + "_" + str(i))
worker_configs.append(
WorkerConfig(
name=operator_name,
request_plane=NatsRequestPlane,
data_plane=UcpDataPlane,
request_plane_args=(
[],
{"request_plane_uri": f"nats://localhost:{NATS_PORT}"},
),
log_level=TRITON_LOG_LEVEL,
log_dir=str(worker_log_dir),
operators=[operator_config],
)
)
worker_deployment = Deployment(worker_configs)
worker_deployment.start()
yield worker_deployment
worker_deployment.shutdown()
def _create_inputs(number, tensor_size_in_kb):
inputs = []
outputs = []
elem_cnt = int(tensor_size_in_kb * 1024 / 4)
for _ in range(number):
input_ = numpy.random.randint(low=1, high=100, size=[elem_cnt])
expected_ = {}
expected_["output"] = input_
inputs.append(input_)
outputs.append(expected_)
return inputs, outputs
def run(
aio_benchmark,
store_inputs_in_request,
store_outputs_in_response,
tensor_size_in_kb,
data_plane_tracker,
):
if data_plane_tracker.is_first_run:
ucp.reset()
data_plane_tracker._data_plane = UcpDataPlane()
data_plane_tracker._data_plane.connect()
request_plane = NatsRequestPlane(f"nats://localhost:{NATS_PORT}")
asyncio.get_event_loop().run_until_complete(request_plane.connect())
identity_operator = RemoteOperator(
"identity", request_plane, data_plane_tracker._data_plane
)
inputs, outputs = _create_inputs(1, tensor_size_in_kb)
aio_benchmark(
post_requests,
identity_operator,
inputs,
outputs,
store_inputs_in_request,
store_outputs_in_response,
)
timeout = 5
asyncio.get_event_loop().run_until_complete(request_plane.close())
if data_plane_tracker.is_last_run:
data_plane_tracker._data_plane.close(timeout)
async def post_requests(
identity_model, inputs, outputs, store_inputs_in_request, store_outputs_in_response
):
results = []
expected_results = {}
for i, input_ in enumerate(inputs):
request_id = str(i)
request = identity_model.create_request(
inputs={"input": input_}, request_id=request_id
)
if store_inputs_in_request:
request.store_inputs_in_request.add("input")
results.append(identity_model.async_infer(request))
expected_results[request_id] = outputs[i]
for result in asyncio.as_completed(results):
responses = await result
async for response in responses:
for output_name, expected_value in expected_results[
response.request_id
].items():
output = response.outputs[output_name]
_ = numpy.from_dlpack(output.to_host())
del output
del response
@pytest.fixture(scope="module")
def data_plane_tracker():
class Tracker:
def __init__(self):
self.total_runs = 0
self.current_run = 0
self._data_plane = None
def increment_run(self):
self.current_run += 1
@property
def is_first_run(self):
return self.current_run == 1
@property
def is_last_run(self):
return self.current_run == self.total_runs
return Tracker()
# FIXME: NATS default size limit is 1 MB. However, even when the tensor_size_in_kb
# is set as 600, which corresponds to 0.6144 MB, we are hiting MaxPayloadError.
# Need to investigate why the limit is being hit.
@pytest.mark.skipif(
"(not os.path.exists('/usr/local/bin/nats-server'))",
reason="NATS.io not present or test is configured to run with mock disaggregated_serving",
)
@pytest.mark.parametrize(
["store_inputs_in_request", "store_outputs_in_response"],
[(True, True), (False, False)],
)
@pytest.mark.parametrize(
"tensor_size_in_kb",
[10, 100, 500],
)
@pytest.mark.benchmark(min_rounds=100, max_time=1)
def test_identity(
request,
nats_server,
workers,
aio_benchmark,
store_inputs_in_request,
store_outputs_in_response,
tensor_size_in_kb,
data_plane_tracker,
):
"""
This benchmark test checks the latency of a simple operator which returns input in its output
without any processing.
NOTE: We can not use benchmark fixture in the child process. Hence, we are required to use the
same process for opening then data plane object as pytest.
This means that the pytest main process cannot create another data plane object in any other
tests. Hence, we will use a run tracker to open and close the data plane
"""
if data_plane_tracker.total_runs == 0:
data_plane_tracker.total_runs = 6 # Set this to the number of parameters
data_plane_tracker.increment_run()
run(
aio_benchmark,
store_inputs_in_request,
store_outputs_in_response,
tensor_size_in_kb,
data_plane_tracker,
)
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pytest
from triton_distributed.runtime.parser import Parser
"""
Tests for parsing the arguments by command line parser
"""
@pytest.fixture
def default_values():
# Add default values for the command-line interface
return {
"request_plane_uri": "nats://localhost:4222",
"log_level": 0,
# TODO: Add the default options for the worker executable here
}
def test_parse_args_default(default_values):
# Tests for default values
args, parser = Parser.parse_args([])
assert args.request_plane_uri == default_values["request_plane_uri"]
assert args.log_level == default_values["log_level"]
if args.operators:
raise Exception(f"Expected no operators by default, got {args.operators}")
if args.operator_configs:
raise Exception(
f"Expected no operators by default, got {args.operator_configs}"
)
@pytest.mark.parametrize(
"valid_request_plane_uri",
[
"https://example.com",
# Add valid request plane uri values
],
)
def test_parse_args_valid_request_plane_uri(valid_request_plane_uri):
# Tests with valid values for request plane uri
args, _ = Parser.parse_args(["--request-plane-uri", valid_request_plane_uri])
assert args.request_plane_uri == valid_request_plane_uri
def clean_argument_list(args_list):
return [x for x in args_list if x is not None]
@pytest.mark.parametrize(
"first_arg, second_arg, third_arg",
[
("name:abc", "version:1", "max_inflight_requests:5"),
("name:abc", "max_inflight_requests:5", None),
("name:abc", "version:1", None),
("name:abc", None, None),
# Add valid cases
],
)
def test_parse_args_valid_model(first_arg, second_arg, third_arg, tmp_path):
model_repo_path = tmp_path / "model_repo"
model_repo_path.mkdir()
d = model_repo_path / "abc"
d.mkdir()
# Tests with valid arguments
input_args = ["--operator"]
model_args = clean_argument_list(
[
first_arg,
second_arg,
third_arg,
f"repository:{model_repo_path}",
"module:worker.triton_core_operator:TritonCoreOperator",
]
)
print(model_args)
input_args = input_args + model_args
args, _ = Parser.parse_args(input_args)
assert args.operators[0] == model_args
def test_parse_args_invalid_operator(capsys):
# Tests with invalid arguments
with pytest.raises(SystemExit):
Parser.parse_args(["--operator"])
captured = capsys.readouterr()
assert "expected at least one argument" in captured.err
@pytest.mark.parametrize(
"first_arg, second_arg, third_arg",
[
("name:abc", "version:1", "max_inflight_requests:5"),
("name:abc", "max_inflight_requests:5", None),
("name:abc", "version:1", None),
# TODO: Revisit can be uncommented once the operator module can be inferred automatically.
# ("abc", None, None),
# Add valid cases
],
)
def test_parse_args_valid_operator(first_arg, second_arg, third_arg, tmp_path):
repo_path = tmp_path / "worker_repo"
repo_path.mkdir()
d = repo_path / "abc"
d.mkdir()
# Tests with valid arguments
input_args = ["--operator"]
operator_args = clean_argument_list([first_arg, second_arg, third_arg])
input_args = input_args + operator_args + ["module:dummyworkflow:Workflow"]
args, _ = Parser.parse_args(input_args)
assert args.operators[0] == operator_args + ["module:dummyworkflow:Workflow"]
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import pytest
from triton_distributed.runtime.logger import get_logger
logger = logging.getLogger(__name__)
MSG = "This is a sample message"
"""
Tests for Logging module
"""
def logging_function(logger):
logger.info(MSG)
logger.warning(MSG)
try:
raise Exception("This is an exception")
except Exception:
logger.exception(MSG)
logger.error(MSG)
logger.debug(MSG)
@pytest.fixture
def reset_logger(caplog):
loggers = [logging.getLogger(name) for name in logging.root.manager.loggerDict]
loggers.append(logging.getLogger())
for logger in loggers:
handlers = logger.handlers[:]
for handler in handlers:
logger.removeHandler(handler)
handler.close()
logger.setLevel(logging.NOTSET)
logger.propagate = True
caplog.clear()
@pytest.mark.parametrize(
"log_level, expected_record_counts",
[
# For log-level 0 only error and exception should be recorded
(0, 2),
# For log-level 1 only info, error, exception and warning should be recorded
(1, 4),
# All logs(error, exception, info, debug and warning) should be printed for log-level 2
(2, 5),
],
)
def test_logging(reset_logger, caplog, log_level, expected_record_counts):
caplog.set_level(log_level)
logger = get_logger(logger_name="test_logging", log_level=log_level)
logging_function(logger)
assert len(caplog.records) == expected_record_counts
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment