Commit f1f29171 authored by Neelay Shah's avatar Neelay Shah Committed by GitHub
Browse files

feat: initial worker

parent b0195f54
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import numpy as np
import triton_python_backend_utils as pb_utils
try:
import cupy
except Exception:
cupy = None
class TritonPythonModel:
@staticmethod
def auto_complete_config(auto_complete_model_config):
inputs = []
outputs = []
dims = [-1, -1]
optional = True
for data_type in ["type_int64"]:
type_name = data_type.split("_")[1].lower()
input_name = f"{type_name}_input"
output_name_1 = f"{type_name}_output_total"
output_name_2 = f"{type_name}_output_partial"
inputs.append(
{
"name": input_name,
"data_type": data_type,
"dims": dims,
"optional": optional,
}
)
outputs.append(
{"name": output_name_1, "data_type": data_type, "dims": dims}
)
outputs.append(
{"name": output_name_2, "data_type": data_type, "dims": dims}
)
outputs.append(
{"name": "output_parameters", "data_type": "TYPE_STRING", "dims": [1]}
)
for input_ in inputs:
auto_complete_model_config.add_input(input_)
for output in outputs:
auto_complete_model_config.add_output(output)
auto_complete_model_config.set_max_batch_size(0)
return auto_complete_model_config
def initialize(self, args):
self._model_config = json.loads(args["model_config"])
self._request_gpu_memory = False
if "parameters" in self._model_config:
parameters = self._model_config["parameters"]
if (
"request_gpu_memory" in parameters
and parameters["request_gpu_memory"]["string_value"] == "True"
):
self._request_gpu_memory = True
def execute(self, requests):
responses = []
for request in requests:
output_tensors = []
for input_tensor in request.inputs():
input_value = input_tensor.as_numpy()
output_value_partial = np.array([[x.sum() for x in input_value]])
output_value_total = np.array([[input_value.sum()]])
if self._request_gpu_memory:
output_value_partial = cupy.array(output_value_partial)
output_value_total = cupy.array(output_value_total)
output_tensor = pb_utils.Tensor.from_dlpack(
input_tensor.name().replace("input", "output_partial"),
output_value_partial,
)
output_tensors.append(output_tensor)
output_tensor = pb_utils.Tensor.from_dlpack(
input_tensor.name().replace("input", "output_total"),
output_value_total,
)
output_tensors.append(output_tensor)
else:
output_tensor = pb_utils.Tensor(
input_tensor.name().replace("input", "output_partial"),
output_value_partial,
)
output_tensors.append(output_tensor)
output_tensor = pb_utils.Tensor(
input_tensor.name().replace("input", "output_total"),
output_value_total,
)
output_tensors.append(output_tensor)
output_parameters = np.array([request.parameters()]).astype(np.object_)
output_tensors.append(
pb_utils.Tensor("output_parameters", output_parameters)
)
responses.append(
pb_utils.InferenceResponse(
output_tensors=output_tensors,
)
)
return responses
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
backend: "python"
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import time
import triton_python_backend_utils as pb_utils
class TritonPythonModel:
def initialize(self, args):
model_config = json.loads(args["model_config"])
self._context_delay = (
int(model_config["parameters"]["context_delay_ms"]["string_value"])
) / 1000
for output_name in [
"KV_CACHE",
"OUTPUT_IDS",
"SEQUENCE_LENGTH",
"REQUEST_OUTPUT_LEN",
]:
setattr(
self,
output_name.lower() + "_dtype",
pb_utils.triton_string_to_numpy(
pb_utils.get_output_config_by_name(model_config, output_name)[
"data_type"
]
),
)
def execute(self, requests):
responses = []
for idx, request in enumerate(requests):
# Get input tensors
input_ids = pb_utils.get_input_tensor_by_name(
request, "INPUT_IDS"
).as_numpy()
input_lengths = pb_utils.get_input_tensor_by_name(
request, "INPUT_LENGTH"
).as_numpy()
request_output_len = pb_utils.get_input_tensor_by_name(
request, "REQUEST_OUTPUT_LEN"
).as_numpy()
time.sleep(self._context_delay)
# Create output tensors. You need pb_utils.Tensor
# objects to create pb_utils.InferenceResponse.
kv_cache_tensor = pb_utils.Tensor(
"KV_CACHE", input_ids.astype(self.kv_cache_dtype)
)
output_ids_tensor = pb_utils.Tensor(
"OUTPUT_IDS", input_ids.astype(self.output_ids_dtype)
)
sequence_length_tensor = pb_utils.Tensor(
"SEQUENCE_LENGTH", input_lengths.astype(self.sequence_length_dtype)
)
request_output_len_tensor = pb_utils.Tensor(
"REQUEST_OUTPUT_LEN", request_output_len
)
inference_response = pb_utils.InferenceResponse(
output_tensors=[
kv_cache_tensor,
output_ids_tensor,
sequence_length_tensor,
request_output_len_tensor,
]
)
responses.append(inference_response)
return responses
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Emulates the tensorrt_llm config from:
# https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt
name: "context"
backend: "python"
max_batch_size: 0
parameters: {
key: "context_delay_ms"
value: {
string_value: "1000"
}
}
input [
{
name: "INPUT_IDS"
data_type: TYPE_INT32
dims: [ -1 ]
},
{
name: "INPUT_LENGTH"
data_type: TYPE_INT32
dims: [ 1 ]
},
{
name: "REQUEST_OUTPUT_LEN"
data_type: TYPE_INT32
dims: [ 1 ]
}
# Add more inputs as per requirement.
# For simplicity only sticking with these
# inputs for preprocessing.
]
output [
# Section of the first request that returns the first token.
# These will be handed over directly to the post-processor
{
name: "OUTPUT_IDS"
data_type: TYPE_INT32
dims: [ -1 ]
},
{
name: "SEQUENCE_LENGTH"
data_type: TYPE_INT32
dims: [ 1 ]
},
{
name: "REQUEST_OUTPUT_LEN"
data_type: TYPE_INT32
dims: [ 1 ]
},
# Section of the second part of handover to the generate stage
{
# TODO: revisit how kv cache is being exposed to generate worker.
name: "KV_CACHE"
data_type: TYPE_INT32
dims: [ -1 ]
}
# Add more outputs as per requirement.
# For simplicity only sticking with these
# outputs for preprocessing.
]
# Add more parameters as per requirement
instance_group [
{
count: 1
kind : KIND_CPU
}
]
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import numpy as np
import triton_python_backend_utils as pb_utils
try:
import cupy
except Exception:
cupy = None
class TritonPythonModel:
@staticmethod
def auto_complete_config(auto_complete_model_config):
inputs = []
outputs = []
dims = [-1, -1]
optional = True
for data_type in ["type_int64"]:
type_name = data_type.split("_")[1].lower()
input_name = f"{type_name}_input"
output_name = "fp64_output_partial"
inputs.append(
{
"name": input_name,
"data_type": data_type,
"dims": dims,
"optional": optional,
}
)
outputs.append({"name": output_name, "data_type": data_type, "dims": dims})
input_name = f"{type_name}_input_divisor"
inputs.append(
{
"name": input_name,
"data_type": data_type,
"dims": dims,
"optional": optional,
}
)
outputs.append(
{"name": "output_parameters", "data_type": "TYPE_STRING", "dims": [1]}
)
for input_ in inputs:
auto_complete_model_config.add_input(input_)
for output in outputs:
auto_complete_model_config.add_output(output)
auto_complete_model_config.set_max_batch_size(0)
return auto_complete_model_config
def initialize(self, args):
self._model_config = json.loads(args["model_config"])
self._request_gpu_memory = False
if "parameters" in self._model_config:
parameters = self._model_config["parameters"]
if (
"request_gpu_memory" in parameters
and parameters["request_gpu_memory"]["string_value"] == "True"
):
self._request_gpu_memory = True
def execute(self, requests):
responses = []
for request in requests:
output_tensors = []
divisor = pb_utils.get_input_tensor_by_name(request, "int64_input_divisor")
divisor = divisor.as_numpy()[0][0]
dividends = pb_utils.get_input_tensor_by_name(request, "int64_input")
dividends = dividends.as_numpy()
output_value = np.array([np.divide(dividends, divisor)])
if self._request_gpu_memory:
output_value = cupy.array(output_value)
output_tensor = pb_utils.Tensor.from_dlpack(
"fp64_output_partial", output_value
)
else:
output_tensor = pb_utils.Tensor("fp64_output_partial", output_value)
output_tensors.append(output_tensor)
output_parameters = np.array([request.parameters()]).astype(np.object_)
output_tensors.append(
pb_utils.Tensor("output_parameters", output_parameters)
)
responses.append(
pb_utils.InferenceResponse(
output_tensors=output_tensors,
)
)
return responses
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
backend: "python"
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import threading
import time
import numpy as np
import triton_python_backend_utils as pb_utils
class TritonPythonModel:
def initialize(self, args):
model_config = json.loads(args["model_config"])
self._output_token_latency = (
int(model_config["parameters"]["inter_token_latency_ms"]["string_value"])
) / 1000
# You must parse model_config. JSON string is not parsed here
self.model_config = model_config = json.loads(args["model_config"])
using_decoupled = pb_utils.using_decoupled_model_transaction_policy(
model_config
)
if not using_decoupled:
raise pb_utils.TritonModelException(
"""the model `{}` can generate any number of responses per request,
enable decoupled transaction policy in model configuration to
serve this model""".format(
args["model_name"]
)
)
for output_name in ["OUTPUT_IDS", "SEQUENCE_LENGTH"]:
setattr(
self,
output_name.lower() + "_dtype",
pb_utils.triton_string_to_numpy(
pb_utils.get_output_config_by_name(model_config, output_name)[
"data_type"
]
),
)
# To keep track of response threads so that we can delay
# the finalizing the model until all response threads
# have completed.
self.inflight_thread_count = 0
self.inflight_thread_count_lck = threading.Lock()
def response_thread(self, response_sender, kv_cache, request_output_len):
for idx in range(request_output_len):
time.sleep(self._output_token_latency)
output_ids_tensor = pb_utils.Tensor(
"OUTPUT_IDS", kv_cache.astype(self.output_ids_dtype)
)
sequence_length = np.array([kv_cache.size])
sequence_length_tensor = pb_utils.Tensor(
"SEQUENCE_LENGTH", sequence_length.astype(self.sequence_length_dtype)
)
response = pb_utils.InferenceResponse(
output_tensors=[output_ids_tensor, sequence_length_tensor]
)
response_sender.send(response)
# We must close the response sender to indicate to Triton that we are
# done sending responses for the corresponding request. We can't use the
# response sender after closing it. The response sender is closed by
# setting the TRITONSERVER_RESPONSE_COMPLETE_FINAL.
response_sender.send(flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
with self.inflight_thread_count_lck:
self.inflight_thread_count -= 1
def execute(self, requests):
for idx, request in enumerate(requests):
# Get input tensors
kv_cache = pb_utils.get_input_tensor_by_name(request, "KV_CACHE").as_numpy()
request_output_len = pb_utils.get_input_tensor_by_name(
request, "REQUEST_OUTPUT_LEN"
).as_numpy()
# Start a separate thread to send the responses for the request. The
# sending back the responses is delegated to this thread.
thread = threading.Thread(
target=self.response_thread,
args=(
requests[0].get_response_sender(),
kv_cache,
request_output_len[0],
),
)
# A model using decoupled transaction policy is not required to send all
# responses for the current request before returning from the execute.
# To demonstrate the flexibility of the decoupled API, we are running
# response thread entirely independent of the execute thread.
thread.daemon = True
with self.inflight_thread_count_lck:
self.inflight_thread_count += 1
thread.start()
return None
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Emulates the tensorrt_llm config from:
# https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt
name: "generation"
backend: "python"
max_batch_size: 0
model_transaction_policy {
decoupled: true
}
parameters: {
key: "inter_token_latency_ms"
value: {
string_value: "1000"
}
}
input [
{
# TODO: revisit how kv cache is being exposed to generate worker.
name: "KV_CACHE"
data_type: TYPE_INT32
dims: [ -1 ]
},
{
name: "REQUEST_OUTPUT_LEN"
data_type: TYPE_INT32
dims: [ 1 ]
}
# Add more inputs as per requirement.
# For simplicity only sticking with these
# inputs for preprocessing.
]
output [
{
name: "OUTPUT_IDS"
data_type: TYPE_INT32
dims: [ -1 ]
},
{
name: "SEQUENCE_LENGTH"
data_type: TYPE_INT32
dims: [ 1 ]
}
# Add more outputs as per requirement.
# For simplicity only sticking with these
# outputs for preprocessing.
]
# Add more parameters as per requirement
instance_group [
{
count: 1
kind : KIND_CPU
}
]
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import numpy as np
import triton_python_backend_utils as pb_utils
try:
import cupy
except Exception:
cupy = None
class TritonPythonModel:
@staticmethod
def auto_complete_config(auto_complete_model_config):
inputs = []
outputs = []
dims = [-1, -1]
optional = True
config = auto_complete_model_config.as_dict()
for data_type in pb_utils.TRITON_STRING_TO_NUMPY.keys():
type_name = data_type.split("_")[1].lower()
input_name = f"{type_name}_input"
output_name = f"{type_name}_output"
inputs.append(
{
"name": input_name,
"data_type": data_type,
"dims": dims,
"optional": optional,
}
)
outputs.append({"name": output_name, "data_type": data_type, "dims": dims})
outputs.append(
{"name": "output_parameters", "data_type": "TYPE_STRING", "dims": [1]}
)
for input_ in inputs:
auto_complete_model_config.add_input(input_)
for output in outputs:
auto_complete_model_config.add_output(output)
auto_complete_model_config.set_max_batch_size(0)
if "decoupled" in config["parameters"]:
if config["parameters"]["decoupled"]["string_value"] == "True":
auto_complete_model_config.set_model_transaction_policy(
{"decoupled": True}
)
return auto_complete_model_config
def initialize(self, args):
self._model_config = json.loads(args["model_config"])
self._decoupled = self._model_config.get("model_transaction_policy", {}).get(
"decoupled"
)
self._request_gpu_memory = False
if "parameters" in self._model_config:
parameters = self._model_config["parameters"]
if (
"request_gpu_memory" in parameters
and parameters["request_gpu_memory"]["string_value"] == "True"
):
self._request_gpu_memory = True
def execute_decoupled(self, requests):
for request in requests:
sender = request.get_response_sender()
output_tensors = []
for input_tensor in request.inputs():
input_value = input_tensor.as_numpy()
output_tensor = pb_utils.Tensor(
input_tensor.name().replace("input", "output"), input_value
)
output_tensors.append(output_tensor)
sender.send(pb_utils.InferenceResponse(output_tensors=output_tensors))
sender.send(flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
return None
def execute(self, requests):
if self._decoupled:
return self.execute_decoupled(requests)
responses = []
for request in requests:
output_tensors = []
for input_tensor in request.inputs():
input_value = input_tensor.as_numpy()
if self._request_gpu_memory:
input_value = cupy.array(input_value)
output_tensor = pb_utils.Tensor.from_dlpack(
input_tensor.name().replace("input", "output"), input_value
)
else:
output_tensor = pb_utils.Tensor(
input_tensor.name().replace("input", "output"), input_value
)
output_tensors.append(output_tensor)
output_parameters = np.array([request.parameters()]).astype(np.object_)
output_tensors.append(
pb_utils.Tensor("output_parameters", output_parameters)
)
responses.append(
pb_utils.InferenceResponse(
output_tensors=output_tensors,
)
)
return responses
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
backend: "python"
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import numpy as np
import triton_python_backend_utils as pb_utils
try:
import cupy
except Exception:
cupy = None
class TritonPythonModel:
@staticmethod
def auto_complete_config(auto_complete_model_config):
inputs = []
outputs = []
dims = [-1, -1]
optional = True
for data_type in ["type_int64"]:
type_name = data_type.split("_")[1].lower()
input_name = f"{type_name}_input"
output_name = f"{type_name}_output_total"
inputs.append(
{
"name": input_name,
"data_type": data_type,
"dims": dims,
"optional": optional,
}
)
outputs.append({"name": output_name, "data_type": data_type, "dims": dims})
outputs.append(
{"name": "output_parameters", "data_type": "TYPE_STRING", "dims": [1]}
)
for input_ in inputs:
auto_complete_model_config.add_input(input_)
for output in outputs:
auto_complete_model_config.add_output(output)
auto_complete_model_config.set_max_batch_size(0)
return auto_complete_model_config
def initialize(self, args):
self._model_config = json.loads(args["model_config"])
self._request_gpu_memory = False
if "parameters" in self._model_config:
parameters = self._model_config["parameters"]
if (
"request_gpu_memory" in parameters
and parameters["request_gpu_memory"]["string_value"] == "True"
):
self._request_gpu_memory = True
def execute(self, requests):
responses = []
for request in requests:
output_tensors = []
for input_tensor in request.inputs():
input_value = input_tensor.as_numpy()
output_value = np.array([[x.prod() for x in input_value]])
if self._request_gpu_memory:
output_value = cupy.array(output_value)
output_tensor = pb_utils.Tensor.from_dlpack(
input_tensor.name().replace("input", "output_total"),
output_value,
)
else:
output_tensor = pb_utils.Tensor(
input_tensor.name().replace("input", "output_total"),
output_value,
)
output_tensors.append(output_tensor)
output_parameters = np.array([request.parameters()]).astype(np.object_)
output_tensors.append(
pb_utils.Tensor("output_parameters", output_parameters)
)
responses.append(
pb_utils.InferenceResponse(
output_tensors=output_tensors,
)
)
return responses
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
backend: "python"
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import numpy as np
import triton_python_backend_utils as pb_utils
# from transformers import LlamaTokenizer
# llama_tokenizer = LlamaTokenizer.from_pretrained("/path/to/hfmodel")
from transformers import XLNetTokenizer
class TritonPythonModel:
def initialize(self, args):
model_config = json.loads(args["model_config"])
for output_name in ["OUTPUT"]:
setattr(
self,
output_name.lower() + "_dtype",
pb_utils.triton_string_to_numpy(
pb_utils.get_output_config_by_name(model_config, output_name)[
"data_type"
]
),
)
# Using a mock hard coded auto-tokenizer
self.tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
def execute(self, requests):
responses = []
for idx, request in enumerate(requests):
# Get input tensors
output_ids = pb_utils.get_input_tensor_by_name(
request, "OUTPUT_IDS"
).as_numpy()
output_result = np.array(
self.tokenizer.convert_ids_to_tokens((output_ids.tolist()))
)
print(f"Output Result \n\n {output_result}", flush=True)
output_tensor = pb_utils.Tensor(
"OUTPUT", output_result.astype(self.output_dtype)
)
inference_response = pb_utils.InferenceResponse(
output_tensors=[output_tensor]
)
responses.append(inference_response)
return responses
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Emulates the pre-processing config from:
# https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/all_models/inflight_batcher_llm/postprocessing/config.pbtxt
name: "postprocessing"
backend: "python"
max_batch_size: 0
input [
{
name: "OUTPUT_IDS"
data_type: TYPE_INT32
dims: [ -1 ]
},
{
name: "SEQUENCE_LENGTH"
data_type: TYPE_INT32
dims: [ 1 ]
}
# Add more inputs as per requirement.
# For simplicity only sticking with these
# inputs for preprocessing.
]
output [
{
name: "OUTPUT"
data_type: TYPE_STRING
dims: [ -1 ]
}
# Add more outputs as per requirement.
# For simplicity only sticking with these
# outputs for preprocessing.
]
instance_group [
{
count: 4
kind: KIND_CPU
}
]
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import numpy as np
import triton_python_backend_utils as pb_utils
# from transformers import LlamaTokenizer
# llama_tokenizer = LlamaTokenizer.from_pretrained("/path/to/hfmodel")
from transformers import XLNetTokenizer
class TritonPythonModel:
"""
This is a mock disaggregated serving pre-processing model.
"""
def initialize(self, args):
model_config = json.loads(args["model_config"])
for output_name in ["INPUT_IDS", "INPUT_LENGTH", "REQUEST_OUTPUT_LEN"]:
setattr(
self,
output_name.lower() + "_dtype",
pb_utils.triton_string_to_numpy(
pb_utils.get_output_config_by_name(model_config, output_name)[
"data_type"
]
),
)
# Using a mock hard coded auto-tokenizer
self.tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
def execute(self, requests):
print("In preprocessing execute!", flush=True)
responses = []
for idx, request in enumerate(requests):
# Get input tensors
query = pb_utils.get_input_tensor_by_name(request, "query").as_numpy()
request_output_len = pb_utils.get_input_tensor_by_name(
request, "request_output_len"
).as_numpy()
print(f"query(pre-proc) {query}", flush=True)
tokenize = np.array(self.tokenizer.encode(query[0].decode()))
print(f"tokenize(pre-proc) {tokenize.size}", flush=True)
input_length = np.array([tokenize.size])
# Just forwarding query to the pre-processed input_ids
input_id_tensor = pb_utils.Tensor(
"INPUT_IDS", tokenize.astype(self.input_ids_dtype)
)
# Just forwarding query to the pre-processed input_ids
input_length_tensor = pb_utils.Tensor(
"INPUT_LENGTH", input_length.astype(self.input_length_dtype)
)
request_output_len_tensor = pb_utils.Tensor(
"REQUEST_OUTPUT_LEN", request_output_len
)
inference_response = pb_utils.InferenceResponse(
output_tensors=[
input_id_tensor,
input_length_tensor,
request_output_len_tensor,
]
)
responses.append(inference_response)
return responses
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Emulates the pre-processing config from:
# https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/all_models/inflight_batcher_llm/preprocessing/config.pbtxt
name: "preprocessing"
backend: "python"
max_batch_size: 0
input [
{
name: "query"
data_type: TYPE_STRING
dims: [ 1 ]
},
{
name: "request_output_len"
data_type: TYPE_INT32
dims: [ 1 ]
}
# Add more inputs as per requirement.
# For simplicity only sticking with these
# inputs for preprocessing.
]
output [
{
name: "INPUT_IDS"
data_type: TYPE_INT32
dims: [ -1 ]
},
{
name: "INPUT_LENGTH"
data_type: TYPE_INT32
dims: [ 1 ]
},
{
name: "REQUEST_OUTPUT_LEN"
data_type: TYPE_INT32
dims: [ 1 ]
}
# Add more outputs as per requirement.
# For simplicity only sticking with these
# outputs for preprocessing.
]
instance_group [
{
count: 4
kind: KIND_CPU
}
]
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import asyncio
import logging
import sys
from multiprocessing import Manager, Process
import cupy
import numpy
import pytest
import ucp
from cupy_backends.cuda.api.runtime import CUDARuntimeError
from triton_distributed.icp.nats_request_plane import NatsRequestPlane
from triton_distributed.icp.ucp_data_plane import UcpDataPlane
from triton_distributed.worker.log_formatter import LOGGER_NAME
from triton_distributed.worker.operator import OperatorConfig
from triton_distributed.worker.remote_operator import RemoteOperator
from triton_distributed.worker.triton_core_operator import TritonCoreOperator
from triton_distributed.worker.worker import WorkerConfig
NATS_PORT = 4223
MODEL_REPOSITORY = "/workspace/worker/tests/python/integration/operators/models"
WORKFLOW_REPOSITORY = "/workspace/worker/tests/python/integration/operators"
TRITON_LOG_LEVEL = 6
logger = logging.getLogger(LOGGER_NAME)
# Run cupy's cuda.is_available once to
# avoid the exception hitting runtime code.
try:
if cupy.cuda.is_available():
pass
else:
print("CUDA not available.")
except CUDARuntimeError:
print("CUDA not available")
# TODO
# Decide if this should be
# pre merge, nightly, or weekly
pytestmark = pytest.mark.pre_merge
@pytest.fixture
def workers(worker_manager, request):
worker_config = WorkerConfig(
request_plane=NatsRequestPlane,
data_plane=UcpDataPlane,
request_plane_args=([], {"request_plane_uri": f"nats://localhost:{NATS_PORT}"}),
log_level=TRITON_LOG_LEVEL,
)
store_outputs_in_response = request.getfixturevalue("store_outputs_in_response")
add_model = OperatorConfig(
name="add",
implementation=TritonCoreOperator,
version=1,
max_inflight_requests=10,
parameters={"store_outputs_in_response": store_outputs_in_response},
repository=MODEL_REPOSITORY,
)
multiply_model = OperatorConfig(
name="multiply",
implementation=TritonCoreOperator,
version=1,
max_inflight_requests=10,
parameters={"store_outputs_in_response": store_outputs_in_response},
repository=MODEL_REPOSITORY,
)
divide_model = OperatorConfig(
name="divide",
implementation=TritonCoreOperator,
version=1,
max_inflight_requests=10,
parameters={"store_outputs_in_response": store_outputs_in_response},
repository=MODEL_REPOSITORY,
)
workflow = OperatorConfig(
name="add_multiply_divide",
implementation="add_multiply_divide:AddMultiplyDivide",
version=1,
max_inflight_requests=10,
parameters={"store_outputs_in_response": store_outputs_in_response},
repository=WORKFLOW_REPOSITORY,
)
with Manager() as manager:
workers = []
queues = []
queues.append(manager.Queue(maxsize=1))
workers.append(
worker_manager.setup_worker_process(
[add_model], "add", queues[-1], worker_config
)
)
queues.append(manager.Queue(maxsize=1))
workers.append(
worker_manager.setup_worker_process(
[multiply_model], "multiply", queues[-1], worker_config
)
)
queues.append(manager.Queue(maxsize=1))
workers.append(
worker_manager.setup_worker_process(
[divide_model], "divide", queues[-1], worker_config
)
)
queues.append(manager.Queue(maxsize=1))
workers.append(
worker_manager.setup_worker_process(
[workflow], "add_multiply_divide", queues[-1], worker_config
)
)
workers_failed = False
status_list = []
for queue, worker in zip(queues, workers):
status = queue.get()
status_list.append(status)
if status != "READY":
workers_failed = True
if workers_failed:
worker_manager.cleanup_workers(workers, check_status=False)
raise Exception(f"Failed to start worker processes: {status_list}")
yield workers
worker_manager.cleanup_workers(workers)
def _create_inputs(number, size):
inputs = []
outputs = []
for index in range(number):
input_ = numpy.random.randint(low=1, high=100, size=[2, size])
expected_ = {}
expected_["add_int64_output_total"] = numpy.array([[input_.sum()]])
expected_["add_int64_output_partial"] = numpy.array([[x.sum() for x in input_]])
expected_["multiply_int64_output_total"] = numpy.array(
[[x.prod() for x in expected_["add_int64_output_partial"]]]
)
divisor = expected_["add_int64_output_total"][0][0]
dividends = expected_["add_int64_output_partial"]
expected_["divide_fp64_output_partial"] = numpy.array(
[numpy.divide(dividends, divisor)]
)
inputs.append(input_)
outputs.append(expected_)
return inputs, outputs
async def post_requests(num_requests, store_inputs_in_request):
ucp.reset()
timeout = 5
data_plane = UcpDataPlane()
data_plane.connect()
request_plane = NatsRequestPlane(f"nats://localhost:{NATS_PORT}")
await request_plane.connect()
add_multiply_divide_model = RemoteOperator(
"add_multiply_divide", 1, request_plane, data_plane
)
results = []
expected_results = {}
inputs, outputs = _create_inputs(num_requests, 40)
for i, input_ in enumerate(inputs):
request_id = str(i)
request = add_multiply_divide_model.create_request(
inputs={"int64_input": input_}, request_id=request_id
)
if store_inputs_in_request:
request.store_inputs_in_request.add("int64_input")
print(request)
results.append(add_multiply_divide_model.async_infer(request))
expected_results[request_id] = outputs[i]
for result in asyncio.as_completed(results):
responses = await result
async for response in responses:
print(response)
for output_name, expected_value in expected_results[
response.request_id
].items():
output = response.outputs[output_name]
output_value = numpy.from_dlpack(output.to_host())
numpy.testing.assert_equal(output_value, expected_value)
del output
print(expected_results[response.request_id])
del response
timeout = 5
data_plane.close(timeout)
await request_plane.close()
def run(num_requests, store_inputs_in_request=False):
sys.exit(
asyncio.run(
post_requests(
num_requests=num_requests,
store_inputs_in_request=store_inputs_in_request,
)
)
)
@pytest.mark.skipif(
"(not os.path.exists('/usr/local/bin/nats-server'))",
reason="NATS.io not present",
)
@pytest.mark.timeout(30)
@pytest.mark.parametrize(
["store_inputs_in_request", "store_outputs_in_response"],
[(False, False), (True, True)],
)
def test_add_multiply_divide(
request, nats_server, workers, store_inputs_in_request, store_outputs_in_response
):
# Using a separate process to use data plane across multiple tests.
p = Process(target=run, args=(2, store_inputs_in_request))
p.start()
p.join()
assert p.exitcode == 0
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pytest
from triton_distributed.worker.parser import Parser
"""
Tests for parsing the arguments by command line parser
"""
@pytest.fixture
def default_values():
# Add default values for the command-line interface
return {
"request_plane_uri": "nats://localhost:4222",
"log_level": 0,
# TODO: Add the default options for the worker executable here
}
def test_parse_args_default(default_values):
# Tests for default values
args, parser = Parser.parse_args([])
assert args.request_plane_uri == default_values["request_plane_uri"]
assert args.log_level == default_values["log_level"]
if args.operators:
raise Exception(f"Expected no operators by default, got {args.operators}")
if args.operator_configs:
raise Exception(
f"Expected no operators by default, got {args.operator_configs}"
)
@pytest.mark.parametrize(
"valid_request_plane_uri",
[
"https://example.com",
# Add valid request plane uri values
],
)
def test_parse_args_valid_request_plane_uri(valid_request_plane_uri):
# Tests with valid values for request plane uri
args, _ = Parser.parse_args(["--request-plane-uri", valid_request_plane_uri])
assert args.request_plane_uri == valid_request_plane_uri
def clean_argument_list(args_list):
return [x for x in args_list if x is not None]
@pytest.mark.parametrize(
"first_arg, second_arg, third_arg",
[
("name:abc", "version:1", "max_inflight_requests:5"),
("name:abc", "max_inflight_requests:5", None),
("name:abc", "version:1", None),
("name:abc", None, None),
# Add valid cases
],
)
def test_parse_args_valid_model(first_arg, second_arg, third_arg, tmp_path):
model_repo_path = tmp_path / "model_repo"
model_repo_path.mkdir()
d = model_repo_path / "abc"
d.mkdir()
# Tests with valid arguments
input_args = ["--operator"]
model_args = clean_argument_list(
[
first_arg,
second_arg,
third_arg,
f"repository:{model_repo_path}",
"module:worker.triton_core_operator:TritonCoreOperator",
]
)
print(model_args)
input_args = input_args + model_args
args, _ = Parser.parse_args(input_args)
assert args.operators[0] == model_args
def test_parse_args_invalid_operator(capsys):
# Tests with invalid arguments
with pytest.raises(SystemExit):
Parser.parse_args(["--operator"])
captured = capsys.readouterr()
assert "expected at least one argument" in captured.err
@pytest.mark.parametrize(
"first_arg, second_arg, third_arg",
[
("name:abc", "version:1", "max_inflight_requests:5"),
("name:abc", "max_inflight_requests:5", None),
("name:abc", "version:1", None),
# TODO: Revisit can be uncommented once the operator module can be inferred automatically.
# ("abc", None, None),
# Add valid cases
],
)
def test_parse_args_valid_operator(first_arg, second_arg, third_arg, tmp_path):
repo_path = tmp_path / "worker_repo"
repo_path.mkdir()
d = repo_path / "abc"
d.mkdir()
# Tests with valid arguments
input_args = ["--operator"]
operator_args = clean_argument_list([first_arg, second_arg, third_arg])
input_args = input_args + operator_args + ["module:dummyworkflow:Workflow"]
args, _ = Parser.parse_args(input_args)
assert args.operators[0] == operator_args + ["module:dummyworkflow:Workflow"]
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import pytest
from triton_distributed.worker.log_formatter import LOGGER_NAME, setup_logger
logger = logging.getLogger(LOGGER_NAME)
MSG = "This is a sample message"
"""
Tests for Logging module
"""
def logging_function(logger):
logger.info(MSG)
logger.warning(MSG)
try:
raise Exception("This is an exception")
except Exception:
logger.exception(MSG)
logger.error(MSG)
logger.debug(MSG)
@pytest.fixture
def reset_logger(caplog):
loggers = [logging.getLogger(name) for name in logging.root.manager.loggerDict]
loggers.append(logging.getLogger())
for logger in loggers:
handlers = logger.handlers[:]
for handler in handlers:
logger.removeHandler(handler)
handler.close()
logger.setLevel(logging.NOTSET)
logger.propagate = True
caplog.clear()
@pytest.mark.parametrize(
"log_level, expected_record_counts",
[
# For log-level 0 only error and exception should be recorded
(0, 2),
# For log-level 1 only info, error, exception and warning should be recorded
(1, 4),
# All logs(error, exception, info, debug and warning) should be printed for log-level 2
(2, 5),
],
)
def test_logging(reset_logger, caplog, log_level, expected_record_counts):
caplog.set_level(log_level)
setup_logger(log_level=log_level)
logging_function(logger)
assert len(caplog.records) == expected_record_counts
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment