Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
f1f29171
Commit
f1f29171
authored
Jan 14, 2025
by
Neelay Shah
Committed by
GitHub
Jan 14, 2025
Browse files
feat: initial worker
parent
b0195f54
Changes
39
Hide whitespace changes
Inline
Side-by-side
Showing
19 changed files
with
1637 additions
and
0 deletions
+1637
-0
worker/tests/python/integration/operators/models/add/1/model.py
.../tests/python/integration/operators/models/add/1/model.py
+128
-0
worker/tests/python/integration/operators/models/add/config.pbtxt
...ests/python/integration/operators/models/add/config.pbtxt
+16
-0
worker/tests/python/integration/operators/models/context/1/model.py
...ts/python/integration/operators/models/context/1/model.py
+87
-0
worker/tests/python/integration/operators/models/context/config.pbtxt
.../python/integration/operators/models/context/config.pbtxt
+90
-0
worker/tests/python/integration/operators/models/divide/1/model.py
...sts/python/integration/operators/models/divide/1/model.py
+119
-0
worker/tests/python/integration/operators/models/divide/config.pbtxt
...s/python/integration/operators/models/divide/config.pbtxt
+16
-0
worker/tests/python/integration/operators/models/generation/1/model.py
...python/integration/operators/models/generation/1/model.py
+119
-0
worker/tests/python/integration/operators/models/generation/config.pbtxt
...thon/integration/operators/models/generation/config.pbtxt
+75
-0
worker/tests/python/integration/operators/models/identity/1/model.py
...s/python/integration/operators/models/identity/1/model.py
+126
-0
worker/tests/python/integration/operators/models/identity/config.pbtxt
...python/integration/operators/models/identity/config.pbtxt
+16
-0
worker/tests/python/integration/operators/models/multiply/1/model.py
...s/python/integration/operators/models/multiply/1/model.py
+104
-0
worker/tests/python/integration/operators/models/multiply/config.pbtxt
...python/integration/operators/models/multiply/config.pbtxt
+16
-0
worker/tests/python/integration/operators/models/postprocessing/1/model.py
...on/integration/operators/models/postprocessing/1/model.py
+65
-0
worker/tests/python/integration/operators/models/postprocessing/config.pbtxt
.../integration/operators/models/postprocessing/config.pbtxt
+55
-0
worker/tests/python/integration/operators/models/preprocessing/1/model.py
...hon/integration/operators/models/preprocessing/1/model.py
+85
-0
worker/tests/python/integration/operators/models/preprocessing/config.pbtxt
...n/integration/operators/models/preprocessing/config.pbtxt
+68
-0
worker/tests/python/integration/test_add_multiply_divide.py
worker/tests/python/integration/test_add_multiply_divide.py
+255
-0
worker/tests/python/unit/test_args.py
worker/tests/python/unit/test_args.py
+126
-0
worker/tests/python/unit/test_logger.py
worker/tests/python/unit/test_logger.py
+71
-0
No files found.
worker/tests/python/integration/operators/models/add/1/model.py
0 → 100644
View file @
f1f29171
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
json
import
numpy
as
np
import
triton_python_backend_utils
as
pb_utils
try
:
import
cupy
except
Exception
:
cupy
=
None
class
TritonPythonModel
:
@
staticmethod
def
auto_complete_config
(
auto_complete_model_config
):
inputs
=
[]
outputs
=
[]
dims
=
[
-
1
,
-
1
]
optional
=
True
for
data_type
in
[
"type_int64"
]:
type_name
=
data_type
.
split
(
"_"
)[
1
].
lower
()
input_name
=
f
"
{
type_name
}
_input"
output_name_1
=
f
"
{
type_name
}
_output_total"
output_name_2
=
f
"
{
type_name
}
_output_partial"
inputs
.
append
(
{
"name"
:
input_name
,
"data_type"
:
data_type
,
"dims"
:
dims
,
"optional"
:
optional
,
}
)
outputs
.
append
(
{
"name"
:
output_name_1
,
"data_type"
:
data_type
,
"dims"
:
dims
}
)
outputs
.
append
(
{
"name"
:
output_name_2
,
"data_type"
:
data_type
,
"dims"
:
dims
}
)
outputs
.
append
(
{
"name"
:
"output_parameters"
,
"data_type"
:
"TYPE_STRING"
,
"dims"
:
[
1
]}
)
for
input_
in
inputs
:
auto_complete_model_config
.
add_input
(
input_
)
for
output
in
outputs
:
auto_complete_model_config
.
add_output
(
output
)
auto_complete_model_config
.
set_max_batch_size
(
0
)
return
auto_complete_model_config
def
initialize
(
self
,
args
):
self
.
_model_config
=
json
.
loads
(
args
[
"model_config"
])
self
.
_request_gpu_memory
=
False
if
"parameters"
in
self
.
_model_config
:
parameters
=
self
.
_model_config
[
"parameters"
]
if
(
"request_gpu_memory"
in
parameters
and
parameters
[
"request_gpu_memory"
][
"string_value"
]
==
"True"
):
self
.
_request_gpu_memory
=
True
def
execute
(
self
,
requests
):
responses
=
[]
for
request
in
requests
:
output_tensors
=
[]
for
input_tensor
in
request
.
inputs
():
input_value
=
input_tensor
.
as_numpy
()
output_value_partial
=
np
.
array
([[
x
.
sum
()
for
x
in
input_value
]])
output_value_total
=
np
.
array
([[
input_value
.
sum
()]])
if
self
.
_request_gpu_memory
:
output_value_partial
=
cupy
.
array
(
output_value_partial
)
output_value_total
=
cupy
.
array
(
output_value_total
)
output_tensor
=
pb_utils
.
Tensor
.
from_dlpack
(
input_tensor
.
name
().
replace
(
"input"
,
"output_partial"
),
output_value_partial
,
)
output_tensors
.
append
(
output_tensor
)
output_tensor
=
pb_utils
.
Tensor
.
from_dlpack
(
input_tensor
.
name
().
replace
(
"input"
,
"output_total"
),
output_value_total
,
)
output_tensors
.
append
(
output_tensor
)
else
:
output_tensor
=
pb_utils
.
Tensor
(
input_tensor
.
name
().
replace
(
"input"
,
"output_partial"
),
output_value_partial
,
)
output_tensors
.
append
(
output_tensor
)
output_tensor
=
pb_utils
.
Tensor
(
input_tensor
.
name
().
replace
(
"input"
,
"output_total"
),
output_value_total
,
)
output_tensors
.
append
(
output_tensor
)
output_parameters
=
np
.
array
([
request
.
parameters
()]).
astype
(
np
.
object_
)
output_tensors
.
append
(
pb_utils
.
Tensor
(
"output_parameters"
,
output_parameters
)
)
responses
.
append
(
pb_utils
.
InferenceResponse
(
output_tensors
=
output_tensors
,
)
)
return
responses
worker/tests/python/integration/operators/models/add/config.pbtxt
0 → 100644
View file @
f1f29171
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
backend: "python"
worker/tests/python/integration/operators/models/context/1/model.py
0 → 100644
View file @
f1f29171
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
json
import
time
import
triton_python_backend_utils
as
pb_utils
class
TritonPythonModel
:
def
initialize
(
self
,
args
):
model_config
=
json
.
loads
(
args
[
"model_config"
])
self
.
_context_delay
=
(
int
(
model_config
[
"parameters"
][
"context_delay_ms"
][
"string_value"
])
)
/
1000
for
output_name
in
[
"KV_CACHE"
,
"OUTPUT_IDS"
,
"SEQUENCE_LENGTH"
,
"REQUEST_OUTPUT_LEN"
,
]:
setattr
(
self
,
output_name
.
lower
()
+
"_dtype"
,
pb_utils
.
triton_string_to_numpy
(
pb_utils
.
get_output_config_by_name
(
model_config
,
output_name
)[
"data_type"
]
),
)
def
execute
(
self
,
requests
):
responses
=
[]
for
idx
,
request
in
enumerate
(
requests
):
# Get input tensors
input_ids
=
pb_utils
.
get_input_tensor_by_name
(
request
,
"INPUT_IDS"
).
as_numpy
()
input_lengths
=
pb_utils
.
get_input_tensor_by_name
(
request
,
"INPUT_LENGTH"
).
as_numpy
()
request_output_len
=
pb_utils
.
get_input_tensor_by_name
(
request
,
"REQUEST_OUTPUT_LEN"
).
as_numpy
()
time
.
sleep
(
self
.
_context_delay
)
# Create output tensors. You need pb_utils.Tensor
# objects to create pb_utils.InferenceResponse.
kv_cache_tensor
=
pb_utils
.
Tensor
(
"KV_CACHE"
,
input_ids
.
astype
(
self
.
kv_cache_dtype
)
)
output_ids_tensor
=
pb_utils
.
Tensor
(
"OUTPUT_IDS"
,
input_ids
.
astype
(
self
.
output_ids_dtype
)
)
sequence_length_tensor
=
pb_utils
.
Tensor
(
"SEQUENCE_LENGTH"
,
input_lengths
.
astype
(
self
.
sequence_length_dtype
)
)
request_output_len_tensor
=
pb_utils
.
Tensor
(
"REQUEST_OUTPUT_LEN"
,
request_output_len
)
inference_response
=
pb_utils
.
InferenceResponse
(
output_tensors
=
[
kv_cache_tensor
,
output_ids_tensor
,
sequence_length_tensor
,
request_output_len_tensor
,
]
)
responses
.
append
(
inference_response
)
return
responses
worker/tests/python/integration/operators/models/context/config.pbtxt
0 → 100644
View file @
f1f29171
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Emulates the tensorrt_llm config from:
# https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt
name: "context"
backend: "python"
max_batch_size: 0
parameters: {
key: "context_delay_ms"
value: {
string_value: "1000"
}
}
input [
{
name: "INPUT_IDS"
data_type: TYPE_INT32
dims: [ -1 ]
},
{
name: "INPUT_LENGTH"
data_type: TYPE_INT32
dims: [ 1 ]
},
{
name: "REQUEST_OUTPUT_LEN"
data_type: TYPE_INT32
dims: [ 1 ]
}
# Add more inputs as per requirement.
# For simplicity only sticking with these
# inputs for preprocessing.
]
output [
# Section of the first request that returns the first token.
# These will be handed over directly to the post-processor
{
name: "OUTPUT_IDS"
data_type: TYPE_INT32
dims: [ -1 ]
},
{
name: "SEQUENCE_LENGTH"
data_type: TYPE_INT32
dims: [ 1 ]
},
{
name: "REQUEST_OUTPUT_LEN"
data_type: TYPE_INT32
dims: [ 1 ]
},
# Section of the second part of handover to the generate stage
{
# TODO: revisit how kv cache is being exposed to generate worker.
name: "KV_CACHE"
data_type: TYPE_INT32
dims: [ -1 ]
}
# Add more outputs as per requirement.
# For simplicity only sticking with these
# outputs for preprocessing.
]
# Add more parameters as per requirement
instance_group [
{
count: 1
kind : KIND_CPU
}
]
worker/tests/python/integration/operators/models/divide/1/model.py
0 → 100644
View file @
f1f29171
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
json
import
numpy
as
np
import
triton_python_backend_utils
as
pb_utils
try
:
import
cupy
except
Exception
:
cupy
=
None
class
TritonPythonModel
:
@
staticmethod
def
auto_complete_config
(
auto_complete_model_config
):
inputs
=
[]
outputs
=
[]
dims
=
[
-
1
,
-
1
]
optional
=
True
for
data_type
in
[
"type_int64"
]:
type_name
=
data_type
.
split
(
"_"
)[
1
].
lower
()
input_name
=
f
"
{
type_name
}
_input"
output_name
=
"fp64_output_partial"
inputs
.
append
(
{
"name"
:
input_name
,
"data_type"
:
data_type
,
"dims"
:
dims
,
"optional"
:
optional
,
}
)
outputs
.
append
({
"name"
:
output_name
,
"data_type"
:
data_type
,
"dims"
:
dims
})
input_name
=
f
"
{
type_name
}
_input_divisor"
inputs
.
append
(
{
"name"
:
input_name
,
"data_type"
:
data_type
,
"dims"
:
dims
,
"optional"
:
optional
,
}
)
outputs
.
append
(
{
"name"
:
"output_parameters"
,
"data_type"
:
"TYPE_STRING"
,
"dims"
:
[
1
]}
)
for
input_
in
inputs
:
auto_complete_model_config
.
add_input
(
input_
)
for
output
in
outputs
:
auto_complete_model_config
.
add_output
(
output
)
auto_complete_model_config
.
set_max_batch_size
(
0
)
return
auto_complete_model_config
def
initialize
(
self
,
args
):
self
.
_model_config
=
json
.
loads
(
args
[
"model_config"
])
self
.
_request_gpu_memory
=
False
if
"parameters"
in
self
.
_model_config
:
parameters
=
self
.
_model_config
[
"parameters"
]
if
(
"request_gpu_memory"
in
parameters
and
parameters
[
"request_gpu_memory"
][
"string_value"
]
==
"True"
):
self
.
_request_gpu_memory
=
True
def
execute
(
self
,
requests
):
responses
=
[]
for
request
in
requests
:
output_tensors
=
[]
divisor
=
pb_utils
.
get_input_tensor_by_name
(
request
,
"int64_input_divisor"
)
divisor
=
divisor
.
as_numpy
()[
0
][
0
]
dividends
=
pb_utils
.
get_input_tensor_by_name
(
request
,
"int64_input"
)
dividends
=
dividends
.
as_numpy
()
output_value
=
np
.
array
([
np
.
divide
(
dividends
,
divisor
)])
if
self
.
_request_gpu_memory
:
output_value
=
cupy
.
array
(
output_value
)
output_tensor
=
pb_utils
.
Tensor
.
from_dlpack
(
"fp64_output_partial"
,
output_value
)
else
:
output_tensor
=
pb_utils
.
Tensor
(
"fp64_output_partial"
,
output_value
)
output_tensors
.
append
(
output_tensor
)
output_parameters
=
np
.
array
([
request
.
parameters
()]).
astype
(
np
.
object_
)
output_tensors
.
append
(
pb_utils
.
Tensor
(
"output_parameters"
,
output_parameters
)
)
responses
.
append
(
pb_utils
.
InferenceResponse
(
output_tensors
=
output_tensors
,
)
)
return
responses
worker/tests/python/integration/operators/models/divide/config.pbtxt
0 → 100644
View file @
f1f29171
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
backend: "python"
worker/tests/python/integration/operators/models/generation/1/model.py
0 → 100644
View file @
f1f29171
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
json
import
threading
import
time
import
numpy
as
np
import
triton_python_backend_utils
as
pb_utils
class
TritonPythonModel
:
def
initialize
(
self
,
args
):
model_config
=
json
.
loads
(
args
[
"model_config"
])
self
.
_output_token_latency
=
(
int
(
model_config
[
"parameters"
][
"inter_token_latency_ms"
][
"string_value"
])
)
/
1000
# You must parse model_config. JSON string is not parsed here
self
.
model_config
=
model_config
=
json
.
loads
(
args
[
"model_config"
])
using_decoupled
=
pb_utils
.
using_decoupled_model_transaction_policy
(
model_config
)
if
not
using_decoupled
:
raise
pb_utils
.
TritonModelException
(
"""the model `{}` can generate any number of responses per request,
enable decoupled transaction policy in model configuration to
serve this model"""
.
format
(
args
[
"model_name"
]
)
)
for
output_name
in
[
"OUTPUT_IDS"
,
"SEQUENCE_LENGTH"
]:
setattr
(
self
,
output_name
.
lower
()
+
"_dtype"
,
pb_utils
.
triton_string_to_numpy
(
pb_utils
.
get_output_config_by_name
(
model_config
,
output_name
)[
"data_type"
]
),
)
# To keep track of response threads so that we can delay
# the finalizing the model until all response threads
# have completed.
self
.
inflight_thread_count
=
0
self
.
inflight_thread_count_lck
=
threading
.
Lock
()
def
response_thread
(
self
,
response_sender
,
kv_cache
,
request_output_len
):
for
idx
in
range
(
request_output_len
):
time
.
sleep
(
self
.
_output_token_latency
)
output_ids_tensor
=
pb_utils
.
Tensor
(
"OUTPUT_IDS"
,
kv_cache
.
astype
(
self
.
output_ids_dtype
)
)
sequence_length
=
np
.
array
([
kv_cache
.
size
])
sequence_length_tensor
=
pb_utils
.
Tensor
(
"SEQUENCE_LENGTH"
,
sequence_length
.
astype
(
self
.
sequence_length_dtype
)
)
response
=
pb_utils
.
InferenceResponse
(
output_tensors
=
[
output_ids_tensor
,
sequence_length_tensor
]
)
response_sender
.
send
(
response
)
# We must close the response sender to indicate to Triton that we are
# done sending responses for the corresponding request. We can't use the
# response sender after closing it. The response sender is closed by
# setting the TRITONSERVER_RESPONSE_COMPLETE_FINAL.
response_sender
.
send
(
flags
=
pb_utils
.
TRITONSERVER_RESPONSE_COMPLETE_FINAL
)
with
self
.
inflight_thread_count_lck
:
self
.
inflight_thread_count
-=
1
def
execute
(
self
,
requests
):
for
idx
,
request
in
enumerate
(
requests
):
# Get input tensors
kv_cache
=
pb_utils
.
get_input_tensor_by_name
(
request
,
"KV_CACHE"
).
as_numpy
()
request_output_len
=
pb_utils
.
get_input_tensor_by_name
(
request
,
"REQUEST_OUTPUT_LEN"
).
as_numpy
()
# Start a separate thread to send the responses for the request. The
# sending back the responses is delegated to this thread.
thread
=
threading
.
Thread
(
target
=
self
.
response_thread
,
args
=
(
requests
[
0
].
get_response_sender
(),
kv_cache
,
request_output_len
[
0
],
),
)
# A model using decoupled transaction policy is not required to send all
# responses for the current request before returning from the execute.
# To demonstrate the flexibility of the decoupled API, we are running
# response thread entirely independent of the execute thread.
thread
.
daemon
=
True
with
self
.
inflight_thread_count_lck
:
self
.
inflight_thread_count
+=
1
thread
.
start
()
return
None
worker/tests/python/integration/operators/models/generation/config.pbtxt
0 → 100644
View file @
f1f29171
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Emulates the tensorrt_llm config from:
# https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt
name: "generation"
backend: "python"
max_batch_size: 0
model_transaction_policy {
decoupled: true
}
parameters: {
key: "inter_token_latency_ms"
value: {
string_value: "1000"
}
}
input [
{
# TODO: revisit how kv cache is being exposed to generate worker.
name: "KV_CACHE"
data_type: TYPE_INT32
dims: [ -1 ]
},
{
name: "REQUEST_OUTPUT_LEN"
data_type: TYPE_INT32
dims: [ 1 ]
}
# Add more inputs as per requirement.
# For simplicity only sticking with these
# inputs for preprocessing.
]
output [
{
name: "OUTPUT_IDS"
data_type: TYPE_INT32
dims: [ -1 ]
},
{
name: "SEQUENCE_LENGTH"
data_type: TYPE_INT32
dims: [ 1 ]
}
# Add more outputs as per requirement.
# For simplicity only sticking with these
# outputs for preprocessing.
]
# Add more parameters as per requirement
instance_group [
{
count: 1
kind : KIND_CPU
}
]
worker/tests/python/integration/operators/models/identity/1/model.py
0 → 100644
View file @
f1f29171
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
json
import
numpy
as
np
import
triton_python_backend_utils
as
pb_utils
try
:
import
cupy
except
Exception
:
cupy
=
None
class
TritonPythonModel
:
@
staticmethod
def
auto_complete_config
(
auto_complete_model_config
):
inputs
=
[]
outputs
=
[]
dims
=
[
-
1
,
-
1
]
optional
=
True
config
=
auto_complete_model_config
.
as_dict
()
for
data_type
in
pb_utils
.
TRITON_STRING_TO_NUMPY
.
keys
():
type_name
=
data_type
.
split
(
"_"
)[
1
].
lower
()
input_name
=
f
"
{
type_name
}
_input"
output_name
=
f
"
{
type_name
}
_output"
inputs
.
append
(
{
"name"
:
input_name
,
"data_type"
:
data_type
,
"dims"
:
dims
,
"optional"
:
optional
,
}
)
outputs
.
append
({
"name"
:
output_name
,
"data_type"
:
data_type
,
"dims"
:
dims
})
outputs
.
append
(
{
"name"
:
"output_parameters"
,
"data_type"
:
"TYPE_STRING"
,
"dims"
:
[
1
]}
)
for
input_
in
inputs
:
auto_complete_model_config
.
add_input
(
input_
)
for
output
in
outputs
:
auto_complete_model_config
.
add_output
(
output
)
auto_complete_model_config
.
set_max_batch_size
(
0
)
if
"decoupled"
in
config
[
"parameters"
]:
if
config
[
"parameters"
][
"decoupled"
][
"string_value"
]
==
"True"
:
auto_complete_model_config
.
set_model_transaction_policy
(
{
"decoupled"
:
True
}
)
return
auto_complete_model_config
def
initialize
(
self
,
args
):
self
.
_model_config
=
json
.
loads
(
args
[
"model_config"
])
self
.
_decoupled
=
self
.
_model_config
.
get
(
"model_transaction_policy"
,
{}).
get
(
"decoupled"
)
self
.
_request_gpu_memory
=
False
if
"parameters"
in
self
.
_model_config
:
parameters
=
self
.
_model_config
[
"parameters"
]
if
(
"request_gpu_memory"
in
parameters
and
parameters
[
"request_gpu_memory"
][
"string_value"
]
==
"True"
):
self
.
_request_gpu_memory
=
True
def
execute_decoupled
(
self
,
requests
):
for
request
in
requests
:
sender
=
request
.
get_response_sender
()
output_tensors
=
[]
for
input_tensor
in
request
.
inputs
():
input_value
=
input_tensor
.
as_numpy
()
output_tensor
=
pb_utils
.
Tensor
(
input_tensor
.
name
().
replace
(
"input"
,
"output"
),
input_value
)
output_tensors
.
append
(
output_tensor
)
sender
.
send
(
pb_utils
.
InferenceResponse
(
output_tensors
=
output_tensors
))
sender
.
send
(
flags
=
pb_utils
.
TRITONSERVER_RESPONSE_COMPLETE_FINAL
)
return
None
def
execute
(
self
,
requests
):
if
self
.
_decoupled
:
return
self
.
execute_decoupled
(
requests
)
responses
=
[]
for
request
in
requests
:
output_tensors
=
[]
for
input_tensor
in
request
.
inputs
():
input_value
=
input_tensor
.
as_numpy
()
if
self
.
_request_gpu_memory
:
input_value
=
cupy
.
array
(
input_value
)
output_tensor
=
pb_utils
.
Tensor
.
from_dlpack
(
input_tensor
.
name
().
replace
(
"input"
,
"output"
),
input_value
)
else
:
output_tensor
=
pb_utils
.
Tensor
(
input_tensor
.
name
().
replace
(
"input"
,
"output"
),
input_value
)
output_tensors
.
append
(
output_tensor
)
output_parameters
=
np
.
array
([
request
.
parameters
()]).
astype
(
np
.
object_
)
output_tensors
.
append
(
pb_utils
.
Tensor
(
"output_parameters"
,
output_parameters
)
)
responses
.
append
(
pb_utils
.
InferenceResponse
(
output_tensors
=
output_tensors
,
)
)
return
responses
worker/tests/python/integration/operators/models/identity/config.pbtxt
0 → 100644
View file @
f1f29171
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
backend: "python"
worker/tests/python/integration/operators/models/multiply/1/model.py
0 → 100644
View file @
f1f29171
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
json
import
numpy
as
np
import
triton_python_backend_utils
as
pb_utils
try
:
import
cupy
except
Exception
:
cupy
=
None
class
TritonPythonModel
:
@
staticmethod
def
auto_complete_config
(
auto_complete_model_config
):
inputs
=
[]
outputs
=
[]
dims
=
[
-
1
,
-
1
]
optional
=
True
for
data_type
in
[
"type_int64"
]:
type_name
=
data_type
.
split
(
"_"
)[
1
].
lower
()
input_name
=
f
"
{
type_name
}
_input"
output_name
=
f
"
{
type_name
}
_output_total"
inputs
.
append
(
{
"name"
:
input_name
,
"data_type"
:
data_type
,
"dims"
:
dims
,
"optional"
:
optional
,
}
)
outputs
.
append
({
"name"
:
output_name
,
"data_type"
:
data_type
,
"dims"
:
dims
})
outputs
.
append
(
{
"name"
:
"output_parameters"
,
"data_type"
:
"TYPE_STRING"
,
"dims"
:
[
1
]}
)
for
input_
in
inputs
:
auto_complete_model_config
.
add_input
(
input_
)
for
output
in
outputs
:
auto_complete_model_config
.
add_output
(
output
)
auto_complete_model_config
.
set_max_batch_size
(
0
)
return
auto_complete_model_config
def
initialize
(
self
,
args
):
self
.
_model_config
=
json
.
loads
(
args
[
"model_config"
])
self
.
_request_gpu_memory
=
False
if
"parameters"
in
self
.
_model_config
:
parameters
=
self
.
_model_config
[
"parameters"
]
if
(
"request_gpu_memory"
in
parameters
and
parameters
[
"request_gpu_memory"
][
"string_value"
]
==
"True"
):
self
.
_request_gpu_memory
=
True
def
execute
(
self
,
requests
):
responses
=
[]
for
request
in
requests
:
output_tensors
=
[]
for
input_tensor
in
request
.
inputs
():
input_value
=
input_tensor
.
as_numpy
()
output_value
=
np
.
array
([[
x
.
prod
()
for
x
in
input_value
]])
if
self
.
_request_gpu_memory
:
output_value
=
cupy
.
array
(
output_value
)
output_tensor
=
pb_utils
.
Tensor
.
from_dlpack
(
input_tensor
.
name
().
replace
(
"input"
,
"output_total"
),
output_value
,
)
else
:
output_tensor
=
pb_utils
.
Tensor
(
input_tensor
.
name
().
replace
(
"input"
,
"output_total"
),
output_value
,
)
output_tensors
.
append
(
output_tensor
)
output_parameters
=
np
.
array
([
request
.
parameters
()]).
astype
(
np
.
object_
)
output_tensors
.
append
(
pb_utils
.
Tensor
(
"output_parameters"
,
output_parameters
)
)
responses
.
append
(
pb_utils
.
InferenceResponse
(
output_tensors
=
output_tensors
,
)
)
return
responses
worker/tests/python/integration/operators/models/multiply/config.pbtxt
0 → 100644
View file @
f1f29171
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
backend: "python"
worker/tests/python/integration/operators/models/postprocessing/1/model.py
0 → 100644
View file @
f1f29171
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
json
import
numpy
as
np
import
triton_python_backend_utils
as
pb_utils
# from transformers import LlamaTokenizer
# llama_tokenizer = LlamaTokenizer.from_pretrained("/path/to/hfmodel")
from
transformers
import
XLNetTokenizer
class
TritonPythonModel
:
def
initialize
(
self
,
args
):
model_config
=
json
.
loads
(
args
[
"model_config"
])
for
output_name
in
[
"OUTPUT"
]:
setattr
(
self
,
output_name
.
lower
()
+
"_dtype"
,
pb_utils
.
triton_string_to_numpy
(
pb_utils
.
get_output_config_by_name
(
model_config
,
output_name
)[
"data_type"
]
),
)
# Using a mock hard coded auto-tokenizer
self
.
tokenizer
=
XLNetTokenizer
.
from_pretrained
(
"xlnet-base-cased"
)
def
execute
(
self
,
requests
):
responses
=
[]
for
idx
,
request
in
enumerate
(
requests
):
# Get input tensors
output_ids
=
pb_utils
.
get_input_tensor_by_name
(
request
,
"OUTPUT_IDS"
).
as_numpy
()
output_result
=
np
.
array
(
self
.
tokenizer
.
convert_ids_to_tokens
((
output_ids
.
tolist
()))
)
print
(
f
"Output Result
\n\n
{
output_result
}
"
,
flush
=
True
)
output_tensor
=
pb_utils
.
Tensor
(
"OUTPUT"
,
output_result
.
astype
(
self
.
output_dtype
)
)
inference_response
=
pb_utils
.
InferenceResponse
(
output_tensors
=
[
output_tensor
]
)
responses
.
append
(
inference_response
)
return
responses
worker/tests/python/integration/operators/models/postprocessing/config.pbtxt
0 → 100644
View file @
f1f29171
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Emulates the pre-processing config from:
# https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/all_models/inflight_batcher_llm/postprocessing/config.pbtxt
name: "postprocessing"
backend: "python"
max_batch_size: 0
input [
{
name: "OUTPUT_IDS"
data_type: TYPE_INT32
dims: [ -1 ]
},
{
name: "SEQUENCE_LENGTH"
data_type: TYPE_INT32
dims: [ 1 ]
}
# Add more inputs as per requirement.
# For simplicity only sticking with these
# inputs for preprocessing.
]
output [
{
name: "OUTPUT"
data_type: TYPE_STRING
dims: [ -1 ]
}
# Add more outputs as per requirement.
# For simplicity only sticking with these
# outputs for preprocessing.
]
instance_group [
{
count: 4
kind: KIND_CPU
}
]
worker/tests/python/integration/operators/models/preprocessing/1/model.py
0 → 100644
View file @
f1f29171
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
json
import
numpy
as
np
import
triton_python_backend_utils
as
pb_utils
# from transformers import LlamaTokenizer
# llama_tokenizer = LlamaTokenizer.from_pretrained("/path/to/hfmodel")
from
transformers
import
XLNetTokenizer
class
TritonPythonModel
:
"""
This is a mock disaggregated serving pre-processing model.
"""
def
initialize
(
self
,
args
):
model_config
=
json
.
loads
(
args
[
"model_config"
])
for
output_name
in
[
"INPUT_IDS"
,
"INPUT_LENGTH"
,
"REQUEST_OUTPUT_LEN"
]:
setattr
(
self
,
output_name
.
lower
()
+
"_dtype"
,
pb_utils
.
triton_string_to_numpy
(
pb_utils
.
get_output_config_by_name
(
model_config
,
output_name
)[
"data_type"
]
),
)
# Using a mock hard coded auto-tokenizer
self
.
tokenizer
=
XLNetTokenizer
.
from_pretrained
(
"xlnet-base-cased"
)
def
execute
(
self
,
requests
):
print
(
"In preprocessing execute!"
,
flush
=
True
)
responses
=
[]
for
idx
,
request
in
enumerate
(
requests
):
# Get input tensors
query
=
pb_utils
.
get_input_tensor_by_name
(
request
,
"query"
).
as_numpy
()
request_output_len
=
pb_utils
.
get_input_tensor_by_name
(
request
,
"request_output_len"
).
as_numpy
()
print
(
f
"query(pre-proc)
{
query
}
"
,
flush
=
True
)
tokenize
=
np
.
array
(
self
.
tokenizer
.
encode
(
query
[
0
].
decode
()))
print
(
f
"tokenize(pre-proc)
{
tokenize
.
size
}
"
,
flush
=
True
)
input_length
=
np
.
array
([
tokenize
.
size
])
# Just forwarding query to the pre-processed input_ids
input_id_tensor
=
pb_utils
.
Tensor
(
"INPUT_IDS"
,
tokenize
.
astype
(
self
.
input_ids_dtype
)
)
# Just forwarding query to the pre-processed input_ids
input_length_tensor
=
pb_utils
.
Tensor
(
"INPUT_LENGTH"
,
input_length
.
astype
(
self
.
input_length_dtype
)
)
request_output_len_tensor
=
pb_utils
.
Tensor
(
"REQUEST_OUTPUT_LEN"
,
request_output_len
)
inference_response
=
pb_utils
.
InferenceResponse
(
output_tensors
=
[
input_id_tensor
,
input_length_tensor
,
request_output_len_tensor
,
]
)
responses
.
append
(
inference_response
)
return
responses
worker/tests/python/integration/operators/models/preprocessing/config.pbtxt
0 → 100644
View file @
f1f29171
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Emulates the pre-processing config from:
# https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/all_models/inflight_batcher_llm/preprocessing/config.pbtxt
name: "preprocessing"
backend: "python"
max_batch_size: 0
input [
{
name: "query"
data_type: TYPE_STRING
dims: [ 1 ]
},
{
name: "request_output_len"
data_type: TYPE_INT32
dims: [ 1 ]
}
# Add more inputs as per requirement.
# For simplicity only sticking with these
# inputs for preprocessing.
]
output [
{
name: "INPUT_IDS"
data_type: TYPE_INT32
dims: [ -1 ]
},
{
name: "INPUT_LENGTH"
data_type: TYPE_INT32
dims: [ 1 ]
},
{
name: "REQUEST_OUTPUT_LEN"
data_type: TYPE_INT32
dims: [ 1 ]
}
# Add more outputs as per requirement.
# For simplicity only sticking with these
# outputs for preprocessing.
]
instance_group [
{
count: 4
kind: KIND_CPU
}
]
worker/tests/python/integration/test_add_multiply_divide.py
0 → 100644
View file @
f1f29171
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
asyncio
import
logging
import
sys
from
multiprocessing
import
Manager
,
Process
import
cupy
import
numpy
import
pytest
import
ucp
from
cupy_backends.cuda.api.runtime
import
CUDARuntimeError
from
triton_distributed.icp.nats_request_plane
import
NatsRequestPlane
from
triton_distributed.icp.ucp_data_plane
import
UcpDataPlane
from
triton_distributed.worker.log_formatter
import
LOGGER_NAME
from
triton_distributed.worker.operator
import
OperatorConfig
from
triton_distributed.worker.remote_operator
import
RemoteOperator
from
triton_distributed.worker.triton_core_operator
import
TritonCoreOperator
from
triton_distributed.worker.worker
import
WorkerConfig
NATS_PORT
=
4223
MODEL_REPOSITORY
=
"/workspace/worker/tests/python/integration/operators/models"
WORKFLOW_REPOSITORY
=
"/workspace/worker/tests/python/integration/operators"
TRITON_LOG_LEVEL
=
6
logger
=
logging
.
getLogger
(
LOGGER_NAME
)
# Run cupy's cuda.is_available once to
# avoid the exception hitting runtime code.
try
:
if
cupy
.
cuda
.
is_available
():
pass
else
:
print
(
"CUDA not available."
)
except
CUDARuntimeError
:
print
(
"CUDA not available"
)
# TODO
# Decide if this should be
# pre merge, nightly, or weekly
pytestmark
=
pytest
.
mark
.
pre_merge
@
pytest
.
fixture
def
workers
(
worker_manager
,
request
):
worker_config
=
WorkerConfig
(
request_plane
=
NatsRequestPlane
,
data_plane
=
UcpDataPlane
,
request_plane_args
=
([],
{
"request_plane_uri"
:
f
"nats://localhost:
{
NATS_PORT
}
"
}),
log_level
=
TRITON_LOG_LEVEL
,
)
store_outputs_in_response
=
request
.
getfixturevalue
(
"store_outputs_in_response"
)
add_model
=
OperatorConfig
(
name
=
"add"
,
implementation
=
TritonCoreOperator
,
version
=
1
,
max_inflight_requests
=
10
,
parameters
=
{
"store_outputs_in_response"
:
store_outputs_in_response
},
repository
=
MODEL_REPOSITORY
,
)
multiply_model
=
OperatorConfig
(
name
=
"multiply"
,
implementation
=
TritonCoreOperator
,
version
=
1
,
max_inflight_requests
=
10
,
parameters
=
{
"store_outputs_in_response"
:
store_outputs_in_response
},
repository
=
MODEL_REPOSITORY
,
)
divide_model
=
OperatorConfig
(
name
=
"divide"
,
implementation
=
TritonCoreOperator
,
version
=
1
,
max_inflight_requests
=
10
,
parameters
=
{
"store_outputs_in_response"
:
store_outputs_in_response
},
repository
=
MODEL_REPOSITORY
,
)
workflow
=
OperatorConfig
(
name
=
"add_multiply_divide"
,
implementation
=
"add_multiply_divide:AddMultiplyDivide"
,
version
=
1
,
max_inflight_requests
=
10
,
parameters
=
{
"store_outputs_in_response"
:
store_outputs_in_response
},
repository
=
WORKFLOW_REPOSITORY
,
)
with
Manager
()
as
manager
:
workers
=
[]
queues
=
[]
queues
.
append
(
manager
.
Queue
(
maxsize
=
1
))
workers
.
append
(
worker_manager
.
setup_worker_process
(
[
add_model
],
"add"
,
queues
[
-
1
],
worker_config
)
)
queues
.
append
(
manager
.
Queue
(
maxsize
=
1
))
workers
.
append
(
worker_manager
.
setup_worker_process
(
[
multiply_model
],
"multiply"
,
queues
[
-
1
],
worker_config
)
)
queues
.
append
(
manager
.
Queue
(
maxsize
=
1
))
workers
.
append
(
worker_manager
.
setup_worker_process
(
[
divide_model
],
"divide"
,
queues
[
-
1
],
worker_config
)
)
queues
.
append
(
manager
.
Queue
(
maxsize
=
1
))
workers
.
append
(
worker_manager
.
setup_worker_process
(
[
workflow
],
"add_multiply_divide"
,
queues
[
-
1
],
worker_config
)
)
workers_failed
=
False
status_list
=
[]
for
queue
,
worker
in
zip
(
queues
,
workers
):
status
=
queue
.
get
()
status_list
.
append
(
status
)
if
status
!=
"READY"
:
workers_failed
=
True
if
workers_failed
:
worker_manager
.
cleanup_workers
(
workers
,
check_status
=
False
)
raise
Exception
(
f
"Failed to start worker processes:
{
status_list
}
"
)
yield
workers
worker_manager
.
cleanup_workers
(
workers
)
def
_create_inputs
(
number
,
size
):
inputs
=
[]
outputs
=
[]
for
index
in
range
(
number
):
input_
=
numpy
.
random
.
randint
(
low
=
1
,
high
=
100
,
size
=
[
2
,
size
])
expected_
=
{}
expected_
[
"add_int64_output_total"
]
=
numpy
.
array
([[
input_
.
sum
()]])
expected_
[
"add_int64_output_partial"
]
=
numpy
.
array
([[
x
.
sum
()
for
x
in
input_
]])
expected_
[
"multiply_int64_output_total"
]
=
numpy
.
array
(
[[
x
.
prod
()
for
x
in
expected_
[
"add_int64_output_partial"
]]]
)
divisor
=
expected_
[
"add_int64_output_total"
][
0
][
0
]
dividends
=
expected_
[
"add_int64_output_partial"
]
expected_
[
"divide_fp64_output_partial"
]
=
numpy
.
array
(
[
numpy
.
divide
(
dividends
,
divisor
)]
)
inputs
.
append
(
input_
)
outputs
.
append
(
expected_
)
return
inputs
,
outputs
async
def
post_requests
(
num_requests
,
store_inputs_in_request
):
ucp
.
reset
()
timeout
=
5
data_plane
=
UcpDataPlane
()
data_plane
.
connect
()
request_plane
=
NatsRequestPlane
(
f
"nats://localhost:
{
NATS_PORT
}
"
)
await
request_plane
.
connect
()
add_multiply_divide_model
=
RemoteOperator
(
"add_multiply_divide"
,
1
,
request_plane
,
data_plane
)
results
=
[]
expected_results
=
{}
inputs
,
outputs
=
_create_inputs
(
num_requests
,
40
)
for
i
,
input_
in
enumerate
(
inputs
):
request_id
=
str
(
i
)
request
=
add_multiply_divide_model
.
create_request
(
inputs
=
{
"int64_input"
:
input_
},
request_id
=
request_id
)
if
store_inputs_in_request
:
request
.
store_inputs_in_request
.
add
(
"int64_input"
)
print
(
request
)
results
.
append
(
add_multiply_divide_model
.
async_infer
(
request
))
expected_results
[
request_id
]
=
outputs
[
i
]
for
result
in
asyncio
.
as_completed
(
results
):
responses
=
await
result
async
for
response
in
responses
:
print
(
response
)
for
output_name
,
expected_value
in
expected_results
[
response
.
request_id
].
items
():
output
=
response
.
outputs
[
output_name
]
output_value
=
numpy
.
from_dlpack
(
output
.
to_host
())
numpy
.
testing
.
assert_equal
(
output_value
,
expected_value
)
del
output
print
(
expected_results
[
response
.
request_id
])
del
response
timeout
=
5
data_plane
.
close
(
timeout
)
await
request_plane
.
close
()
def
run
(
num_requests
,
store_inputs_in_request
=
False
):
sys
.
exit
(
asyncio
.
run
(
post_requests
(
num_requests
=
num_requests
,
store_inputs_in_request
=
store_inputs_in_request
,
)
)
)
@
pytest
.
mark
.
skipif
(
"(not os.path.exists('/usr/local/bin/nats-server'))"
,
reason
=
"NATS.io not present"
,
)
@
pytest
.
mark
.
timeout
(
30
)
@
pytest
.
mark
.
parametrize
(
[
"store_inputs_in_request"
,
"store_outputs_in_response"
],
[(
False
,
False
),
(
True
,
True
)],
)
def
test_add_multiply_divide
(
request
,
nats_server
,
workers
,
store_inputs_in_request
,
store_outputs_in_response
):
# Using a separate process to use data plane across multiple tests.
p
=
Process
(
target
=
run
,
args
=
(
2
,
store_inputs_in_request
))
p
.
start
()
p
.
join
()
assert
p
.
exitcode
==
0
worker/tests/python/unit/test_args.py
0 → 100644
View file @
f1f29171
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
pytest
from
triton_distributed.worker.parser
import
Parser
"""
Tests for parsing the arguments by command line parser
"""
@
pytest
.
fixture
def
default_values
():
# Add default values for the command-line interface
return
{
"request_plane_uri"
:
"nats://localhost:4222"
,
"log_level"
:
0
,
# TODO: Add the default options for the worker executable here
}
def
test_parse_args_default
(
default_values
):
# Tests for default values
args
,
parser
=
Parser
.
parse_args
([])
assert
args
.
request_plane_uri
==
default_values
[
"request_plane_uri"
]
assert
args
.
log_level
==
default_values
[
"log_level"
]
if
args
.
operators
:
raise
Exception
(
f
"Expected no operators by default, got
{
args
.
operators
}
"
)
if
args
.
operator_configs
:
raise
Exception
(
f
"Expected no operators by default, got
{
args
.
operator_configs
}
"
)
@
pytest
.
mark
.
parametrize
(
"valid_request_plane_uri"
,
[
"https://example.com"
,
# Add valid request plane uri values
],
)
def
test_parse_args_valid_request_plane_uri
(
valid_request_plane_uri
):
# Tests with valid values for request plane uri
args
,
_
=
Parser
.
parse_args
([
"--request-plane-uri"
,
valid_request_plane_uri
])
assert
args
.
request_plane_uri
==
valid_request_plane_uri
def
clean_argument_list
(
args_list
):
return
[
x
for
x
in
args_list
if
x
is
not
None
]
@
pytest
.
mark
.
parametrize
(
"first_arg, second_arg, third_arg"
,
[
(
"name:abc"
,
"version:1"
,
"max_inflight_requests:5"
),
(
"name:abc"
,
"max_inflight_requests:5"
,
None
),
(
"name:abc"
,
"version:1"
,
None
),
(
"name:abc"
,
None
,
None
),
# Add valid cases
],
)
def
test_parse_args_valid_model
(
first_arg
,
second_arg
,
third_arg
,
tmp_path
):
model_repo_path
=
tmp_path
/
"model_repo"
model_repo_path
.
mkdir
()
d
=
model_repo_path
/
"abc"
d
.
mkdir
()
# Tests with valid arguments
input_args
=
[
"--operator"
]
model_args
=
clean_argument_list
(
[
first_arg
,
second_arg
,
third_arg
,
f
"repository:
{
model_repo_path
}
"
,
"module:worker.triton_core_operator:TritonCoreOperator"
,
]
)
print
(
model_args
)
input_args
=
input_args
+
model_args
args
,
_
=
Parser
.
parse_args
(
input_args
)
assert
args
.
operators
[
0
]
==
model_args
def
test_parse_args_invalid_operator
(
capsys
):
# Tests with invalid arguments
with
pytest
.
raises
(
SystemExit
):
Parser
.
parse_args
([
"--operator"
])
captured
=
capsys
.
readouterr
()
assert
"expected at least one argument"
in
captured
.
err
@
pytest
.
mark
.
parametrize
(
"first_arg, second_arg, third_arg"
,
[
(
"name:abc"
,
"version:1"
,
"max_inflight_requests:5"
),
(
"name:abc"
,
"max_inflight_requests:5"
,
None
),
(
"name:abc"
,
"version:1"
,
None
),
# TODO: Revisit can be uncommented once the operator module can be inferred automatically.
# ("abc", None, None),
# Add valid cases
],
)
def
test_parse_args_valid_operator
(
first_arg
,
second_arg
,
third_arg
,
tmp_path
):
repo_path
=
tmp_path
/
"worker_repo"
repo_path
.
mkdir
()
d
=
repo_path
/
"abc"
d
.
mkdir
()
# Tests with valid arguments
input_args
=
[
"--operator"
]
operator_args
=
clean_argument_list
([
first_arg
,
second_arg
,
third_arg
])
input_args
=
input_args
+
operator_args
+
[
"module:dummyworkflow:Workflow"
]
args
,
_
=
Parser
.
parse_args
(
input_args
)
assert
args
.
operators
[
0
]
==
operator_args
+
[
"module:dummyworkflow:Workflow"
]
worker/tests/python/unit/test_logger.py
0 → 100644
View file @
f1f29171
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
logging
import
pytest
from
triton_distributed.worker.log_formatter
import
LOGGER_NAME
,
setup_logger
logger
=
logging
.
getLogger
(
LOGGER_NAME
)
MSG
=
"This is a sample message"
"""
Tests for Logging module
"""
def
logging_function
(
logger
):
logger
.
info
(
MSG
)
logger
.
warning
(
MSG
)
try
:
raise
Exception
(
"This is an exception"
)
except
Exception
:
logger
.
exception
(
MSG
)
logger
.
error
(
MSG
)
logger
.
debug
(
MSG
)
@
pytest
.
fixture
def
reset_logger
(
caplog
):
loggers
=
[
logging
.
getLogger
(
name
)
for
name
in
logging
.
root
.
manager
.
loggerDict
]
loggers
.
append
(
logging
.
getLogger
())
for
logger
in
loggers
:
handlers
=
logger
.
handlers
[:]
for
handler
in
handlers
:
logger
.
removeHandler
(
handler
)
handler
.
close
()
logger
.
setLevel
(
logging
.
NOTSET
)
logger
.
propagate
=
True
caplog
.
clear
()
@
pytest
.
mark
.
parametrize
(
"log_level, expected_record_counts"
,
[
# For log-level 0 only error and exception should be recorded
(
0
,
2
),
# For log-level 1 only info, error, exception and warning should be recorded
(
1
,
4
),
# All logs(error, exception, info, debug and warning) should be printed for log-level 2
(
2
,
5
),
],
)
def
test_logging
(
reset_logger
,
caplog
,
log_level
,
expected_record_counts
):
caplog
.
set_level
(
log_level
)
setup_logger
(
log_level
=
log_level
)
logging_function
(
logger
)
assert
len
(
caplog
.
records
)
==
expected_record_counts
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment