Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
0bfd9a76
Commit
0bfd9a76
authored
Feb 24, 2025
by
Neelay Shah
Committed by
GitHub
Feb 24, 2025
Browse files
refactor: remove python native runtime
parent
8f741f14
Changes
132
Hide whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
0 additions
and
1666 deletions
+0
-1666
runtime/tests/python/integration/operators/triton_core_models/multiply/config.pbtxt
...ration/operators/triton_core_models/multiply/config.pbtxt
+0
-16
runtime/tests/python/integration/operators/triton_core_models/postprocessing/1/model.py
...on/operators/triton_core_models/postprocessing/1/model.py
+0
-66
runtime/tests/python/integration/operators/triton_core_models/postprocessing/config.pbtxt
.../operators/triton_core_models/postprocessing/config.pbtxt
+0
-55
runtime/tests/python/integration/operators/triton_core_models/preprocessing/1/model.py
...ion/operators/triton_core_models/preprocessing/1/model.py
+0
-86
runtime/tests/python/integration/operators/triton_core_models/preprocessing/config.pbtxt
...n/operators/triton_core_models/preprocessing/config.pbtxt
+0
-68
runtime/tests/python/integration/test_add_multiply_divide.py
runtime/tests/python/integration/test_add_multiply_divide.py
+0
-234
runtime/tests/python/integration/test_consolidated_logging.py
...ime/tests/python/integration/test_consolidated_logging.py
+0
-237
runtime/tests/python/integration/test_direct.py
runtime/tests/python/integration/test_direct.py
+0
-178
runtime/tests/python/integration/test_mock_disaggregated_serving.py
...sts/python/integration/test_mock_disaggregated_serving.py
+0
-290
runtime/tests/python/integration/test_perf_benchmark.py
runtime/tests/python/integration/test_perf_benchmark.py
+0
-237
runtime/tests/python/unit/test_args.py
runtime/tests/python/unit/test_args.py
+0
-127
runtime/tests/python/unit/test_logger.py
runtime/tests/python/unit/test_logger.py
+0
-72
No files found.
runtime/tests/python/integration/operators/triton_core_models/multiply/config.pbtxt
deleted
100644 → 0
View file @
8f741f14
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
backend: "python"
runtime/tests/python/integration/operators/triton_core_models/postprocessing/1/model.py
deleted
100644 → 0
View file @
8f741f14
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
json
import
numpy
as
np
import
triton_python_backend_utils
as
pb_utils
# from transformers import LlamaTokenizer
# llama_tokenizer = LlamaTokenizer.from_pretrained("/path/to/hfmodel")
from
transformers
import
XLNetTokenizer
class
TritonPythonModel
:
def
initialize
(
self
,
args
):
model_config
=
json
.
loads
(
args
[
"model_config"
])
for
output_name
in
[
"OUTPUT"
]:
setattr
(
self
,
output_name
.
lower
()
+
"_dtype"
,
pb_utils
.
triton_string_to_numpy
(
pb_utils
.
get_output_config_by_name
(
model_config
,
output_name
)[
"data_type"
]
),
)
# Using a mock hard coded auto-tokenizer
self
.
tokenizer
=
XLNetTokenizer
.
from_pretrained
(
"xlnet-base-cased"
)
self
.
_logger
=
pb_utils
.
Logger
def
execute
(
self
,
requests
):
responses
=
[]
for
idx
,
request
in
enumerate
(
requests
):
# Get input tensors
output_ids
=
pb_utils
.
get_input_tensor_by_name
(
request
,
"OUTPUT_IDS"
).
as_numpy
()
output_result
=
np
.
array
(
self
.
tokenizer
.
convert_ids_to_tokens
((
output_ids
.
tolist
()))
)
self
.
_logger
.
log_verbose
(
f
"Output Result
\n\n
{
output_result
}
"
)
output_tensor
=
pb_utils
.
Tensor
(
"OUTPUT"
,
output_result
.
astype
(
self
.
output_dtype
)
)
inference_response
=
pb_utils
.
InferenceResponse
(
output_tensors
=
[
output_tensor
]
)
responses
.
append
(
inference_response
)
return
responses
runtime/tests/python/integration/operators/triton_core_models/postprocessing/config.pbtxt
deleted
100644 → 0
View file @
8f741f14
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Emulates the pre-processing config from:
# https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/all_models/inflight_batcher_llm/postprocessing/config.pbtxt
name: "postprocessing"
backend: "python"
max_batch_size: 0
input [
{
name: "OUTPUT_IDS"
data_type: TYPE_INT32
dims: [ -1 ]
},
{
name: "SEQUENCE_LENGTH"
data_type: TYPE_INT32
dims: [ 1 ]
}
# Add more inputs as per requirement.
# For simplicity only sticking with these
# inputs for preprocessing.
]
output [
{
name: "OUTPUT"
data_type: TYPE_STRING
dims: [ -1 ]
}
# Add more outputs as per requirement.
# For simplicity only sticking with these
# outputs for preprocessing.
]
instance_group [
{
count: 4
kind: KIND_CPU
}
]
runtime/tests/python/integration/operators/triton_core_models/preprocessing/1/model.py
deleted
100644 → 0
View file @
8f741f14
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
json
import
numpy
as
np
import
triton_python_backend_utils
as
pb_utils
# from transformers import LlamaTokenizer
# llama_tokenizer = LlamaTokenizer.from_pretrained("/path/to/hfmodel")
from
transformers
import
XLNetTokenizer
class
TritonPythonModel
:
"""
This is a mock disaggregated serving pre-processing model.
"""
def
initialize
(
self
,
args
):
model_config
=
json
.
loads
(
args
[
"model_config"
])
for
output_name
in
[
"INPUT_IDS"
,
"INPUT_LENGTH"
,
"REQUEST_OUTPUT_LEN"
]:
setattr
(
self
,
output_name
.
lower
()
+
"_dtype"
,
pb_utils
.
triton_string_to_numpy
(
pb_utils
.
get_output_config_by_name
(
model_config
,
output_name
)[
"data_type"
]
),
)
# Using a mock hard coded auto-tokenizer
self
.
tokenizer
=
XLNetTokenizer
.
from_pretrained
(
"xlnet-base-cased"
)
self
.
logger
=
pb_utils
.
Logger
def
execute
(
self
,
requests
):
self
.
logger
.
log_verbose
(
"In preprocessing execute!"
)
responses
=
[]
for
idx
,
request
in
enumerate
(
requests
):
# Get input tensors
query
=
pb_utils
.
get_input_tensor_by_name
(
request
,
"query"
).
as_numpy
()
request_output_len
=
pb_utils
.
get_input_tensor_by_name
(
request
,
"request_output_len"
).
as_numpy
()
self
.
logger
.
log_verbose
(
f
"query(pre-proc)
{
query
}
"
)
tokenize
=
np
.
array
(
self
.
tokenizer
.
encode
(
query
[
0
].
decode
()))
self
.
logger
.
log_verbose
(
f
"tokenize(pre-proc)
{
tokenize
.
size
}
"
)
input_length
=
np
.
array
([
tokenize
.
size
])
# Just forwarding query to the pre-processed input_ids
input_id_tensor
=
pb_utils
.
Tensor
(
"INPUT_IDS"
,
tokenize
.
astype
(
self
.
input_ids_dtype
)
)
# Just forwarding query to the pre-processed input_ids
input_length_tensor
=
pb_utils
.
Tensor
(
"INPUT_LENGTH"
,
input_length
.
astype
(
self
.
input_length_dtype
)
)
request_output_len_tensor
=
pb_utils
.
Tensor
(
"REQUEST_OUTPUT_LEN"
,
request_output_len
)
inference_response
=
pb_utils
.
InferenceResponse
(
output_tensors
=
[
input_id_tensor
,
input_length_tensor
,
request_output_len_tensor
,
]
)
responses
.
append
(
inference_response
)
return
responses
runtime/tests/python/integration/operators/triton_core_models/preprocessing/config.pbtxt
deleted
100644 → 0
View file @
8f741f14
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Emulates the pre-processing config from:
# https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/all_models/inflight_batcher_llm/preprocessing/config.pbtxt
name: "preprocessing"
backend: "python"
max_batch_size: 0
input [
{
name: "query"
data_type: TYPE_STRING
dims: [ 1 ]
},
{
name: "request_output_len"
data_type: TYPE_INT32
dims: [ 1 ]
}
# Add more inputs as per requirement.
# For simplicity only sticking with these
# inputs for preprocessing.
]
output [
{
name: "INPUT_IDS"
data_type: TYPE_INT32
dims: [ -1 ]
},
{
name: "INPUT_LENGTH"
data_type: TYPE_INT32
dims: [ 1 ]
},
{
name: "REQUEST_OUTPUT_LEN"
data_type: TYPE_INT32
dims: [ 1 ]
}
# Add more outputs as per requirement.
# For simplicity only sticking with these
# outputs for preprocessing.
]
instance_group [
{
count: 4
kind: KIND_CPU
}
]
runtime/tests/python/integration/test_add_multiply_divide.py
deleted
100644 → 0
View file @
8f741f14
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
asyncio
import
sys
from
multiprocessing
import
Process
import
cupy
import
numpy
import
pytest
import
ucp
from
cupy_backends.cuda.api.runtime
import
CUDARuntimeError
from
triton_distributed.icp.nats_request_plane
import
NatsRequestPlane
from
triton_distributed.icp.ucp_data_plane
import
UcpDataPlane
from
triton_distributed.runtime.deployment
import
Deployment
from
triton_distributed.runtime.logger
import
get_logger
from
triton_distributed.runtime.operator
import
OperatorConfig
from
triton_distributed.runtime.remote_operator
import
RemoteOperator
from
triton_distributed.runtime.triton_core_operator
import
TritonCoreOperator
from
triton_distributed.runtime.worker
import
WorkerConfig
NATS_PORT
=
4223
MODEL_REPOSITORY
=
(
"/workspace/runtime/tests/python/integration/operators/triton_core_models"
)
OPERATORS_REPOSITORY
=
"/workspace/runtime/tests/python/integration/operators"
TRITON_LOG_LEVEL
=
6
logger
=
get_logger
(
__name__
)
# Run cupy's cuda.is_available once to
# avoid the exception hitting runtime code.
try
:
if
cupy
.
cuda
.
is_available
():
pass
else
:
print
(
"CUDA not available."
)
except
CUDARuntimeError
:
print
(
"CUDA not available"
)
# TODO
# Decide if this should be
# pre merge, nightly, or weekly
pytestmark
=
pytest
.
mark
.
pre_merge
@
pytest
.
fixture
def
workers
(
request
,
log_dir
):
operator_configs
=
{}
store_outputs_in_response
=
request
.
getfixturevalue
(
"store_outputs_in_response"
)
# Add configs for triton core operators
triton_core_operators
=
[
"add"
,
"multiply"
,
"divide"
]
for
operator_name
in
triton_core_operators
:
operator_configs
[
operator_name
]
=
OperatorConfig
(
name
=
operator_name
,
implementation
=
TritonCoreOperator
,
version
=
1
,
max_inflight_requests
=
10
,
parameters
=
{
"store_outputs_in_response"
:
store_outputs_in_response
},
repository
=
MODEL_REPOSITORY
,
)
# Add configs for other custom operators
operator_name
=
"add_multiply_divide"
operator_configs
[
operator_name
]
=
OperatorConfig
(
name
=
operator_name
,
implementation
=
"add_multiply_divide:AddMultiplyDivide"
,
version
=
1
,
max_inflight_requests
=
10
,
parameters
=
{
"store_outputs_in_response"
:
store_outputs_in_response
},
repository
=
OPERATORS_REPOSITORY
,
)
worker_configs
=
[]
test_log_dir
=
log_dir
/
request
.
node
.
name
test_log_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
# We will instantiate a worker for each operator
for
name
,
operator_config
in
operator_configs
.
items
():
# Set the logging directory
worker_log_dir
=
test_log_dir
/
name
worker_configs
.
append
(
WorkerConfig
(
name
=
name
,
request_plane
=
NatsRequestPlane
,
data_plane
=
UcpDataPlane
,
request_plane_args
=
(
[],
{
"request_plane_uri"
:
f
"nats://localhost:
{
NATS_PORT
}
"
},
),
log_level
=
TRITON_LOG_LEVEL
,
log_dir
=
str
(
worker_log_dir
),
operators
=
[
operator_config
],
)
)
worker_deployment
=
Deployment
(
worker_configs
)
worker_deployment
.
start
()
yield
worker_deployment
worker_deployment
.
shutdown
()
def
_create_inputs
(
number
,
size
):
inputs
=
[]
outputs
=
[]
for
index
in
range
(
number
):
input_
=
numpy
.
random
.
randint
(
low
=
1
,
high
=
100
,
size
=
[
2
,
size
])
expected_
=
{}
expected_
[
"add_int64_output_total"
]
=
numpy
.
array
([[
input_
.
sum
()]])
expected_
[
"add_int64_output_partial"
]
=
numpy
.
array
([[
x
.
sum
()
for
x
in
input_
]])
expected_
[
"multiply_int64_output_total"
]
=
numpy
.
array
(
[[
x
.
prod
()
for
x
in
expected_
[
"add_int64_output_partial"
]]]
)
divisor
=
expected_
[
"add_int64_output_total"
][
0
][
0
]
dividends
=
expected_
[
"add_int64_output_partial"
]
expected_
[
"divide_fp64_output_partial"
]
=
numpy
.
array
(
[
numpy
.
divide
(
dividends
,
divisor
)]
)
inputs
.
append
(
input_
)
outputs
.
append
(
expected_
)
return
inputs
,
outputs
async
def
post_requests
(
num_requests
,
store_inputs_in_request
):
"""
Post requests to add_multiply_divide operator.
"""
ucp
.
reset
()
timeout
=
5
data_plane
=
UcpDataPlane
()
data_plane
.
connect
()
request_plane
=
NatsRequestPlane
(
f
"nats://localhost:
{
NATS_PORT
}
"
)
await
request_plane
.
connect
()
add_multiply_divide_operator
=
RemoteOperator
(
"add_multiply_divide"
,
request_plane
,
data_plane
)
results
=
[]
expected_results
=
{}
inputs
,
outputs
=
_create_inputs
(
num_requests
,
40
)
for
i
,
input_
in
enumerate
(
inputs
):
request_id
=
str
(
i
)
request
=
add_multiply_divide_operator
.
create_request
(
inputs
=
{
"int64_input"
:
input_
},
request_id
=
request_id
)
if
store_inputs_in_request
:
request
.
store_inputs_in_request
.
add
(
"int64_input"
)
print
(
request
)
results
.
append
(
add_multiply_divide_operator
.
async_infer
(
request
))
expected_results
[
request_id
]
=
outputs
[
i
]
for
result
in
asyncio
.
as_completed
(
results
):
responses
=
await
result
async
for
response
in
responses
:
print
(
response
)
for
output_name
,
expected_value
in
expected_results
[
response
.
request_id
].
items
():
output
=
response
.
outputs
[
output_name
]
output_value
=
numpy
.
from_dlpack
(
output
.
to_host
())
numpy
.
testing
.
assert_equal
(
output_value
,
expected_value
)
del
output
print
(
expected_results
[
response
.
request_id
])
del
response
timeout
=
5
data_plane
.
close
(
timeout
)
await
request_plane
.
close
()
def
run
(
num_requests
,
store_inputs_in_request
=
False
):
sys
.
exit
(
asyncio
.
run
(
post_requests
(
num_requests
=
num_requests
,
store_inputs_in_request
=
store_inputs_in_request
,
)
)
)
@
pytest
.
mark
.
skipif
(
"(not os.path.exists('/usr/local/bin/nats-server'))"
,
reason
=
"NATS.io not present"
,
)
@
pytest
.
mark
.
timeout
(
120
)
@
pytest
.
mark
.
parametrize
(
[
"store_inputs_in_request"
,
"store_outputs_in_response"
],
[(
False
,
False
),
(
True
,
True
)],
)
def
test_add_multiply_divide
(
request
,
nats_server
,
workers
,
store_inputs_in_request
,
store_outputs_in_response
,
):
# Using a separate process to use data plane across multiple tests.
p
=
Process
(
target
=
run
,
args
=
(
2
,
store_inputs_in_request
))
p
.
start
()
p
.
join
()
assert
p
.
exitcode
==
0
runtime/tests/python/integration/test_consolidated_logging.py
deleted
100644 → 0
View file @
8f741f14
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
asyncio
import
pathlib
import
sys
from
multiprocessing
import
Process
import
cupy
import
numpy
import
pytest
import
ucp
from
cupy_backends.cuda.api.runtime
import
CUDARuntimeError
from
triton_distributed.icp.nats_request_plane
import
NatsRequestPlane
from
triton_distributed.icp.ucp_data_plane
import
UcpDataPlane
from
triton_distributed.runtime.deployment
import
Deployment
from
triton_distributed.runtime.logger
import
get_logger
from
triton_distributed.runtime.operator
import
OperatorConfig
from
triton_distributed.runtime.remote_operator
import
RemoteOperator
from
triton_distributed.runtime.triton_core_operator
import
TritonCoreOperator
from
triton_distributed.runtime.worker
import
WorkerConfig
NATS_PORT
=
4223
MODEL_REPOSITORY
=
(
"/workspace/runtime/tests/python/integration/operators/triton_core_models"
)
OPERATORS_REPOSITORY
=
"/workspace/runtime/tests/python/integration/operators"
TRITON_LOG_LEVEL
=
6
logger
=
get_logger
(
__name__
)
# Run cupy's cuda.is_available once to
# avoid the exception hitting runtime code.
try
:
if
cupy
.
cuda
.
is_available
():
pass
else
:
print
(
"CUDA not available."
)
except
CUDARuntimeError
:
print
(
"CUDA not available"
)
# TODO
# Decide if this should be
# pre merge, nightly, or weekly
pytestmark
=
pytest
.
mark
.
pre_merge
@
pytest
.
fixture
def
workers
(
request
,
log_dir
):
operator_configs
=
{}
# Add configs for triton core operators
triton_core_operators
=
[
"add"
,
"multiply"
,
"divide"
]
for
operator_name
in
triton_core_operators
:
operator_configs
[
operator_name
]
=
OperatorConfig
(
name
=
operator_name
,
implementation
=
TritonCoreOperator
,
version
=
1
,
max_inflight_requests
=
10
,
repository
=
MODEL_REPOSITORY
,
)
# Add configs for other custom operators
operator_name
=
"add_multiply_divide"
operator_configs
[
operator_name
]
=
OperatorConfig
(
name
=
operator_name
,
implementation
=
"add_multiply_divide:AddMultiplyDivide"
,
version
=
1
,
max_inflight_requests
=
10
,
repository
=
OPERATORS_REPOSITORY
,
)
worker_configs
=
[]
test_log_dir
=
log_dir
/
request
.
node
.
name
test_log_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
# We will instantiate a worker for each operator
for
name
,
operator_config
in
operator_configs
.
items
():
# Set the logging directory
worker_log_dir
=
test_log_dir
/
name
worker_configs
.
append
(
WorkerConfig
(
name
=
name
,
request_plane
=
NatsRequestPlane
,
data_plane
=
UcpDataPlane
,
request_plane_args
=
(
[],
{
"request_plane_uri"
:
f
"nats://localhost:
{
NATS_PORT
}
"
},
),
log_level
=
TRITON_LOG_LEVEL
,
log_dir
=
str
(
worker_log_dir
),
operators
=
[
operator_config
],
)
)
consolidate_logs
=
request
.
getfixturevalue
(
"consolidate_logs"
)
worker_deployment
=
Deployment
(
worker_configs
,
consolidate_logs
=
consolidate_logs
,
log_dir
=
log_dir
,
)
worker_deployment
.
start
()
yield
worker_deployment
worker_deployment
.
shutdown
()
def
_create_inputs
(
number
,
size
):
inputs
=
[]
outputs
=
[]
for
index
in
range
(
number
):
input_
=
numpy
.
random
.
randint
(
low
=
1
,
high
=
100
,
size
=
[
2
,
size
])
expected_
=
{}
expected_
[
"add_int64_output_total"
]
=
numpy
.
array
([[
input_
.
sum
()]])
expected_
[
"add_int64_output_partial"
]
=
numpy
.
array
([[
x
.
sum
()
for
x
in
input_
]])
expected_
[
"multiply_int64_output_total"
]
=
numpy
.
array
(
[[
x
.
prod
()
for
x
in
expected_
[
"add_int64_output_partial"
]]]
)
divisor
=
expected_
[
"add_int64_output_total"
][
0
][
0
]
dividends
=
expected_
[
"add_int64_output_partial"
]
expected_
[
"divide_fp64_output_partial"
]
=
numpy
.
array
(
[
numpy
.
divide
(
dividends
,
divisor
)]
)
inputs
.
append
(
input_
)
outputs
.
append
(
expected_
)
return
inputs
,
outputs
async
def
post_requests
(
num_requests
):
"""
Post requests to add_multiply_divide operator.
"""
ucp
.
reset
()
timeout
=
5
data_plane
=
UcpDataPlane
()
data_plane
.
connect
()
request_plane
=
NatsRequestPlane
(
f
"nats://localhost:
{
NATS_PORT
}
"
)
await
request_plane
.
connect
()
add_multiply_divide_operator
=
RemoteOperator
(
"add_multiply_divide"
,
request_plane
,
data_plane
)
results
=
[]
expected_results
=
{}
inputs
,
outputs
=
_create_inputs
(
num_requests
,
40
)
for
i
,
input_
in
enumerate
(
inputs
):
request_id
=
str
(
i
)
request
=
add_multiply_divide_operator
.
create_request
(
inputs
=
{
"int64_input"
:
input_
},
request_id
=
request_id
)
print
(
request
)
results
.
append
(
add_multiply_divide_operator
.
async_infer
(
request
))
expected_results
[
request_id
]
=
outputs
[
i
]
for
result
in
asyncio
.
as_completed
(
results
):
responses
=
await
result
async
for
response
in
responses
:
print
(
response
)
for
output_name
,
expected_value
in
expected_results
[
response
.
request_id
].
items
():
output
=
response
.
outputs
[
output_name
]
output_value
=
numpy
.
from_dlpack
(
output
.
to_host
())
numpy
.
testing
.
assert_equal
(
output_value
,
expected_value
)
del
output
print
(
expected_results
[
response
.
request_id
])
del
response
timeout
=
5
data_plane
.
close
(
timeout
)
await
request_plane
.
close
()
def
run
(
num_requests
):
sys
.
exit
(
asyncio
.
run
(
post_requests
(
num_requests
=
num_requests
)))
@
pytest
.
mark
.
skipif
(
"(not os.path.exists('/usr/local/bin/nats-server'))"
,
reason
=
"NATS.io not present"
,
)
@
pytest
.
mark
.
timeout
(
120
)
@
pytest
.
mark
.
parametrize
(
"consolidate_logs"
,
[
True
,
False
],
)
def
test_consolidate_logs
(
request
,
nats_server
,
workers
,
consolidate_logs
,
log_dir
):
# Using a separate process to use data plane across multiple tests.
p
=
Process
(
target
=
run
,
args
=
(
2
,))
p
.
start
()
p
.
join
()
assert
p
.
exitcode
==
0
# Test the number of logs that were created
log_dir_path
=
pathlib
.
Path
(
log_dir
)
/
request
.
node
.
name
worker_log_dir_count
=
0
for
name
in
log_dir_path
.
iterdir
():
worker_log_dir_count
+=
1
expected_worker_log_count
=
1
if
not
consolidate_logs
and
name
.
stem
not
in
[
"add_multiply_divide"
]:
expected_worker_log_count
=
2
worker_log_path
=
log_dir_path
/
name
.
stem
worker_log_count
=
0
for
log_name
in
worker_log_path
.
iterdir
():
worker_log_count
+=
1
assert
worker_log_count
==
expected_worker_log_count
assert
worker_log_dir_count
==
4
runtime/tests/python/integration/test_direct.py
deleted
100644 → 0
View file @
8f741f14
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
asyncio
import
sys
import
uuid
from
multiprocessing
import
Process
import
cupy
import
numpy
import
pytest
import
ucp
from
cupy_backends.cuda.api.runtime
import
CUDARuntimeError
from
triton_distributed.icp.nats_request_plane
import
NatsRequestPlane
from
triton_distributed.icp.ucp_data_plane
import
UcpDataPlane
from
triton_distributed.runtime.deployment
import
Deployment
from
triton_distributed.runtime.logger
import
get_logger
from
triton_distributed.runtime.operator
import
OperatorConfig
from
triton_distributed.runtime.remote_operator
import
RemoteOperator
from
triton_distributed.runtime.worker
import
WorkerConfig
NATS_PORT
=
4223
MODEL_REPOSITORY
=
(
"/workspace/runtime/tests/python/integration/operators/triton_core_models"
)
OPERATORS_REPOSITORY
=
"/workspace/runtime/tests/python/integration/operators"
TRITON_LOG_LEVEL
=
6
logger
=
get_logger
(
__name__
)
# Run cupy's cuda.is_available once to
# avoid the exception hitting runtime code.
try
:
if
cupy
.
cuda
.
is_available
():
pass
else
:
print
(
"CUDA not available."
)
except
CUDARuntimeError
:
print
(
"CUDA not available"
)
# TODO
# Decide if this should be
# pre merge, nightly, or weekly
pytestmark
=
pytest
.
mark
.
pre_merge
@
pytest
.
fixture
def
workers
(
request
,
log_dir
,
number_workers
=
10
):
# Add configs for identity operator
operator_name
=
"identity"
operator_config
=
OperatorConfig
(
name
=
operator_name
,
implementation
=
"identity:Identity"
,
version
=
1
,
max_inflight_requests
=
10
,
repository
=
OPERATORS_REPOSITORY
,
)
worker_configs
=
[]
test_log_dir
=
log_dir
/
request
.
node
.
name
test_log_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
for
i
in
range
(
number_workers
):
# Set the logging directory
worker_log_dir
=
test_log_dir
/
(
operator_name
+
"_"
+
str
(
i
))
worker_configs
.
append
(
WorkerConfig
(
name
=
operator_name
,
request_plane
=
NatsRequestPlane
,
data_plane
=
UcpDataPlane
,
request_plane_args
=
(
[],
{
"request_plane_uri"
:
f
"nats://localhost:
{
NATS_PORT
}
"
},
),
log_level
=
TRITON_LOG_LEVEL
,
log_dir
=
str
(
worker_log_dir
),
operators
=
[
operator_config
],
)
)
worker_deployment
=
Deployment
(
worker_configs
)
worker_deployment
.
start
()
yield
worker_deployment
worker_deployment
.
shutdown
()
async
def
post_requests
(
num_requests
,
num_targets
):
"""
Posts requests until the number of
workers that respond is equal to the number of targets
after that - only sends requests to one of the targets
"""
ucp
.
reset
()
timeout
=
5
data_plane
=
UcpDataPlane
()
data_plane
.
connect
()
request_plane
=
NatsRequestPlane
(
f
"nats://localhost:
{
NATS_PORT
}
"
)
await
request_plane
.
connect
()
identity_operator
=
RemoteOperator
(
"identity"
,
request_plane
,
data_plane
)
target_components
=
set
()
target_component_list
:
list
[
uuid
.
UUID
]
=
[]
responding_components
=
set
()
for
index
in
range
(
num_requests
):
request
=
identity_operator
.
create_request
(
inputs
=
{
"input"
:
[
index
]},
)
target_component
=
None
if
target_component_list
:
# we have the list of targets
# only send to workers in that list
target_index
=
index
%
len
(
target_component_list
)
target_component
=
target_component_list
[
target_index
]
identity_operator
.
component_id
=
target_component
async
for
response
in
await
identity_operator
.
async_infer
(
request
):
responding_component
=
response
.
component_id
numpy
.
testing
.
assert_equal
(
numpy
.
from_dlpack
(
response
.
outputs
[
"output"
]),
request
.
inputs
[
"input"
]
)
responding_components
.
add
(
responding_component
)
if
not
target_component_list
:
# add to list of acceptable targets
target_components
.
add
(
responding_component
)
if
len
(
target_components
)
>=
num_targets
:
# finalize list
target_component_list
=
list
(
target_components
)
timeout
=
5
data_plane
.
close
(
timeout
)
await
request_plane
.
close
()
assert
target_components
==
responding_components
def
run
(
num_requests
,
num_targets
=
5
):
sys
.
exit
(
asyncio
.
run
(
post_requests
(
num_requests
=
num_requests
,
num_targets
=
num_targets
,
)
)
)
@
pytest
.
mark
.
skipif
(
"(not os.path.exists('/usr/local/bin/nats-server'))"
,
reason
=
"NATS.io not present"
,
)
@
pytest
.
mark
.
timeout
(
30
)
def
test_direct
(
request
,
nats_server
,
workers
):
# Using a separate process to use data plane across multiple tests.
p
=
Process
(
target
=
run
,
args
=
(
50
,))
p
.
start
()
p
.
join
()
assert
p
.
exitcode
==
0
runtime/tests/python/integration/test_mock_disaggregated_serving.py
deleted
100644 → 0
View file @
8f741f14
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
asyncio
import
queue
import
sys
import
time
from
functools
import
partial
from
multiprocessing
import
Process
import
cupy
import
numpy
import
pytest
import
tritonclient.grpc
as
grpcclient
import
ucp
from
cupy_backends.cuda.api.runtime
import
CUDARuntimeError
from
transformers
import
XLNetTokenizer
from
tritonclient.utils
import
InferenceServerException
from
tritonserver
import
Tensor
from
triton_distributed.icp.nats_request_plane
import
NatsRequestPlane
from
triton_distributed.icp.ucp_data_plane
import
UcpDataPlane
from
triton_distributed.runtime.deployment
import
Deployment
from
triton_distributed.runtime.logger
import
get_logger
from
triton_distributed.runtime.operator
import
OperatorConfig
from
triton_distributed.runtime.remote_operator
import
RemoteOperator
from
triton_distributed.runtime.triton_core_operator
import
TritonCoreOperator
from
triton_distributed.runtime.worker
import
WorkerConfig
NATS_PORT
=
4223
MODEL_REPOSITORY
=
(
"/workspace/runtime/tests/python/integration/operators/triton_core_models"
)
OPERATORS_REPOSITORY
=
"/workspace/runtime/tests/python/integration/operators"
TRITON_LOG_LEVEL
=
6
logger
=
get_logger
(
__name__
)
# Run cupy's cuda.is_available once to
# avoid the exception hitting runtime code.
try
:
if
cupy
.
cuda
.
is_available
():
pass
else
:
print
(
"CUDA not available."
)
except
CUDARuntimeError
:
print
(
"CUDA not available"
)
# Slower test than others - make it nightly for now
pytestmark
=
pytest
.
mark
.
nightly
@
pytest
.
fixture
def
workers
(
request
,
log_dir
):
operator_configs
=
{}
# Add configs for triton core operators
triton_core_operators
=
[
"preprocessing"
,
"context"
,
"generation"
,
"postprocessing"
]
for
operator_name
in
triton_core_operators
:
operator_configs
[
operator_name
]
=
OperatorConfig
(
name
=
operator_name
,
implementation
=
TritonCoreOperator
,
version
=
1
,
max_inflight_requests
=
10
,
repository
=
MODEL_REPOSITORY
,
)
# Add configs for other custom operators
operator_name
=
"mock_disaggregated_serving"
operator_configs
[
operator_name
]
=
OperatorConfig
(
name
=
operator_name
,
implementation
=
"mock_disaggregated_serving:MockDisaggregatedServing"
,
version
=
1
,
max_inflight_requests
=
10
,
repository
=
OPERATORS_REPOSITORY
,
)
worker_configs
=
[]
test_log_dir
=
log_dir
/
request
.
node
.
name
test_log_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
# We will instantiate a worker for each operator
for
name
,
operator_config
in
operator_configs
.
items
():
# Set the logging directory
worker_log_dir
=
test_log_dir
/
name
worker_configs
.
append
(
WorkerConfig
(
name
=
name
,
request_plane
=
NatsRequestPlane
,
data_plane
=
UcpDataPlane
,
request_plane_args
=
(
[],
{
"request_plane_uri"
:
f
"nats://localhost:
{
NATS_PORT
}
"
},
),
log_level
=
TRITON_LOG_LEVEL
,
log_dir
=
str
(
worker_log_dir
),
operators
=
[
operator_config
],
)
)
worker_deployment
=
Deployment
(
worker_configs
)
worker_deployment
.
start
()
yield
worker_deployment
worker_deployment
.
shutdown
()
def
_create_inputs
(
number
):
inputs
=
[]
outputs
=
[]
for
_
in
range
(
number
):
request_output_len
=
10
query_arr
=
numpy
.
array
([
"This is a sample prompt"
],
dtype
=
numpy
.
object_
)
request_output_len_arr
=
numpy
.
array
([
request_output_len
],
dtype
=
numpy
.
int32
)
input_
=
{
"query"
:
query_arr
,
"request_output_len"
:
request_output_len_arr
}
expected_output
=
numpy
.
repeat
(
query_arr
,
request_output_len
)
tokenizer
=
XLNetTokenizer
.
from_pretrained
(
"xlnet-base-cased"
)
tokens
=
numpy
.
array
(
tokenizer
.
encode
(
query_arr
[
0
]))
expected_output
=
numpy
.
array
(
tokenizer
.
convert_ids_to_tokens
((
tokens
.
tolist
()))
)
output_data_
=
{
"output"
:
Tensor
.
_from_object
(
expected_output
)}
inputs
.
append
(
input_
)
outputs
.
append
(
output_data_
)
return
inputs
,
outputs
async
def
post_requests
(
num_requests
):
ucp
.
reset
()
data_plane
=
UcpDataPlane
()
data_plane
.
connect
()
request_plane
=
NatsRequestPlane
(
f
"nats://localhost:
{
NATS_PORT
}
"
)
await
request_plane
.
connect
()
mock_disaggregated_serving_operator
=
RemoteOperator
(
"mock_disaggregated_serving"
,
request_plane
,
data_plane
)
expected_results
=
{}
inputs
,
outputs
=
_create_inputs
(
num_requests
)
begin
=
None
token_latency
=
[]
timeout
=
True
for
i
,
input_dict
in
enumerate
(
inputs
):
request_id
=
str
(
i
)
request
=
mock_disaggregated_serving_operator
.
create_request
(
inputs
=
input_dict
,
request_id
=
request_id
)
begin
=
time
.
time
()
response_count
=
0
try
:
async
for
response
in
await
mock_disaggregated_serving_operator
.
async_infer
(
inference_request
=
request
):
token_latency
.
append
(
time
.
time
()
-
begin
)
expected_results
[
request_id
]
=
outputs
[
i
]
if
not
response
.
final
:
for
output_name
,
expected_value
in
expected_results
[
response
.
request_id
].
items
():
output
=
response
.
outputs
[
output_name
]
output_value
=
output
.
to_bytes_array
()
print
(
f
"Final Output:
{
output_value
}
"
)
numpy
.
testing
.
assert_equal
(
output_value
,
expected_value
.
to_bytes_array
()
)
response_count
+=
1
# 1 response from context and 10 responses from generation
assert
response_count
==
11
except
Exception
as
e
:
print
(
"Failed collecting responses:"
+
repr
(
e
))
del
response
print
(
f
"Token latency:
{
token_latency
}
"
)
data_plane
.
close
(
wait_for_release
=
timeout
)
await
request_plane
.
close
()
raise
e
print
(
f
"Token latency:
{
token_latency
}
"
)
data_plane
.
close
(
wait_for_release
=
timeout
)
await
request_plane
.
close
()
def
run
(
num_requests
):
sys
.
exit
(
asyncio
.
run
(
post_requests
(
num_requests
=
num_requests
)))
@
pytest
.
mark
.
skipif
(
"(not os.path.exists('/usr/local/bin/nats-server'))"
,
reason
=
"NATS.io not present or test is not configured to run with mock disaggregated serving"
,
)
def
test_mock_disaggregated_serving
(
request
,
nats_server
,
workers
):
# Using a separate process to use data plane across multiple tests.
p
=
Process
(
target
=
run
,
args
=
(
1
,))
p
.
start
()
p
.
join
()
assert
p
.
exitcode
==
0
class
UserData
:
def
__init__
(
self
):
self
.
_completed_requests
:
queue
.
Queue
[
grpcclient
.
Result
|
InferenceServerException
]
=
queue
.
Queue
()
# Define the callback function. Note the last two parameters should be
# result and error. InferenceServerClient would povide the results of an
# inference as grpcclient.InferResult in result. For successful
# inference, error will be None, otherwise it will be an object of
# tritonclientutils.InferenceServerException holding the error details
def
callback
(
user_data
,
result
,
error
):
if
error
:
user_data
.
_completed_requests
.
put
(
error
)
else
:
user_data
.
_completed_requests
.
put
(
result
)
async
def
send_kserve_requests
(
num_requests
):
inputs_dict
,
outputs_dicts
=
_create_inputs
(
num_requests
)
inputs
=
[]
inputs
.
append
(
grpcclient
.
InferInput
(
"query"
,
[
1
],
"BYTES"
))
inputs
.
append
(
grpcclient
.
InferInput
(
"request_output_len"
,
[
1
],
"INT32"
))
user_data
=
UserData
()
with
grpcclient
.
InferenceServerClient
(
"localhost:8001"
)
as
client
:
client
.
start_stream
(
callback
=
partial
(
callback
,
user_data
),
)
for
i
,
input_dict
in
enumerate
(
inputs_dict
):
inputs
[
0
].
set_data_from_numpy
(
input_dict
[
"query"
])
inputs
[
1
].
set_data_from_numpy
(
input_dict
[
"request_output_len"
])
client
.
async_stream_infer
(
model_name
=
"mock_disaggregated_serving"
,
inputs
=
inputs
)
recv_count
=
0
while
recv_count
<
10
:
data_item
=
user_data
.
_completed_requests
.
get
()
recv_count
+=
1
if
isinstance
(
data_item
,
InferenceServerException
):
raise
data_item
else
:
result
=
data_item
.
as_numpy
(
"output"
)
print
(
"test
\n
"
)
print
(
result
)
# Wait for the tensor clean-up
time
.
sleep
(
5
)
def
run_kserve
(
num_requests
):
sys
.
exit
(
asyncio
.
run
(
send_kserve_requests
(
num_requests
=
num_requests
)))
@
pytest
.
mark
.
skipif
(
"(not os.path.exists('/usr/local/bin/nats-server'))"
,
reason
=
"NATS.io not present"
,
)
def
test_mock_disaggregated_serving_kserve
(
request
,
nats_server
,
workers
,
api_server
):
# Using a separate process to use data plane across multiple tests.
p
=
Process
(
target
=
run_kserve
,
args
=
(
1
,))
p
.
start
()
p
.
join
()
assert
p
.
exitcode
==
0
runtime/tests/python/integration/test_perf_benchmark.py
deleted
100644 → 0
View file @
8f741f14
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
asyncio
import
numpy
import
pytest
import
ucp
from
triton_distributed.icp.nats_request_plane
import
NatsRequestPlane
from
triton_distributed.icp.ucp_data_plane
import
UcpDataPlane
from
triton_distributed.runtime.deployment
import
Deployment
from
triton_distributed.runtime.logger
import
get_logger
from
triton_distributed.runtime.operator
import
OperatorConfig
from
triton_distributed.runtime.remote_operator
import
RemoteOperator
from
triton_distributed.runtime.worker
import
WorkerConfig
NATS_PORT
=
4223
MODEL_REPOSITORY
=
(
"/workspace/runtime/tests/python/integration/operators/triton_core_models"
)
OPERATORS_REPOSITORY
=
"/workspace/runtime/tests/python/integration/operators"
TRITON_LOG_LEVEL
=
0
logger
=
get_logger
(
__name__
)
# Slower test than others - make it nightly for now
pytestmark
=
pytest
.
mark
.
nightly
@
pytest
.
fixture
def
workers
(
log_dir
,
request
,
number_workers
=
1
):
store_outputs_in_response
=
request
.
getfixturevalue
(
"store_outputs_in_response"
)
# Add configs for identity operator
operator_name
=
"identity"
operator_config
=
OperatorConfig
(
name
=
operator_name
,
implementation
=
"identity:Identity"
,
version
=
1
,
max_inflight_requests
=
10
,
parameters
=
{
"store_outputs_in_response"
:
store_outputs_in_response
},
repository
=
OPERATORS_REPOSITORY
,
)
worker_configs
=
[]
test_log_dir
=
log_dir
/
request
.
node
.
name
test_log_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
for
i
in
range
(
number_workers
):
# Set the logging directory
worker_log_dir
=
test_log_dir
/
(
operator_name
+
"_"
+
str
(
i
))
worker_configs
.
append
(
WorkerConfig
(
name
=
operator_name
,
request_plane
=
NatsRequestPlane
,
data_plane
=
UcpDataPlane
,
request_plane_args
=
(
[],
{
"request_plane_uri"
:
f
"nats://localhost:
{
NATS_PORT
}
"
},
),
log_level
=
TRITON_LOG_LEVEL
,
log_dir
=
str
(
worker_log_dir
),
operators
=
[
operator_config
],
)
)
worker_deployment
=
Deployment
(
worker_configs
)
worker_deployment
.
start
()
yield
worker_deployment
worker_deployment
.
shutdown
()
def
_create_inputs
(
number
,
tensor_size_in_kb
):
inputs
=
[]
outputs
=
[]
elem_cnt
=
int
(
tensor_size_in_kb
*
1024
/
4
)
for
_
in
range
(
number
):
input_
=
numpy
.
random
.
randint
(
low
=
1
,
high
=
100
,
size
=
[
elem_cnt
])
expected_
=
{}
expected_
[
"output"
]
=
input_
inputs
.
append
(
input_
)
outputs
.
append
(
expected_
)
return
inputs
,
outputs
def
run
(
aio_benchmark
,
store_inputs_in_request
,
store_outputs_in_response
,
tensor_size_in_kb
,
data_plane_tracker
,
):
if
data_plane_tracker
.
is_first_run
:
ucp
.
reset
()
data_plane_tracker
.
_data_plane
=
UcpDataPlane
()
data_plane_tracker
.
_data_plane
.
connect
()
request_plane
=
NatsRequestPlane
(
f
"nats://localhost:
{
NATS_PORT
}
"
)
asyncio
.
get_event_loop
().
run_until_complete
(
request_plane
.
connect
())
identity_operator
=
RemoteOperator
(
"identity"
,
request_plane
,
data_plane_tracker
.
_data_plane
)
inputs
,
outputs
=
_create_inputs
(
1
,
tensor_size_in_kb
)
aio_benchmark
(
post_requests
,
identity_operator
,
inputs
,
outputs
,
store_inputs_in_request
,
store_outputs_in_response
,
)
timeout
=
5
asyncio
.
get_event_loop
().
run_until_complete
(
request_plane
.
close
())
if
data_plane_tracker
.
is_last_run
:
data_plane_tracker
.
_data_plane
.
close
(
timeout
)
async
def
post_requests
(
identity_model
,
inputs
,
outputs
,
store_inputs_in_request
,
store_outputs_in_response
):
results
=
[]
expected_results
=
{}
for
i
,
input_
in
enumerate
(
inputs
):
request_id
=
str
(
i
)
request
=
identity_model
.
create_request
(
inputs
=
{
"input"
:
input_
},
request_id
=
request_id
)
if
store_inputs_in_request
:
request
.
store_inputs_in_request
.
add
(
"input"
)
results
.
append
(
identity_model
.
async_infer
(
request
))
expected_results
[
request_id
]
=
outputs
[
i
]
for
result
in
asyncio
.
as_completed
(
results
):
responses
=
await
result
async
for
response
in
responses
:
for
output_name
,
expected_value
in
expected_results
[
response
.
request_id
].
items
():
output
=
response
.
outputs
[
output_name
]
_
=
numpy
.
from_dlpack
(
output
.
to_host
())
del
output
del
response
@
pytest
.
fixture
(
scope
=
"module"
)
def
data_plane_tracker
():
class
Tracker
:
def
__init__
(
self
):
self
.
total_runs
=
0
self
.
current_run
=
0
self
.
_data_plane
=
None
def
increment_run
(
self
):
self
.
current_run
+=
1
@
property
def
is_first_run
(
self
):
return
self
.
current_run
==
1
@
property
def
is_last_run
(
self
):
return
self
.
current_run
==
self
.
total_runs
return
Tracker
()
# FIXME: NATS default size limit is 1 MB. However, even when the tensor_size_in_kb
# is set as 600, which corresponds to 0.6144 MB, we are hiting MaxPayloadError.
# Need to investigate why the limit is being hit.
@
pytest
.
mark
.
skipif
(
"(not os.path.exists('/usr/local/bin/nats-server'))"
,
reason
=
"NATS.io not present or test is configured to run with mock disaggregated_serving"
,
)
@
pytest
.
mark
.
parametrize
(
[
"store_inputs_in_request"
,
"store_outputs_in_response"
],
[(
True
,
True
),
(
False
,
False
)],
)
@
pytest
.
mark
.
parametrize
(
"tensor_size_in_kb"
,
[
10
,
100
,
500
],
)
@
pytest
.
mark
.
benchmark
(
min_rounds
=
100
,
max_time
=
1
)
def
test_identity
(
request
,
nats_server
,
workers
,
aio_benchmark
,
store_inputs_in_request
,
store_outputs_in_response
,
tensor_size_in_kb
,
data_plane_tracker
,
):
"""
This benchmark test checks the latency of a simple operator which returns input in its output
without any processing.
NOTE: We can not use benchmark fixture in the child process. Hence, we are required to use the
same process for opening then data plane object as pytest.
This means that the pytest main process cannot create another data plane object in any other
tests. Hence, we will use a run tracker to open and close the data plane
"""
if
data_plane_tracker
.
total_runs
==
0
:
data_plane_tracker
.
total_runs
=
6
# Set this to the number of parameters
data_plane_tracker
.
increment_run
()
run
(
aio_benchmark
,
store_inputs_in_request
,
store_outputs_in_response
,
tensor_size_in_kb
,
data_plane_tracker
,
)
runtime/tests/python/unit/test_args.py
deleted
100644 → 0
View file @
8f741f14
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
pytest
from
triton_distributed.runtime.parser
import
Parser
"""
Tests for parsing the arguments by command line parser
"""
@
pytest
.
fixture
def
default_values
():
# Add default values for the command-line interface
return
{
"request_plane_uri"
:
"nats://localhost:4222"
,
"log_level"
:
0
,
# TODO: Add the default options for the worker executable here
}
def
test_parse_args_default
(
default_values
):
# Tests for default values
args
,
parser
=
Parser
.
parse_args
([])
assert
args
.
request_plane_uri
==
default_values
[
"request_plane_uri"
]
assert
args
.
log_level
==
default_values
[
"log_level"
]
if
args
.
operators
:
raise
Exception
(
f
"Expected no operators by default, got
{
args
.
operators
}
"
)
if
args
.
operator_configs
:
raise
Exception
(
f
"Expected no operators by default, got
{
args
.
operator_configs
}
"
)
@
pytest
.
mark
.
parametrize
(
"valid_request_plane_uri"
,
[
"https://example.com"
,
# Add valid request plane uri values
],
)
def
test_parse_args_valid_request_plane_uri
(
valid_request_plane_uri
):
# Tests with valid values for request plane uri
args
,
_
=
Parser
.
parse_args
([
"--request-plane-uri"
,
valid_request_plane_uri
])
assert
args
.
request_plane_uri
==
valid_request_plane_uri
def
clean_argument_list
(
args_list
):
return
[
x
for
x
in
args_list
if
x
is
not
None
]
@
pytest
.
mark
.
parametrize
(
"first_arg, second_arg, third_arg"
,
[
(
"name:abc"
,
"version:1"
,
"max_inflight_requests:5"
),
(
"name:abc"
,
"max_inflight_requests:5"
,
None
),
(
"name:abc"
,
"version:1"
,
None
),
(
"name:abc"
,
None
,
None
),
# Add valid cases
],
)
def
test_parse_args_valid_model
(
first_arg
,
second_arg
,
third_arg
,
tmp_path
):
model_repo_path
=
tmp_path
/
"model_repo"
model_repo_path
.
mkdir
()
d
=
model_repo_path
/
"abc"
d
.
mkdir
()
# Tests with valid arguments
input_args
=
[
"--operator"
]
model_args
=
clean_argument_list
(
[
first_arg
,
second_arg
,
third_arg
,
f
"repository:
{
model_repo_path
}
"
,
"module:worker.triton_core_operator:TritonCoreOperator"
,
]
)
print
(
model_args
)
input_args
=
input_args
+
model_args
args
,
_
=
Parser
.
parse_args
(
input_args
)
assert
args
.
operators
[
0
]
==
model_args
def
test_parse_args_invalid_operator
(
capsys
):
# Tests with invalid arguments
with
pytest
.
raises
(
SystemExit
):
Parser
.
parse_args
([
"--operator"
])
captured
=
capsys
.
readouterr
()
assert
"expected at least one argument"
in
captured
.
err
@
pytest
.
mark
.
parametrize
(
"first_arg, second_arg, third_arg"
,
[
(
"name:abc"
,
"version:1"
,
"max_inflight_requests:5"
),
(
"name:abc"
,
"max_inflight_requests:5"
,
None
),
(
"name:abc"
,
"version:1"
,
None
),
# TODO: Revisit can be uncommented once the operator module can be inferred automatically.
# ("abc", None, None),
# Add valid cases
],
)
def
test_parse_args_valid_operator
(
first_arg
,
second_arg
,
third_arg
,
tmp_path
):
repo_path
=
tmp_path
/
"worker_repo"
repo_path
.
mkdir
()
d
=
repo_path
/
"abc"
d
.
mkdir
()
# Tests with valid arguments
input_args
=
[
"--operator"
]
operator_args
=
clean_argument_list
([
first_arg
,
second_arg
,
third_arg
])
input_args
=
input_args
+
operator_args
+
[
"module:dummyworkflow:Workflow"
]
args
,
_
=
Parser
.
parse_args
(
input_args
)
assert
args
.
operators
[
0
]
==
operator_args
+
[
"module:dummyworkflow:Workflow"
]
runtime/tests/python/unit/test_logger.py
deleted
100644 → 0
View file @
8f741f14
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
logging
import
pytest
from
triton_distributed.runtime.logger
import
get_logger
logger
=
logging
.
getLogger
(
__name__
)
MSG
=
"This is a sample message"
"""
Tests for Logging module
"""
def
logging_function
(
logger
):
logger
.
info
(
MSG
)
logger
.
warning
(
MSG
)
try
:
raise
Exception
(
"This is an exception"
)
except
Exception
:
logger
.
exception
(
MSG
)
logger
.
error
(
MSG
)
logger
.
debug
(
MSG
)
@
pytest
.
fixture
def
reset_logger
(
caplog
):
loggers
=
[
logging
.
getLogger
(
name
)
for
name
in
logging
.
root
.
manager
.
loggerDict
]
loggers
.
append
(
logging
.
getLogger
())
for
logger
in
loggers
:
handlers
=
logger
.
handlers
[:]
for
handler
in
handlers
:
logger
.
removeHandler
(
handler
)
handler
.
close
()
logger
.
setLevel
(
logging
.
NOTSET
)
logger
.
propagate
=
True
caplog
.
clear
()
@
pytest
.
mark
.
parametrize
(
"log_level, expected_record_counts"
,
[
# For log-level 0 only error and exception should be recorded
(
0
,
2
),
# For log-level 1 only info, error, exception and warning should be recorded
(
1
,
4
),
# All logs(error, exception, info, debug and warning) should be printed for log-level 2
(
2
,
5
),
],
)
def
test_logging
(
reset_logger
,
caplog
,
log_level
,
expected_record_counts
):
caplog
.
set_level
(
log_level
)
logger
=
get_logger
(
logger_name
=
"test_logging"
,
log_level
=
log_level
)
logging_function
(
logger
)
assert
len
(
caplog
.
records
)
==
expected_record_counts
Prev
1
…
3
4
5
6
7
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment