Commit 0a21fff9 authored by xiabo's avatar xiabo
Browse files

Adapt to 0.1.0

parent 9484fd1c
---
BasedOnStyle: Google
IndentWidth: 2
ContinuationIndentWidth: 4
UseTab: Never
MaxEmptyLinesToKeep: 2
SortIncludes: true
CompactNamespaces: true
ReflowComments: true
DerivePointerAlignment: false
PointerAlignment: Left
AllowShortIfStatementsOnASingleLine: false
AllowShortBlocksOnASingleLine: false
AllowShortFunctionsOnASingleLine: Inline
AlwaysBreakAfterReturnType: TopLevelDefinitions
AlignAfterOpenBracket: AlwaysBreak
BreakBeforeBraces: Custom
BraceWrapping:
AfterClass: false
AfterControlStatement: false
AfterEnum: false
AfterFunction: true
AfterNamespace: false
AfterStruct: false
AfterUnion: false
BeforeCatch: true
BinPackArguments: true
BinPackParameters: true
ConstructorInitializerAllOnOneLineOrOnePerLine: false
IndentCaseLabels: true
\ No newline at end of file
# Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#cmake_minimum_required(VERSION 3.17)
cmake_minimum_required(VERSION 3.16)
project(tritonbackend LANGUAGES C CXX)
#
# Options
#
option(TRITON_ENABLE_GPU "Enable GPU support in backend utilities" ON)
option(TRITON_ENABLE_MALI_GPU "Enable Arm MALI GPU support in backend utilities" OFF)
option(TRITON_ENABLE_STATS "Include statistics collections in backend utilities" ON)
set(TRITON_COMMON_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/common repo")
set(TRITON_CORE_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/core repo")
if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release)
endif()
#
# Dependencies
#
include(FetchContent)
FetchContent_Declare(
repo-common
GIT_REPOSITORY https://github.com/triton-inference-server/common.git
GIT_TAG ${TRITON_COMMON_REPO_TAG}
GIT_SHALLOW ON
)
FetchContent_Declare(
repo-core
GIT_REPOSITORY https://github.com/triton-inference-server/core.git
GIT_TAG ${TRITON_CORE_REPO_TAG}
GIT_SHALLOW ON
)
FetchContent_MakeAvailable(repo-common repo-core)
#
# CUDA
#
if(${TRITON_ENABLE_GPU})
#find_package(CUDAToolkit REQUIRED)
find_package(CUDA REQUIRED)
message(STATUS "Using CUDA ${CUDA_VERSION}")
set(CUDA_NVCC_FLAGS -std=c++11)
if(CUDA_VERSION VERSION_GREATER "10.1" OR CUDA_VERSION VERSION_EQUAL "10.1")
add_definitions(-DTRITON_ENABLE_CUDA_GRAPH=1)
else()
message(WARNING "CUDA ${CUDA_VERSION} does not support CUDA graphs.")
endif()
endif() # TRITON_ENABLE_GPU
#
# Backend library containing useful source and utilities
#
set(SRC_FILES
"src/backend_common.cc"
"src/backend_input_collector.cc"
"src/backend_memory.cc"
"src/backend_model_instance.cc"
"src/backend_model.cc"
"src/backend_output_responder.cc"
)
if(${TRITON_ENABLE_GPU})
set(SRC_FILES ${SRC_FILES} "src/kernel.h")
endif() # TRITON_ENABLE_GPU
add_library(
triton-backend-utils
${SRC_FILES}
)
if(${TRITON_ENABLE_GPU})
set(HOST_COMPILER_FLAGS "")
if (WIN32)
set(HOST_COMPILER_FLAGS "/MD")
else()
set(HOST_COMPILER_FLAGS "-fPIC")
endif()
set(CUDA_LIBRARIES PUBLIC ${CUDA_LIBRARIES})
cuda_add_library(
kernel-library-new
src/kernel.cu src/kernel.h
OPTIONS -arch compute_53
OPTIONS -code compute_53,sm_53,sm_60,sm_61,sm_62,sm_70,sm_72,sm_75
OPTIONS -Xcompiler ${HOST_COMPILER_FLAGS}
)
endif() # TRITON_ENABLE_GPU
add_library(
TritonBackend::triton-backend-utils ALIAS triton-backend-utils
)
target_include_directories(
triton-backend-utils
PUBLIC
$<INSTALL_INTERFACE:include>
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/src
)
if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
message("Using MSVC as compiler, default target on Windows 10. "
"If the target system is not Windows 10, please update _WIN32_WINNT "
"to corresponding value.")
endif()
target_compile_features(triton-backend-utils PRIVATE cxx_std_11)
target_compile_options(
triton-backend-utils
PRIVATE
$<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>,$<CXX_COMPILER_ID:GNU>>:
-Wall -Wextra -Wno-unused-parameter -Werror>
$<$<CXX_COMPILER_ID:MSVC>:/Wall /D_WIN32_WINNT=0x0A00 /EHsc>
)
# TRITON_ENABLE_GPU exposed in header so set PUBLIC
if(${TRITON_ENABLE_GPU})
target_compile_definitions(
triton-backend-utils
PUBLIC TRITON_ENABLE_GPU=1
)
endif() # TRITON_ENABLE_GPU
# TRITON_ENABLE_MALI_GPU exposed in header so set PUBLIC
if(${TRITON_ENABLE_MALI_GPU})
target_compile_definitions(
triton-backend-utils
PUBLIC TRITON_ENABLE_MALI_GPU=1
)
endif() # TRITON_ENABLE_MALI_GPU
# TRITON_ENABLE_STATS exposed in header so set PUBLIC
if(${TRITON_ENABLE_STATS})
target_compile_definitions(
triton-backend-utils
PUBLIC TRITON_ENABLE_STATS=1
)
endif() # TRITON_ENABLE_STATS
set_target_properties(
triton-backend-utils PROPERTIES
WINDOWS_EXPORT_ALL_SYMBOLS TRUE
POSITION_INDEPENDENT_CODE ON
OUTPUT_NAME tritonbackendutils
)
target_link_libraries(
triton-backend-utils
PUBLIC
triton-core-backendapi # from repo-core
triton-core-serverapi # from repo-core
triton-common-async-work-queue # from repo-common
triton-common-json # from repo-common
)
if(${TRITON_ENABLE_GPU})
target_link_libraries(
triton-backend-utils
PUBLIC
#CUDA::cudart
cudart
PRIVATE
kernel-library-new
)
endif() # TRITON_ENABLE_GPU
#
# Install
#
include(GNUInstallDirs)
set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/TritonBackend)
install(
TARGETS
triton-backend-utils
EXPORT
triton-backend-targets
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
)
if(${TRITON_ENABLE_GPU})
install(
TARGETS
kernel-library-new
EXPORT
triton-backend-targets
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
)
endif() # TRITON_ENABLE_GPU
install(
DIRECTORY include/
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
)
install(
EXPORT
triton-backend-targets
FILE
TritonBackendTargets.cmake
NAMESPACE
TritonBackend::
DESTINATION
${INSTALL_CONFIGDIR}
)
include(CMakePackageConfigHelpers)
configure_package_config_file(
${CMAKE_CURRENT_LIST_DIR}/cmake/TritonBackendConfig.cmake.in
${CMAKE_CURRENT_BINARY_DIR}/TritonBackendConfig.cmake
INSTALL_DESTINATION ${INSTALL_CONFIGDIR}
)
install(
FILES
${CMAKE_CURRENT_BINARY_DIR}/TritonBackendConfig.cmake
DESTINATION ${INSTALL_CONFIGDIR}
)
#
# Export from build tree
#
export(
EXPORT triton-backend-targets
FILE ${CMAKE_CURRENT_BINARY_DIR}/TritonBackendTargets.cmake
NAMESPACE TritonBackend::
)
export(PACKAGE TritonBackend)
Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of NVIDIA CORPORATION nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
<!--
# Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-->
[![License](https://img.shields.io/badge/License-BSD3-lightgrey.svg)](https://opensource.org/licenses/BSD-3-Clause)
# Triton Inference Server Backend
A Triton *backend* is the implementation that executes a model. A
backend can be a wrapper around a deep-learning framework, like
PyTorch, TensorFlow, TensorRT or ONNX Runtime. Or a backend can be
custom C/C++ logic performing any operation (for example, image
pre-processing).
This repo contains documentation on Triton backends and also source,
scripts and utilities for creating Triton backends. You do not need to
use anything provided in this repo to create a Triton backend but you
will likely find its contents useful.
## Frequently Asked Questions
Full documentation is included below but these shortcuts can help you
get started in the right direction.
### Where can I ask general questions about Triton and Triton backends?
Be sure to read all the information below as well as the [general
Triton
documentation](https://github.com/triton-inference-server/server#triton-inference-server)
available in the main
[server](https://github.com/triton-inference-server/server) repo. If
you don't find your answer there you can ask questions on the main
Triton [issues
page](https://github.com/triton-inference-server/server/issues).
### Where can I find all the backends that are available for Triton?
Anyone can develop a Triton backend, so it isn't possible for us to
know about all available backends. But the Triton project does provide
a set of supported backends that are tested and updated with each
Triton release.
**TensorRT**: The TensorRT backend is used to execute TensorRT
models. The
[server](https://github.com/triton-inference-server/tensorrt_backend)
repo contains the source for the backend.
**ONNX Runtime**: The ONNX Runtime backend is used to execute ONNX
models. The
[onnxruntime_backend](https://github.com/triton-inference-server/onnxruntime_backend)
repo contains the documentation and source for the backend.
**TensorFlow**: The TensorFlow backend is used to execute TensorFlow
models in both GraphDef and SavedModel formats. The same backend is
used to execute both TensorFlow 1 and TensorFlow 2 models. The
[tensorflow_backend](https://github.com/triton-inference-server/tensorflow_backend)
repo contains the documentation and source for the backend.
**PyTorch**: The PyTorch backend is used to execute TorchScript
models. The
[pytorch_backend](https://github.com/triton-inference-server/pytorch_backend)
repo contains the documentation and source for the backend.
**OpenVINO**: The OpenVINO backend is used to execute
[OpenVINO](https://docs.openvinotoolkit.org/latest/index.html)
models. The
[openvino_backend](https://github.com/triton-inference-server/openvino_backend)
repo contains the documentation and source for the backend.
**Python**: The Python backend allows you to write your model logic in
Python. For example, you can use this backend to execute pre/post
processing code written in Python, or to execute a PyTorch Python
script directly (instead of first converting it to TorchScript and
then using the PyTorch backend). The
[python_backend](https://github.com/triton-inference-server/python_backend)
repo contains the documentation and source for the backend.
**DALI**: [DALI](https://github.com/NVIDIA/DALI) is a collection of
highly optimized building blocks and an execution engine that
accelerates the pre-processing of the input data for deep learning
applications. The DALI backend allows you to execute your DALI
pipeline within Triton. The
[dali_backend](https://github.com/triton-inference-server/dali_backend)
repo contains the documentation and source for the backend.
**FIL**: The FIL ([Forest Inference
Library](https://github.com/rapidsai/cuml/tree/branch-21.10/python/cuml/fil))
backend is used to execute a variety of tree-based ML models, including
XGBoost models, LightGBM models, Scikit-Learn random forest models, and cuML
random forest models. The
[fil_backend](https://github.com/triton-inference-server/fil_backend) repo
contains the documentation and source for the backend.
**Important Note!** Not all the above backends are supported on every platform
supported by Triton. Look at the
[Backend-Platform Support Matrix](docs/backend_platform_support_matrix.md)
to learn about the same.
### How can I develop my own Triton backend?
First you probably want to ask on the main Triton [issues
page](https://github.com/triton-inference-server/server/issues) to
make sure you are not duplicating a backend that already exists. Then
follow the [tutorial](examples/README.md) to learn how to create your
first simple Triton backend and incrementally improve it to add more
features. You should also read the complete documentation on [Triton
backends](#backends).
### Can I add (or remove) a backend to an existing Triton installation?
Yes. See [Backend Shared Library](#backend-shared-library) for general
information about how the shared library implementing a backend is
managed by Triton, and [Triton with Unsupported and Custom
Backends](https://github.com/triton-inference-server/server/blob/main/docs/customization_guide/compose.md#triton-with-unsupported-and-custom-backends)
for documentation on how to add your backend to the released Triton
Docker image. For a standard install the globally available backends
are in /opt/tritonserver/backends.
### What about backends developed using the "legacy custom backend" API.
The legacy custom API is removed from Triton. If you have custom
backends that you developed using this older API you must port them to
the new [Triton Backend API](#triton-backend-api).
## Backends
A Triton *backend* is the implementation that executes a model. A
backend can be a wrapper around a deep-learning framework, like
PyTorch, TensorFlow, TensorRT, ONNX Runtime or OpenVINO. A backend can
also implement any functionality you want as long as it adheres to the
[backend API](#triton-backend-api). Triton uses this API to send
requests to the backend for execution and the backend uses the API to
communicate with Triton.
Every model must be associated with a backend. A model's backend is
specified in the model's configuration using the 'backend' setting.
For using TensorRT backend, the value of this setting should be *tensorrt*.
Similarly, for using PyTorch, ONNX and TensorFlow Backends, the `backend`
field should be set to *pytorch*, *onnxruntime* or *tensorflow* respectively.
For all other backends, 'backend' must be set to the name of the backend.
### Backend Shared Library
Each backend must be implemented as a shared library and the name of
the shared library must be *libtriton_\<backend-name\>.so*. For
example, if the name of the backend is "mybackend", a model indicates
that it uses the backend by setting the model configuration 'backend'
setting to "mybackend", and Triton looks for *libtriton_mybackend.so*
as the shared library that implements the backend. The
[tutorial](examples/README.md) shows examples of how to build your
backend logic into the appropriate shared library.
For a model, *M* that specifies backend *B*, Triton searches for the
backend shared library in the following places, in this order:
* \<model_repository\>/M/\<version_directory\>/libtriton_B.so
* \<model_repository\>/M/libtriton_B.so
* \<global_backend_directory\>/B/libtriton_B.so
Where \<global_backend_directory\> is by default
/opt/tritonserver/backends. The --backend-directory flag can be used
to override the default.
Typically you will install your backend into the global backend
directory. For example, if using Triton Docker images you can follow
the instructions in [Triton with Unsupported and Custom
Backends](https://github.com/triton-inference-server/server/blob/main/docs/customization_guide/compose.md#triton-with-unsupported-and-custom-backends). Continuing
the example of a backend names "mybackend", you would install into the
Triton image as:
```
/opt/
tritonserver/
backends/
mybackend/
libtriton_mybackend.so
... # other files needed by mybackend
```
### Triton Backend API
A Triton backend must implement the C interface defined in
[tritonbackend.h](https://github.com/triton-inference-server/core/tree/main/include/triton/core/tritonbackend.h). The
following abstractions are used by the API.
#### TRITONBACKEND_Backend
A TRITONBACKEND_Backend object represents the backend itself. The
same backend object is shared across all models that use the
backend. The associated API, like TRITONBACKEND_BackendName, is used
to get information about the backend and to associate a user-defined
state with the backend.
A backend can optionally implement TRITONBACKEND_Initialize and
TRITONBACKEND_Finalize to get notification of when the backend object
is created and destroyed (for more information see [backend
lifecycles](#backend-lifecycles)).
#### TRITONBACKEND_Model
A TRITONBACKEND_Model object represents a model. Each model loaded by
Triton is associated with a TRITONBACKEND_Model. Each model can use
the TRITONBACKEND_ModelBackend API to get the backend object
representing the backend that is used by the model.
The same model object is shared across all instances of that
model. The associated API, like TRITONBACKEND_ModelName, is used to
get information about the model and to associate a user-defined state
with the model.
Most backends will implement TRITONBACKEND_ModelInitialize and
TRITONBACKEND_ModelFinalize to initialize the backend for a given
model and to manage the user-defined state associated with the model
(for more information see [backend lifecycles](#backend-lifecycles)).
The backend must take into account threading concerns when
implementing TRITONBACKEND_ModelInitialize and
TRITONBACKEND_ModelFinalize. Triton will not perform multiple
simultaneous calls to these functions for a given model; however, if a
backend is used by multiple models Triton may simultaneously call the
functions with a different thread for each model. As a result, the
backend must be able to handle multiple simultaneous calls to the
functions. Best practice for backend implementations is to use only
function-local and model-specific user-defined state in these
functions, as is shown in the [tutorial](examples/README.md).
#### TRITONBACKEND_ModelInstance
A TRITONBACKEND_ModelInstance object represents a model
*instance*. Triton creates one or more instances of the model based on
the *instance_group* settings specified in the model
configuration. Each of these instances is associated with a
TRITONBACKEND_ModelInstance object.
The only function that the backend must implement is
TRITONBACKEND_ModelInstanceExecute. The
TRITONBACKEND_ModelInstanceExecute function is called by Triton to
perform inference/computation on a batch of inference requests. Most
backends will also implement TRITONBACKEND_ModelInstanceInitialize
and TRITONBACKEND_ModelInstanceFinalize to initialize the backend for
a given model instance and to manage the user-defined state associated
with the model (for more information see [backend
lifecycles](#backend-lifecycles)).
The backend must take into account threading concerns when
implementing TRITONBACKEND_ModelInstanceInitialize,
TRITONBACKEND_ModelInstanceFinalize and
TRITONBACKEND_ModelInstanceExecute. Triton will not perform multiple
simultaneous calls to these functions for a given model instance;
however, if a backend is used by a model with multiple instances or by
multiple models Triton may simultaneously call the functions with a
different thread for each model instance. As a result, the backend
must be able to handle multiple simultaneous calls to the
functions. Best practice for backend implementations is to use only
function-local and model-specific user-defined state in these
functions, as is shown in the [tutorial](examples/README.md).
#### TRITONBACKEND_Request
A TRITONBACKEND_Request object represents an inference request made
to the model. The backend takes ownership of the request object(s) in
TRITONBACKEND_ModelInstanceExecute and must release each request by
calling TRITONBACKEND_RequestRelease. However, the ownership of request
object is returned back to Triton in case TRITONBACKEND_ModelInstanceExecute
returns an error. See [Inference Requests and Responses](#inference-requests-and-responses)
for more information about request lifecycle.
The Triton Backend API allows the backend to get information about the
request as well as the input and request output tensors of the
request. Each request input is represented by a TRITONBACKEND_Input
object.
#### TRITONBACKEND_Response
A TRITONBACKEND_Response object represents a response sent by the
backend for a specific request. The backend uses the response API to
set the name, shape, datatype and tensor values for each output tensor
included in the response. The response can indicate either a failed or
a successful request. See [Inference Requests and
Responses](#inference-requests-and-responses) for more information
about request-response lifecycle.
### Backend Lifecycles
A backend must carefully manage the lifecycle of the backend itself,
the models and model instances that use the backend and the inference
requests that execute on the model instances using the backend.
#### Backend and Model
Backend, model and model instance initialization is triggered when
Triton loads a model.
* If the model requires a backend that is not already in use by an
already loaded model, then:
* Triton [loads the shared library](#backend-shared-library) that
implements the backend required by the model.
* Triton creates the TRITONBACKEND_Backend object that represents
the backend.
* Triton calls TRITONBACKEND_Initialize if it is implemented in the
backend shared library. TRITONBACKEND_Initialize should not return
until the backend is completely initialized. If
TRITONBACKEND_Initialize returns an error, Triton will report that
the model failed to load.
* Triton creates the TRITONBACKEND_Model object that represents the
model. Triton calls TRITONBACKEND_ModelInitialize if it is
implemented in the backend shared library.
TRITONBACKEND_ModelInitialize should not return until the backend
is completely initialized for the model. If
TRITONBACKEND_ModelInitialize returns an error, Triton will show
that the model failed to load.
* For each model instance specified for the model in the model
configuration:
* Triton creates the TRITONBACKEND_ModelInstance object that
represents the model instance.
* Triton calls TRITONBACKEND_ModelInstanceInitialize if it is
implemented in the backend shared library.
TRITONBACKEND_ModelInstanceInitialize should not return until the
backend is completely initialized for the instance. If
TRITONBACKEND_ModelInstanceInitialize returns an error, Triton
will show that the model failed to load.
Backend, model and model instance finalization is triggered when
Triton unloads a model.
* For each model instance:
* Triton calls TRITONBACKEND_ModelInstanceFinalize if it is
implemented in the backend shared library.
TRITONBACKEND_ModelInstanceFinalize should not return until the
backend is completely finalized, including stopping any threads
create for the model instance and freeing any user-defined state
created for the model instance.
* Triton destroys the TRITONBACKEND_ModelInstance object that
represents the model instance.
* Triton calls TRITONBACKEND_ModelFinalize if it is implemented in the
backend shared library. TRITONBACKEND_ModelFinalize should not
return until the backend is completely finalized, including stopping
any threads create for the model and freeing any user-defined state
created for the model.
* Triton destroys the TRITONBACKEND_Model object that represents the
model.
* Even if no other loaded model requires the backend, Triton does not
finalize and unload the backend until the tritonserver process is
exiting. When the tritonserver process exits:
* Triton calls TRITONBACKEND_Finalize if it is implemented in the
backend shared library. TRITONBACKEND_ModelFinalize should not
return until the backend is completely finalized, including
stopping any threads create for the backend and freeing any
user-defined state created for the backend.
* Triton destroys the TRITONBACKEND_Backend object that represents
the backend.
#### Inference Requests and Responses
Triton calls TRITONBACKEND_ModelInstanceExecute to execute inference
requests on a model instance. Each call to
TRITONBACKEND_ModelInstanceExecute communicates a batch of requests
to execute and the instance of the model that should be used to
execute those requests. The backend should not allow the caller
thread to return from TRITONBACKEND_ModelInstanceExecute until that
instance is ready to handle another set of requests. Typically this
means that the TRITONBACKEND_ModelInstanceExecute function will
create responses and release the requests before returning. However,
in case TRITONBACKEND_ModelInstanceExecute returns an error, the ownership
of requests is transferred back to Triton which will then be responsible
for releasing them. Therefore, in the case where TRITONBACKEND_ModelInstanceExecute
returns an error, the backend must not retain references to the requests
or access them in any way. For more detailed description of request/response
lifetimes, study the documentation of TRITONBACKEND_ModelInstanceExecute in
[tritonbackend.h](https://github.com/triton-inference-server/core/blob/main/include/triton/core/tritonbackend.h).
##### Single Response
Most backends will create a single response for each request. For that
kind of backend, executing a single inference request requires the
following steps:
* Create a response for the request using TRITONBACKEND_ResponseNew.
* For each request input tensor use TRITONBACKEND_InputProperties to
get shape and datatype of the input as well as the buffer(s)
containing the tensor contents.
* For each output tensor which the request expects to be returned, use
TRITONBACKEND_ResponseOutput to create the output tensor of the
required datatype and shape. Use TRITONBACKEND_OutputBuffer to get a
pointer to the buffer where the tensor's contents should be written.
* Use the inputs to perform the inference computation that produces
the requested output tensor contents into the appropriate output
buffers.
* Optionally set parameters in the response.
* Send the response using TRITONBACKEND_ResponseSend.
* Release the request using TRITONBACKEND_RequestRelease.
For a batch of requests the backend should attempt to combine the
execution of the individual requests as much as possible to increase
performance.
##### Decoupled Responses
It is also possible for a backend to send multiple responses for a
request or not send any responses for a request. A backend may also
send responses out-of-order relative to the order that the request
batches are executed. Such backends are called *decoupled* backends.
The decoupled backends use one `ResponseFactory` object per request to keep
creating and sending any number of responses for the request. For this
kind of backend, executing a single inference request typically requires
the following steps:
* For each request input tensor use TRITONBACKEND_InputProperties to
get shape and datatype of the input as well as the buffer(s)
containing the tensor contents.
* Create a `ResponseFactory` object for the request using
TRITONBACKEND_ResponseFactoryNew.
1. Create a response from the `ResponseFactory` object using
TRITONBACKEND_ResponseNewFromFactory. As long as you have
`ResponseFactory` object you can continue creating responses.
2. For each output tensor which the request expects to be returned, use
TRITONBACKEND_ResponseOutput to create the output tensor of the
required datatype and shape. Use TRITONBACKEND_OutputBuffer to get a
pointer to the buffer where the tensor's contents should be written.
3. Use the inputs to perform the inference computation that produces
the requested output tensor contents into the appropriate output
buffers.
4. Optionally set parameters in the response.
5. Send the response using TRITONBACKEND_ResponseSend. If this is the
last request then use TRITONSERVER_ResponseCompleteFlag with
TRITONBACKEND_ResponseSend. Otherwise continue with Step 1 for
sending next request
* Release the request using TRITONBACKEND_RequestRelease.
###### Special Cases
The decoupled API is powerful and supports various special cases:
* If the backend should not send any response for the request,
TRITONBACKEND_ResponseFactorySendFlags can be used to send
TRITONSERVER_RESPONSE_COMPLETE_FINAL using the `ResponseFactory`.
* The model can also send responses out-of-order in which it received
requests.
* The backend can copy out the contents of the input buffer(s) if
request is to be released before the contents are completely
consumed to generate responses. After copy, the request can be
released anytime before exiting TRITONBACKEND_ModelInstanceExecute.
The copies and `ResponseFactory` object can be passed to a separate
thread in backend. This means main caller thread can exit from
TRITONBACKEND_ModelInstanceExecute and the backend can still continue
generating responses as long as it holds `ResponseFactory` object.
The [repeat example](examples/README.md) demonstrates full power of
what can be acheived from decoupled API.
Study documentation of these TRTIONBACKEND_* functions in
[tritonbackend.h](https://github.com/triton-inference-server/core/blob/main/include/triton/core/tritonbackend.h)
for more details on these APIs. Read
[Decoupled Backends and Models](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/decoupled_models.md)
for more details on how to host a decoupled model.
## Build the Backend Utilities
The source in this repo builds into a single "backend utilities"
library that is useful when building backends. You don't need to use
these utilities but they will be helpful for most backends.
Typically you don't need to build this repo directly but instead you
can include it in the build of your backend as is shown in the
CMakeLists.txt files of the [tutorial examples](examples/README.md).
To build and install in a local directory use the following commands.
```
$ mkdir build
$ cd build
$ cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install ..
$ make install
```
The following required Triton repositories will be pulled and used in
the build. By default the "main" branch/tag will be used for each repo
but the listed CMake argument can be used to override.
* triton-inference-server/common: -DTRITON_COMMON_REPO_TAG=[tag]
* triton-inference-server/core: -DTRITON_CORE_REPO_TAG=[tag]
See the [CMakeLists.txt](CMakeLists.txt) file for other build options.
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
include(CMakeFindDependencyMacro)
get_filename_component(
TRITONBACKEND_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH
)
list(APPEND CMAKE_MODULE_PATH ${TRITONBACKEND_CMAKE_DIR})
if(NOT TARGET TritonBackend::triton-backend-utils)
include("${TRITONBACKEND_CMAKE_DIR}/TritonBackendTargets.cmake")
endif()
set(TRITONBACKEND_LIBRARIES TritonBackend::triton-backend-utils)
<!--
# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-->
# Backend-Platform Support Matrix
Even though Triton supports inference across various platforms such as
cloud, data center, edge and embedded devices on NVIDIA GPUs, x86 and
ARM CPU, or AWS Inferentia, it does so by relying on the backends.
Note that not all Triton backends support every platform. The purpose
of this document is to go over what all compute platforms are supported
by each of these Triton backends.
GPU in this document refers to Nvidia GPU. See
[GPU, Driver, and CUDA Support Matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html)
to learn more about supported GPUs.
## Ubuntu 20.04
The table below describes target device(s) supported for inference by
each backend on different platforms.
| Backend | x86 | ARM-SBSA |
| ------------ | --------- | ------------- |
| TensorRT | :heavy_check_mark: GPU <br/> :x: CPU | :heavy_check_mark: GPU <br/> :x: CPU |
| ONNX Runtime | :heavy_check_mark: GPU <br/> :heavy_check_mark: CPU | :heavy_check_mark: GPU <br/> :heavy_check_mark: CPU |
| TensorFlow | :heavy_check_mark: GPU <br/> :heavy_check_mark: CPU | :heavy_check_mark: GPU <br/> :heavy_check_mark: CPU |
| PyTorch | :heavy_check_mark: GPU <br/> :heavy_check_mark: CPU | :heavy_check_mark: GPU <br/> :heavy_check_mark: CPU |
| OpenVINO | :x: GPU <br/> :heavy_check_mark: CPU | :x: GPU <br/> :x: CPU |
| Python[^1] | :heavy_check_mark: GPU <br/> :heavy_check_mark: CPU | :heavy_check_mark: GPU <br/> :heavy_check_mark: CPU |
| DALI | :heavy_check_mark: GPU <br/> :heavy_check_mark: CPU | :heavy_check_mark: GPU[^2] <br/> :heavy_check_mark: CPU[^2] |
| FIL | :heavy_check_mark: GPU <br/> :heavy_check_mark: CPU | Unsupported |
## Windows 10
Only TensorRT and ONNX Runtime backends are supported on Windows.
| Backend | x86 | ARM-SBSA |
| ------------ | --------- | ------------- |
| TensorRT | :heavy_check_mark: GPU <br/> :x: CPU | :heavy_check_mark: GPU <br/> :x: CPU |
| ONNX Runtime | :heavy_check_mark: GPU <br/> :heavy_check_mark: CPU | :heavy_check_mark: GPU <br/> :heavy_check_mark: CPU |
## Jetson JetPack
Following backends are currently supported on Jetson Jetpack:
| Backend | Jetson |
| ------------ | --------- |
| TensorRT | :heavy_check_mark: GPU <br/> :x: CPU |
| ONNX Runtime | :heavy_check_mark: GPU <br/> :heavy_check_mark: CPU | :heavy_check_mark: GPU <br/> :heavy_check_mark: CPU |
| TensorFlow | :heavy_check_mark: GPU <br/> :heavy_check_mark: CPU | :heavy_check_mark: GPU <br/> :heavy_check_mark: CPU |
| PyTorch | :heavy_check_mark: GPU <br/> :heavy_check_mark: CPU | :heavy_check_mark: GPU <br/> :heavy_check_mark: CPU |
| Python[^1] | :x: GPU <br/> :heavy_check_mark: CPU |
Look at the [Triton Inference Server Support for Jetson and JetPack](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/jetson.md).
## AWS Inferentia
Currently, inference on AWS Inferentia is only supported via
[python backend](https://github.com/triton-inference-server/python_backend#running-with-inferentia)
where the deployed python script invokes AWS Neuron SDK.
[^1]: The supported devices for Python Backend are mentioned with
respect to Triton. The python script running in Python Backend can
be used to execute inference on any hardware if there are available
python APIs to do so. AWS inferentia is one such example. Triton
core is largely unaware of the fact that inference will run on
Inferentia.
[^2]: In case of ARM-SBSA, some operations are not fully supported.
<!--
# Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-->
[![License](https://img.shields.io/badge/License-BSD3-lightgrey.svg)](https://opensource.org/licenses/BSD-3-Clause)
# Triton Example Backends
To learn how to create a Triton backend, and to see a best-practices
baseline onto which you can add your own backend log, follow the
[Tutorial](#tutorial).
Triton also provides a couple of example backends that demonstrate
specific aspects of the backend API not covered by the
[Tutorial](#tutorial).
* The
[*repeat*](https://github.com/triton-inference-server/repeat_backend)
backend shows a more advanced example of how a backend can produce
multiple responses per request.
* The
[*stateful*](https://github.com/triton-inference-server/stateful_backend)
backend shows an example of how a backend can manage model state
tensors on the server-side for the [sequence
batcher](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#sequence-batcher)
to avoid transferring state tensors between client and server. Triton
also implements [Implicit State
Management](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/architecture.md#implicit-state-management)
which allows backends to behave in a stateless manner and leave the
state management to Triton.
## Tutorial
The [Triton Backend API](../README.md#triton-backend-api) exposes a
large number of features. The backend utilities and classes provide
many functions commonly used when creating a backend. But to create a
functional backend it is not necessary to use most of the backend API
or utilities. The tutorial starts with an implementation that shows a
*minimal* backend and then adds on recommended and optional
enhancements. The tutorial implementations follow best practices for
Triton backends and so can be used as templates for your own backend.
### *Minimal* Triton Backend
The source code for the *minimal* backend is contained in
[minimal.cc](backends/minimal/src/minimal.cc). The source code
contains extensive documentation describing the operation of the
backend and the use of the [Triton Backend
API](../README.md#triton-backend-api) and the backend
utilities. Before reading the source code, make sure you understand
the concepts associated with Triton backend abstractions
[TRITONBACKEND_Backend](../README.md#tritonbackend_backend),
[TRITONBACKEND_Model](../README.md#tritonbackend_model), and
[TRITONBACKEND_ModelInstance](../README.md#tritonbackend_modelinstance).
The *minimal* backend does not do any interesting operation, it simply
copies a single input tensor to a single output tensor, but it does
demonstrate the basic organization required for a Triton backend.
The *minimal* backend is complete but for clarity leaves out some
important aspects of writing a full-featured backend that are
described in [*Recommended* Triton
Backend](#recommended-triton-backend). When creating your own backend
use the [*Recommended* Triton Backend](#recommended-triton-backend) as
a starting point.
#### Building the *Minimal* Backend
[backends/minimal/CMakeLists.txt](backends/minimal/CMakeLists.txt)
shows the recommended build and install script for a Triton
backend. To build the *minimal* backend and install in a local directory
use the following commands.
```
$ cd backends/minimal
$ mkdir build
$ cd build
$ cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install ..
$ make install
```
The following required Triton repositories will be pulled and used in
the build. By default the "main" branch/tag will be used for each repo
but the listed CMake argument can be used to override.
* triton-inference-server/backend: -DTRITON_BACKEND_REPO_TAG=[tag]
* triton-inference-server/core: -DTRITON_CORE_REPO_TAG=[tag]
* triton-inference-server/common: -DTRITON_COMMON_REPO_TAG=[tag]
If you are building on a release branch (or on a development branch
that is based off of a release branch), then you must set these cmake
arguments to point to that release branch as well. For example, if you
are building the r21.10 identity_backend branch then you need to use
the following additional cmake flags:
```
-DTRITON_BACKEND_REPO_TAG=r21.10
-DTRITON_CORE_REPO_TAG=r21.10
-DTRITON_COMMON_REPO_TAG=r21.10
```
After building the install directory will contain a backends/minimal
directory that contains the *minimal* backend. Instructions for adding
this backend to the Triton server are described in [Backend Shared
Library](../README.md#backend-shared-library).
#### Running Triton with the *Minimal* Backend
After adding the *minimal* backend to the Triton server as described
in [Backend Shared Library](../README.md#backend-shared-library), you
can run Triton and have it load the models in
[model_repos/minimal_models](model_repos/minimal_models). Assuming you
have created a *tritonserver* Docker image by adding the *minimal*
backend to Triton, the following command will run Triton:
```
$ docker run --rm -it --net=host -v/path/to/model_repos/minimal_models:/models tritonserver --model-repository=/models
```
The console output will show similar to the following indicating that
the *batching* and *nonbatching* models from the minimal_models
repository have loaded correctly. Note that the model repository has
two models that both use the *minimal* backend. A backend can support
any number of diffent models.
```
I1215 23:46:00.250284 68 server.cc:589]
+-------------+---------+--------+
| Model | Version | Status |
+-------------+---------+--------+
| batching | 1 | READY |
| nonbatching | 1 | READY |
+-------------+---------+--------+
```
The models are identical except that the *batching* model enabled the
[dynamic
batcher](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#dynamic-batcher)
and supports batch sizes up to 8. Note that the *batching* model sets
the [batch
delay](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#delayed-batching)
to 5 seconds so that the example client described below can
demonstrate how the *minimal* backend receives a batch of requests.
#### Testing the *Minimal* Backend
The [clients](clients) directory holds example clients. The
[minimal_client](clients/minimal_client) Python script demonstrates
sending a couple of inference requests to the *minimal* backend. With
Triton running as described in [Running Triton with the *Minimal*
Backend](#running-triton-with-the-minimal-backend), execute the
client:
```
$ clients/minimal_client
```
The minimal_client first sends a single request to nonbatching
model. From the output you can see that the input value is returned in
the output.
```
=========
Sending request to nonbatching model: IN0 = [1 2 3 4]
Response: {'model_name': 'nonbatching', 'model_version': '1', 'outputs': [{'name': 'OUT0', 'datatype': 'INT32', 'shape': [4], 'parameters': {'binary_data_size': 16}}]}
OUT0 = [1 2 3 4]
```
In the Triton console output you can see the log message printed by
the *minimal* backend that indicates that it received a batch
containing the single request.
```
I1221 18:14:12.964836 86 minimal.cc:348] model nonbatching: requests in batch 1
I1221 18:14:12.964857 86 minimal.cc:356] batched IN0 value: [ 1, 2, 3, 4 ]
```
The minimal_client next sends 2 requests at the same time to the
batching model. Triton will dynamically batch those requests into a
single batch and send that single batch to the *minimal* backend.
```
=========
Sending request to batching model: IN0 = [[10 11 12 13]]
Sending request to batching model: IN0 = [[20 21 22 23]]
Response: {'model_name': 'batching', 'model_version': '1', 'outputs': [{'name': 'OUT0', 'datatype': 'INT32', 'shape': [1, 4], 'parameters': {'binary_data_size': 16}}]}
OUT0 = [[10 11 12 13]]
Response: {'model_name': 'batching', 'model_version': '1', 'outputs': [{'name': 'OUT0', 'datatype': 'INT32', 'shape': [1, 4], 'parameters': {'binary_data_size': 16}}]}
OUT0 = [[20 21 22 23]]
```
In the Triton console output you can see the log message indicating
that the *minimal* backend received a batch containing both requests.
```
I1221 18:14:17.965982 86 minimal.cc:348] model batching: requests in batch 2
I1221 18:14:17.966035 86 minimal.cc:356] batched IN0 value: [ 10, 11, 12, 13, 20, 21, 22, 23 ]
```
### *Recommended* Triton Backend
The source code for the *recommended* backend is contained in
[recommended.cc](backends/recommended/src/recommended.cc). The source
code contains extensive documentation describing the operation of the
backend and the use of the [Triton Backend
API](../README.md#triton-backend-api) and the backend
utilities. Before reading the source code, make sure you understand
the concepts associated with Triton backend abstractions
[TRITONBACKEND_Backend](../README.md#tritonbackend_backend),
[TRITONBACKEND_Model](../README.md#tritonbackend_model), and
[TRITONBACKEND_ModelInstance](../README.md#tritonbackend_modelinstance).
The *recommended* backend improves the [*minimal*
backend](#minimal-triton-backend) to include the following features
which should be present in any robust backend implementation:
* Enhances the backend to support models with input/output tensors
that have datatypes other than INT32.
* Enhances the backend to support models with input/output tensors
that have any shape.
* Uses the Triton backend metric APIs to record statistics about
requests executing in the backend. These metrics can then we queried
using the Triton
[metrics](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/metrics.md)
and
[statistics](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_statistics.md)
APIs.
* Additional error checking to ensure that the backend's version is
compatible with Triton and that each model's configuration is
compatible with the backend.
As with the *minimal* backend, the *recommended* backend just returns
the input tensor value in the output tensor. Because of the additions
described above, the *recommended* backend can serve as a starting
point for your backend.
#### Building the *Recommended* Backend
[backends/recommended/CMakeLists.txt](backends/recommended/CMakeLists.txt)
shows the recommended build and install script for a Triton
backend. Building and installing is the same as decribed in [Building
the *Minimal* Backend](#building-the-minimal-backend).
#### Running Triton with the *Recommended* Backend
After adding the *recommended* backend to the Triton server as
described in [Backend Shared
Library](../README.md#backend-shared-library), you can run Triton and
have it load the models in
[model_repos/recommended_models](model_repos/recommended_models). Assuming
you have created a *tritonserver* Docker image by adding the
*recommended* backend to Triton, the following command will run
Triton:
```
$ docker run --rm -it --net=host -v/path/to/model_repos/recommended_models:/models tritonserver --model-repository=/models
```
The console output will show similar to the following indicating that
the *batching* model from the recommended_models repository have
loaded correctly.
```
I1215 23:46:00.250284 68 server.cc:589]
+-------------+---------+--------+
| Model | Version | Status |
+-------------+---------+--------+
| batching | 1 | READY |
+-------------+---------+--------+
```
#### Testing the *Recommended* Backend
The [clients](clients) directory holds example clients. The
[recommended_client](clients/recommended_client) Python script
demonstrates sending a couple of inference requests to the
*recommended* backend. With Triton running as described in [Running
Triton with the *Recommended*
Backend](#running-triton-with-the-recommended-backend), execute the
client:
```
$ clients/recommended_client
```
The recommended_client next sends 2 requests at the same time to the
batching model, similar to what was done above with the *minimal*
backend. Triton will dynamically batch those requests into a single
batch and send that single batch to the *recommended* backend. In this
model, batching is supported, the datatype is FP32 and the tensor
shape is [ -1, 4, 4 ].
```
=========
Sending request to batching model: input = [[[1. 1.1 1.2 1.3]
[2. 2.1 2.2 2.3]
[3. 3.1 3.2 3.3]
[4. 4.1 4.2 4.3]]]
Sending request to batching model: input = [[[10. 10.1 10.2 10.3]
[20. 20.1 20.2 20.3]
[30. 30.1 30.2 30.3]
[40. 40.1 40.2 40.3]]]
Response: {'model_name': 'batching', 'model_version': '1', 'outputs': [{'name': 'OUTPUT', 'datatype': 'FP32', 'shape': [1, 4, 4], 'parameters': {'binary_data_size': 64}}]}
OUTPUT = [[[1. 1.1 1.2 1.3]
[2. 2.1 2.2 2.3]
[3. 3.1 3.2 3.3]
[4. 4.1 4.2 4.3]]]
Response: {'model_name': 'batching', 'model_version': '1', 'outputs': [{'name': 'OUTPUT', 'datatype': 'FP32', 'shape': [1, 4, 4], 'parameters': {'binary_data_size': 64}}]}
OUTPUT = [[[10. 10.1 10.2 10.3]
[20. 20.1 20.2 20.3]
[30. 30.1 30.2 30.3]
[40. 40.1 40.2 40.3]]]
```
In the Triton console output you can see the log message indicating
that the *recommended* backend received a batch containing both
requests.
```
I1221 18:30:52.223226 127 recommended.cc:604] model batching: requests in batch 2
I1221 18:30:52.223313 127 recommended.cc:613] batched INPUT value: [ 1.000000, 1.100000, 1.200000, 1.300000, 2.000000, 2.100000, 2.200000, 2.300000, 3.000000, 3.100000, 3.200000, 3.300000, 4.000000, 4.100000, 4.200000, 4.300000, 10.000000, 10.100000, 10.200000, 10.300000, 20.000000, 20.100000, 20.200001, 20.299999, 30.000000, 30.100000, 30.200001, 30.299999, 40.000000, 40.099998, 40.200001, 40.299999 ]
```
Because the *recommended* backend can support models that have
input/output tensors with any datatype and shape, you can edit the
model configuration and the client to experiment with these options.
To see the metrics collected for these two inference requests, use the following command to access Triton's metrics endpoint.
```
$ curl localhost:8002/metrics
```
The output will be metric values in Prometheus data format. The
[metrics
documentation](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/metrics.md)
gives a description of these metric values.
```
# HELP nv_inference_request_success Number of successful inference requests, all batch sizes
# TYPE nv_inference_request_success counter
nv_inference_request_success{model="batching",version="1"} 2.000000
# HELP nv_inference_request_failure Number of failed inference requests, all batch sizes
# TYPE nv_inference_request_failure counter
nv_inference_request_failure{model="batching",version="1"} 0.000000
# HELP nv_inference_count Number of inferences performed
# TYPE nv_inference_count counter
nv_inference_count{model="batching",version="1"} 2.000000
# HELP nv_inference_exec_count Number of model executions performed
# TYPE nv_inference_exec_count counter
nv_inference_exec_count{model="batching",version="1"} 1.000000
...
```
You can also see the collected statistics using the [statistics
endpoint](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_statistics.md).
```
$ curl localhost:8000/v2/models/batching/stats
{"model_stats":[{"name":"batching","version":"1","last_inference":1640111452223,"inference_count":2,"execution_count":1,"inference_stats":{"success":{"count":2,"ns":9997025869},"fail":{"count":0,"ns":0},"queue":{"count":2,"ns":9996491319},"compute_input":{"count":2,"ns":95288},"compute_infer":{"count":2,"ns":232202},"compute_output":{"count":2,"ns":195850}},"batch_stats":[{"batch_size":2,"compute_input":{"count":1,"ns":47644},"compute_infer":{"count":1,"ns":116101},"compute_output":{"count":1,"ns":97925}}]}]}
```
### *BLS* Triton Backend
Please see the [doucumentation](backends/bls/README.md) of *BLS* Backend.
### Enhancements
This section describes several optional features that you can add to
enhance the capabilities of your backend.
#### Automatically Model Configuration Generation
[Automatic model configuration
generation](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#auto-generated-model-configuration)
is enabled by the backend implementing the appropriate logic (for
example, in a function called AutoCompleteConfig) during
TRITONBACKEND_ModelInitialize. For the *recommended* backend you would
add a call to AutoCompleteConfig in the ModelState constructor just
before the call to ValidateModelConfig. The AutoCompleteConfig
function can update the model configuration with input tensor, output
tensor, and max-batch-size configuration; and then update the
configuration using TRITONBACKEND_ModelSetConfig. Examples can be
found in [ONNXRuntime
backend](https://github.com/triton-inference-server/onnxruntime_backend),
[TensorFlow
backend](https://github.com/triton-inference-server/tensorflow_backend)
and other backends.
#### Add Key-Value Parameters to a Response
A backend can add a key-value pair to a response any time after the
response is created and before it is sent. The parameter key must be a
string and the parameter value can be a string, integer or
boolean. The following example shows the TRITONBACKEND API used to set
response parameters. Error checking code is not shown to improve
clarity.
```
TRITONBACKEND_ResponseSetStringParameter(response, "param0", "an example string parameter");
TRITONBACKEND_ResponseSetIntParameter(responses[r], "param1", 42);
TRITONBACKEND_ResponseSetBoolParameter(responses[r], "param2", false);
```
#### Access Model Artifacts in the Model Repository
A backend can access any of the files in a model's area of the model
registry. These files are typically needed during
TRITONBACKEND_ModelInitialize but can be accessed at other times as
well. The TRITONBACKEND_ModelRepository API gives the location of the
model's repository. For example, the following code can be run during
TRITONBACKEND_ModelInitialize to write the location to the log.
```
// Can get location of the model artifacts. Normally we would need
// to check the artifact type to make sure it was something we can
// handle... but we are just going to log the location so we don't
// need the check. We would use the location if we wanted to load
// something from the model's repo.
TRITONBACKEND_ArtifactType artifact_type;
const char* clocation;
RETURN_IF_ERROR(
TRITONBACKEND_ModelRepository(model, &artifact_type, &clocation));
LOG_MESSAGE(
TRITONSERVER_LOG_INFO,
(std::string("Repository location: ") + clocation).c_str());
```
The framework backends (for example, TensorRT, ONNXRuntime,
TensorFlow, PyTorch) read the actual model file from the model
repository using this API. See those backends for examples of how it
can be used.
<!--
# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-->
# *BLS* Triton Backend
The [*BLS*](../bls) backend demonstrates using in-process C-API to
execute inferences within the backend. This backend serves as an example to
backend developers for implementing their own custom pipeline in C++.
For Python use cases, please refer to
[Business Logic Scripting](https://github.com/triton-inference-server/python_backend/blob/main/README.md#business-logic-scripting)
section in Python backend.
The source code for the *bls* backend is contained in
[src](./src).
* [backend.cc](./src/backend.cc) contains the main backend
implementation. The content of this file is not BLS specific. It only includes
the required Triton backend functions that is standard for any backend
implementation. The BLS logic is set off in the
`TRITONBACKEND_ModelInstanceExecute` with lines `bls_executor.Execute(requests[r], &responses[r]);`.
* [bls.h](./src/bls.h) is where the BLS (class `BLSExecutor`) of
this example is located. You can refer to this file to see how to interact with
Triton in-process C-API to build the custom execution pipeline.
* [bls_utils.h](./src/bls_utils.h) is where all the utilities that
are not BLS dependent are located.
The source code contains extensive documentation describing the operation of
the backend and the use of the
[Triton Backend API](../../../README.md#triton-backend-api) and the
[Triton Server API](https://github.com/triton-inference-server/server/blob/main/docs/customization_guide/inference_protocols.md#in-process-triton-server-api).
Before reading the source code, make sure you understand
the concepts associated with Triton backend abstractions
[TRITONBACKEND_Backend](../../../README.md#tritonbackend_backend),
[TRITONBACKEND_Model](../../../README.md#tritonbackend_model), and
[TRITONBACKEND_ModelInstance](../../../README.md#tritonbackend_modelinstance).
The *bls* backend will send two requests on the 'addsub_python' and 'addsub_tf'
models. After the inference requests are completed, this backend will extract
OUTPUT0 from the 'addsub_python' and OUTPUT1 from the 'addsub_tf' model to
construct the final inference response object using these tensors.
There are some self-imposed limitations that were made for the simplicity of
this example:
1. This backend does not support batching.
2. This backend does not support decoupled models.
3. This backend does not support GPU tensors.
4. The model configuraion should be strictly set as the comments described in
[backend.cc](./src/backend.cc).
You can implement your custom backend that is not limited to the limitations
mentioned above.
## Building the *BLS* Backend
[backends/bls/CMakeLists.txt](CMakeLists.txt)
shows the recommended build and install script for a Triton
backend. Building and installing is the same as decribed in [Building
the *Minimal* Backend](../../README.md#building-the-minimal-backend).
## Running Triton with the *BLS* Backend
After adding the *bls* backend to the Triton server as
described in [Backend Shared
Library](../../../README.md#backend-shared-library), you can run Triton and
have it load the models in
[model_repos/bls_models](../../model_repos/bls_models). Assuming you have created a
*tritonserver* Docker image by adding the *bls* backend to Triton, the
following command will run Triton:
```
$ docker run --rm -it --net=host -v/path/to/model_repos/bls_models:/models tritonserver --model-repository=/models
```
The console output will show similar to the following indicating that
the *bls_fp32*, *addsub_python* and *addsub_tf* models from the bls_models repository have
loaded correctly.
```
I0616 09:34:47.767433 19214 server.cc:629]
+---------------+---------+--------+
| Model | Version | Status |
+---------------+---------+--------+
| addsub_python | 1 | READY |
| addsub_tf | 1 | READY |
| bls_fp32 | 1 | READY |
+---------------+---------+--------+
```
## Testing the *BLS* Backend
The [clients](../../clients) directory holds example clients. The
[bls_client](../../clients/bls_client) Python script demonstrates sending an
inference requests to the *bls* backend. With Triton running as
described in [Running Triton with the *BLS* Backend](#running-triton-with-the-bls-backend),
execute the client:
```
$ clients/bls_client
```
You should see an output similar to the output below:
```
INPUT0 ([0.42935285 0.51512766 0.43625894 ... 0.6670954 0.17747518 0.7976901 ]) + INPUT1 ([6.7752063e-01 2.4223252e-01 6.7743927e-01 ... 4.1531715e-01 2.5451833e-01 7.9097062e-01]) = OUTPUT0 ([1.1068735 0.75736016 1.1136982 ... 1.0824126 0.4319935 1.5886607 ])
INPUT0 ([0.42935285 0.51512766 0.43625894 ... 0.6670954 0.17747518 0.7976901 ]) - INPUT1 ([6.7752063e-01 2.4223252e-01 6.7743927e-01 ... 4.1531715e-01 2.5451833e-01 7.9097062e-01]) = OUTPUT1 ([-0.24816778 0.27289516 -0.24118033 ... 0.25177827 -0.07704315 0.00671947])
PASS
```
# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
include(CMakeFindDependencyMacro)
get_filename_component(
TRITONBLSBACKEND_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH
)
list(APPEND CMAKE_MODULE_PATH ${TRITONBLSBACKEND_CMAKE_DIR})
if(NOT TARGET TritonBLSBackend::triton-bls-backend)
include("${TRITONBLSBACKEND_CMAKE_DIR}/TritonBLSBackendTargets.cmake")
endif()
set(TRITONBLSBACKEND_LIBRARIES TritonBLSBackend::triton-bls-backend)
// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "bls.h"
#include "triton/backend/backend_model.h"
#include "triton/backend/backend_model_instance.h"
//
// Backend that demonstrates using in-process C-API to execute inferences
// within the backend.
//
// Two particular models, 'addsub_python' and 'addsub_tf', must be loaded on
// the server for a successful inference execution on this backend.
//
// The model configuration should be set as follows in order to be in line with
// the 'addsub_python' and 'addsub_tf' models. This backend does not support
// batching. These limitations are only for this specific backend. You can
// implement your custom BLS backend with less limitations.
//
// Model Configuration:
// - Input 'INPUT0' must have shape [16] and datatype must be TYPE_FP32.
//
// - Input 'INPUT1' must have shape [16] and datatype must be TYPE_FP32.
//
// - For each response, output 'OUTPUT0' must have shape [16] and
// datatype TYPE_FP32.
//
// - For each response, output 'OUTPUT1' must have shape [16] and
// datatype TYPE_FP32.
//
// This backend will send two requests on the 'addsub_python' and 'addsub_tf'
// models. After the inference requests are completed, this backend
// will extract OUTPUT0 from the 'addsub_python' and OUTPUT1 from the
// 'addsub_tf' model to construct the final inference response object using
// these tensors.
namespace triton { namespace backend { namespace bls {
//
// ModelState
//
// State associated with a model that is using this backend. An object
// of this class is created and associated with each
// TRITONBACKEND_Model.
//
class ModelState : public BackendModel {
public:
static TRITONSERVER_Error* Create(
TRITONBACKEND_Model* triton_model, ModelState** state);
virtual ~ModelState() = default;
// Validate that model configuration is supported by this backend.
TRITONSERVER_Error* ValidateModelConfig();
private:
ModelState(TRITONBACKEND_Model* triton_model) : BackendModel(triton_model) {}
};
TRITONSERVER_Error*
ModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state)
{
try {
*state = new ModelState(triton_model);
}
catch (const BackendModelException& ex) {
RETURN_ERROR_IF_TRUE(
ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL,
std::string("unexpected nullptr in BackendModelException"));
RETURN_IF_ERROR(ex.err_);
}
return nullptr; // success
}
TRITONSERVER_Error*
ModelState::ValidateModelConfig()
{
// We have the json DOM for the model configuration...
common::TritonJson::WriteBuffer buffer;
RETURN_IF_ERROR(model_config_.PrettyWrite(&buffer));
LOG_MESSAGE(
TRITONSERVER_LOG_INFO,
(std::string("model configuration:\n") + buffer.Contents()).c_str());
// max_batch_size must be 0 because this backend does not support
// batching
int64_t max_batch_size;
RETURN_IF_ERROR(model_config_.MemberAsInt("max_batch_size", &max_batch_size));
RETURN_ERROR_IF_FALSE(
max_batch_size == 0, TRITONSERVER_ERROR_INVALID_ARG,
std::string("bls backend only supports models with max_batch_size == 0"));
common::TritonJson::Value inputs, outputs;
RETURN_IF_ERROR(model_config_.MemberAsArray("input", &inputs));
RETURN_IF_ERROR(model_config_.MemberAsArray("output", &outputs));
// There must be 2 inputs and 2 outputs.
RETURN_ERROR_IF_FALSE(
inputs.ArraySize() == 2, TRITONSERVER_ERROR_INVALID_ARG,
std::string("expected 2 inputs, got ") +
std::to_string(inputs.ArraySize()));
RETURN_ERROR_IF_FALSE(
outputs.ArraySize() == 2, TRITONSERVER_ERROR_INVALID_ARG,
std::string("expected 2 outputs, got ") +
std::to_string(outputs.ArraySize()));
// Here we rely on the model configuation listing the inputs and
// outputs in a specific order, which we shouldn't really require...
common::TritonJson::Value input0, input1, output0, output1;
RETURN_IF_ERROR(inputs.IndexAsObject(0, &input0));
RETURN_IF_ERROR(inputs.IndexAsObject(1, &input1));
RETURN_IF_ERROR(outputs.IndexAsObject(0, &output0));
RETURN_IF_ERROR(outputs.IndexAsObject(1, &output1));
// Check tensor names
std::string in0_name, in1_name, out0_name, out1_name;
RETURN_IF_ERROR(input0.MemberAsString("name", &in0_name));
RETURN_IF_ERROR(input1.MemberAsString("name", &in1_name));
RETURN_IF_ERROR(output0.MemberAsString("name", &out0_name));
RETURN_IF_ERROR(output1.MemberAsString("name", &out1_name));
RETURN_ERROR_IF_FALSE(
in0_name == "INPUT0", TRITONSERVER_ERROR_INVALID_ARG,
std::string("expected first input tensor name to be INPUT0, got ") +
in0_name);
RETURN_ERROR_IF_FALSE(
in1_name == "INPUT1", TRITONSERVER_ERROR_INVALID_ARG,
std::string("expected second input tensor name to be INPUT1, got ") +
in1_name);
RETURN_ERROR_IF_FALSE(
out0_name == "OUTPUT0", TRITONSERVER_ERROR_INVALID_ARG,
std::string("expected first output tensor name to be OUTPUT0, got ") +
out0_name);
RETURN_ERROR_IF_FALSE(
out1_name == "OUTPUT1", TRITONSERVER_ERROR_INVALID_ARG,
std::string("expected second output tensor name to be OUTPUT1, got ") +
out1_name);
// Check shapes
std::vector<int64_t> in0_shape, in1_shape, out0_shape, out1_shape;
RETURN_IF_ERROR(backend::ParseShape(input0, "dims", &in0_shape));
RETURN_IF_ERROR(backend::ParseShape(input1, "dims", &in1_shape));
RETURN_IF_ERROR(backend::ParseShape(output0, "dims", &out0_shape));
RETURN_IF_ERROR(backend::ParseShape(output1, "dims", &out1_shape));
RETURN_ERROR_IF_FALSE(
in0_shape.size() == 1, TRITONSERVER_ERROR_INVALID_ARG,
std::string("expected INPUT0 shape to have one dimension, got ") +
backend::ShapeToString(in0_shape));
RETURN_ERROR_IF_FALSE(
in1_shape.size() == 1, TRITONSERVER_ERROR_INVALID_ARG,
std::string("expected INPUT1 shape to have one dimension, got ") +
backend::ShapeToString(in1_shape));
RETURN_ERROR_IF_FALSE(
out0_shape.size() == 1, TRITONSERVER_ERROR_INVALID_ARG,
std::string("expected OUTPUT0 shape to have one dimension, got ") +
backend::ShapeToString(out0_shape));
RETURN_ERROR_IF_FALSE(
out1_shape.size() == 1, TRITONSERVER_ERROR_INVALID_ARG,
std::string("expected OUTPUT1 shape to have one dimension, got ") +
backend::ShapeToString(out1_shape));
// Check datatypes
std::string in0_dtype, in1_dtype, out0_dtype, out1_dtype;
RETURN_IF_ERROR(input0.MemberAsString("data_type", &in0_dtype));
RETURN_IF_ERROR(input1.MemberAsString("data_type", &in1_dtype));
RETURN_IF_ERROR(output0.MemberAsString("data_type", &out0_dtype));
RETURN_IF_ERROR(output1.MemberAsString("data_type", &out1_dtype));
RETURN_ERROR_IF_FALSE(
in0_dtype == "TYPE_FP32", TRITONSERVER_ERROR_INVALID_ARG,
std::string("expected INPUT0 datatype to be TYPE_FP32, got ") +
in0_dtype);
RETURN_ERROR_IF_FALSE(
in1_dtype == "TYPE_FP32", TRITONSERVER_ERROR_INVALID_ARG,
std::string("expected INPUT1 datatype to be TYPE_FP32, got ") +
in1_dtype);
RETURN_ERROR_IF_FALSE(
out0_dtype == "TYPE_FP32", TRITONSERVER_ERROR_INVALID_ARG,
std::string("expected OUTPUT0 datatype to be TYPE_FP32, got ") +
out0_dtype);
RETURN_ERROR_IF_FALSE(
out1_dtype == "TYPE_FP32", TRITONSERVER_ERROR_INVALID_ARG,
std::string("expected OUTPUT1 datatype to be TYPE_FP32, got ") +
out1_dtype);
return nullptr; // success
}
//
// ModelInstanceState
//
// State associated with a model instance. An object of this class is
// created and associated with each TRITONBACKEND_ModelInstance.
//
class ModelInstanceState : public BackendModelInstance {
public:
static TRITONSERVER_Error* Create(
ModelState* model_state,
TRITONBACKEND_ModelInstance* triton_model_instance,
ModelInstanceState** state);
virtual ~ModelInstanceState() = default;
void ProcessRequests(
TRITONBACKEND_Request** requests, const uint32_t request_count);
private:
ModelInstanceState(
ModelState* model_state,
TRITONBACKEND_ModelInstance* triton_model_instance)
: BackendModelInstance(model_state, triton_model_instance)
{
}
};
TRITONSERVER_Error*
ModelInstanceState::Create(
ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance,
ModelInstanceState** state)
{
try {
*state = new ModelInstanceState(model_state, triton_model_instance);
}
catch (const BackendModelInstanceException& ex) {
RETURN_ERROR_IF_TRUE(
ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL,
std::string("unexpected nullptr in BackendModelInstanceException"));
RETURN_IF_ERROR(ex.err_);
}
return nullptr; // success
}
void
ModelInstanceState::ProcessRequests(
TRITONBACKEND_Request** requests, const uint32_t request_count)
{
uint64_t exec_start_ns = 0;
SET_TIMESTAMP(exec_start_ns);
for (size_t i = 0; i < request_count; i++) {
// If we get a nullptr request then something is badly wrong. Fail
// and release all requests.
if (requests[i] == nullptr) {
RequestsRespondWithError(
requests, request_count,
TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL,
std::string(
"null request given to BLS backend for '" + Name() + "'")
.c_str()));
return;
}
}
// At this point we accept ownership of 'requests', which means that
// even if something goes wrong we must still return success from
// this function. If something does go wrong in processing a
// particular request then we send an error response just for the
// specific request.
std::vector<TRITONBACKEND_Response*> responses;
responses.reserve(request_count);
for (size_t i = 0; i < request_count; i++) {
TRITONBACKEND_Response* response;
auto err = TRITONBACKEND_ResponseNew(&response, requests[i]);
if (err == nullptr) {
responses.emplace_back(response);
} else {
responses.emplace_back(nullptr);
LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Fail to create response");
TRITONSERVER_ErrorDelete(err);
}
}
ModelState* model_state = reinterpret_cast<ModelState*>(Model());
// The way we collect these batch timestamps is not entirely
// accurate. Normally, in a performant backend you would execute all
// the requests at the same time, and so there would be a single
// compute-start / compute-end time-range. But here we execute each
// request separately so there is no single range. As a result we
// just show the entire execute time as being the compute time as
// well.
uint64_t compute_start_ns = 0;
SET_TIMESTAMP(compute_start_ns);
// Create a BLSExecutor object. To separate from standard backend
// implementation, the BLS logic is placed inside class BLSExecutor.
BLSExecutor bls_executor(model_state->TritonServer());
for (size_t r = 0; r < request_count; r++) {
bls_executor.Execute(requests[r], &responses[r]);
}
uint64_t compute_end_ns = 0;
SET_TIMESTAMP(compute_end_ns);
uint64_t exec_end_ns = 0;
SET_TIMESTAMP(exec_end_ns);
// Send all the responses that haven't already been sent because of
// an earlier error. Note that the responses are not set to nullptr
// here as we need that indication below to determine if the request
// we successful or not.
for (auto& response : responses) {
if (response != nullptr) {
LOG_IF_ERROR(
TRITONBACKEND_ResponseSend(
response, TRITONSERVER_RESPONSE_COMPLETE_FINAL, nullptr),
"failed to send BLS backend response");
}
}
// Report statistics for each request.
for (uint32_t r = 0; r < request_count; ++r) {
auto& request = requests[r];
LOG_IF_ERROR(
TRITONBACKEND_ModelInstanceReportStatistics(
TritonModelInstance(), request,
(responses[r] != nullptr) /* success */, exec_start_ns,
compute_start_ns, compute_end_ns, exec_end_ns),
"failed reporting request statistics");
LOG_IF_ERROR(
TRITONBACKEND_RequestRelease(request, TRITONSERVER_REQUEST_RELEASE_ALL),
"failed releasing request");
}
// Report the entire batch statistics.
LOG_IF_ERROR(
TRITONBACKEND_ModelInstanceReportBatchStatistics(
TritonModelInstance(), 1 /*total_batch_size*/, exec_start_ns,
compute_start_ns, compute_end_ns, exec_end_ns),
"failed reporting batch request statistics");
LOG_MESSAGE(
TRITONSERVER_LOG_VERBOSE,
(std::string("TRITONBACKEND_ModelExecute: model ") + Name() +
" released " + std::to_string(request_count) + " requests")
.c_str());
}
/////////////
extern "C" {
// Implementing TRITONBACKEND_ModelInitialize is optional. The backend
// should initialize any state that is intended to be shared across
// all instances of the model.
TRITONSERVER_Error*
TRITONBACKEND_ModelInitialize(TRITONBACKEND_Model* model)
{
const char* cname;
RETURN_IF_ERROR(TRITONBACKEND_ModelName(model, &cname));
std::string name(cname);
uint64_t version;
RETURN_IF_ERROR(TRITONBACKEND_ModelVersion(model, &version));
LOG_MESSAGE(
TRITONSERVER_LOG_INFO,
(std::string("TRITONBACKEND_ModelInitialize: ") + name + " (version " +
std::to_string(version) + ")")
.c_str());
// With each model we create a ModelState object and associate it
// with the TRITONBACKEND_Model.
ModelState* model_state;
RETURN_IF_ERROR(ModelState::Create(model, &model_state));
RETURN_IF_ERROR(
TRITONBACKEND_ModelSetState(model, reinterpret_cast<void*>(model_state)));
// One of the primary things to do in ModelInitialize is to examine
// the model configuration to ensure that it is something that this
// backend can support. If not, returning an error from this
// function will prevent the model from loading.
RETURN_IF_ERROR(model_state->ValidateModelConfig());
return nullptr; // success
}
// Implementing TRITONBACKEND_ModelFinalize is optional unless state
// is set using TRITONBACKEND_ModelSetState. The backend must free
// this state and perform any other cleanup.
TRITONSERVER_Error*
TRITONBACKEND_ModelFinalize(TRITONBACKEND_Model* model)
{
void* vstate;
RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vstate));
ModelState* model_state = reinterpret_cast<ModelState*>(vstate);
LOG_MESSAGE(
TRITONSERVER_LOG_INFO, "TRITONBACKEND_ModelFinalize: delete model state");
delete model_state;
return nullptr; // success
}
// Implementing TRITONBACKEND_ModelInstanceInitialize is optional. The
// backend should initialize any state that is required for a model
// instance.
TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance)
{
const char* cname;
RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceName(instance, &cname));
std::string name(cname);
int32_t device_id;
RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceDeviceId(instance, &device_id));
TRITONSERVER_InstanceGroupKind kind;
RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceKind(instance, &kind));
LOG_MESSAGE(
TRITONSERVER_LOG_INFO,
(std::string("TRITONBACKEND_ModelInstanceInitialize: ") + name + " (" +
TRITONSERVER_InstanceGroupKindString(kind) + " device " +
std::to_string(device_id) + ")")
.c_str());
// The instance can access the corresponding model as well... here
// we get the model and from that get the model's state.
TRITONBACKEND_Model* model;
RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceModel(instance, &model));
void* vmodelstate;
RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vmodelstate));
ModelState* model_state = reinterpret_cast<ModelState*>(vmodelstate);
// With each instance we create a ModelInstanceState object and
// associate it with the TRITONBACKEND_ModelInstance.
ModelInstanceState* instance_state;
RETURN_IF_ERROR(
ModelInstanceState::Create(model_state, instance, &instance_state));
RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceSetState(
instance, reinterpret_cast<void*>(instance_state)));
LOG_MESSAGE(
TRITONSERVER_LOG_VERBOSE,
(std::string("TRITONBACKEND_ModelInstanceInitialize: instance "
"initialization successful ") +
name + " (device " + std::to_string(device_id) + ")")
.c_str());
return nullptr; // success
}
// Implementing TRITONBACKEND_ModelInstanceFinalize is optional unless
// state is set using TRITONBACKEND_ModelInstanceSetState. The backend
// must free this state and perform any other cleanup.
TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance)
{
void* vstate;
RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(instance, &vstate));
ModelInstanceState* instance_state =
reinterpret_cast<ModelInstanceState*>(vstate);
LOG_MESSAGE(
TRITONSERVER_LOG_INFO,
"TRITONBACKEND_ModelInstanceFinalize: delete instance state");
delete instance_state;
return nullptr; // success
}
// Implementing TRITONBACKEND_ModelInstanceExecute is required.
TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceExecute(
TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Request** requests,
const uint32_t request_count)
{
// Triton will not call this function simultaneously for the same
// 'instance'. But since this backend could be used by multiple
// instances from multiple models the implementation needs to handle
// multiple calls to this function at the same time (with different
// 'instance' objects). Suggested practice for this is to use only
// function-local and model-instance-specific state (obtained from
// 'instance'), which is what we do here.
ModelInstanceState* instance_state;
RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(
instance, reinterpret_cast<void**>(&instance_state)));
ModelState* model_state =
reinterpret_cast<ModelState*>(instance_state->Model());
LOG_MESSAGE(
TRITONSERVER_LOG_VERBOSE,
(std::string("model ") + model_state->Name() + ", instance " +
instance_state->Name() + ", executing " + std::to_string(request_count) +
" requests")
.c_str());
instance_state->ProcessRequests(requests, request_count);
return nullptr; // success
}
} // extern "C"
}}} // namespace triton::backend::bls
// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "bls.h"
namespace triton { namespace backend { namespace bls {
BLSExecutor::BLSExecutor(TRITONSERVER_Server* server)
: server_(server), model_executor_(server)
{
}
TRITONSERVER_Error*
BLSExecutor::PrepareInferenceRequest(
TRITONBACKEND_Request* bls_request,
TRITONSERVER_InferenceRequest** irequest, const std::string model_name)
{
// Get request_id, correlation_id, and flags from the current request
// for preparing a new inference request that we will send to 'addsub_python'
// or 'addsub_tf' model later.
const char* request_id;
uint64_t correlation_id;
uint32_t flags;
RETURN_IF_ERROR(TRITONBACKEND_RequestId(bls_request, &request_id));
RETURN_IF_ERROR(
TRITONBACKEND_RequestCorrelationId(bls_request, &correlation_id));
RETURN_IF_ERROR(TRITONBACKEND_RequestFlags(bls_request, &flags));
// Create an inference request object. The inference request object
// is where we set the name of the model we want to use for
// inference and the input tensors.
RETURN_IF_ERROR(TRITONSERVER_InferenceRequestNew(
irequest, server_, model_name.c_str(), -1 /* model_version */));
// Set request_id, correlation_id, and flags for the new request.
RETURN_IF_ERROR(TRITONSERVER_InferenceRequestSetId(*irequest, request_id));
RETURN_IF_ERROR(
TRITONSERVER_InferenceRequestSetCorrelationId(*irequest, correlation_id));
RETURN_IF_ERROR(TRITONSERVER_InferenceRequestSetFlags(*irequest, flags));
RETURN_IF_ERROR(TRITONSERVER_InferenceRequestSetReleaseCallback(
*irequest, InferRequestComplete, nullptr /* request_release_userp */));
return nullptr; // success
}
TRITONSERVER_Error*
BLSExecutor::PrepareInferenceInput(
TRITONBACKEND_Request* bls_request, TRITONSERVER_InferenceRequest* irequest)
{
// Get the properties of the two inputs from the current request.
// Then, add the two input tensors and append the input data to the new
// request.
uint32_t input_count;
RETURN_IF_ERROR(TRITONBACKEND_RequestInputCount(bls_request, &input_count));
TRITONBACKEND_Input* input;
const char* name;
TRITONSERVER_DataType datatype;
const int64_t* shape;
uint32_t dims_count;
size_t data_byte_size;
TRITONSERVER_MemoryType data_memory_type;
int64_t data_memory_id;
const char* data_buffer;
for (size_t count = 0; count < input_count; count++) {
RETURN_IF_ERROR(TRITONBACKEND_RequestInputByIndex(
bls_request, count /* index */, &input));
RETURN_IF_ERROR(TRITONBACKEND_InputProperties(
input, &name, &datatype, &shape, &dims_count, nullptr, nullptr));
RETURN_IF_ERROR(TRITONBACKEND_InputBuffer(
input, 0 /* idx */, reinterpret_cast<const void**>(&data_buffer),
&data_byte_size, &data_memory_type, &data_memory_id));
RETURN_IF_ERROR(TRITONSERVER_InferenceRequestAddInput(
irequest, name, datatype, shape, dims_count));
RETURN_IF_ERROR(TRITONSERVER_InferenceRequestAppendInputData(
irequest, name, &data_buffer[0], data_byte_size, data_memory_type,
data_memory_id));
}
return nullptr; // success
}
TRITONSERVER_Error*
BLSExecutor::PrepareInferenceOutput(
TRITONBACKEND_Request* bls_request, TRITONSERVER_InferenceRequest* irequest)
{
// Indicate the output tensors to be calculated and returned
// for the inference request.
uint32_t output_count;
RETURN_IF_ERROR(TRITONBACKEND_RequestOutputCount(bls_request, &output_count));
const char* output_name;
for (size_t count = 0; count < output_count; count++) {
RETURN_IF_ERROR(TRITONBACKEND_RequestOutputName(
bls_request, count /* index */, &output_name));
RETURN_IF_ERROR(
TRITONSERVER_InferenceRequestAddRequestedOutput(irequest, output_name));
}
return nullptr; // success
}
void
BLSExecutor::Execute(
TRITONBACKEND_Request* bls_request, TRITONBACKEND_Response** response)
{
// The names of the models that we will send internal requests on.
std::vector<std::string> model_names = {"addsub_python", "addsub_tf"};
// Check if both models are valid before executing request.
try {
for (size_t i = 0; i < 2; i++) {
// Check if the model is ready.
bool is_ready = false;
THROW_IF_TRITON_ERROR(TRITONSERVER_ServerModelIsReady(
server_, model_names[i].c_str(), -1 /* model_version */, &is_ready));
if (!is_ready) {
throw BLSBackendException(
(std::string("Failed to execute the inference request. Model '") +
model_names[i].c_str() + "' is not ready.")
.c_str());
}
// For simplicity, decoupled API is not supported in this BLS backend. You
// can implement your own backend that supports decoupled models.
uint32_t txn_flags;
THROW_IF_TRITON_ERROR(TRITONSERVER_ServerModelTransactionProperties(
server_, model_names[i].c_str(), -1 /* model_version */, &txn_flags,
nullptr /* voidp */));
if ((txn_flags & TRITONSERVER_TXN_DECOUPLED) != 0) {
throw BLSBackendException(
std::string("Model '") + model_names[i].c_str() +
"' is using the decoupled. This BLS Backend doesn't support models "
"using the decoupled transaction policy.");
}
}
}
catch (const BLSBackendException& bls_exception) {
LOG_MESSAGE(TRITONSERVER_LOG_ERROR, bls_exception.what());
RESPOND_AND_SET_NULL_IF_ERROR(
response,
TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL, "Failed to send inference requests"));
return;
}
// Prepare std::future for each model. Since this BLS backend
// can handle requests in parallel, we will send all the inference
// requests first and then retrieve them later.
std::vector<std::future<TRITONSERVER_InferenceResponse*>> futures(2);
// The inference request object for sending internal requests.
TRITONSERVER_InferenceRequest* irequest = nullptr;
// For each inference request, the backend sends two requests on the
// 'addsub_python' and 'addsub_tf' models.
try {
for (size_t icount = 0; icount < 2; icount++) {
// Initialize the inference request with required information.
THROW_IF_TRITON_ERROR(
PrepareInferenceRequest(bls_request, &irequest, model_names[icount]));
THROW_IF_TRITON_ERROR(PrepareInferenceInput(bls_request, irequest));
THROW_IF_TRITON_ERROR(PrepareInferenceOutput(bls_request, irequest));
// Execute inference request.
THROW_IF_TRITON_ERROR(
model_executor_.AsyncExecute(irequest, &futures[icount]));
}
}
catch (const BLSBackendException& bls_exception) {
LOG_MESSAGE(TRITONSERVER_LOG_ERROR, bls_exception.what());
LOG_IF_ERROR(
TRITONSERVER_InferenceRequestDelete(irequest),
"Failed to delete inference request.");
RESPOND_AND_SET_NULL_IF_ERROR(
response,
TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL, "Failed to send inference requests"));
return;
}
// If both internal requests are sent successfully, retrieve the output from
// each request and construct the final response.
ConstructFinalResponse(response, std::move(futures));
}
void
BLSExecutor::ConstructFinalResponse(
TRITONBACKEND_Response** response,
std::vector<std::future<TRITONSERVER_InferenceResponse*>> futures)
{
// Prepare two TRITONSERVER_InferenceResponse* objects for 'addsub_python' and
// 'addsub_tf' repectively.
std::vector<TRITONSERVER_InferenceResponse*> completed_responses = {nullptr,
nullptr};
const char* output_name;
TRITONSERVER_DataType output_datatype;
const int64_t* output_shape;
uint64_t dims_count;
size_t output_byte_size;
TRITONSERVER_MemoryType output_memory_type;
int64_t output_memory_id;
const void* output_base;
void* userp;
for (size_t icount = 0; icount < 2; icount++) {
// Retrieve the corresponding TRITONSERVER_InferenceResponse object from
// 'futures'. The InferResponseComplete function sets the std::promise
// so that this thread will block until the response is returned.
completed_responses[icount] = futures[icount].get();
try {
THROW_IF_TRITON_ERROR(
TRITONSERVER_InferenceResponseError(completed_responses[icount]));
}
catch (const BLSBackendException& bls_exception) {
LOG_MESSAGE(TRITONSERVER_LOG_ERROR, bls_exception.what());
if (completed_responses[icount] != nullptr) {
LOG_IF_ERROR(
TRITONSERVER_InferenceResponseDelete(completed_responses[icount]),
"Failed to delete inference response.");
}
return;
}
// Retrieve outputs from 'completed_responses'.
// Extract OUTPUT0 from the 'addsub_python' and OUTPUT1 from the
// 'addsub_tf' model to form the final inference response object.
// Get all the information about the output tensor.
RESPOND_AND_SET_NULL_IF_ERROR(
response,
TRITONSERVER_InferenceResponseOutput(
completed_responses[icount], icount, &output_name, &output_datatype,
&output_shape, &dims_count, &output_base, &output_byte_size,
&output_memory_type, &output_memory_id, &userp));
// Create an output tensor in the final response with
// the information retrieved above.
TRITONBACKEND_Output* output;
RESPOND_AND_SET_NULL_IF_ERROR(
response, TRITONBACKEND_ResponseOutput(
*response, &output, output_name, output_datatype,
output_shape, dims_count));
// Get a buffer that holds the tensor data for the output.
// We request a buffer in CPU memory but we have to handle any returned
// type. If we get back a buffer in GPU memory we just fail the request.
void* output_buffer;
output_memory_type = TRITONSERVER_MEMORY_CPU;
RESPOND_AND_SET_NULL_IF_ERROR(
response, TRITONBACKEND_OutputBuffer(
output, &output_buffer, output_byte_size,
&output_memory_type, &output_memory_id));
if (output_memory_type == TRITONSERVER_MEMORY_GPU) {
RESPOND_AND_SET_NULL_IF_ERROR(
response, TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL,
"failed to create output buffer in CPU memory"));
}
// Fill the BLS output buffer with output data returned by internal
// requests.
memcpy(output_buffer, output_base, output_byte_size);
LOG_IF_ERROR(
TRITONSERVER_InferenceResponseDelete(completed_responses[icount]),
"Failed to delete inference response.");
}
}
}}} // namespace triton::backend::bls
// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <future>
#include "bls_utils.h"
#include "triton/backend/backend_common.h"
#include "triton/core/tritonbackend.h"
#include "triton/core/tritonserver.h"
namespace triton { namespace backend { namespace bls {
//
// BLSExecutor
//
// Includes the custom BLS logic for this backend.
// This class shows how to utilize Triton in-process C-API to build the
// execution pipeline.
//
class BLSExecutor {
public:
BLSExecutor(TRITONSERVER_Server* server);
// Prepares the inference request that will be used internally.
TRITONSERVER_Error* PrepareInferenceRequest(
TRITONBACKEND_Request* bls_request,
TRITONSERVER_InferenceRequest** irequest, const std::string model_name);
// Prepares the input for the internal inference request.
TRITONSERVER_Error* PrepareInferenceInput(
TRITONBACKEND_Request* bls_request,
TRITONSERVER_InferenceRequest* irequest);
// Prepares the output for the internal inference request.
TRITONSERVER_Error* PrepareInferenceOutput(
TRITONBACKEND_Request* bls_request,
TRITONSERVER_InferenceRequest* irequest);
// Performs the whole BLS pipeline.
void Execute(
TRITONBACKEND_Request* bls_request, TRITONBACKEND_Response** response);
// Constructs the final response.
void ConstructFinalResponse(
TRITONBACKEND_Response** response,
std::vector<std::future<TRITONSERVER_InferenceResponse*>> futures);
private:
// The server object that encapsulates all the functionality of the Triton
// server and allows access to the Triton server API.
TRITONSERVER_Server* server_;
// The ModelExecutor object for executing inference request on a model.
ModelExecutor model_executor_;
};
}}} // namespace triton::backend::bls
// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "bls_utils.h"
namespace triton { namespace backend { namespace bls {
TRITONSERVER_Error*
CPUAllocator(
TRITONSERVER_ResponseAllocator* allocator, const char* tensor_name,
size_t byte_size, TRITONSERVER_MemoryType preferred_memory_type,
int64_t preferred_memory_type_id, void* userp, void** buffer,
void** buffer_userp, TRITONSERVER_MemoryType* actual_memory_type,
int64_t* actual_memory_type_id)
{
// For simplicity, this backend example always uses CPU memory regardless of
// the preferred memory type. You can make the actual memory type and id that
// we allocate be the same as preferred memory type. You can also provide a
// customized allocator to support different preferred_memory_type, and reuse
// memory buffer when possible.
*actual_memory_type = TRITONSERVER_MEMORY_CPU;
*actual_memory_type_id = preferred_memory_type_id;
// If 'byte_size' is zero just return 'buffer' == nullptr, we don't
// need to do any other book-keeping.
if (byte_size == 0) {
*buffer = nullptr;
*buffer_userp = nullptr;
LOG_MESSAGE(
TRITONSERVER_LOG_VERBOSE, ("allocated " + std::to_string(byte_size) +
" bytes for result tensor " + tensor_name)
.c_str());
} else {
void* allocated_ptr = nullptr;
*actual_memory_type = TRITONSERVER_MEMORY_CPU;
allocated_ptr = malloc(byte_size);
// Pass the tensor name with buffer_userp so we can show it when
// releasing the buffer.
if (allocated_ptr != nullptr) {
*buffer = allocated_ptr;
*buffer_userp = new std::string(tensor_name);
LOG_MESSAGE(
TRITONSERVER_LOG_VERBOSE,
("allocated " + std::to_string(byte_size) + " bytes in " +
TRITONSERVER_MemoryTypeString(*actual_memory_type) +
" for result tensor " + tensor_name)
.c_str());
}
}
return nullptr; // Success
}
TRITONSERVER_Error*
ResponseRelease(
TRITONSERVER_ResponseAllocator* allocator, void* buffer, void* buffer_userp,
size_t byte_size, TRITONSERVER_MemoryType memory_type,
int64_t memory_type_id)
{
std::string* name = nullptr;
if (buffer_userp != nullptr) {
name = reinterpret_cast<std::string*>(buffer_userp);
} else {
name = new std::string("<unknown>");
}
std::stringstream ss;
ss << buffer;
std::string buffer_str = ss.str();
LOG_MESSAGE(
TRITONSERVER_LOG_VERBOSE,
("Releasing buffer " + buffer_str + " of size " +
std::to_string(byte_size) + " in " +
TRITONSERVER_MemoryTypeString(memory_type) + " for result '" + *name)
.c_str());
switch (memory_type) {
case TRITONSERVER_MEMORY_CPU:
free(buffer);
break;
default:
LOG_MESSAGE(
TRITONSERVER_LOG_ERROR,
std::string(
"error: unexpected buffer allocated in CUDA managed memory")
.c_str());
break;
}
delete name;
return nullptr; // Success
}
void
InferRequestComplete(
TRITONSERVER_InferenceRequest* request, const uint32_t flags, void* userp)
{
if (request != nullptr) {
LOG_IF_ERROR(
TRITONSERVER_InferenceRequestDelete(request),
"Failed to delete inference request.");
}
}
void
InferResponseComplete(
TRITONSERVER_InferenceResponse* response, const uint32_t flags, void* userp)
{
// The following logic only works for non-decoupled models as for decoupled
// models it may send multiple responses for a request or not send any
// responses for a request. Need to modify this function if the model is using
// decoupled API.
if (response != nullptr) {
// Send 'response' to the future.
std::promise<TRITONSERVER_InferenceResponse*>* p =
reinterpret_cast<std::promise<TRITONSERVER_InferenceResponse*>*>(userp);
p->set_value(response);
delete p;
}
}
ModelExecutor::ModelExecutor(TRITONSERVER_Server* server) : server_(server)
{
// When triton needs a buffer to hold an output tensor, it will ask
// us to provide the buffer. In this way we can have any buffer
// management and sharing strategy that we want. To communicate to
// triton the functions that we want it to call to perform the
// allocations, we create a "response allocator" object. We pass
// this response allocate object to triton when requesting
// inference. We can reuse this response allocator object for any
// number of inference requests.
allocator_ = nullptr;
THROW_IF_TRITON_ERROR(TRITONSERVER_ResponseAllocatorNew(
&allocator_, CPUAllocator, ResponseRelease, nullptr /* start_fn */));
}
TRITONSERVER_Error*
ModelExecutor::AsyncExecute(
TRITONSERVER_InferenceRequest* irequest,
std::future<TRITONSERVER_InferenceResponse*>* future)
{
// Perform inference by calling TRITONSERVER_ServerInferAsync. This
// call is asychronous and therefore returns immediately. The
// completion of the inference and delivery of the response is done
// by triton by calling the "response complete" callback functions
// (InferResponseComplete in this case).
auto p = new std::promise<TRITONSERVER_InferenceResponse*>();
*future = p->get_future();
RETURN_IF_ERROR(TRITONSERVER_InferenceRequestSetResponseCallback(
irequest, allocator_, nullptr /* response_allocator_userp */,
InferResponseComplete, reinterpret_cast<void*>(p)));
RETURN_IF_ERROR(
TRITONSERVER_ServerInferAsync(server_, irequest, nullptr /* trace */));
return nullptr; // success
}
}}} // namespace triton::backend::bls
// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <future>
#include <sstream>
#include "triton/backend/backend_common.h"
#include "triton/core/tritonbackend.h"
#include "triton/core/tritonserver.h"
namespace triton { namespace backend { namespace bls {
#define THROW_IF_TRITON_ERROR(X) \
do { \
TRITONSERVER_Error* tie_err__ = (X); \
if (tie_err__ != nullptr) { \
throw BLSBackendException(TRITONSERVER_ErrorMessage(tie_err__)); \
} \
} while (false)
//
// BLSBackendException
//
// Exception thrown if error occurs in BLSBackend.
//
struct BLSBackendException : std::exception {
BLSBackendException(const std::string& message) : message_(message) {}
const char* what() const throw() { return message_.c_str(); }
std::string message_;
};
// Performs the allocations of output tensors.
TRITONSERVER_Error* CPUAllocator(
TRITONSERVER_ResponseAllocator* allocator, const char* tensor_name,
size_t byte_size, TRITONSERVER_MemoryType preferred_memory_type,
int64_t preferred_memory_type_id, void* userp, void** buffer,
void** buffer_userp, TRITONSERVER_MemoryType* actual_memory_type,
int64_t* actual_memory_type_id);
// Callback functions for server inference.
TRITONSERVER_Error* ResponseRelease(
TRITONSERVER_ResponseAllocator* allocator, void* buffer, void* buffer_userp,
size_t byte_size, TRITONSERVER_MemoryType memory_type,
int64_t memory_type_id);
void InferRequestComplete(
TRITONSERVER_InferenceRequest* request, const uint32_t flags, void* userp);
void InferResponseComplete(
TRITONSERVER_InferenceResponse* response, const uint32_t flags,
void* userp);
//
// ModelExecutor
//
// Execute inference request on a model.
//
class ModelExecutor {
public:
ModelExecutor(TRITONSERVER_Server* server);
// Performs async inference request.
TRITONSERVER_Error* AsyncExecute(
TRITONSERVER_InferenceRequest* irequest,
std::future<TRITONSERVER_InferenceResponse*>* future);
private:
// The server object that encapsulates all the functionality of the Triton
// server and allows access to the Triton server API.
TRITONSERVER_Server* server_;
// The allocator object that will be used for allocating output tensors.
TRITONSERVER_ResponseAllocator* allocator_;
};
}}} // namespace triton::backend::bls
# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
{
global:
TRITONBACKEND_*;
local: *;
};
# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
include(CMakeFindDependencyMacro)
get_filename_component(
TUTORIALMINIMALBACKEND_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH
)
list(APPEND CMAKE_MODULE_PATH ${TUTORIALMINIMALBACKEND_CMAKE_DIR})
if(NOT TARGET TutorialMinimalBackend::triton-minimal-backend)
include("${TUTORIALMINIMALBACKEND_CMAKE_DIR}/TutorialMinimalBackendTargets.cmake")
endif()
set(TUTORIALMINIMALBACKEND_LIBRARIES TutorialMinimalBackend::triton-minimal-backend)
# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
{
global:
TRITONBACKEND_*;
local: *;
};
// Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "triton/backend/backend_common.h"
#include "triton/backend/backend_input_collector.h"
#include "triton/backend/backend_model.h"
#include "triton/backend/backend_model_instance.h"
#include "triton/backend/backend_output_responder.h"
#include "triton/core/tritonbackend.h"
namespace triton { namespace backend { namespace minimal {
//
// Minimal backend that demonstrates the TRITONBACKEND API. This
// backend works for any model that has 1 input called "IN0" with
// INT32 datatype and shape [ 4 ] and 1 output called "OUT0" with
// INT32 datatype and shape [ 4 ]. The backend supports both batching
// and non-batching models.
//
// For each batch of requests, the backend returns the input tensor
// value in the output tensor.
//
/////////////
//
// ModelState
//
// State associated with a model that is using this backend. An object
// of this class is created and associated with each
// TRITONBACKEND_Model. ModelState is derived from BackendModel class
// provided in the backend utilities that provides many common
// functions.
//
class ModelState : public BackendModel {
public:
static TRITONSERVER_Error* Create(
TRITONBACKEND_Model* triton_model, ModelState** state);
virtual ~ModelState() = default;
private:
ModelState(TRITONBACKEND_Model* triton_model) : BackendModel(triton_model) {}
};
TRITONSERVER_Error*
ModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state)
{
try {
*state = new ModelState(triton_model);
}
catch (const BackendModelException& ex) {
RETURN_ERROR_IF_TRUE(
ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL,
std::string("unexpected nullptr in BackendModelException"));
RETURN_IF_ERROR(ex.err_);
}
return nullptr; // success
}
extern "C" {
// Triton calls TRITONBACKEND_ModelInitialize when a model is loaded
// to allow the backend to create any state associated with the model,
// and to also examine the model configuration to determine if the
// configuration is suitable for the backend. Any errors reported by
// this function will prevent the model from loading.
//
TRITONSERVER_Error*
TRITONBACKEND_ModelInitialize(TRITONBACKEND_Model* model)
{
// Create a ModelState object and associate it with the
// TRITONBACKEND_Model. If anything goes wrong with initialization
// of the model state then an error is returned and Triton will fail
// to load the model.
ModelState* model_state;
RETURN_IF_ERROR(ModelState::Create(model, &model_state));
RETURN_IF_ERROR(
TRITONBACKEND_ModelSetState(model, reinterpret_cast<void*>(model_state)));
return nullptr; // success
}
// Triton calls TRITONBACKEND_ModelFinalize when a model is no longer
// needed. The backend should cleanup any state associated with the
// model. This function will not be called until all model instances
// of the model have been finalized.
//
TRITONSERVER_Error*
TRITONBACKEND_ModelFinalize(TRITONBACKEND_Model* model)
{
void* vstate;
RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vstate));
ModelState* model_state = reinterpret_cast<ModelState*>(vstate);
delete model_state;
return nullptr; // success
}
} // extern "C"
/////////////
//
// ModelInstanceState
//
// State associated with a model instance. An object of this class is
// created and associated with each
// TRITONBACKEND_ModelInstance. ModelInstanceState is derived from
// BackendModelInstance class provided in the backend utilities that
// provides many common functions.
//
class ModelInstanceState : public BackendModelInstance {
public:
static TRITONSERVER_Error* Create(
ModelState* model_state,
TRITONBACKEND_ModelInstance* triton_model_instance,
ModelInstanceState** state);
virtual ~ModelInstanceState() = default;
// Get the state of the model that corresponds to this instance.
ModelState* StateForModel() const { return model_state_; }
private:
ModelInstanceState(
ModelState* model_state,
TRITONBACKEND_ModelInstance* triton_model_instance)
: BackendModelInstance(model_state, triton_model_instance),
model_state_(model_state)
{
}
ModelState* model_state_;
};
TRITONSERVER_Error*
ModelInstanceState::Create(
ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance,
ModelInstanceState** state)
{
try {
*state = new ModelInstanceState(model_state, triton_model_instance);
}
catch (const BackendModelInstanceException& ex) {
RETURN_ERROR_IF_TRUE(
ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL,
std::string("unexpected nullptr in BackendModelInstanceException"));
RETURN_IF_ERROR(ex.err_);
}
return nullptr; // success
}
extern "C" {
// Triton calls TRITONBACKEND_ModelInstanceInitialize when a model
// instance is created to allow the backend to initialize any state
// associated with the instance.
//
TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance)
{
// Get the model state associated with this instance's model.
TRITONBACKEND_Model* model;
RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceModel(instance, &model));
void* vmodelstate;
RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vmodelstate));
ModelState* model_state = reinterpret_cast<ModelState*>(vmodelstate);
// Create a ModelInstanceState object and associate it with the
// TRITONBACKEND_ModelInstance.
ModelInstanceState* instance_state;
RETURN_IF_ERROR(
ModelInstanceState::Create(model_state, instance, &instance_state));
RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceSetState(
instance, reinterpret_cast<void*>(instance_state)));
return nullptr; // success
}
// Triton calls TRITONBACKEND_ModelInstanceFinalize when a model
// instance is no longer needed. The backend should cleanup any state
// associated with the model instance.
//
TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance)
{
void* vstate;
RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(instance, &vstate));
ModelInstanceState* instance_state =
reinterpret_cast<ModelInstanceState*>(vstate);
delete instance_state;
return nullptr; // success
}
} // extern "C"
/////////////
extern "C" {
// When Triton calls TRITONBACKEND_ModelInstanceExecute it is required
// that a backend create a response for each request in the batch. A
// response may be the output tensors required for that request or may
// be an error that is returned in the response.
//
TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceExecute(
TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Request** requests,
const uint32_t request_count)
{
// Triton will not call this function simultaneously for the same
// 'instance'. But since this backend could be used by multiple
// instances from multiple models the implementation needs to handle
// multiple calls to this function at the same time (with different
// 'instance' objects). Best practice for a high-performance
// implementation is to avoid introducing mutex/lock and instead use
// only function-local and model-instance-specific state.
ModelInstanceState* instance_state;
RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(
instance, reinterpret_cast<void**>(&instance_state)));
ModelState* model_state = instance_state->StateForModel();
// 'responses' is initialized as a parallel array to 'requests',
// with one TRITONBACKEND_Response object for each
// TRITONBACKEND_Request object. If something goes wrong while
// creating these response objects, the backend simply returns an
// error from TRITONBACKEND_ModelInstanceExecute, indicating to
// Triton that this backend did not create or send any responses and
// so it is up to Triton to create and send an appropriate error
// response for each request. RETURN_IF_ERROR is one of several
// useful macros for error handling that can be found in
// backend_common.h.
std::vector<TRITONBACKEND_Response*> responses;
responses.reserve(request_count);
for (uint32_t r = 0; r < request_count; ++r) {
TRITONBACKEND_Request* request = requests[r];
TRITONBACKEND_Response* response;
RETURN_IF_ERROR(TRITONBACKEND_ResponseNew(&response, request));
responses.push_back(response);
}
// At this point, the backend takes ownership of 'requests', which
// means that it is responsible for sending a response for every
// request. From here, even if something goes wrong in processing,
// the backend must return 'nullptr' from this function to indicate
// success. Any errors and failures must be communicated via the
// response objects.
//
// To simplify error handling, the backend utilities manage
// 'responses' in a specific way and it is recommended that backends
// follow this same pattern. When an error is detected in the
// processing of a request, an appropriate error response is sent
// and the corresponding TRITONBACKEND_Response object within
// 'responses' is set to nullptr to indicate that the
// request/response has already been handled and no futher processing
// should be performed for that request. Even if all responses fail,
// the backend still allows execution to flow to the end of the
// function. RESPOND_AND_SET_NULL_IF_ERROR, and
// RESPOND_ALL_AND_SET_NULL_IF_ERROR are macros from
// backend_common.h that assist in this management of response
// objects.
// The backend could iterate over the 'requests' and process each
// one separately. But for performance reasons it is usually
// preferred to create batched input tensors that are processed
// simultaneously. This is especially true for devices like GPUs
// that are capable of exploiting the large amount parallelism
// exposed by larger data sets.
//
// The backend utilities provide a "collector" to facilitate this
// batching process. The 'collector's ProcessTensor function will
// combine a tensor's value from each request in the batch into a
// single contiguous buffer. The buffer can be provided by the
// backend or 'collector' can create and manage it. In this backend,
// there is not a specific buffer into which the batch should be
// created, so use ProcessTensor arguments that cause collector to
// manage it.
BackendInputCollector collector(
requests, request_count, &responses, model_state->TritonMemoryManager(),
false /* pinned_enabled */, nullptr /* stream*/);
// To instruct ProcessTensor to "gather" the entire batch of IN0
// input tensors into a single contiguous buffer in CPU memory, set
// the "allowed input types" to be the CPU ones (see tritonserver.h
// in the triton-inference-server/core repo for allowed memory
// types).
std::vector<std::pair<TRITONSERVER_MemoryType, int64_t>> allowed_input_types =
{{TRITONSERVER_MEMORY_CPU_PINNED, 0}, {TRITONSERVER_MEMORY_CPU, 0}};
const char* input_buffer;
size_t input_buffer_byte_size;
TRITONSERVER_MemoryType input_buffer_memory_type;
int64_t input_buffer_memory_type_id;
RESPOND_ALL_AND_SET_NULL_IF_ERROR(
responses, request_count,
collector.ProcessTensor(
"IN0", nullptr /* existing_buffer */,
0 /* existing_buffer_byte_size */, allowed_input_types, &input_buffer,
&input_buffer_byte_size, &input_buffer_memory_type,
&input_buffer_memory_type_id));
// Finalize the collector. If 'true' is returned, 'input_buffer'
// will not be valid until the backend synchronizes the CUDA
// stream or event that was used when creating the collector. For
// this backend, GPU is not supported and so no CUDA sync should
// be needed; so if 'true' is returned simply log an error.
const bool need_cuda_input_sync = collector.Finalize();
if (need_cuda_input_sync) {
LOG_MESSAGE(
TRITONSERVER_LOG_ERROR,
"'minimal' backend: unexpected CUDA sync required by collector");
}
// 'input_buffer' contains the batched "IN0" tensor. The backend can
// implement whatever logic is necesary to produce "OUT0". This
// backend simply returns the IN0 value in OUT0 so no actual
// computation is needed.
LOG_MESSAGE(
TRITONSERVER_LOG_INFO,
(std::string("model ") + model_state->Name() + ": requests in batch " +
std::to_string(request_count))
.c_str());
std::string tstr;
IGNORE_ERROR(BufferAsTypedString(
tstr, input_buffer, input_buffer_byte_size, TRITONSERVER_TYPE_INT32));
LOG_MESSAGE(
TRITONSERVER_LOG_INFO,
(std::string("batched IN0 value: ") + tstr).c_str());
const char* output_buffer = input_buffer;
TRITONSERVER_MemoryType output_buffer_memory_type = input_buffer_memory_type;
int64_t output_buffer_memory_type_id = input_buffer_memory_type_id;
// This backend supports models that batch along the first dimension
// and those that don't batch. For non-batch models the output shape
// will be [ 4 ]. For batch models the output shape will be [ -1, 4
// ] and the backend "responder" utility below will set the
// appropriate batch dimension value for each response.
std::vector<int64_t> output_batch_shape;
bool supports_first_dim_batching;
RESPOND_ALL_AND_SET_NULL_IF_ERROR(
responses, request_count,
model_state->SupportsFirstDimBatching(&supports_first_dim_batching));
if (supports_first_dim_batching) {
output_batch_shape.push_back(-1);
}
output_batch_shape.push_back(4);
// Because the OUT0 values are concatenated into a single contiguous
// 'output_buffer', the backend must "scatter" them out to the
// individual response OUT0 tensors. The backend utilities provide
// a "responder" to facilitate this scattering process.
// The 'responders's ProcessTensor function will copy the portion of
// 'output_buffer' corresonding to each request's output into the
// response for that request.
BackendOutputResponder responder(
requests, request_count, &responses, model_state->TritonMemoryManager(),
supports_first_dim_batching, false /* pinned_enabled */,
nullptr /* stream*/);
responder.ProcessTensor(
"OUT0", TRITONSERVER_TYPE_INT32, output_batch_shape, output_buffer,
output_buffer_memory_type, output_buffer_memory_type_id);
// Finalize the responder. If 'true' is returned, the OUT0
// tensors' data will not be valid until the backend synchronizes
// the CUDA stream or event that was used when creating the
// responder. For this backend, GPU is not supported and so no
// CUDA sync should be needed; so if 'true' is returned simply log
// an error.
const bool need_cuda_output_sync = responder.Finalize();
if (need_cuda_output_sync) {
LOG_MESSAGE(
TRITONSERVER_LOG_ERROR,
"'minimal' backend: unexpected CUDA sync required by responder");
}
// Send all the responses that haven't already been sent because of
// an earlier error.
for (auto& response : responses) {
if (response != nullptr) {
LOG_IF_ERROR(
TRITONBACKEND_ResponseSend(
response, TRITONSERVER_RESPONSE_COMPLETE_FINAL, nullptr),
"failed to send response");
}
}
// Done with the request objects so release them.
for (uint32_t r = 0; r < request_count; ++r) {
auto& request = requests[r];
LOG_IF_ERROR(
TRITONBACKEND_RequestRelease(request, TRITONSERVER_REQUEST_RELEASE_ALL),
"failed releasing request");
}
return nullptr; // success
}
} // extern "C"
}}} // namespace triton::backend::minimal
# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
include(CMakeFindDependencyMacro)
get_filename_component(
TUTORIALRECOMMENDEDBACKEND_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH
)
list(APPEND CMAKE_MODULE_PATH ${TUTORIALRECOMMENDEDBACKEND_CMAKE_DIR})
if(NOT TARGET TutorialRecommendedBackend::triton-recommended-backend)
include("${TUTORIALRECOMMENDEDBACKEND_CMAKE_DIR}/TutorialRecommendedBackendTargets.cmake")
endif()
set(TUTORIALRECOMMENDEDBACKEND_LIBRARIES TutorialRecommendedBackend::triton-recommended-backend)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment