Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Lmdeploy
Commits
0a21fff9
Commit
0a21fff9
authored
Dec 20, 2023
by
xiabo
Browse files
Adapt to 0.1.0
parent
9484fd1c
Changes
158
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
8704 additions
and
0 deletions
+8704
-0
3rdparty/core-r22.12/cmake/TritonCoreConfig.cmake.in
3rdparty/core-r22.12/cmake/TritonCoreConfig.cmake.in
+37
-0
3rdparty/core-r22.12/include/triton/core/tritonbackend.h
3rdparty/core-r22.12/include/triton/core/tritonbackend.h
+1410
-0
3rdparty/core-r22.12/include/triton/core/tritonrepoagent.h
3rdparty/core-r22.12/include/triton/core/tritonrepoagent.h
+417
-0
3rdparty/core-r22.12/include/triton/core/tritonserver.h
3rdparty/core-r22.12/include/triton/core/tritonserver.h
+2360
-0
3rdparty/core-r22.12/src/backend_config.cc
3rdparty/core-r22.12/src/backend_config.cc
+225
-0
3rdparty/core-r22.12/src/backend_config.h
3rdparty/core-r22.12/src/backend_config.h
+77
-0
3rdparty/core-r22.12/src/backend_manager.cc
3rdparty/core-r22.12/src/backend_manager.cc
+383
-0
3rdparty/core-r22.12/src/backend_manager.h
3rdparty/core-r22.12/src/backend_manager.h
+174
-0
3rdparty/core-r22.12/src/backend_memory_manager.cc
3rdparty/core-r22.12/src/backend_memory_manager.cc
+149
-0
3rdparty/core-r22.12/src/backend_memory_manager.h
3rdparty/core-r22.12/src/backend_memory_manager.h
+36
-0
3rdparty/core-r22.12/src/backend_model.cc
3rdparty/core-r22.12/src/backend_model.cc
+1301
-0
3rdparty/core-r22.12/src/backend_model.h
3rdparty/core-r22.12/src/backend_model.h
+133
-0
3rdparty/core-r22.12/src/backend_model_instance.cc
3rdparty/core-r22.12/src/backend_model_instance.cc
+966
-0
3rdparty/core-r22.12/src/backend_model_instance.h
3rdparty/core-r22.12/src/backend_model_instance.h
+200
-0
3rdparty/core-r22.12/src/buffer_attributes.cc
3rdparty/core-r22.12/src/buffer_attributes.cc
+104
-0
3rdparty/core-r22.12/src/buffer_attributes.h
3rdparty/core-r22.12/src/buffer_attributes.h
+79
-0
3rdparty/core-r22.12/src/constants.h
3rdparty/core-r22.12/src/constants.h
+108
-0
3rdparty/core-r22.12/src/cuda_memory_manager.cc
3rdparty/core-r22.12/src/cuda_memory_manager.cc
+197
-0
3rdparty/core-r22.12/src/cuda_memory_manager.h
3rdparty/core-r22.12/src/cuda_memory_manager.h
+85
-0
3rdparty/core-r22.12/src/cuda_utils.cc
3rdparty/core-r22.12/src/cuda_utils.cc
+263
-0
No files found.
Too many changes to show.
To preserve performance only
158 of 158+
files are displayed.
Plain diff
Email patch
3rdparty/core-r22.12/cmake/TritonCoreConfig.cmake.in
0 → 100644
View file @
0a21fff9
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
include(CMakeFindDependencyMacro)
get_filename_component(
TRITONCORE_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH
)
list(APPEND CMAKE_MODULE_PATH ${TRITONCORE_CMAKE_DIR})
if(NOT TARGET TritonCore::triton-core-serverapi)
include("${TRITONCORE_CMAKE_DIR}/TritonCoreTargets.cmake")
endif()
3rdparty/core-r22.12/include/triton/core/tritonbackend.h
0 → 100644
View file @
0a21fff9
// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <stddef.h>
#include <stdint.h>
#include "triton/core/tritonserver.h"
#ifdef __cplusplus
extern
"C"
{
#endif
#ifdef _COMPILING_TRITONBACKEND
#if defined(_MSC_VER)
#define TRITONBACKEND_DECLSPEC __declspec(dllexport)
#define TRITONBACKEND_ISPEC __declspec(dllimport)
#elif defined(__GNUC__)
#define TRITONBACKEND_DECLSPEC __attribute__((__visibility__("default")))
#define TRITONBACKEND_ISPEC
#else
#define TRITONBACKEND_DECLSPEC
#define TRITONBACKEND_ISPEC
#endif
#else
#if defined(_MSC_VER)
#define TRITONBACKEND_DECLSPEC __declspec(dllimport)
#define TRITONBACKEND_ISPEC __declspec(dllexport)
#else
#define TRITONBACKEND_DECLSPEC
#define TRITONBACKEND_ISPEC
#endif
#endif
struct
TRITONBACKEND_MemoryManager
;
struct
TRITONBACKEND_Input
;
struct
TRITONBACKEND_Output
;
struct
TRITONBACKEND_State
;
struct
TRITONBACKEND_Request
;
struct
TRITONBACKEND_ResponseFactory
;
struct
TRITONBACKEND_Response
;
struct
TRITONBACKEND_Backend
;
struct
TRITONBACKEND_Model
;
struct
TRITONBACKEND_ModelInstance
;
struct
TRITONBACKEND_BackendAttribute
;
///
/// TRITONBACKEND API Version
///
/// The TRITONBACKEND API is versioned with major and minor version
/// numbers. Any change to the API that does not impact backwards
/// compatibility (for example, adding a non-required function)
/// increases the minor version number. Any change that breaks
/// backwards compatibility (for example, deleting or changing the
/// behavior of a function) increases the major version number. A
/// backend should check that the API version used to compile the
/// backend is compatible with the API version of the Triton server
/// that it is running in. This is typically done by code similar to
/// the following which makes sure that the major versions are equal
/// and that the minor version of Triton is >= the minor version used
/// to build the backend.
///
/// uint32_t api_version_major, api_version_minor;
/// TRITONBACKEND_ApiVersion(&api_version_major, &api_version_minor);
/// if ((api_version_major != TRITONBACKEND_API_VERSION_MAJOR) ||
/// (api_version_minor < TRITONBACKEND_API_VERSION_MINOR)) {
/// return TRITONSERVER_ErrorNew(
/// TRITONSERVER_ERROR_UNSUPPORTED,
/// "triton backend API version does not support this backend");
/// }
///
#define TRITONBACKEND_API_VERSION_MAJOR 1
#define TRITONBACKEND_API_VERSION_MINOR 10
/// Get the TRITONBACKEND API version supported by Triton. This value
/// can be compared against the TRITONBACKEND_API_VERSION_MAJOR and
/// TRITONBACKEND_API_VERSION_MINOR used to build the backend to
/// ensure that Triton is compatible with the backend.
///
/// \param major Returns the TRITONBACKEND API major version supported
/// by Triton.
/// \param minor Returns the TRITONBACKEND API minor version supported
/// by Triton.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ApiVersion
(
uint32_t
*
major
,
uint32_t
*
minor
);
/// TRITONBACKEND_ArtifactType
///
/// The ways that the files that make up a backend or model are
/// communicated to the backend.
///
/// TRITONBACKEND_ARTIFACT_FILESYSTEM: The model or backend
/// artifacts are made available to Triton via a locally
/// accessible filesystem. The backend can access these files
/// using an appropriate system API.
///
typedef
enum
TRITONBACKEND_artifacttype_enum
{
TRITONBACKEND_ARTIFACT_FILESYSTEM
}
TRITONBACKEND_ArtifactType
;
///
/// TRITONBACKEND_MemoryManager
///
/// Object representing an memory manager that is capable of
/// allocating and otherwise managing different memory types. For
/// improved performance Triton maintains pools for GPU and CPU-pinned
/// memory and the memory manager allows backends to access those
/// pools.
///
/// Allocate a contiguous block of memory of a specific type using a
/// memory manager. Two error codes have specific interpretations for
/// this function:
///
/// TRITONSERVER_ERROR_UNSUPPORTED: Indicates that Triton is
/// incapable of allocating the requested memory type and memory
/// type ID. Requests for the memory type and ID will always fail
/// no matter 'byte_size' of the request.
///
/// TRITONSERVER_ERROR_UNAVAILABLE: Indicates that Triton can
/// allocate the memory type and ID but that currently it cannot
/// allocate a contiguous block of memory of the requested
/// 'byte_size'.
///
/// \param manager The memory manager.
/// \param buffer Returns the allocated memory.
/// \param memory_type The type of memory to allocate.
/// \param memory_type_id The ID associated with the memory type to
/// allocate. For GPU memory this indicates the device ID of the GPU
/// to allocate from.
/// \param byte_size The size of memory to allocate, in bytes.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_MemoryManagerAllocate
(
TRITONBACKEND_MemoryManager
*
manager
,
void
**
buffer
,
const
TRITONSERVER_MemoryType
memory_type
,
const
int64_t
memory_type_id
,
const
uint64_t
byte_size
);
/// Free a buffer that was previously allocated with
/// TRITONBACKEND_MemoryManagerAllocate. The call must provide the
/// same values for 'memory_type' and 'memory_type_id' as were used
/// when the buffer was allocate or else the behavior is undefined.
///
/// \param manager The memory manager.
/// \param buffer The allocated memory buffer to free.
/// \param memory_type The type of memory of the buffer.
/// \param memory_type_id The ID associated with the memory type of
/// the buffer.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_MemoryManagerFree
(
TRITONBACKEND_MemoryManager
*
manager
,
void
*
buffer
,
const
TRITONSERVER_MemoryType
memory_type
,
const
int64_t
memory_type_id
);
///
/// TRITONBACKEND_Input
///
/// Object representing an input tensor.
///
/// Get the name and properties of an input tensor. The returned
/// strings and other properties are owned by the input, not the
/// caller, and so should not be modified or freed.
///
/// \param input The input tensor.
/// \param name If non-nullptr, returns the tensor name.
/// \param datatype If non-nullptr, returns the tensor datatype.
/// \param shape If non-nullptr, returns the tensor shape.
/// \param dim_count If non-nullptr, returns the number of dimensions
/// in the tensor shape.
/// \param byte_size If non-nullptr, returns the size of the available
/// data for the tensor, in bytes. This size reflects the actual data
/// available, and does not necessarily match what is
/// expected/required for the tensor given its shape and datatype. It
/// is the responsibility of the backend to handle mismatches in these
/// sizes appropriately.
/// \param buffer_count If non-nullptr, returns the number of buffers
/// holding the contents of the tensor. These buffers are accessed
/// using TRITONBACKEND_InputBuffer.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_InputProperties
(
TRITONBACKEND_Input
*
input
,
const
char
**
name
,
TRITONSERVER_DataType
*
datatype
,
const
int64_t
**
shape
,
uint32_t
*
dims_count
,
uint64_t
*
byte_size
,
uint32_t
*
buffer_count
);
/// Get the name and properties of an input tensor associated with a given
/// host policy. If there are no input buffers for the specified host policy,
/// the properties of the fallback input buffers are returned. The returned
/// strings and other properties are owned by the input, not the caller, and so
/// should not be modified or freed.
///
/// \param input The input tensor.
/// \param host_policy_name The host policy name. Fallback input properties
/// will be return if nullptr is provided.
/// \param name If non-nullptr, returns the tensor name.
/// \param datatype If non-nullptr, returns the tensor datatype.
/// \param shape If non-nullptr, returns the tensor shape.
/// \param dim_count If non-nullptr, returns the number of dimensions
/// in the tensor shape.
/// \param byte_size If non-nullptr, returns the size of the available
/// data for the tensor, in bytes. This size reflects the actual data
/// available, and does not necessarily match what is
/// expected/required for the tensor given its shape and datatype. It
/// is the responsibility of the backend to handle mismatches in these
/// sizes appropriately.
/// \param buffer_count If non-nullptr, returns the number of buffers
/// holding the contents of the tensor. These buffers are accessed
/// using TRITONBACKEND_InputBufferForHostPolicy.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_InputPropertiesForHostPolicy
(
TRITONBACKEND_Input
*
input
,
const
char
*
host_policy_name
,
const
char
**
name
,
TRITONSERVER_DataType
*
datatype
,
const
int64_t
**
shape
,
uint32_t
*
dims_count
,
uint64_t
*
byte_size
,
uint32_t
*
buffer_count
);
/// Get a buffer holding (part of) the tensor data for an input. For a
/// given input the number of buffers composing the input are found
/// from 'buffer_count' returned by TRITONBACKEND_InputProperties. The
/// returned buffer is owned by the input and so should not be
/// modified or freed by the caller. The lifetime of the buffer
/// matches that of the input and so the buffer should not be accessed
/// after the input tensor object is released.
///
/// \param input The input tensor.
/// \param index The index of the buffer. Must be 0 <= index <
/// buffer_count, where buffer_count is the value returned by
/// TRITONBACKEND_InputProperties.
/// \param buffer Returns a pointer to a contiguous block of data for
/// the named input.
/// \param buffer_byte_size Returns the size, in bytes, of 'buffer'.
/// \param memory_type Acts as both input and output. On input gives
/// the buffer memory type preferred by the function caller. Returns
/// the actual memory type of 'buffer'.
/// \param memory_type_id Acts as both input and output. On input
/// gives the buffer memory type id preferred by the function caller.
/// Returns the actual memory type id of 'buffer'.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_InputBuffer
(
TRITONBACKEND_Input
*
input
,
const
uint32_t
index
,
const
void
**
buffer
,
uint64_t
*
buffer_byte_size
,
TRITONSERVER_MemoryType
*
memory_type
,
int64_t
*
memory_type_id
);
/// Get a buffer holding (part of) the tensor data for an input for a specific
/// host policy. If there are no input buffers specified for this host policy,
/// the fallback input buffer is returned.
/// For a given input the number of buffers composing the input are found
/// from 'buffer_count' returned by TRITONBACKEND_InputPropertiesForHostPolicy.
/// The returned buffer is owned by the input and so should not be modified or
/// freed by the caller. The lifetime of the buffer matches that of the input
/// and so the buffer should not be accessed after the input tensor object is
/// released.
///
/// \param input The input tensor.
/// \param host_policy_name The host policy name. Fallback input buffer
/// will be return if nullptr is provided.
/// \param index The index of the buffer. Must be 0 <= index <
/// buffer_count, where buffer_count is the value returned by
/// TRITONBACKEND_InputPropertiesForHostPolicy.
/// \param buffer Returns a pointer to a contiguous block of data for
/// the named input.
/// \param buffer_byte_size Returns the size, in bytes, of 'buffer'.
/// \param memory_type Acts as both input and output. On input gives
/// the buffer memory type preferred by the function caller. Returns
/// the actual memory type of 'buffer'.
/// \param memory_type_id Acts as both input and output. On input
/// gives the buffer memory type id preferred by the function caller.
/// Returns the actual memory type id of 'buffer'.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_InputBufferForHostPolicy
(
TRITONBACKEND_Input
*
input
,
const
char
*
host_policy_name
,
const
uint32_t
index
,
const
void
**
buffer
,
uint64_t
*
buffer_byte_size
,
TRITONSERVER_MemoryType
*
memory_type
,
int64_t
*
memory_type_id
);
/// Get the buffer attributes associated with the given input buffer. For a
/// given input the number of buffers composing the input are found from
/// 'buffer_count' returned by TRITONBACKEND_InputProperties. The returned
/// 'buffer_attributes' is owned by the input and so should not be modified or
/// freed by the caller. The lifetime of the 'buffer_attributes' matches that of
/// the input and so the 'buffer_attributes' should not be accessed after the
/// input tensor object is released.
///
/// \param input The input tensor.
/// \param index The index of the buffer. Must be 0 <= index < buffer_count,
/// where buffer_count is the value returned by TRITONBACKEND_InputProperties.
/// \param buffer Returns a pointer to a contiguous block of data for
/// the named input.
/// \param buffer_attributes Returns the attributes for the given buffer.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_InputBufferAttributes
(
TRITONBACKEND_Input
*
input
,
const
uint32_t
index
,
const
void
**
buffer
,
TRITONSERVER_BufferAttributes
**
buffer_attributes
);
///
/// TRITONBACKEND_Output
///
/// Object representing a response output tensor.
///
/// Get a buffer to use to hold the tensor data for the output. The
/// returned buffer is owned by the output and so should not be freed
/// by the caller. The caller can and should fill the buffer with the
/// output data for the tensor. The lifetime of the buffer matches
/// that of the output and so the buffer should not be accessed after
/// the output tensor object is released.
///
/// \param buffer Returns a pointer to a buffer where the contents of
/// the output tensor should be placed.
/// \param buffer_byte_size The size, in bytes, of the buffer required
/// by the caller.
/// \param memory_type Acts as both input and output. On input gives
/// the buffer memory type preferred by the caller. Returns the
/// actual memory type of 'buffer'.
/// \param memory_type_id Acts as both input and output. On input
/// gives the buffer memory type id preferred by the caller. Returns
/// the actual memory type id of 'buffer'.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_OutputBuffer
(
TRITONBACKEND_Output
*
output
,
void
**
buffer
,
const
uint64_t
buffer_byte_size
,
TRITONSERVER_MemoryType
*
memory_type
,
int64_t
*
memory_type_id
);
/// Get the buffer attributes associated with the given output buffer. The
/// returned 'buffer_attributes' is owned by the output and so should not be
/// modified or freed by the caller. The lifetime of the 'buffer_attributes'
/// matches that of the output and so the 'buffer_attributes' should not be
/// accessed after the output tensor object is released. This function must be
/// called after the TRITONBACKEND_OutputBuffer otherwise it might contain
/// incorrect data.
///
/// \param output The output tensor.
/// \param buffer_attributes Returns the attributes for the output buffer.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_OutputBufferAttributes
(
TRITONBACKEND_Output
*
output
,
TRITONSERVER_BufferAttributes
**
buffer_attributes
);
///
/// TRITONBACKEND_Request
///
/// Object representing an inference request.
///
/// Get the ID of the request. Can be nullptr if request doesn't have
/// an ID. The returned string is owned by the request, not the
/// caller, and so should not be modified or freed.
///
/// \param request The inference request.
/// \param id Returns the ID.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_RequestId
(
TRITONBACKEND_Request
*
request
,
const
char
**
id
);
/// Get the correlation ID of the request if it is an unsigned integer.
/// Zero indicates that the request does not have a correlation ID.
/// Returns failure if correlation ID for given request is not an unsigned
/// integer.
///
/// \param request The inference request.
/// \param id Returns the correlation ID.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_RequestCorrelationId
(
TRITONBACKEND_Request
*
request
,
uint64_t
*
id
);
/// Get the correlation ID of the request if it is a string.
/// Empty string indicates that the request does not have a correlation ID.
/// Returns error if correlation ID for given request is not a string.
///
/// \param request The inference request.
/// \param id Returns the correlation ID.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_RequestCorrelationIdString
(
TRITONBACKEND_Request
*
request
,
const
char
**
id
);
/// Get the flag(s) associated with a request. On return 'flags' holds
/// a bitwise-or of all flag values, see TRITONSERVER_RequestFlag for
/// available flags.
///
/// \param request The inference request.
/// \param flags Returns the flags.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_RequestFlags
(
TRITONBACKEND_Request
*
request
,
uint32_t
*
flags
);
/// Get the number of input tensors specified in the request.
///
/// \param request The inference request.
/// \param count Returns the number of input tensors.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_RequestInputCount
(
TRITONBACKEND_Request
*
request
,
uint32_t
*
count
);
/// Get the name of an input tensor. The caller does not own
/// the returned string and must not modify or delete it. The lifetime
/// of the returned string extends only as long as 'request'.
///
/// \param request The inference request.
/// \param index The index of the input tensor. Must be 0 <= index <
/// count, where count is the value returned by
/// TRITONBACKEND_RequestInputCount.
/// \param input_name Returns the name of the input tensor
/// corresponding to the index.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_RequestInputName
(
TRITONBACKEND_Request
*
request
,
const
uint32_t
index
,
const
char
**
input_name
);
/// Get a named request input. The lifetime of the returned input
/// object matches that of the request and so the input object should
/// not be accessed after the request object is released.
///
/// \param request The inference request.
/// \param name The name of the input.
/// \param input Returns the input corresponding to the name.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_RequestInput
(
TRITONBACKEND_Request
*
request
,
const
char
*
name
,
TRITONBACKEND_Input
**
input
);
/// Get a request input by index. The order of inputs in a given
/// request is not necessarily consistent with other requests, even if
/// the requests are in the same batch. As a result, you can not
/// assume that an index obtained from one request will point to the
/// same input in a different request.
///
/// The lifetime of the returned input object matches that of the
/// request and so the input object should not be accessed after the
/// request object is released.
///
/// \param request The inference request.
/// \param index The index of the input tensor. Must be 0 <= index <
/// count, where count is the value returned by
/// TRITONBACKEND_RequestInputCount.
/// \param input Returns the input corresponding to the index.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_RequestInputByIndex
(
TRITONBACKEND_Request
*
request
,
const
uint32_t
index
,
TRITONBACKEND_Input
**
input
);
/// Get the number of output tensors requested to be returned in the
/// request.
///
/// \param request The inference request.
/// \param count Returns the number of output tensors.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_RequestOutputCount
(
TRITONBACKEND_Request
*
request
,
uint32_t
*
count
);
/// Get the name of a requested output tensor. The caller does not own
/// the returned string and must not modify or delete it. The lifetime
/// of the returned string extends only as long as 'request'.
///
/// \param request The inference request.
/// \param index The index of the requested output tensor. Must be 0
/// <= index < count, where count is the value returned by
/// TRITONBACKEND_RequestOutputCount.
/// \param output_name Returns the name of the requested output tensor
/// corresponding to the index.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_RequestOutputName
(
TRITONBACKEND_Request
*
request
,
const
uint32_t
index
,
const
char
**
output_name
);
/// Returns the preferred memory type and memory type ID of the output buffer
/// for the request. As much as possible, Triton will attempt to return
/// the same memory_type and memory_type_id values that will be returned by
/// the subsequent call to TRITONBACKEND_OutputBuffer, however, the backend must
/// be capable of handling cases where the values differ.
///
/// \param request The request.
/// \param name The name of the output tensor. This is optional
/// and it should be set to nullptr to indicate that the tensor name has
/// not determined.
/// \param byte_size The expected size of the buffer. This is optional
/// and it should be set to nullptr to indicate that the byte size has
/// not determined.
/// \param memory_type Acts as both input and output. On input gives
/// the memory type preferred by the caller. Returns memory type preferred
/// by Triton, taken account of the caller preferred type.
/// \param memory_type_id Acts as both input and output. On input gives
/// the memory type ID preferred by the caller. Returns memory type ID preferred
/// by Triton, taken account of the caller preferred type ID.
/// \return a TRITONSERVER_Error object if a failure occurs.
/// A TRITONSERVER_ERROR_UNAVAILABLE error indicates that the properties are not
/// available, other error codes indicate an error.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_RequestOutputBufferProperties
(
TRITONBACKEND_Request
*
request
,
const
char
*
name
,
size_t
*
byte_size
,
TRITONSERVER_MemoryType
*
memory_type
,
int64_t
*
memory_type_id
);
/// Release the request. The request should be released when it is no
/// longer needed by the backend. If this call returns with an error
/// (i.e. non-nullptr) then the request was not released and ownership
/// remains with the backend. If this call returns with success, the
/// 'request' object is no longer owned by the backend and must not be
/// used. Any tensor names, data types, shapes, input tensors,
/// etc. returned by TRITONBACKEND_Request* functions for this request
/// are no longer valid. If a persistent copy of that data is required
/// it must be created before calling this function.
///
/// \param request The inference request.
/// \param release_flags Flags indicating what type of request release
/// should be performed. \see TRITONSERVER_RequestReleaseFlag. \see
/// TRITONSERVER_InferenceRequestReleaseFn_t.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_RequestRelease
(
TRITONBACKEND_Request
*
request
,
uint32_t
release_flags
);
///
/// TRITONBACKEND_ResponseFactory
///
/// Object representing an inference response factory. Using a
/// response factory is not required; instead a response can be
/// generated directly from a TRITONBACKEND_Request object using
/// TRITONBACKEND_ResponseNew(). A response factory allows a request
/// to be released before all responses have been sent. Releasing a
/// request as early as possible releases all input tensor data and
/// therefore may be desirable in some cases.
/// Create the response factory associated with a request.
///
/// \param factory Returns the new response factory.
/// \param request The inference request.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ResponseFactoryNew
(
TRITONBACKEND_ResponseFactory
**
factory
,
TRITONBACKEND_Request
*
request
);
/// Destroy a response factory.
///
/// \param factory The response factory.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ResponseFactoryDelete
(
TRITONBACKEND_ResponseFactory
*
factory
);
/// Send response flags without a corresponding response.
///
/// \param factory The response factory.
/// \param send_flags Flags to send. \see
/// TRITONSERVER_ResponseCompleteFlag. \see
/// TRITONSERVER_InferenceResponseCompleteFn_t.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ResponseFactorySendFlags
(
TRITONBACKEND_ResponseFactory
*
factory
,
const
uint32_t
send_flags
);
///
/// TRITONBACKEND_Response
///
/// Object representing an inference response. For a given request,
/// the backend must carefully manage the lifecycle of responses
/// generated for that request to ensure that the output tensor
/// buffers are allocated correctly. When a response is created with
/// TRITONBACKEND_ResponseNew or TRITONBACKEND_ResponseNewFromFactory,
/// all the outputs and corresponding buffers must be created for that
/// response using TRITONBACKEND_ResponseOutput and
/// TRITONBACKEND_OutputBuffer *before* another response is created
/// for the request. For a given response, outputs can be created in
/// any order but they must be created sequentially/sychronously (for
/// example, the backend cannot use multiple threads to simultaneously
/// add multiple outputs to a response).
///
/// The above requirement applies only to responses being generated
/// for a given request. The backend may generate responses in
/// parallel on multiple threads as long as those responses are for
/// different requests.
///
/// This order of response creation must be strictly followed. But,
/// once response(s) are created they do not need to be sent
/// immediately, nor do they need to be sent in the order they were
/// created. The backend may even delete a created response instead of
/// sending it by using TRITONBACKEND_ResponseDelete.
/// Create a response for a request.
///
/// \param response Returns the new response.
/// \param request The request.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ResponseNew
(
TRITONBACKEND_Response
**
response
,
TRITONBACKEND_Request
*
request
);
/// Create a response using a factory.
///
/// \param response Returns the new response.
/// \param factory The response factory.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ResponseNewFromFactory
(
TRITONBACKEND_Response
**
response
,
TRITONBACKEND_ResponseFactory
*
factory
);
/// Destroy a response. It is not necessary to delete a response if
/// TRITONBACKEND_ResponseSend is called as that function transfers
/// ownership of the response object to Triton.
///
/// \param response The response.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ResponseDelete
(
TRITONBACKEND_Response
*
response
);
/// Set a string parameter in the response.
///
/// \param response The response.
/// \param name The name of the parameter.
/// \param value The value of the parameter.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ResponseSetStringParameter
(
TRITONBACKEND_Response
*
response
,
const
char
*
name
,
const
char
*
value
);
/// Set an integer parameter in the response.
///
/// \param response The response.
/// \param name The name of the parameter.
/// \param value The value of the parameter.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ResponseSetIntParameter
(
TRITONBACKEND_Response
*
response
,
const
char
*
name
,
const
int64_t
value
);
/// Set an boolean parameter in the response.
///
/// \param response The response.
/// \param name The name of the parameter.
/// \param value The value of the parameter.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ResponseSetBoolParameter
(
TRITONBACKEND_Response
*
response
,
const
char
*
name
,
const
bool
value
);
/// Create an output tensor in the response. The lifetime of the
/// returned output tensor object matches that of the response and so
/// the output tensor object should not be accessed after the response
/// object is deleted.
///
/// \param response The response.
/// \param output Returns the new response output.
/// \param name The name of the output tensor.
/// \param datatype The datatype of the output tensor.
/// \param shape The shape of the output tensor.
/// \param dims_count The number of dimensions in the output tensor
/// shape.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ResponseOutput
(
TRITONBACKEND_Response
*
response
,
TRITONBACKEND_Output
**
output
,
const
char
*
name
,
const
TRITONSERVER_DataType
datatype
,
const
int64_t
*
shape
,
const
uint32_t
dims_count
);
/// Send a response. Calling this function transfers ownership of the
/// response object to Triton. The caller must not access or delete
/// the response object after calling this function.
///
/// \param response The response.
/// \param send_flags Flags associated with the response. \see
/// TRITONSERVER_ResponseCompleteFlag. \see
/// TRITONSERVER_InferenceResponseCompleteFn_t.
/// \param error The TRITONSERVER_Error to send if the response is an
/// error, or nullptr if the response is successful.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ResponseSend
(
TRITONBACKEND_Response
*
response
,
const
uint32_t
send_flags
,
TRITONSERVER_Error
*
error
);
///
/// TRITONBACKEND_State
///
/// Object representing a state.
///
/// Create a state in the request. The returned state object is only valid
/// before the TRITONBACKEND_StateUpdate is called. The state should not be
/// freed by the caller. If TRITONBACKEND_StateUpdate is not called, the
/// lifetime of the state matches the lifetime of the request. If the state name
/// does not exist in the "state" section of the model configuration, the state
/// will not be created and an error will be returned. If this function is
/// called when sequence batching is not enabled or there is no 'states' section
/// in the sequence batching section of the model configuration, this call will
/// return an error.
///
/// \param state Returns the new state.
/// \param request The request.
/// \param name The name of the state.
/// \param datatype The datatype of the state.
/// \param shape The shape of the state.
/// \param dims_count The number of dimensions in the state shape.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_StateNew
(
TRITONBACKEND_State
**
state
,
TRITONBACKEND_Request
*
request
,
const
char
*
name
,
const
TRITONSERVER_DataType
datatype
,
const
int64_t
*
shape
,
const
uint32_t
dims_count
);
/// Update the state for the sequence. Calling this function will replace the
/// state stored for this seqeunce in Triton with 'state' provided in the
/// function argument. If this function is called when sequence batching is not
/// enabled or there is no 'states' section in the sequence batching section of
/// the model configuration, this call will return an error. The backend is not
/// required to call this function. If the backend doesn't call
/// TRITONBACKEND_StateUpdate function, this particular state for the sequence
/// will not be updated and the next inference request in the sequence will use
/// the same state as the current inference request.
///
/// \param state The state.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_StateUpdate
(
TRITONBACKEND_State
*
state
);
/// Get a buffer to use to hold the tensor data for the state. The returned
/// buffer is owned by the state and so should not be freed by the caller. The
/// caller can and should fill the buffer with the state data. The buffer must
/// not be accessed by the backend after TRITONBACKEND_StateUpdate is called.
/// The caller should fill the buffer before calling TRITONBACKEND_StateUpdate.
///
/// \param state The state.
/// \param buffer Returns a pointer to a buffer where the contents of the state
/// should be placed.
/// \param buffer_byte_size The size, in bytes, of the buffer required
/// by the caller.
/// \param memory_type Acts as both input and output. On input gives
/// the buffer memory type preferred by the caller. Returns the
/// actual memory type of 'buffer'.
/// \param memory_type_id Acts as both input and output. On input
/// gives the buffer memory type id preferred by the caller. Returns
/// the actual memory type id of 'buffer'.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_StateBuffer
(
TRITONBACKEND_State
*
state
,
void
**
buffer
,
const
uint64_t
buffer_byte_size
,
TRITONSERVER_MemoryType
*
memory_type
,
int64_t
*
memory_type_id
);
/// Get the buffer attributes associated with the given state buffer.
/// The returned 'buffer_attributes' is owned by the state and so should not be
/// modified or freed by the caller. The lifetime of the 'buffer_attributes'
/// matches that of the state.
///
/// \param state The state.
/// \param buffer_attributes Returns the buffer attributes for the given state.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_StateBufferAttributes
(
TRITONBACKEND_State
*
state
,
TRITONSERVER_BufferAttributes
**
buffer_attributes
);
///
/// TRITONBACKEND_Backend
///
/// Object representing a backend.
///
/// TRITONBACKEND_ExecutionPolicy
///
/// Types of execution policy that can be implemented by a backend.
///
/// TRITONBACKEND_EXECUTION_BLOCKING: An instance of the model
/// blocks in TRITONBACKEND_ModelInstanceExecute until it is ready
/// to handle another inference. Upon returning from
/// TRITONBACKEND_ModelInstanceExecute, Triton may immediately
/// call TRITONBACKEND_ModelInstanceExecute for the same instance
/// to execute a new batch of requests. Thus, most backends using
/// this policy will not return from
/// TRITONBACKEND_ModelInstanceExecute until all responses have
/// been sent and all requests have been released. This is the
/// default execution policy.
///
/// TRITONBACKEND_EXECUTION_DEVICE_BLOCKING: An instance, A, of the
/// model blocks in TRITONBACKEND_ModelInstanceExecute if the
/// device associated with the instance is unable to handle
/// another inference. Even if another instance, B, associated
/// with the device, is available and ready to perform an
/// inference, Triton will not invoke
/// TRITONBACKEND_ModeInstanceExecute for B until A returns from
/// TRITONBACKEND_ModelInstanceExecute. Triton will not be blocked
/// from calling TRITONBACKEND_ModelInstanceExecute for instance
/// C, which is associated with a different device than A and B,
/// even if A or B has not returned from
/// TRITONBACKEND_ModelInstanceExecute. This execution policy is
/// typically used by a backend that can cooperatively execute
/// multiple model instances on the same device.
///
typedef
enum
TRITONBACKEND_execpolicy_enum
{
TRITONBACKEND_EXECUTION_BLOCKING
,
TRITONBACKEND_EXECUTION_DEVICE_BLOCKING
}
TRITONBACKEND_ExecutionPolicy
;
/// Get the name of the backend. The caller does not own the returned
/// string and must not modify or delete it. The lifetime of the
/// returned string extends only as long as 'backend'.
///
/// \param backend The backend.
/// \param name Returns the name of the backend.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_BackendName
(
TRITONBACKEND_Backend
*
backend
,
const
char
**
name
);
/// Get the backend configuration. The 'backend_config' message is
/// owned by Triton and should not be modified or freed by the caller.
///
/// The backend configuration, as JSON, is:
///
/// {
/// "cmdline" : {
/// "<setting>" : "<value>",
/// ...
/// }
/// }
///
/// \param backend The backend.
/// \param backend_config Returns the backend configuration as a message.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_BackendConfig
(
TRITONBACKEND_Backend
*
backend
,
TRITONSERVER_Message
**
backend_config
);
/// Get the execution policy for this backend. By default the
/// execution policy is TRITONBACKEND_EXECUTION_BLOCKING.
///
/// \param backend The backend.
/// \param policy Returns the execution policy.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_BackendExecutionPolicy
(
TRITONBACKEND_Backend
*
backend
,
TRITONBACKEND_ExecutionPolicy
*
policy
);
/// Set the execution policy for this backend. By default the
/// execution policy is TRITONBACKEND_EXECUTION_BLOCKING. Triton reads
/// the backend's execution policy after calling
/// TRITONBACKEND_Initialize, so to be recognized changes to the
/// execution policy must be made in TRITONBACKEND_Initialize.
/// Also, note that if using sequence batcher for the model, Triton will
/// use TRITONBACKEND_EXECUTION_BLOCKING policy irrespective of the
/// policy specified by this setter function.
///
/// \param backend The backend.
/// \param policy The execution policy.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_BackendSetExecutionPolicy
(
TRITONBACKEND_Backend
*
backend
,
TRITONBACKEND_ExecutionPolicy
policy
);
/// Get the location of the files that make up the backend
/// implementation. This location contains the backend shared library
/// and any other files located with the shared library. The
/// 'location' communicated depends on how the backend is being
/// communicated to Triton as indicated by 'artifact_type'.
///
/// TRITONBACKEND_ARTIFACT_FILESYSTEM: The backend artifacts are
/// made available to Triton via the local filesytem. 'location'
/// returns the full path to the directory containing this
/// backend's artifacts. The returned string is owned by Triton,
/// not the caller, and so should not be modified or freed.
///
/// \param backend The backend.
/// \param artifact_type Returns the artifact type for the backend.
/// \param path Returns the location.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_BackendArtifacts
(
TRITONBACKEND_Backend
*
backend
,
TRITONBACKEND_ArtifactType
*
artifact_type
,
const
char
**
location
);
/// Get the memory manager associated with a backend.
///
/// \param backend The backend.
/// \param manager Returns the memory manager.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_BackendMemoryManager
(
TRITONBACKEND_Backend
*
backend
,
TRITONBACKEND_MemoryManager
**
manager
);
/// Get the user-specified state associated with the backend. The
/// state is completely owned and managed by the backend.
///
/// \param backend The backend.
/// \param state Returns the user state, or nullptr if no user state.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_BackendState
(
TRITONBACKEND_Backend
*
backend
,
void
**
state
);
/// Set the user-specified state associated with the backend. The
/// state is completely owned and managed by the backend.
///
/// \param backend The backend.
/// \param state The user state, or nullptr if no user state.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_BackendSetState
(
TRITONBACKEND_Backend
*
backend
,
void
*
state
);
///
/// TRITONBACKEND_Model
///
/// Object representing a model implemented using the backend.
///
/// Get the name of the model. The returned string is owned by the
/// model object, not the caller, and so should not be modified or
/// freed.
///
/// \param model The model.
/// \param name Returns the model name.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelName
(
TRITONBACKEND_Model
*
model
,
const
char
**
name
);
/// Get the version of the model.
///
/// \param model The model.
/// \param version Returns the model version.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelVersion
(
TRITONBACKEND_Model
*
model
,
uint64_t
*
version
);
/// Get the location of the files that make up the model. The
/// 'location' communicated depends on how the model is being
/// communicated to Triton as indicated by 'artifact_type'.
///
/// TRITONBACKEND_ARTIFACT_FILESYSTEM: The model artifacts are made
/// available to Triton via the local filesytem. 'location'
/// returns the full path to the directory in the model repository
/// that contains this model's artifacts. The returned string is
/// owned by Triton, not the caller, and so should not be modified
/// or freed.
///
/// \param model The model.
/// \param artifact_type Returns the artifact type for the model.
/// \param path Returns the location.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelRepository
(
TRITONBACKEND_Model
*
model
,
TRITONBACKEND_ArtifactType
*
artifact_type
,
const
char
**
location
);
/// Get the model configuration. The caller takes ownership of the
/// message object and must call TRITONSERVER_MessageDelete to release
/// the object. The configuration is available via this call even
/// before the model is loaded and so can be used in
/// TRITONBACKEND_ModelInitialize. TRITONSERVER_ServerModelConfig
/// returns equivalent information but is not useable until after the
/// model loads.
///
/// \param model The model.
/// \param config_version The model configuration will be returned in
/// a format matching this version. If the configuration cannot be
/// represented in the requested version's format then an error will
/// be returned. Currently only version 1 is supported.
/// \param model_config Returns the model configuration as a message.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelConfig
(
TRITONBACKEND_Model
*
model
,
const
uint32_t
config_version
,
TRITONSERVER_Message
**
model_config
);
/// Whether the backend should attempt to auto-complete the model configuration.
/// If true, the model should fill the inputs, outputs, and max batch size in
/// the model configuration if incomplete. If the model configuration is
/// changed, the new configuration must be reported to Triton using
/// TRITONBACKEND_ModelSetConfig.
///
/// \param model The model.
/// \param auto_complete_config Returns whether the backend should auto-complete
/// the model configuration.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelAutoCompleteConfig
(
TRITONBACKEND_Model
*
model
,
bool
*
auto_complete_config
);
/// Set the model configuration in Triton server. This API should only be called
/// when the backend implements the auto-completion of model configuration
/// and TRITONBACKEND_ModelAutoCompleteConfig returns true in
/// auto_complete_config. Only the inputs, outputs, max batch size, and
/// scheduling choice can be changed. A caveat being scheduling choice can only
/// be changed if none is previously set. Any other changes to the model
/// configuration will be ignored by Triton. This function can only be called
/// from TRITONBACKEND_ModelInitialize, calling in any other context will result
/// in an error being returned. Additionally, Triton server can add some of the
/// missing fields in the provided config with this call. The backend must get
/// the complete configuration again by using TRITONBACKEND_ModelConfig.
/// TRITONBACKEND_ModelSetConfig does not take ownership of the message object
/// and so the caller should call TRITONSERVER_MessageDelete to release the
/// object once the function returns.
///
/// \param model The model.
/// \param config_version The format version of the model configuration.
/// If the configuration is not represented in the version's format
/// then an error will be returned. Currently only version 1 is supported.
/// \param model_config The updated model configuration as a message.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelSetConfig
(
TRITONBACKEND_Model
*
model
,
const
uint32_t
config_version
,
TRITONSERVER_Message
*
model_config
);
/// Get the TRITONSERVER_Server object that this model is being served
/// by.
///
/// \param model The model.
/// \param server Returns the server.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelServer
(
TRITONBACKEND_Model
*
model
,
TRITONSERVER_Server
**
server
);
/// Get the backend used by the model.
///
/// \param model The model.
/// \param model Returns the backend object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelBackend
(
TRITONBACKEND_Model
*
model
,
TRITONBACKEND_Backend
**
backend
);
/// Get the user-specified state associated with the model. The
/// state is completely owned and managed by the backend.
///
/// \param model The model.
/// \param state Returns the user state, or nullptr if no user state.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelState
(
TRITONBACKEND_Model
*
model
,
void
**
state
);
/// Set the user-specified state associated with the model. The
/// state is completely owned and managed by the backend.
///
/// \param model The model.
/// \param state The user state, or nullptr if no user state.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelSetState
(
TRITONBACKEND_Model
*
model
,
void
*
state
);
///
/// TRITONBACKEND_ModelInstance
///
/// Object representing a model instance implemented using the
/// backend.
///
/// Get the name of the model instance. The returned string is owned by the
/// model object, not the caller, and so should not be modified or
/// freed.
///
/// \param instance The model instance.
/// \param name Returns the instance name.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelInstanceName
(
TRITONBACKEND_ModelInstance
*
instance
,
const
char
**
name
);
/// Get the kind of the model instance.
///
/// \param instance The model instance.
/// \param kind Returns the instance kind.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelInstanceKind
(
TRITONBACKEND_ModelInstance
*
instance
,
TRITONSERVER_InstanceGroupKind
*
kind
);
/// Get the device ID of the model instance.
///
/// \param instance The model instance.
/// \param device_id Returns the instance device ID.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelInstanceDeviceId
(
TRITONBACKEND_ModelInstance
*
instance
,
int32_t
*
device_id
);
/// Get the host policy setting. The 'host_policy' message is
/// owned by Triton and should not be modified or freed by the caller.
///
/// The host policy setting, as JSON, is:
///
/// {
/// "<host_policy>" : {
/// "<setting>" : "<value>",
/// ...
/// }
/// }
///
/// \param instance The model instance.
/// \param host_policy Returns the host policy setting as a message.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelInstanceHostPolicy
(
TRITONBACKEND_ModelInstance
*
instance
,
TRITONSERVER_Message
**
host_policy
);
/// Whether the model instance is passive.
///
/// \param instance The model instance.
/// \param is_passive Returns true if the instance is passive, false otherwise
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelInstanceIsPassive
(
TRITONBACKEND_ModelInstance
*
instance
,
bool
*
is_passive
);
/// Get the number of optimization profiles to be loaded for the instance.
///
/// \param instance The model instance.
/// \param count Returns the number of optimization profiles.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelInstanceProfileCount
(
TRITONBACKEND_ModelInstance
*
instance
,
uint32_t
*
count
);
/// Get the name of optimization profile. The caller does not own
/// the returned string and must not modify or delete it. The lifetime
/// of the returned string extends only as long as 'instance'.
///
/// \param instance The model instance.
/// \param index The index of the optimization profile. Must be 0
/// <= index < count, where count is the value returned by
/// TRITONBACKEND_ModelInstanceProfileCount.
/// \param profile_name Returns the name of the optimization profile
/// corresponding to the index.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelInstanceProfileName
(
TRITONBACKEND_ModelInstance
*
instance
,
const
uint32_t
index
,
const
char
**
profile_name
);
/// Get the number of secondary devices configured for the instance.
///
/// \param instance The model instance.
/// \param count Returns the number of secondary devices.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelInstanceSecondaryDeviceCount
(
TRITONBACKEND_ModelInstance
*
instance
,
uint32_t
*
count
);
/// Get the properties of indexed secondary device. The returned
/// strings and other properties are owned by the instance, not the
/// caller, and so should not be modified or freed.
///
/// \param instance The model instance.
/// \param index The index of the secondary device. Must be 0
/// <= index < count, where count is the value returned by
/// TRITONBACKEND_ModelInstanceSecondaryDeviceCount.
/// \param kind Returns the kind of secondary device corresponding
/// to the index.
/// \param id Returns the id of secondary device corresponding to the index.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelInstanceSecondaryDeviceProperties
(
TRITONBACKEND_ModelInstance
*
instance
,
uint32_t
index
,
const
char
**
kind
,
int64_t
*
id
);
/// Get the model associated with a model instance.
///
/// \param instance The model instance.
/// \param backend Returns the model object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelInstanceModel
(
TRITONBACKEND_ModelInstance
*
instance
,
TRITONBACKEND_Model
**
model
);
/// Get the user-specified state associated with the model
/// instance. The state is completely owned and managed by the
/// backend.
///
/// \param instance The model instance.
/// \param state Returns the user state, or nullptr if no user state.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelInstanceState
(
TRITONBACKEND_ModelInstance
*
instance
,
void
**
state
);
/// Set the user-specified state associated with the model
/// instance. The state is completely owned and managed by the
/// backend.
///
/// \param instance The model instance.
/// \param state The user state, or nullptr if no user state.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelInstanceSetState
(
TRITONBACKEND_ModelInstance
*
instance
,
void
*
state
);
/// Record statistics for an inference request.
///
/// Set 'success' true to indicate that the inference request
/// completed successfully. In this case all timestamps should be
/// non-zero values reported in nanoseconds and should be collected
/// using std::chrono::steady_clock::now().time_since_epoch() or the equivalent.
/// Set 'success' to false to indicate that the inference request failed
/// to complete successfully. In this case all timestamps values are
/// ignored.
///
/// For consistency of measurement across different backends, the
/// timestamps should be collected at the following points during
/// TRITONBACKEND_ModelInstanceExecute.
///
/// TRITONBACKEND_ModelInstanceExecute()
/// CAPTURE TIMESPACE (exec_start_ns)
/// < process input tensors to prepare them for inference
/// execution, including copying the tensors to/from GPU if
/// necessary>
/// CAPTURE TIMESPACE (compute_start_ns)
/// < perform inference computations to produce outputs >
/// CAPTURE TIMESPACE (compute_end_ns)
/// < allocate output buffers and extract output tensors, including
/// copying the tensors to/from GPU if necessary>
/// CAPTURE TIMESPACE (exec_end_ns)
/// return
///
/// Note that these statistics are associated with a valid
/// TRITONBACKEND_Request object and so must be reported before the
/// request is released. For backends that release the request before
/// all response(s) are sent, these statistics cannot capture
/// information about the time required to produce the response.
///
/// \param instance The model instance.
/// \param request The inference request that statistics are being
/// reported for.
/// \param success True if the inference request completed
/// successfully, false if it failed to complete.
/// \param exec_start_ns Timestamp for the start of execution.
/// \param compute_start_ns Timestamp for the start of execution
/// computations.
/// \param compute_end_ns Timestamp for the end of execution
/// computations.
/// \param exec_end_ns Timestamp for the end of execution.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelInstanceReportStatistics
(
TRITONBACKEND_ModelInstance
*
instance
,
TRITONBACKEND_Request
*
request
,
const
bool
success
,
const
uint64_t
exec_start_ns
,
const
uint64_t
compute_start_ns
,
const
uint64_t
compute_end_ns
,
const
uint64_t
exec_end_ns
);
/// Record statistics for the execution of an entire batch of
/// inference requests.
///
/// All timestamps should be non-zero values reported in nanoseconds
/// and should be collected using
/// std::chrono::steady_clock::now().time_since_epoch() or the equivalent.
/// See TRITONBACKEND_ModelInstanceReportStatistics for more information about
/// the timestamps.
///
/// 'batch_size' is the sum of the batch sizes for the individual
/// requests that were delivered together in the call to
/// TRITONBACKEND_ModelInstanceExecute. For example, if three requests
/// are passed to TRITONBACKEND_ModelInstanceExecute and those
/// requests have batch size 1, 2, and 3; then 'batch_size' should be
/// set to 6.
///
/// \param instance The model instance.
/// \param batch_size Combined batch size of all the individual
/// requests executed in the batch.
/// \param exec_start_ns Timestamp for the start of execution.
/// \param compute_start_ns Timestamp for the start of execution
/// computations.
/// \param compute_end_ns Timestamp for the end of execution
/// computations.
/// \param exec_end_ns Timestamp for the end of execution.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelInstanceReportBatchStatistics
(
TRITONBACKEND_ModelInstance
*
instance
,
const
uint64_t
batch_size
,
const
uint64_t
exec_start_ns
,
const
uint64_t
compute_start_ns
,
const
uint64_t
compute_end_ns
,
const
uint64_t
exec_end_ns
);
///
/// The following functions can be implemented by a backend. Functions
/// indicated as required must be implemented or the backend will fail
/// to load.
///
/// Initialize a backend. This function is optional, a backend is not
/// required to implement it. This function is called once when a
/// backend is loaded to allow the backend to initialize any state
/// associated with the backend. A backend has a single state that is
/// shared across all models that use the backend.
///
/// \param backend The backend.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_ISPEC
TRITONSERVER_Error
*
TRITONBACKEND_Initialize
(
TRITONBACKEND_Backend
*
backend
);
/// Finalize for a backend. This function is optional, a backend is
/// not required to implement it. This function is called once, just
/// before the backend is unloaded. All state associated with the
/// backend should be freed and any threads created for the backend
/// should be exited/joined before returning from this function.
///
/// \param backend The backend.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_ISPEC
TRITONSERVER_Error
*
TRITONBACKEND_Finalize
(
TRITONBACKEND_Backend
*
backend
);
/// Initialize for a model. This function is optional, a backend is
/// not required to implement it. This function is called once when a
/// model that uses the backend is loaded to allow the backend to
/// initialize any state associated with the model. The backend should
/// also examine the model configuration to determine if the
/// configuration is suitable for the backend. Any errors reported by
/// this function will prevent the model from loading.
///
/// \param model The model.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_ISPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelInitialize
(
TRITONBACKEND_Model
*
model
);
/// Finalize for a model. This function is optional, a backend is not
/// required to implement it. This function is called once for a
/// model, just before the model is unloaded from Triton. All state
/// associated with the model should be freed and any threads created
/// for the model should be exited/joined before returning from this
/// function.
///
/// \param model The model.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_ISPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelFinalize
(
TRITONBACKEND_Model
*
model
);
/// Initialize for a model instance. This function is optional, a
/// backend is not required to implement it. This function is called
/// once when a model instance is created to allow the backend to
/// initialize any state associated with the instance.
///
/// \param instance The model instance.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_ISPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelInstanceInitialize
(
TRITONBACKEND_ModelInstance
*
instance
);
/// Finalize for a model instance. This function is optional, a
/// backend is not required to implement it. This function is called
/// once for an instance, just before the corresponding model is
/// unloaded from Triton. All state associated with the instance
/// should be freed and any threads created for the instance should be
/// exited/joined before returning from this function.
///
/// \param instance The model instance.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_ISPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelInstanceFinalize
(
TRITONBACKEND_ModelInstance
*
instance
);
/// Execute a batch of one or more requests on a model instance. This
/// function is required. Triton will not perform multiple
/// simultaneous calls to this function for a given model 'instance';
/// however, there may be simultaneous calls for different model
/// instances (for the same or different models).
///
/// If an error is returned the ownership of the request objects
/// remains with Triton and the backend must not retain references to
/// the request objects or access them in any way.
///
/// If success is returned, ownership of the request objects is
/// transferred to the backend and it is then responsible for creating
/// responses and releasing the request objects. Note that even though
/// ownership of the request objects is transferred to the backend, the
/// ownership of the buffer holding request pointers is returned back
/// to Triton upon return from TRITONBACKEND_ModelInstanceExecute. If
/// any request objects need to be maintained beyond
/// TRITONBACKEND_ModelInstanceExecute, then the pointers must be copied
/// out of the array within TRITONBACKEND_ModelInstanceExecute.
///
/// \param instance The model instance.
/// \param requests The requests.
/// \param request_count The number of requests in the batch.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_ISPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelInstanceExecute
(
TRITONBACKEND_ModelInstance
*
instance
,
TRITONBACKEND_Request
**
requests
,
const
uint32_t
request_count
);
/// Query the backend for different model attributes. This function is optional,
/// a backend is not required to implement it. The backend is also not required
/// to set all backend attribute listed. This function is called when
/// Triton requires further backend / model information to perform operations.
/// This function may be called multiple times within the lifetime of the
/// backend (between TRITONBACKEND_Initialize and TRITONBACKEND_Finalize).
/// The backend may return error to indicate failure to set the backend
/// attributes, and the attributes specified in the same function call will be
/// ignored. Triton will update the specified attributes if 'nullptr' is
/// returned.
///
/// \param backend The backend.
/// \param backend_attributes Return the backend attribute.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_ISPEC
TRITONSERVER_Error
*
TRITONBACKEND_GetBackendAttribute
(
TRITONBACKEND_Backend
*
backend
,
TRITONBACKEND_BackendAttribute
*
backend_attributes
);
/// TRITONBACKEND_BackendAttribute
///
/// API to modify attributes associated with a backend.
///
/// Add the preferred instance group of the backend. This function
/// can be called multiple times to cover different instance group kinds that
/// the backend supports, given the priority order that the first call describes
/// the most preferred group. In the case where instance group are not
/// explicitly provided, Triton will use this attribute to create model
/// deployment that aligns more with the backend preference.
///
/// \param backend_attributes The backend attributes object.
/// \param kind The kind of the instance group.
/// \param count The number of instances per device. Triton default will be used
/// if 0 is provided.
/// \param device_ids The devices where instances should be available. Triton
/// default will be used if 'nullptr' is provided.
/// \param id_count The number of devices in 'device_ids'.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_BackendAttributeAddPreferredInstanceGroup
(
TRITONBACKEND_BackendAttribute
*
backend_attributes
,
const
TRITONSERVER_InstanceGroupKind
kind
,
const
uint64_t
count
,
const
uint64_t
*
device_ids
,
const
uint64_t
id_count
);
#ifdef __cplusplus
}
#endif
3rdparty/core-r22.12/include/triton/core/tritonrepoagent.h
0 → 100644
View file @
0a21fff9
// Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <stddef.h>
#include <stdint.h>
#include "triton/core/tritonserver.h"
#ifdef __cplusplus
extern
"C"
{
#endif
#ifdef _COMPILING_TRITONREPOAGENT
#if defined(_MSC_VER)
#define TRITONREPOAGENT_DECLSPEC __declspec(dllexport)
#define TRITONREPOAGENT_ISPEC __declspec(dllimport)
#elif defined(__GNUC__)
#define TRITONREPOAGENT_DECLSPEC __attribute__((__visibility__("default")))
#define TRITONREPOAGENT_ISPEC
#else
#define TRITONREPOAGENT_DECLSPEC
#define TRITONREPOAGENT_ISPEC
#endif
#else
#if defined(_MSC_VER)
#define TRITONREPOAGENT_DECLSPEC __declspec(dllimport)
#define TRITONREPOAGENT_ISPEC __declspec(dllexport)
#else
#define TRITONREPOAGENT_DECLSPEC
#define TRITONREPOAGENT_ISPEC
#endif
#endif
struct
TRITONREPOAGENT_Agent
;
struct
TRITONREPOAGENT_AgentModel
;
///
/// TRITONREPOAGENT API Version
///
/// The TRITONREPOAGENT API is versioned with major and minor version
/// numbers. Any change to the API that does not impact backwards
/// compatibility (for example, adding a non-required function)
/// increases the minor version number. Any change that breaks
/// backwards compatibility (for example, deleting or changing the
/// behavior of a function) increases the major version number. A
/// repository agent should check that the API version used to compile
/// the agent is compatible with the API version of the Triton server
/// that it is running in. This is typically done by code similar to
/// the following which makes sure that the major versions are equal
/// and that the minor version of Triton is >= the minor version used
/// to build the agent.
///
/// uint32_t api_version_major, api_version_minor;
/// TRITONREPOAGENT_ApiVersion(&api_version_major, &api_version_minor);
/// if ((api_version_major != TRITONREPOAGENT_API_VERSION_MAJOR) ||
/// (api_version_minor < TRITONREPOAGENT_API_VERSION_MINOR)) {
/// return TRITONSERVER_ErrorNew(
/// TRITONSERVER_ERROR_UNSUPPORTED,
/// "triton repository agent API version does not support this agent");
/// }
///
#define TRITONREPOAGENT_API_VERSION_MAJOR 0
#define TRITONREPOAGENT_API_VERSION_MINOR 1
/// Get the TRITONREPOAGENT API version supported by Triton. This
/// value can be compared against the
/// TRITONREPOAGENT_API_VERSION_MAJOR and
/// TRITONREPOAGENT_API_VERSION_MINOR used to build the agent to
/// ensure that Triton is compatible with the agent.
///
/// \param major Returns the TRITONREPOAGENT API major version supported
/// by Triton.
/// \param minor Returns the TRITONREPOAGENT API minor version supported
/// by Triton.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONREPOAGENT_DECLSPEC
TRITONSERVER_Error
*
TRITONREPOAGENT_ApiVersion
(
uint32_t
*
major
,
uint32_t
*
minor
);
/// TRITONREPOAGENT_ArtifactType
///
/// The ways that the files that make up a model's repository content
/// are communicated between Triton and the agent.
///
/// TRITONREPOAGENT_ARTIFACT_FILESYSTEM: The model artifacts are
/// communicated to and from the repository agent via a locally
/// accessible filesystem. The agent can access these files using
/// an appropriate filesystem API.
///
/// TRITONREPOAGENT_ARTIFACT_REMOTE_FILESYSTEM: The model artifacts are
/// communicated to and from the repository agent via a remote filesystem.
/// The remote filesystem path follows the same convention as is used for
/// repository paths, for example, "s3://" prefix indicates an S3 path.
///
typedef
enum
TRITONREPOAGENT_artifacttype_enum
{
TRITONREPOAGENT_ARTIFACT_FILESYSTEM
,
TRITONREPOAGENT_ARTIFACT_REMOTE_FILESYSTEM
}
TRITONREPOAGENT_ArtifactType
;
/// TRITONREPOAGENT_ActionType
///
/// Types of repository actions that can be handled by an agent.
/// The lifecycle of a TRITONREPOAGENT_AgentModel begins with a call to
/// TRITONREPOAGENT_ModelInitialize and ends with a call to
/// TRITONREPOAGENT_ModelFinalize. Between those calls the current lifecycle
/// state of the model is communicated by calls to TRITONREPOAGENT_ModelAction.
/// Possible lifecycles are:
///
/// LOAD -> LOAD_COMPLETE -> UNLOAD -> UNLOAD_COMPLETE
/// LOAD -> LOAD_FAIL
///
/// TRITONREPOAGENT_ACTION_LOAD: A model is being loaded.
///
/// TRITONREPOAGENT_ACTION_LOAD_COMPLETE: The model load completed
/// successfully and the model is now loaded.
///
/// TRITONREPOAGENT_ACTION_LOAD_FAIL: The model load did not complete
/// successfully. The model is not loaded.
///
/// TRITONREPOAGENT_ACTION_UNLOAD: The model is being unloaded.
///
/// TRITONREPOAGENT_ACTION_UNLOAD_COMPLETE: The model unload is complete.
///
typedef
enum
TRITONREPOAGENT_actiontype_enum
{
TRITONREPOAGENT_ACTION_LOAD
,
TRITONREPOAGENT_ACTION_LOAD_COMPLETE
,
TRITONREPOAGENT_ACTION_LOAD_FAIL
,
TRITONREPOAGENT_ACTION_UNLOAD
,
TRITONREPOAGENT_ACTION_UNLOAD_COMPLETE
}
TRITONREPOAGENT_ActionType
;
/// Get the location of the files that make up the model. The
/// 'location' communicated depends on how the model is being
/// communicated to the agent as indicated by 'artifact_type'.
///
/// TRITONREPOAGENT_ARTIFACT_FILESYSTEM: The model artifacts are
/// made available to the agent via the local
/// filesytem. 'location' returns the full path to the directory
/// in the model repository that contains the model's
/// artifacts. The returned location string is owned by Triton,
/// not the caller, and so should not be modified or freed. The
/// contents of the directory are owned by Triton, not the agent,
/// and so the agent should not delete or modify the contents. Use
/// TRITONREPOAGENT_RepositoryAcquire to get a location that can be
/// used to modify the model repository contents.
///
/// TRITONREPOAGENT_ARTIFACT_REMOTE_FILESYSTEM: The model artifacts are
/// made available to the agent via a remote filesystem.
/// 'location' returns the full path to the remote directory that contains
/// the model's artifacts. The returned location string is owned by Triton,
/// not the caller, and so should not be modified or freed. The contents of
/// the remote directory are owned by Triton, not the agent,
/// and so the agent should not delete or modify the contents.
/// Use TRITONREPOAGENT_ModelRepositoryLocationAcquire to get a location
/// that can be used to write updated model repository contents.
///
/// \param agent The agent.
/// \param model The model.
/// \param artifact_type Returns the artifact type for the location.
/// \param path Returns the location.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONREPOAGENT_DECLSPEC
TRITONSERVER_Error
*
TRITONREPOAGENT_ModelRepositoryLocation
(
TRITONREPOAGENT_Agent
*
agent
,
TRITONREPOAGENT_AgentModel
*
model
,
TRITONREPOAGENT_ArtifactType
*
artifact_type
,
const
char
**
location
);
/// Acquire a location where the agent can produce a new version of
/// the model repository files. This is a convenience method to create
/// a temporary directory for the agent. The agent is responsible for
/// calling TRITONREPOAGENT_ModelRepositoryLocationDelete in
/// TRITONREPOAGENT_ModelFinalize to delete the location. Initially the
/// acquired location is empty. The 'location' communicated depends on
/// the requested 'artifact_type'.
///
/// TRITONREPOAGENT_ARTIFACT_FILESYSTEM: The location is a directory
/// on the local filesystem. 'location' returns the full path to
/// an empty directory that the agent should populate with the
/// model's artifacts. The returned location string is owned by
/// Triton, not the agent, and so should not be modified or freed.
///
/// \param agent The agent.
/// \param model The model.
/// \param artifact_type The artifact type for the location.
/// \param path Returns the location.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONREPOAGENT_DECLSPEC
TRITONSERVER_Error
*
TRITONREPOAGENT_ModelRepositoryLocationAcquire
(
TRITONREPOAGENT_Agent
*
agent
,
TRITONREPOAGENT_AgentModel
*
model
,
const
TRITONREPOAGENT_ArtifactType
artifact_type
,
const
char
**
location
);
/// Discard and release ownership of a previously acquired location
/// and its contents. The agent must not access or modify the location
/// or its contents after this call.
///
/// \param agent The agent.
/// \param model The model.
/// \param path The location to release.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONREPOAGENT_DECLSPEC
TRITONSERVER_Error
*
TRITONREPOAGENT_ModelRepositoryLocationRelease
(
TRITONREPOAGENT_Agent
*
agent
,
TRITONREPOAGENT_AgentModel
*
model
,
const
char
*
location
);
/// Inform Triton that the specified repository location should be used for
/// the model in place of the original model repository. This method can only be
/// called when TRITONREPOAGENT_ModelAction is invoked with
/// TRITONREPOAGENT_ACTION_LOAD. The 'location' The 'location'
/// communicated depends on how the repository is being
/// communicated to Triton as indicated by 'artifact_type'.
///
/// TRITONREPOAGENT_ARTIFACT_FILESYSTEM: The model artifacts are
/// made available to Triton via the local filesytem. 'location' returns
/// the full path to the directory. Ownership of the contents of the
/// returned directory are transferred to Triton and the agent should not
/// modified or freed the contents until TRITONREPOAGENT_ModelFinalize.
/// The local filesystem directory can be created using
/// TRITONREPOAGENT_ModelReopsitroyLocationAcquire or the agent can use
/// its own local filesystem API.
///
/// TRITONREPOAGENT_ARTIFACT_REMOTE_FILESYSTEM: The model artifacts are
/// made available to Triton via a remote filesystem. 'location' returns
/// the full path to the remote filesystem directory. Ownership of the
/// contents of the returned directory are transferred to Triton and
/// the agent should not modified or freed the contents until
/// TRITONREPOAGENT_ModelFinalize.
///
/// \param agent The agent.
/// \param model The model.
/// \param artifact_type The artifact type for the location.
/// \param path Returns the location.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONREPOAGENT_DECLSPEC
TRITONSERVER_Error
*
TRITONREPOAGENT_ModelRepositoryUpdate
(
TRITONREPOAGENT_Agent
*
agent
,
TRITONREPOAGENT_AgentModel
*
model
,
const
TRITONREPOAGENT_ArtifactType
artifact_type
,
const
char
*
location
);
/// Get the number of agent parameters defined for a model.
///
/// \param agent The agent.
/// \param model The model.
/// \param count Returns the number of input tensors.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONREPOAGENT_DECLSPEC
TRITONSERVER_Error
*
TRITONREPOAGENT_ModelParameterCount
(
TRITONREPOAGENT_Agent
*
agent
,
TRITONREPOAGENT_AgentModel
*
model
,
uint32_t
*
count
);
/// Get a parameter name and value. The caller does not own the
/// returned strings and must not modify or delete them.
///
/// \param agent The agent.
/// \param model The model.
/// \param index The index of the parameter. Must be 0 <= index <
/// count, where count is the value returned by
/// TRITONREPOAGENT_ModelParameterCount.
/// \param parameter_name Returns the name of the parameter.
/// \param parameter_value Returns the value of the parameter.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONREPOAGENT_DECLSPEC
TRITONSERVER_Error
*
TRITONREPOAGENT_ModelParameter
(
TRITONREPOAGENT_Agent
*
agent
,
TRITONREPOAGENT_AgentModel
*
model
,
const
uint32_t
index
,
const
char
**
parameter_name
,
const
char
**
parameter_value
);
/// Get the model configuration. The caller takes ownership of the
/// message object and must call TRITONSERVER_MessageDelete to release
/// the object. If the model repository does not contain a
/// config.pbtxt file then 'model_config' is returned as nullptr.
///
/// \param agent The agent.
/// \param model The model.
/// \param config_version The model configuration will be returned in
/// a format matching this version. If the configuration cannot be
/// represented in the requested version's format then an error will
/// be returned. Currently only version 1 is supported.
/// \param model_config Returns the model configuration as a message.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONREPOAGENT_DECLSPEC
TRITONSERVER_Error
*
TRITONREPOAGENT_ModelConfig
(
TRITONREPOAGENT_Agent
*
agent
,
TRITONREPOAGENT_AgentModel
*
model
,
const
uint32_t
config_version
,
TRITONSERVER_Message
**
model_config
);
/// Get the user-specified state associated with the model.
///
/// \param model The agent model.
/// \param state Returns the user state, or nullptr if no user state.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONREPOAGENT_DECLSPEC
TRITONSERVER_Error
*
TRITONREPOAGENT_ModelState
(
TRITONREPOAGENT_AgentModel
*
model
,
void
**
state
);
/// Set the user-specified state associated with the model.
///
/// \param model The agent model.
/// \param state The user state, or nullptr if no user state.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONREPOAGENT_DECLSPEC
TRITONSERVER_Error
*
TRITONREPOAGENT_ModelSetState
(
TRITONREPOAGENT_AgentModel
*
model
,
void
*
state
);
/// Get the user-specified state associated with the agent.
///
/// \param agent The agent.
/// \param state Returns the user state, or nullptr if no user state.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONREPOAGENT_DECLSPEC
TRITONSERVER_Error
*
TRITONREPOAGENT_State
(
TRITONREPOAGENT_Agent
*
agent
,
void
**
state
);
/// Set the user-specified state associated with the agent.
///
/// \param agent The agent.
/// \param state The user state, or nullptr if no user state.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONREPOAGENT_DECLSPEC
TRITONSERVER_Error
*
TRITONREPOAGENT_SetState
(
TRITONREPOAGENT_Agent
*
agent
,
void
*
state
);
///
/// The following functions can be implemented by an agent. Functions
/// indicated as required must be implemented or the agent will fail
/// to load.
///
/// Initialize an agent. This function is optional. This function is
/// called once when an agent is loaded to allow the agent to
/// initialize any state associated with the agent. An agent has a
/// single state that is shared across all invocations of the agent.
///
/// \param agent The agent.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONREPOAGENT_ISPEC
TRITONSERVER_Error
*
TRITONREPOAGENT_Initialize
(
TRITONREPOAGENT_Agent
*
agent
);
/// Finalize for an agent. This function is optional. This function is
/// called once, just before the agent is unloaded. All state
/// associated with the agent should be freed and any threads created
/// for the agent should be exited/joined before returning from this
/// function.
///
/// \param agent The agent.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONREPOAGENT_ISPEC
TRITONSERVER_Error
*
TRITONREPOAGENT_Finalize
(
TRITONREPOAGENT_Agent
*
agent
);
/// Initialize a model associated with an agent. This function is optional.
/// This function is called once when an agent model's lifecycle begins to allow
/// the agent model to initialize any state associated with it. An agent model
/// has a single state that is shared across all the lifecycle of the agent
/// model.
///
/// \param agent The agent to be associated with the model.
/// \param model The model.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONREPOAGENT_ISPEC
TRITONSERVER_Error
*
TRITONREPOAGENT_ModelInitialize
(
TRITONREPOAGENT_Agent
*
agent
,
TRITONREPOAGENT_AgentModel
*
model
);
/// Finalize for a model. This function is optional. This function is
/// called once, just before the end of the agent model's lifecycle. All state
/// associated with the agent model should be freed and any threads created
/// for the agent model should be exited/joined before returning from this
/// function. If the model acquired a model location using
/// TRITONREPOAGENT_ModelRepositoryLocationAcquire, it must call
/// TRITONREPOAGENT_ModelRepositoryLocationRelease to release that location.
///
/// \param agent The agent associated with the model.
/// \param model The model.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONREPOAGENT_ISPEC
TRITONSERVER_Error
*
TRITONREPOAGENT_ModelFinalize
(
TRITONREPOAGENT_Agent
*
agent
,
TRITONREPOAGENT_AgentModel
*
model
);
/// Handle an action for a specified model. This function is
/// required. Triton will not perform multiple simultaneous calls to
/// this function for a given agent and model; however, there may be
/// simultaneous calls for the agent for different models.
///
/// If the agent does not handle the action the agent should
/// immediately return success (nullptr).
///
/// Any modification to the model's repository must be made when 'action_type'
/// is TRITONREPOAGENT_ACTION_LOAD.
/// To modify the model's repository the agent must either acquire a mutable
/// location via TRITONREPOAGENT_ModelRepositoryLocationAcquire
/// or its own managed location, report the location to Triton via
/// TRITONREPOAGENT_ModelRepositoryUpdate, and then return
/// success (nullptr). If the agent does not need to make any changes
/// to the model repository it should not call
/// TRITONREPOAGENT_ModelRepositoryUpdate and then return success.
/// To indicate that a model load should fail return a non-success status.
///
/// \param agent The agent.
/// \param model The model that is the target of the action.
/// \action_type The type of action the agent should handle for the model.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONREPOAGENT_ISPEC
TRITONSERVER_Error
*
TRITONREPOAGENT_ModelAction
(
TRITONREPOAGENT_Agent
*
agent
,
TRITONREPOAGENT_AgentModel
*
model
,
const
TRITONREPOAGENT_ActionType
action_type
);
#ifdef __cplusplus
}
#endif
3rdparty/core-r22.12/include/triton/core/tritonserver.h
0 → 100644
View file @
0a21fff9
// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
/// \file
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#ifdef __cplusplus
extern
"C"
{
#endif
#ifdef _COMPILING_TRITONSERVER
#if defined(_MSC_VER)
#define TRITONSERVER_DECLSPEC __declspec(dllexport)
#elif defined(__GNUC__)
#define TRITONSERVER_DECLSPEC __attribute__((__visibility__("default")))
#else
#define TRITONSERVER_DECLSPEC
#endif
#else
#if defined(_MSC_VER)
#define TRITONSERVER_DECLSPEC __declspec(dllimport)
#else
#define TRITONSERVER_DECLSPEC
#endif
#endif
struct
TRITONSERVER_BufferAttributes
;
struct
TRITONSERVER_Error
;
struct
TRITONSERVER_InferenceRequest
;
struct
TRITONSERVER_InferenceResponse
;
struct
TRITONSERVER_InferenceTrace
;
struct
TRITONSERVER_Message
;
struct
TRITONSERVER_Metrics
;
struct
TRITONSERVER_Parameter
;
struct
TRITONSERVER_ResponseAllocator
;
struct
TRITONSERVER_Server
;
struct
TRITONSERVER_ServerOptions
;
struct
TRITONSERVER_Metric
;
struct
TRITONSERVER_MetricFamily
;
///
/// TRITONSERVER API Version
///
/// The TRITONSERVER API is versioned with major and minor version
/// numbers. Any change to the API that does not impact backwards
/// compatibility (for example, adding a non-required function)
/// increases the minor version number. Any change that breaks
/// backwards compatibility (for example, deleting or changing the
/// behavior of a function) increases the major version number. A
/// client should check that the API version used to compile the
/// client is compatible with the API version of the Triton shared
/// library that it is linking against. This is typically done by code
/// similar to the following which makes sure that the major versions
/// are equal and that the minor version of the Triton shared library
/// is >= the minor version used to build the client.
///
/// uint32_t api_version_major, api_version_minor;
/// TRITONSERVER_ApiVersion(&api_version_major, &api_version_minor);
/// if ((api_version_major != TRITONSERVER_API_VERSION_MAJOR) ||
/// (api_version_minor < TRITONSERVER_API_VERSION_MINOR)) {
/// return TRITONSERVER_ErrorNew(
/// TRITONSERVER_ERROR_UNSUPPORTED,
/// "triton server API version does not support this client");
/// }
///
#define TRITONSERVER_API_VERSION_MAJOR 1
#define TRITONSERVER_API_VERSION_MINOR 17
/// Get the TRITONBACKEND API version supported by the Triton shared
/// library. This value can be compared against the
/// TRITONSERVER_API_VERSION_MAJOR and TRITONSERVER_API_VERSION_MINOR
/// used to build the client to ensure that Triton shared library is
/// compatible with the client.
///
/// \param major Returns the TRITONSERVER API major version supported
/// by Triton.
/// \param minor Returns the TRITONSERVER API minor version supported
/// by Triton.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ApiVersion
(
uint32_t
*
major
,
uint32_t
*
minor
);
/// TRITONSERVER_DataType
///
/// Tensor data types recognized by TRITONSERVER.
///
typedef
enum
TRITONSERVER_datatype_enum
{
TRITONSERVER_TYPE_INVALID
,
TRITONSERVER_TYPE_BOOL
,
TRITONSERVER_TYPE_UINT8
,
TRITONSERVER_TYPE_UINT16
,
TRITONSERVER_TYPE_UINT32
,
TRITONSERVER_TYPE_UINT64
,
TRITONSERVER_TYPE_INT8
,
TRITONSERVER_TYPE_INT16
,
TRITONSERVER_TYPE_INT32
,
TRITONSERVER_TYPE_INT64
,
TRITONSERVER_TYPE_FP16
,
TRITONSERVER_TYPE_FP32
,
TRITONSERVER_TYPE_FP64
,
TRITONSERVER_TYPE_BYTES
,
TRITONSERVER_TYPE_BF16
}
TRITONSERVER_DataType
;
/// Get the string representation of a data type. The returned string
/// is not owned by the caller and so should not be modified or freed.
///
/// \param datatype The data type.
/// \return The string representation of the data type.
TRITONSERVER_DECLSPEC
const
char
*
TRITONSERVER_DataTypeString
(
TRITONSERVER_DataType
datatype
);
/// Get the Triton datatype corresponding to a string representation
/// of a datatype.
///
/// \param dtype The datatype string representation.
/// \return The Triton data type or TRITONSERVER_TYPE_INVALID if the
/// string does not represent a data type.
TRITONSERVER_DECLSPEC
TRITONSERVER_DataType
TRITONSERVER_StringToDataType
(
const
char
*
dtype
);
/// Get the size of a Triton datatype in bytes. Zero is returned for
/// TRITONSERVER_TYPE_BYTES because it have variable size. Zero is
/// returned for TRITONSERVER_TYPE_INVALID.
///
/// \param dtype The datatype.
/// \return The size of the datatype.
TRITONSERVER_DECLSPEC
uint32_t
TRITONSERVER_DataTypeByteSize
(
TRITONSERVER_DataType
datatype
);
/// TRITONSERVER_MemoryType
///
/// Types of memory recognized by TRITONSERVER.
///
typedef
enum
TRITONSERVER_memorytype_enum
{
TRITONSERVER_MEMORY_CPU
,
TRITONSERVER_MEMORY_CPU_PINNED
,
TRITONSERVER_MEMORY_GPU
}
TRITONSERVER_MemoryType
;
/// Get the string representation of a memory type. The returned
/// string is not owned by the caller and so should not be modified or
/// freed.
///
/// \param memtype The memory type.
/// \return The string representation of the memory type.
TRITONSERVER_DECLSPEC
const
char
*
TRITONSERVER_MemoryTypeString
(
TRITONSERVER_MemoryType
memtype
);
/// TRITONSERVER_ParameterType
///
/// Types of parameters recognized by TRITONSERVER.
///
typedef
enum
TRITONSERVER_parametertype_enum
{
TRITONSERVER_PARAMETER_STRING
,
TRITONSERVER_PARAMETER_INT
,
TRITONSERVER_PARAMETER_BOOL
,
TRITONSERVER_PARAMETER_BYTES
}
TRITONSERVER_ParameterType
;
/// Get the string representation of a parameter type. The returned
/// string is not owned by the caller and so should not be modified or
/// freed.
///
/// \param paramtype The parameter type.
/// \return The string representation of the parameter type.
TRITONSERVER_DECLSPEC
const
char
*
TRITONSERVER_ParameterTypeString
(
TRITONSERVER_ParameterType
paramtype
);
/// Create a new parameter object. The caller takes ownership of the
/// TRITONSERVER_Parameter object and must call TRITONSERVER_ParameterDelete to
/// release the object. The object will maintain its own copy of the 'value'
///
/// \param name The parameter name.
/// \param type The parameter type.
/// \param value The pointer to the value.
/// \return A new TRITONSERVER_Parameter object. 'nullptr' will be returned if
/// 'type' is 'TRITONSERVER_PARAMETER_BYTES'. The caller should use
/// TRITONSERVER_ParameterBytesNew to create parameter with bytes type.
TRITONSERVER_DECLSPEC
TRITONSERVER_Parameter
*
TRITONSERVER_ParameterNew
(
const
char
*
name
,
const
TRITONSERVER_ParameterType
type
,
const
void
*
value
);
/// Create a new parameter object with type TRITONSERVER_PARAMETER_BYTES.
/// The caller takes ownership of the TRITONSERVER_Parameter object and must
/// call TRITONSERVER_ParameterDelete to release the object. The object only
/// maintains a shallow copy of the 'byte_ptr' so the data content must be
/// valid until the parameter object is deleted.
///
/// \param name The parameter name.
/// \param byte_ptr The pointer to the data content.
/// \param size The size of the data content.
/// \return A new TRITONSERVER_Error object.
TRITONSERVER_DECLSPEC
TRITONSERVER_Parameter
*
TRITONSERVER_ParameterBytesNew
(
const
char
*
name
,
const
void
*
byte_ptr
,
const
uint64_t
size
);
/// Delete an parameter object.
///
/// \param parameter The parameter object.
TRITONSERVER_DECLSPEC
void
TRITONSERVER_ParameterDelete
(
TRITONSERVER_Parameter
*
parameter
);
/// TRITONSERVER_InstanceGroupKind
///
/// Kinds of instance groups recognized by TRITONSERVER.
///
typedef
enum
TRITONSERVER_instancegroupkind_enum
{
TRITONSERVER_INSTANCEGROUPKIND_AUTO
,
TRITONSERVER_INSTANCEGROUPKIND_CPU
,
TRITONSERVER_INSTANCEGROUPKIND_GPU
,
TRITONSERVER_INSTANCEGROUPKIND_MODEL
}
TRITONSERVER_InstanceGroupKind
;
/// Get the string representation of an instance-group kind. The
/// returned string is not owned by the caller and so should not be
/// modified or freed.
///
/// \param kind The instance-group kind.
/// \return The string representation of the kind.
TRITONSERVER_DECLSPEC
const
char
*
TRITONSERVER_InstanceGroupKindString
(
TRITONSERVER_InstanceGroupKind
kind
);
/// TRITONSERVER_Logging
///
/// Types/levels of logging.
///
typedef
enum
TRITONSERVER_loglevel_enum
{
TRITONSERVER_LOG_INFO
,
TRITONSERVER_LOG_WARN
,
TRITONSERVER_LOG_ERROR
,
TRITONSERVER_LOG_VERBOSE
}
TRITONSERVER_LogLevel
;
///
/// Format of logging.
///
/// TRITONSERVER_LOG_DEFAULT: the log severity (L) and timestamp will be
/// logged as "LMMDD hh:mm:ss.ssssss".
///
/// TRITONSERVER_LOG_ISO8601: the log format will be "YYYY-MM-DDThh:mm:ssZ L".
///
typedef
enum
TRITONSERVER_logformat_enum
{
TRITONSERVER_LOG_DEFAULT
,
TRITONSERVER_LOG_ISO8601
}
TRITONSERVER_LogFormat
;
/// Is a log level enabled?
///
/// \param level The log level.
/// \return True if the log level is enabled, false if not enabled.
TRITONSERVER_DECLSPEC
bool
TRITONSERVER_LogIsEnabled
(
TRITONSERVER_LogLevel
level
);
/// Log a message at a given log level if that level is enabled.
///
/// \param level The log level.
/// \param filename The file name of the location of the log message.
/// \param line The line number of the log message.
/// \param msg The log message.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_LogMessage
(
TRITONSERVER_LogLevel
level
,
const
char
*
filename
,
const
int
line
,
const
char
*
msg
);
/// TRITONSERVER_Error
///
/// Errors are reported by a TRITONSERVER_Error object. A NULL
/// TRITONSERVER_Error indicates no error, a non-NULL TRITONSERVER_Error
/// indicates error and the code and message for the error can be
/// retrieved from the object.
///
/// The caller takes ownership of a TRITONSERVER_Error object returned by
/// the API and must call TRITONSERVER_ErrorDelete to release the object.
///
/// The TRITONSERVER_Error error codes
typedef
enum
TRITONSERVER_errorcode_enum
{
TRITONSERVER_ERROR_UNKNOWN
,
TRITONSERVER_ERROR_INTERNAL
,
TRITONSERVER_ERROR_NOT_FOUND
,
TRITONSERVER_ERROR_INVALID_ARG
,
TRITONSERVER_ERROR_UNAVAILABLE
,
TRITONSERVER_ERROR_UNSUPPORTED
,
TRITONSERVER_ERROR_ALREADY_EXISTS
}
TRITONSERVER_Error_Code
;
/// Create a new error object. The caller takes ownership of the
/// TRITONSERVER_Error object and must call TRITONSERVER_ErrorDelete to
/// release the object.
///
/// \param code The error code.
/// \param msg The error message.
/// \return A new TRITONSERVER_Error object.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ErrorNew
(
TRITONSERVER_Error_Code
code
,
const
char
*
msg
);
/// Delete an error object.
///
/// \param error The error object.
TRITONSERVER_DECLSPEC
void
TRITONSERVER_ErrorDelete
(
TRITONSERVER_Error
*
error
);
/// Get the error code.
///
/// \param error The error object.
/// \return The error code.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error_Code
TRITONSERVER_ErrorCode
(
TRITONSERVER_Error
*
error
);
/// Get the string representation of an error code. The returned
/// string is not owned by the caller and so should not be modified or
/// freed. The lifetime of the returned string extends only as long as
/// 'error' and must not be accessed once 'error' is deleted.
///
/// \param error The error object.
/// \return The string representation of the error code.
TRITONSERVER_DECLSPEC
const
char
*
TRITONSERVER_ErrorCodeString
(
TRITONSERVER_Error
*
error
);
/// Get the error message. The returned string is not owned by the
/// caller and so should not be modified or freed. The lifetime of the
/// returned string extends only as long as 'error' and must not be
/// accessed once 'error' is deleted.
///
/// \param error The error object.
/// \return The error message.
TRITONSERVER_DECLSPEC
const
char
*
TRITONSERVER_ErrorMessage
(
TRITONSERVER_Error
*
error
);
/// TRITONSERVER_ResponseAllocator
///
/// Object representing a memory allocator for output tensors in an
/// inference response.
///
/// Type for allocation function that allocates a buffer to hold an
/// output tensor.
///
/// \param allocator The allocator that is provided in the call to
/// TRITONSERVER_InferenceRequestSetResponseCallback.
/// \param tensor_name The name of the output tensor to allocate for.
/// \param byte_size The size of the buffer to allocate.
/// \param memory_type The type of memory that the caller prefers for
/// the buffer allocation.
/// \param memory_type_id The ID of the memory that the caller prefers
/// for the buffer allocation.
/// \param userp The user data pointer that is provided as
/// 'response_allocator_userp' in the call to
/// TRITONSERVER_InferenceRequestSetResponseCallback.
/// \param buffer Returns a pointer to the allocated memory.
/// \param buffer_userp Returns a user-specified value to associate
/// with the buffer, or nullptr if no user-specified value should be
/// associated with the buffer. This value will be provided in the
/// call to TRITONSERVER_ResponseAllocatorReleaseFn_t when the buffer
/// is released and will also be returned by
/// TRITONSERVER_InferenceResponseOutput.
/// \param actual_memory_type Returns the type of memory where the
/// allocation resides. May be different than the type of memory
/// requested by 'memory_type'.
/// \param actual_memory_type_id Returns the ID of the memory where
/// the allocation resides. May be different than the ID of the memory
/// requested by 'memory_type_id'.
/// \return a TRITONSERVER_Error object if a failure occurs while
/// attempting an allocation. If an error is returned all other return
/// values will be ignored.
typedef
TRITONSERVER_Error
*
(
*
TRITONSERVER_ResponseAllocatorAllocFn_t
)(
TRITONSERVER_ResponseAllocator
*
allocator
,
const
char
*
tensor_name
,
size_t
byte_size
,
TRITONSERVER_MemoryType
memory_type
,
int64_t
memory_type_id
,
void
*
userp
,
void
**
buffer
,
void
**
buffer_userp
,
TRITONSERVER_MemoryType
*
actual_memory_type
,
int64_t
*
actual_memory_type_id
);
/// Type for allocation function that allocates a buffer to hold an
/// output tensor with buffer attributes. The callback function must fill in the
/// appropriate buffer attributes information related to this buffer. If set,
/// this function is always called after TRITONSERVER_ResponseAllocatorAllocFn_t
/// function.
///
/// \param allocator The allocator that is provided in the call to
/// TRITONSERVER_InferenceRequestSetResponseCallback.
/// \param tensor_name The name of the output tensor to allocate for.
/// \param buffer_attributes The buffer attributes associated with the buffer.
/// \param userp The user data pointer that is provided as
/// 'response_allocator_userp' in the call to
/// TRITONSERVER_InferenceRequestSetResponseCallback.
/// \param buffer_userp Returns a user-specified value to associate
/// with the buffer, or nullptr if no user-specified value should be
/// associated with the buffer. This value will be provided in the
/// call to TRITONSERVER_ResponseAllocatorReleaseFn_t when the buffer
/// is released and will also be returned by
/// TRITONSERVER_InferenceResponseOutput.
/// \return a TRITONSERVER_Error object if a failure occurs while
/// attempting an allocation. If an error is returned all other return
/// values will be ignored.
typedef
TRITONSERVER_Error
*
(
*
TRITONSERVER_ResponseAllocatorBufferAttributesFn_t
)(
TRITONSERVER_ResponseAllocator
*
allocator
,
const
char
*
tensor_name
,
TRITONSERVER_BufferAttributes
*
buffer_attributes
,
void
*
userp
,
void
*
buffer_userp
);
/// Type for function that is called to query the allocator's preferred memory
/// type and memory type ID. As much as possible, the allocator should attempt
/// to return the same memory_type and memory_type_id values that will be
/// returned by the subsequent call to TRITONSERVER_ResponseAllocatorAllocFn_t.
/// But the allocator is not required to do so.
///
/// \param allocator The allocator that is provided in the call to
/// TRITONSERVER_InferenceRequestSetResponseCallback.
/// \param userp The user data pointer that is provided as
/// 'response_allocator_userp' in the call to
/// TRITONSERVER_InferenceRequestSetResponseCallback.
/// \param tensor_name The name of the output tensor. This is optional
/// and it should be set to nullptr to indicate that the tensor name has
/// not determined.
/// \param byte_size The expected size of the buffer. This is optional
/// and it should be set to nullptr to indicate that the byte size has
/// not determined.
/// \param memory_type Acts as both input and output. On input gives
/// the memory type preferred by the caller. Returns memory type preferred
/// by the allocator, taken account of the caller preferred type.
/// \param memory_type_id Acts as both input and output. On input gives
/// the memory type ID preferred by the caller. Returns memory type ID preferred
/// by the allocator, taken account of the caller preferred type ID.
/// \return a TRITONSERVER_Error object if a failure occurs.
typedef
TRITONSERVER_Error
*
(
*
TRITONSERVER_ResponseAllocatorQueryFn_t
)(
TRITONSERVER_ResponseAllocator
*
allocator
,
void
*
userp
,
const
char
*
tensor_name
,
size_t
*
byte_size
,
TRITONSERVER_MemoryType
*
memory_type
,
int64_t
*
memory_type_id
);
/// Type for function that is called when the server no longer holds
/// any reference to a buffer allocated by
/// TRITONSERVER_ResponseAllocatorAllocFn_t. In practice this function
/// is typically called when the response object associated with the
/// buffer is deleted by TRITONSERVER_InferenceResponseDelete.
///
/// \param allocator The allocator that is provided in the call to
/// TRITONSERVER_InferenceRequestSetResponseCallback.
/// \param buffer Pointer to the buffer to be freed.
/// \param buffer_userp The user-specified value associated
/// with the buffer in TRITONSERVER_ResponseAllocatorAllocFn_t.
/// \param byte_size The size of the buffer.
/// \param memory_type The type of memory holding the buffer.
/// \param memory_type_id The ID of the memory holding the buffer.
/// \return a TRITONSERVER_Error object if a failure occurs while
/// attempting the release. If an error is returned Triton will not
/// attempt to release the buffer again.
typedef
TRITONSERVER_Error
*
(
*
TRITONSERVER_ResponseAllocatorReleaseFn_t
)(
TRITONSERVER_ResponseAllocator
*
allocator
,
void
*
buffer
,
void
*
buffer_userp
,
size_t
byte_size
,
TRITONSERVER_MemoryType
memory_type
,
int64_t
memory_type_id
);
/// Type for function that is called to indicate that subsequent
/// allocation requests will refer to a new response.
///
/// \param allocator The allocator that is provided in the call to
/// TRITONSERVER_InferenceRequestSetResponseCallback.
/// \param userp The user data pointer that is provided as
/// 'response_allocator_userp' in the call to
/// TRITONSERVER_InferenceRequestSetResponseCallback.
/// \return a TRITONSERVER_Error object if a failure occurs.
typedef
TRITONSERVER_Error
*
(
*
TRITONSERVER_ResponseAllocatorStartFn_t
)(
TRITONSERVER_ResponseAllocator
*
allocator
,
void
*
userp
);
/// Create a new response allocator object.
///
/// The response allocator object is used by Triton to allocate
/// buffers to hold the output tensors in inference responses. Most
/// models generate a single response for each inference request
/// (TRITONSERVER_TXN_ONE_TO_ONE). For these models the order of
/// callbacks will be:
///
/// TRITONSERVER_ServerInferAsync called
/// - start_fn : optional (and typically not required)
/// - alloc_fn : called once for each output tensor in response
/// TRITONSERVER_InferenceResponseDelete called
/// - release_fn: called once for each output tensor in response
///
/// For models that generate multiple responses for each inference
/// request (TRITONSERVER_TXN_DECOUPLED), the start_fn callback can be
/// used to determine sets of alloc_fn callbacks that belong to the
/// same response:
///
/// TRITONSERVER_ServerInferAsync called
/// - start_fn
/// - alloc_fn : called once for each output tensor in response
/// - start_fn
/// - alloc_fn : called once for each output tensor in response
/// ...
/// For each response, TRITONSERVER_InferenceResponseDelete called
/// - release_fn: called once for each output tensor in the response
///
/// In all cases the start_fn, alloc_fn and release_fn callback
/// functions must be thread-safe. Typically making these functions
/// thread-safe does not require explicit locking. The recommended way
/// to implement these functions is to have each inference request
/// provide a 'response_allocator_userp' object that is unique to that
/// request with TRITONSERVER_InferenceRequestSetResponseCallback. The
/// callback functions then operate only on this unique state. Locking
/// is required only when the callback function needs to access state
/// that is shared across inference requests (for example, a common
/// allocation pool).
///
/// \param allocator Returns the new response allocator object.
/// \param alloc_fn The function to call to allocate buffers for result
/// tensors.
/// \param release_fn The function to call when the server no longer
/// holds a reference to an allocated buffer.
/// \param start_fn The function to call to indicate that the
/// subsequent 'alloc_fn' calls are for a new response. This callback
/// is optional (use nullptr to indicate that it should not be
/// invoked).
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ResponseAllocatorNew
(
TRITONSERVER_ResponseAllocator
**
allocator
,
TRITONSERVER_ResponseAllocatorAllocFn_t
alloc_fn
,
TRITONSERVER_ResponseAllocatorReleaseFn_t
release_fn
,
TRITONSERVER_ResponseAllocatorStartFn_t
start_fn
);
/// Set the buffer attributes function for a response allocator object.
/// The function will be called after alloc_fn to set the buffer attributes
/// associated with the output buffer.
///
/// The thread-safy requirement for buffer_attributes_fn is the same as other
/// allocator callbacks.
///
/// \param allocator The response allocator object.
/// \param buffer_attributes_fn The function to call to get the buffer
/// attributes information for an allocated buffer.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ResponseAllocatorSetBufferAttributesFunction
(
TRITONSERVER_ResponseAllocator
*
allocator
,
TRITONSERVER_ResponseAllocatorBufferAttributesFn_t
buffer_attributes_fn
);
/// Set the query function to a response allocator object. Usually the
/// function will be called before alloc_fn to understand what is the
/// allocator's preferred memory type and memory type ID at the current
/// situation to make different execution decision.
///
/// The thread-safy requirement for query_fn is the same as other allocator
/// callbacks.
///
/// \param allocator The response allocator object.
/// \param query_fn The function to call to query allocator's preferred memory
/// type and memory type ID.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ResponseAllocatorSetQueryFunction
(
TRITONSERVER_ResponseAllocator
*
allocator
,
TRITONSERVER_ResponseAllocatorQueryFn_t
query_fn
);
/// Delete a response allocator.
///
/// \param allocator The response allocator object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ResponseAllocatorDelete
(
TRITONSERVER_ResponseAllocator
*
allocator
);
/// TRITONSERVER_Message
///
/// Object representing a Triton Server message.
///
/// Create a new message object from serialized JSON string.
///
/// \param message The message object.
/// \param base The base of the serialized JSON.
/// \param byte_size The size, in bytes, of the serialized message.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_MessageNewFromSerializedJson
(
TRITONSERVER_Message
**
message
,
const
char
*
base
,
size_t
byte_size
);
/// Delete a message object.
///
/// \param message The message object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_MessageDelete
(
TRITONSERVER_Message
*
message
);
/// Get the base and size of the buffer containing the serialized
/// message in JSON format. The buffer is owned by the
/// TRITONSERVER_Message object and should not be modified or freed by
/// the caller. The lifetime of the buffer extends only as long as
/// 'message' and must not be accessed once 'message' is deleted.
///
/// \param message The message object.
/// \param base Returns the base of the serialized message.
/// \param byte_size Returns the size, in bytes, of the serialized
/// message.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_MessageSerializeToJson
(
TRITONSERVER_Message
*
message
,
const
char
**
base
,
size_t
*
byte_size
);
/// TRITONSERVER_Metrics
///
/// Object representing metrics.
///
/// Metric format types
typedef
enum
tritonserver_metricformat_enum
{
TRITONSERVER_METRIC_PROMETHEUS
}
TRITONSERVER_MetricFormat
;
/// Delete a metrics object.
///
/// \param metrics The metrics object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_MetricsDelete
(
TRITONSERVER_Metrics
*
metrics
);
/// Get a buffer containing the metrics in the specified format. For
/// each format the buffer contains the following:
///
/// TRITONSERVER_METRIC_PROMETHEUS: 'base' points to a single multiline
/// string (char*) that gives a text representation of the metrics in
/// prometheus format. 'byte_size' returns the length of the string
/// in bytes.
///
/// The buffer is owned by the 'metrics' object and should not be
/// modified or freed by the caller. The lifetime of the buffer
/// extends only as long as 'metrics' and must not be accessed once
/// 'metrics' is deleted.
///
/// \param metrics The metrics object.
/// \param format The format to use for the returned metrics.
/// \param base Returns a pointer to the base of the formatted
/// metrics, as described above.
/// \param byte_size Returns the size, in bytes, of the formatted
/// metrics.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_MetricsFormatted
(
TRITONSERVER_Metrics
*
metrics
,
TRITONSERVER_MetricFormat
format
,
const
char
**
base
,
size_t
*
byte_size
);
/// TRITONSERVER_InferenceTrace
///
/// Object that represents tracing for an inference request.
///
/// Trace levels. The trace level controls the type of trace
/// activities that are reported for an inference request.
///
/// Trace level values are power-of-2 and can be combined to trace
/// multiple types of activities. For example, use
/// (TRITONSERVER_TRACE_LEVEL_TIMESTAMPS |
/// TRITONSERVER_TRACE_LEVEL_TENSORS) to trace both timestamps and
/// tensors for an inference request.
///
/// TRITONSERVER_TRACE_LEVEL_MIN and TRITONSERVER_TRACE_LEVEL_MAX are
/// deprecated and should not be used.
typedef
enum
tritonserver_tracelevel_enum
{
/// Tracing disabled. No trace activities are reported.
TRITONSERVER_TRACE_LEVEL_DISABLED
=
0
,
/// Deprecated. Use TRITONSERVER_TRACE_LEVEL_TIMESTAMPS.
TRITONSERVER_TRACE_LEVEL_MIN
=
1
,
/// Deprecated. Use TRITONSERVER_TRACE_LEVEL_TIMESTAMPS.
TRITONSERVER_TRACE_LEVEL_MAX
=
2
,
/// Record timestamps for the inference request.
TRITONSERVER_TRACE_LEVEL_TIMESTAMPS
=
0x4
,
/// Record input and output tensor values for the inference request.
TRITONSERVER_TRACE_LEVEL_TENSORS
=
0x8
}
TRITONSERVER_InferenceTraceLevel
;
/// Get the string representation of a trace level. The returned
/// string is not owned by the caller and so should not be modified or
/// freed.
///
/// \param level The trace level.
/// \return The string representation of the trace level.
TRITONSERVER_DECLSPEC
const
char
*
TRITONSERVER_InferenceTraceLevelString
(
TRITONSERVER_InferenceTraceLevel
level
);
/// Trace activities
typedef
enum
tritonserver_traceactivity_enum
{
TRITONSERVER_TRACE_REQUEST_START
=
0
,
TRITONSERVER_TRACE_QUEUE_START
=
1
,
TRITONSERVER_TRACE_COMPUTE_START
=
2
,
TRITONSERVER_TRACE_COMPUTE_INPUT_END
=
3
,
TRITONSERVER_TRACE_COMPUTE_OUTPUT_START
=
4
,
TRITONSERVER_TRACE_COMPUTE_END
=
5
,
TRITONSERVER_TRACE_REQUEST_END
=
6
,
TRITONSERVER_TRACE_TENSOR_QUEUE_INPUT
=
7
,
TRITONSERVER_TRACE_TENSOR_BACKEND_INPUT
=
8
,
TRITONSERVER_TRACE_TENSOR_BACKEND_OUTPUT
=
9
}
TRITONSERVER_InferenceTraceActivity
;
/// Get the string representation of a trace activity. The returned
/// string is not owned by the caller and so should not be modified or
/// freed.
///
/// \param activity The trace activity.
/// \return The string representation of the trace activity.
TRITONSERVER_DECLSPEC
const
char
*
TRITONSERVER_InferenceTraceActivityString
(
TRITONSERVER_InferenceTraceActivity
activity
);
/// Type for trace timeline activity callback function. This callback function
/// is used to report activity occurring for a trace. This function
/// does not take ownership of 'trace' and so any information needed
/// from that object must be copied before returning. The 'userp' data
/// is the same as what is supplied in the call to
/// TRITONSERVER_InferenceTraceNew.
typedef
void
(
*
TRITONSERVER_InferenceTraceActivityFn_t
)(
TRITONSERVER_InferenceTrace
*
trace
,
TRITONSERVER_InferenceTraceActivity
activity
,
uint64_t
timestamp_ns
,
void
*
userp
);
/// Type for trace tensor activity callback function. This callback function
/// is used to report tensor activity occurring for a trace. This function
/// does not take ownership of 'trace' and so any information needed
/// from that object must be copied before returning. The 'userp' data
/// is the same as what is supplied in the call to
/// TRITONSERVER_InferenceTraceTensorNew.
typedef
void
(
*
TRITONSERVER_InferenceTraceTensorActivityFn_t
)(
TRITONSERVER_InferenceTrace
*
trace
,
TRITONSERVER_InferenceTraceActivity
activity
,
const
char
*
name
,
TRITONSERVER_DataType
datatype
,
const
void
*
base
,
size_t
byte_size
,
const
int64_t
*
shape
,
uint64_t
dim_count
,
TRITONSERVER_MemoryType
memory_type
,
int64_t
memory_type_id
,
void
*
userp
);
/// Type for trace release callback function. This callback function
/// is called when all activity for the trace has completed. The
/// callback function takes ownership of the
/// TRITONSERVER_InferenceTrace object. The 'userp' data is the same
/// as what is supplied in the call to TRITONSERVER_InferenceTraceNew.
typedef
void
(
*
TRITONSERVER_InferenceTraceReleaseFn_t
)(
TRITONSERVER_InferenceTrace
*
trace
,
void
*
userp
);
/// Create a new inference trace object. The caller takes ownership of
/// the TRITONSERVER_InferenceTrace object and must call
/// TRITONSERVER_InferenceTraceDelete to release the object.
///
/// The activity callback function will be called to report activity
/// for 'trace' as well as for any child traces that are spawned by
/// 'trace', and so the activity callback must check the trace object
/// to determine specifically what activity is being reported.
///
/// The release callback is called for both 'trace' and for any child
/// traces spawned by 'trace'.
///
/// \param trace Returns the new inference trace object.
/// \param level The tracing level.
/// \param parent_id The parent trace id for this trace. A value of 0
/// indicates that there is not parent trace.
/// \param activity_fn The callback function where activity for the
/// trace is reported.
/// \param release_fn The callback function called when all activity
/// is complete for the trace.
/// \param trace_userp User-provided pointer that is delivered to
/// the activity and release callback functions.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceTraceNew
(
TRITONSERVER_InferenceTrace
**
trace
,
TRITONSERVER_InferenceTraceLevel
level
,
uint64_t
parent_id
,
TRITONSERVER_InferenceTraceActivityFn_t
activity_fn
,
TRITONSERVER_InferenceTraceReleaseFn_t
release_fn
,
void
*
trace_userp
);
/// Create a new inference trace object. The caller takes ownership of
/// the TRITONSERVER_InferenceTrace object and must call
/// TRITONSERVER_InferenceTraceDelete to release the object.
///
/// The timeline and tensor activity callback function will be called to report
/// activity for 'trace' as well as for any child traces that are spawned by
/// 'trace', and so the activity callback must check the trace object
/// to determine specifically what activity is being reported.
///
/// The release callback is called for both 'trace' and for any child
/// traces spawned by 'trace'.
///
/// \param trace Returns the new inference trace object.
/// \param level The tracing level.
/// \param parent_id The parent trace id for this trace. A value of 0
/// indicates that there is not parent trace.
/// \param activity_fn The callback function where timeline activity for the
/// trace is reported.
/// \param tensor_activity_fn The callback function where tensor activity for
/// the trace is reported.
/// \param release_fn The callback function called when all activity
/// is complete for the trace.
/// \param trace_userp User-provided pointer that is delivered to
/// the activity and release callback functions.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceTraceTensorNew
(
TRITONSERVER_InferenceTrace
**
trace
,
TRITONSERVER_InferenceTraceLevel
level
,
uint64_t
parent_id
,
TRITONSERVER_InferenceTraceActivityFn_t
activity_fn
,
TRITONSERVER_InferenceTraceTensorActivityFn_t
tensor_activity_fn
,
TRITONSERVER_InferenceTraceReleaseFn_t
release_fn
,
void
*
trace_userp
);
/// Delete a trace object.
///
/// \param trace The trace object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceTraceDelete
(
TRITONSERVER_InferenceTrace
*
trace
);
/// Get the id associated with a trace. Every trace is assigned an id
/// that is unique across all traces created for a Triton server.
///
/// \param trace The trace.
/// \param id Returns the id associated with the trace.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceTraceId
(
TRITONSERVER_InferenceTrace
*
trace
,
uint64_t
*
id
);
/// Get the parent id associated with a trace. The parent id indicates
/// a parent-child relationship between two traces. A parent id value
/// of 0 indicates that there is no parent trace.
///
/// \param trace The trace.
/// \param id Returns the parent id associated with the trace.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceTraceParentId
(
TRITONSERVER_InferenceTrace
*
trace
,
uint64_t
*
parent_id
);
/// Get the name of the model associated with a trace. The caller does
/// not own the returned string and must not modify or delete it. The
/// lifetime of the returned string extends only as long as 'trace'.
///
/// \param trace The trace.
/// \param model_name Returns the name of the model associated with
/// the trace.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceTraceModelName
(
TRITONSERVER_InferenceTrace
*
trace
,
const
char
**
model_name
);
/// Get the version of the model associated with a trace.
///
/// \param trace The trace.
/// \param model_version Returns the version of the model associated
/// with the trace.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceTraceModelVersion
(
TRITONSERVER_InferenceTrace
*
trace
,
int64_t
*
model_version
);
/// TRITONSERVER_InferenceRequest
///
/// Object representing an inference request. The inference request
/// provides the meta-data and input tensor values needed for an
/// inference and returns the inference result meta-data and output
/// tensors. An inference request object can be modified and reused
/// multiple times.
///
/// Inference request flags. The enum values must be power-of-2 values.
typedef
enum
tritonserver_requestflag_enum
{
TRITONSERVER_REQUEST_FLAG_SEQUENCE_START
=
1
,
TRITONSERVER_REQUEST_FLAG_SEQUENCE_END
=
2
}
TRITONSERVER_RequestFlag
;
/// Inference request release flags. The enum values must be
/// power-of-2 values.
typedef
enum
tritonserver_requestreleaseflag_enum
{
TRITONSERVER_REQUEST_RELEASE_ALL
=
1
}
TRITONSERVER_RequestReleaseFlag
;
/// Inference response complete flags. The enum values must be
/// power-of-2 values.
typedef
enum
tritonserver_responsecompleteflag_enum
{
TRITONSERVER_RESPONSE_COMPLETE_FINAL
=
1
}
TRITONSERVER_ResponseCompleteFlag
;
/// Type for inference request release callback function. The callback
/// indicates what type of release is being performed on the request
/// and for some of these the callback function takes ownership of the
/// TRITONSERVER_InferenceRequest object. The 'userp' data is the data
/// provided as 'request_release_userp' in the call to
/// TRITONSERVER_InferenceRequestSetReleaseCallback.
///
/// One or more flags will be specified when the callback is invoked,
/// and the callback must take the following actions:
///
/// - TRITONSERVER_REQUEST_RELEASE_ALL: The entire inference request
/// is being released and ownership is passed to the callback
/// function. Triton will not longer access the 'request' object
/// itself nor any input tensor data associated with the
/// request. The callback should free or otherwise manage the
/// 'request' object and all associated tensor data.
///
/// Note that currently TRITONSERVER_REQUEST_RELEASE_ALL should always
/// be set when the callback is invoked but in the future that may
/// change, so the callback should explicitly check for the flag
/// before taking ownership of the request object.
///
typedef
void
(
*
TRITONSERVER_InferenceRequestReleaseFn_t
)(
TRITONSERVER_InferenceRequest
*
request
,
const
uint32_t
flags
,
void
*
userp
);
/// Type for callback function indicating that an inference response
/// has completed. The callback function takes ownership of the
/// TRITONSERVER_InferenceResponse object. The 'userp' data is the
/// data provided as 'response_userp' in the call to
/// TRITONSERVER_InferenceRequestSetResponseCallback.
///
/// One or more flags may be specified when the callback is invoked:
///
/// - TRITONSERVER_RESPONSE_COMPLETE_FINAL: Indicates that no more
/// responses will be generated for a given request (more
/// specifically, that no more responses will be generated for the
/// inference request that set this callback and 'userp'). When
/// this flag is set 'response' may be a response object or may be
/// nullptr. If 'response' is not nullptr, then 'response' is the
/// last response that Triton will produce for the request. If
/// 'response' is nullptr then Triton is indicating that no more
/// responses will be produced for the request.
typedef
void
(
*
TRITONSERVER_InferenceResponseCompleteFn_t
)(
TRITONSERVER_InferenceResponse
*
response
,
const
uint32_t
flags
,
void
*
userp
);
/// Create a new inference request object.
///
/// \param inference_request Returns the new request object.
/// \param server the inference server object.
/// \param model_name The name of the model to use for the request.
/// \param model_version The version of the model to use for the
/// request. If -1 then the server will choose a version based on the
/// model's policy.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceRequestNew
(
TRITONSERVER_InferenceRequest
**
inference_request
,
TRITONSERVER_Server
*
server
,
const
char
*
model_name
,
const
int64_t
model_version
);
/// Delete an inference request object.
///
/// \param inference_request The request object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceRequestDelete
(
TRITONSERVER_InferenceRequest
*
inference_request
);
/// Get the ID for a request. The returned ID is owned by
/// 'inference_request' and must not be modified or freed by the
/// caller.
///
/// \param inference_request The request object.
/// \param id Returns the ID.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceRequestId
(
TRITONSERVER_InferenceRequest
*
inference_request
,
const
char
**
id
);
/// Set the ID for a request.
///
/// \param inference_request The request object.
/// \param id The ID.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceRequestSetId
(
TRITONSERVER_InferenceRequest
*
inference_request
,
const
char
*
id
);
/// Get the flag(s) associated with a request. On return 'flags' holds
/// a bitwise-or of all flag values, see TRITONSERVER_RequestFlag for
/// available flags.
///
/// \param inference_request The request object.
/// \param flags Returns the flags.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceRequestFlags
(
TRITONSERVER_InferenceRequest
*
inference_request
,
uint32_t
*
flags
);
/// Set the flag(s) associated with a request. 'flags' should hold a
/// bitwise-or of all flag values, see TRITONSERVER_RequestFlag for
/// available flags.
///
/// \param inference_request The request object.
/// \param flags The flags.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceRequestSetFlags
(
TRITONSERVER_InferenceRequest
*
inference_request
,
uint32_t
flags
);
/// Get the correlation ID of the inference request as an unsigned integer.
/// Default is 0, which indicates that the request has no correlation ID.
/// If the correlation id associated with the inference request is a string,
/// this function will return a failure. The correlation ID is used
/// to indicate two or more inference request are related to each other.
/// How this relationship is handled by the inference server is determined by
/// the model's scheduling policy.
///
/// \param inference_request The request object.
/// \param correlation_id Returns the correlation ID.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceRequestCorrelationId
(
TRITONSERVER_InferenceRequest
*
inference_request
,
uint64_t
*
correlation_id
);
/// Get the correlation ID of the inference request as a string.
/// Default is empty "", which indicates that the request has no correlation ID.
/// If the correlation id associated with the inference request is an unsigned
/// integer, then this function will return a failure. The correlation ID
/// is used to indicate two or more inference request are related to each other.
/// How this relationship is handled by the inference server is determined by
/// the model's scheduling policy.
///
/// \param inference_request The request object.
/// \param correlation_id Returns the correlation ID.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceRequestCorrelationIdString
(
TRITONSERVER_InferenceRequest
*
inference_request
,
const
char
**
correlation_id
);
/// Set the correlation ID of the inference request to be an unsigned integer.
/// Default is 0, which indicates that the request has no correlation ID.
/// The correlation ID is used to indicate two or more inference request
/// are related to each other. How this relationship is handled by the
/// inference server is determined by the model's scheduling policy.
///
/// \param inference_request The request object.
/// \param correlation_id The correlation ID.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceRequestSetCorrelationId
(
TRITONSERVER_InferenceRequest
*
inference_request
,
uint64_t
correlation_id
);
/// Set the correlation ID of the inference request to be a string.
/// The correlation ID is used to indicate two or more inference
/// request are related to each other. How this relationship is
/// handled by the inference server is determined by the model's
/// scheduling policy.
///
/// \param inference_request The request object.
/// \param correlation_id The correlation ID.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceRequestSetCorrelationIdString
(
TRITONSERVER_InferenceRequest
*
inference_request
,
const
char
*
correlation_id
);
/// Get the priority for a request. The default is 0 indicating that
/// the request does not specify a priority and so will use the
/// model's default priority.
///
/// \param inference_request The request object.
/// \param priority Returns the priority level.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceRequestPriority
(
TRITONSERVER_InferenceRequest
*
inference_request
,
uint32_t
*
priority
);
/// Set the priority for a request. The default is 0 indicating that
/// the request does not specify a priority and so will use the
/// model's default priority.
///
/// \param inference_request The request object.
/// \param priority The priority level.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceRequestSetPriority
(
TRITONSERVER_InferenceRequest
*
inference_request
,
uint32_t
priority
);
/// Get the timeout for a request, in microseconds. The default is 0
/// which indicates that the request has no timeout.
///
/// \param inference_request The request object.
/// \param timeout_us Returns the timeout, in microseconds.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceRequestTimeoutMicroseconds
(
TRITONSERVER_InferenceRequest
*
inference_request
,
uint64_t
*
timeout_us
);
/// Set the timeout for a request, in microseconds. The default is 0
/// which indicates that the request has no timeout.
///
/// \param inference_request The request object.
/// \param timeout_us The timeout, in microseconds.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceRequestSetTimeoutMicroseconds
(
TRITONSERVER_InferenceRequest
*
inference_request
,
uint64_t
timeout_us
);
/// Add an input to a request.
///
/// \param inference_request The request object.
/// \param name The name of the input.
/// \param datatype The type of the input. Valid type names are BOOL,
/// UINT8, UINT16, UINT32, UINT64, INT8, INT16, INT32, INT64, FP16,
/// FP32, FP64, and BYTES.
/// \param shape The shape of the input.
/// \param dim_count The number of dimensions of 'shape'.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceRequestAddInput
(
TRITONSERVER_InferenceRequest
*
inference_request
,
const
char
*
name
,
const
TRITONSERVER_DataType
datatype
,
const
int64_t
*
shape
,
uint64_t
dim_count
);
/// Add a raw input to a request. The name recognized by the model, data type
/// and shape of the input will be deduced from model configuration.
/// This function must be called at most once on request with no other input to
/// ensure the deduction is accurate.
///
/// \param inference_request The request object.
/// \param name The name of the input. This name is only used as a reference
/// of the raw input in other Tritonserver APIs. It doesn't assoicate with the
/// name used in the model.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceRequestAddRawInput
(
TRITONSERVER_InferenceRequest
*
inference_request
,
const
char
*
name
);
/// Remove an input from a request.
///
/// \param inference_request The request object.
/// \param name The name of the input.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceRequestRemoveInput
(
TRITONSERVER_InferenceRequest
*
inference_request
,
const
char
*
name
);
/// Remove all inputs from a request.
///
/// \param inference_request The request object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceRequestRemoveAllInputs
(
TRITONSERVER_InferenceRequest
*
inference_request
);
/// Assign a buffer of data to an input. The buffer will be appended
/// to any existing buffers for that input. The 'inference_request'
/// object takes ownership of the buffer and so the caller should not
/// modify or free the buffer until that ownership is released by
/// 'inference_request' being deleted or by the input being removed
/// from 'inference_request'.
///
/// \param inference_request The request object.
/// \param name The name of the input.
/// \param base The base address of the input data.
/// \param byte_size The size, in bytes, of the input data.
/// \param memory_type The memory type of the input data.
/// \param memory_type_id The memory type id of the input data.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceRequestAppendInputData
(
TRITONSERVER_InferenceRequest
*
inference_request
,
const
char
*
name
,
const
void
*
base
,
size_t
byte_size
,
TRITONSERVER_MemoryType
memory_type
,
int64_t
memory_type_id
);
/// Assign a buffer of data to an input for execution on all model instances
/// with the specified host policy. The buffer will be appended to any existing
/// buffers for that input on all devices with this host policy. The
/// 'inference_request' object takes ownership of the buffer and so the caller
/// should not modify or free the buffer until that ownership is released by
/// 'inference_request' being deleted or by the input being removed from
/// 'inference_request'. If the execution is scheduled on a device that does not
/// have a input buffer specified using this function, then the input buffer
/// specified with TRITONSERVER_InferenceRequestAppendInputData will be used so
/// a non-host policy specific version of data must be added using that API.
/// \param inference_request The request object.
/// \param name The name of the input.
/// \param base The base address of the input data.
/// \param byte_size The size, in bytes, of the input data.
/// \param memory_type The memory type of the input data.
/// \param memory_type_id The memory type id of the input data.
/// \param host_policy_name All model instances executing with this host_policy
/// will use this input buffer for execution.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceRequestAppendInputDataWithHostPolicy
(
TRITONSERVER_InferenceRequest
*
inference_request
,
const
char
*
name
,
const
void
*
base
,
size_t
byte_size
,
TRITONSERVER_MemoryType
memory_type
,
int64_t
memory_type_id
,
const
char
*
host_policy_name
);
/// Assign a buffer of data to an input. The buffer will be appended
/// to any existing buffers for that input. The 'inference_request'
/// object takes ownership of the buffer and so the caller should not
/// modify or free the buffer until that ownership is released by
/// 'inference_request' being deleted or by the input being removed
/// from 'inference_request'.
///
/// \param inference_request The request object.
/// \param name The name of the input.
/// \param base The base address of the input data.
/// \param buffer_attributes The buffer attrubutes of the input.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceRequestAppendInputDataWithBufferAttributes
(
TRITONSERVER_InferenceRequest
*
inference_request
,
const
char
*
name
,
const
void
*
base
,
TRITONSERVER_BufferAttributes
*
buffer_attributes
);
/// Clear all input data from an input, releasing ownership of the
/// buffer(s) that were appended to the input with
/// TRITONSERVER_InferenceRequestAppendInputData or
/// TRITONSERVER_InferenceRequestAppendInputDataWithHostPolicy
/// \param inference_request The request object.
/// \param name The name of the input.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceRequestRemoveAllInputData
(
TRITONSERVER_InferenceRequest
*
inference_request
,
const
char
*
name
);
/// Add an output request to an inference request.
///
/// \param inference_request The request object.
/// \param name The name of the output.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceRequestAddRequestedOutput
(
TRITONSERVER_InferenceRequest
*
inference_request
,
const
char
*
name
);
/// Remove an output request from an inference request.
///
/// \param inference_request The request object.
/// \param name The name of the output.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceRequestRemoveRequestedOutput
(
TRITONSERVER_InferenceRequest
*
inference_request
,
const
char
*
name
);
/// Remove all output requests from an inference request.
///
/// \param inference_request The request object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceRequestRemoveAllRequestedOutputs
(
TRITONSERVER_InferenceRequest
*
inference_request
);
/// Set the release callback for an inference request. The release
/// callback is called by Triton to return ownership of the request
/// object.
///
/// \param inference_request The request object.
/// \param request_release_fn The function called to return ownership
/// of the 'inference_request' object.
/// \param request_release_userp User-provided pointer that is
/// delivered to the 'request_release_fn' callback.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceRequestSetReleaseCallback
(
TRITONSERVER_InferenceRequest
*
inference_request
,
TRITONSERVER_InferenceRequestReleaseFn_t
request_release_fn
,
void
*
request_release_userp
);
/// Set the allocator and response callback for an inference
/// request. The allocator is used to allocate buffers for any output
/// tensors included in responses that are produced for this
/// request. The response callback is called to return response
/// objects representing responses produced for this request.
///
/// \param inference_request The request object.
/// \param response_allocator The TRITONSERVER_ResponseAllocator to use
/// to allocate buffers to hold inference results.
/// \param response_allocator_userp User-provided pointer that is
/// delivered to the response allocator's start and allocation functions.
/// \param response_fn The function called to deliver an inference
/// response for this request.
/// \param response_userp User-provided pointer that is delivered to
/// the 'response_fn' callback.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceRequestSetResponseCallback
(
TRITONSERVER_InferenceRequest
*
inference_request
,
TRITONSERVER_ResponseAllocator
*
response_allocator
,
void
*
response_allocator_userp
,
TRITONSERVER_InferenceResponseCompleteFn_t
response_fn
,
void
*
response_userp
);
/// TRITONSERVER_InferenceResponse
///
/// Object representing an inference response. The inference response
/// provides the meta-data and output tensor values calculated by the
/// inference.
///
/// Delete an inference response object.
///
/// \param inference_response The response object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceResponseDelete
(
TRITONSERVER_InferenceResponse
*
inference_response
);
/// Return the error status of an inference response. Return a
/// TRITONSERVER_Error object on failure, return nullptr on success.
/// The returned error object is owned by 'inference_response' and so
/// should not be deleted by the caller.
///
/// \param inference_response The response object.
/// \return a TRITONSERVER_Error indicating the success or failure
/// status of the response.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceResponseError
(
TRITONSERVER_InferenceResponse
*
inference_response
);
/// Get model used to produce a response. The caller does not own the
/// returned model name value and must not modify or delete it. The
/// lifetime of all returned values extends until 'inference_response'
/// is deleted.
///
/// \param inference_response The response object.
/// \param model_name Returns the name of the model.
/// \param model_version Returns the version of the model.
/// this response.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceResponseModel
(
TRITONSERVER_InferenceResponse
*
inference_response
,
const
char
**
model_name
,
int64_t
*
model_version
);
/// Get the ID of the request corresponding to a response. The caller
/// does not own the returned ID and must not modify or delete it. The
/// lifetime of all returned values extends until 'inference_response'
/// is deleted.
///
/// \param inference_response The response object.
/// \param request_id Returns the ID of the request corresponding to
/// this response.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceResponseId
(
TRITONSERVER_InferenceResponse
*
inference_response
,
const
char
**
request_id
);
/// Get the number of parameters available in the response.
///
/// \param inference_response The response object.
/// \param count Returns the number of parameters.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceResponseParameterCount
(
TRITONSERVER_InferenceResponse
*
inference_response
,
uint32_t
*
count
);
/// Get all information about a parameter. The caller does not own any
/// of the returned values and must not modify or delete them. The
/// lifetime of all returned values extends until 'inference_response'
/// is deleted.
///
/// The 'vvalue' returns a void* pointer that must be cast
/// appropriately based on 'type'. For example:
///
/// void* vvalue;
/// TRITONSERVER_ParameterType type;
/// TRITONSERVER_InferenceResponseParameter(
/// response, index, &name, &type, &vvalue);
/// switch (type) {
/// case TRITONSERVER_PARAMETER_BOOL:
/// bool value = *(reinterpret_cast<bool*>(vvalue));
/// ...
/// case TRITONSERVER_PARAMETER_INT:
/// int64_t value = *(reinterpret_cast<int64_t*>(vvalue));
/// ...
/// case TRITONSERVER_PARAMETER_STRING:
/// const char* value = reinterpret_cast<const char*>(vvalue);
/// ...
///
/// \param inference_response The response object.
/// \param index The index of the parameter, must be 0 <= index <
/// count, where 'count' is the value returned by
/// TRITONSERVER_InferenceResponseParameterCount.
/// \param name Returns the name of the parameter.
/// \param type Returns the type of the parameter.
/// \param vvalue Returns a pointer to the parameter value.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceResponseParameter
(
TRITONSERVER_InferenceResponse
*
inference_response
,
const
uint32_t
index
,
const
char
**
name
,
TRITONSERVER_ParameterType
*
type
,
const
void
**
vvalue
);
/// Get the number of outputs available in the response.
///
/// \param inference_response The response object.
/// \param count Returns the number of output tensors.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceResponseOutputCount
(
TRITONSERVER_InferenceResponse
*
inference_response
,
uint32_t
*
count
);
/// Get all information about an output tensor. The tensor data is
/// returned as the base pointer to the data and the size, in bytes,
/// of the data. The caller does not own any of the returned values
/// and must not modify or delete them. The lifetime of all returned
/// values extends until 'inference_response' is deleted.
///
/// \param inference_response The response object.
/// \param index The index of the output tensor, must be 0 <= index <
/// count, where 'count' is the value returned by
/// TRITONSERVER_InferenceResponseOutputCount.
/// \param name Returns the name of the output.
/// \param datatype Returns the type of the output.
/// \param shape Returns the shape of the output.
/// \param dim_count Returns the number of dimensions of the returned
/// shape.
/// \param base Returns the tensor data for the output.
/// \param byte_size Returns the size, in bytes, of the data.
/// \param memory_type Returns the memory type of the data.
/// \param memory_type_id Returns the memory type id of the data.
/// \param userp The user-specified value associated with the buffer
/// in TRITONSERVER_ResponseAllocatorAllocFn_t.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceResponseOutput
(
TRITONSERVER_InferenceResponse
*
inference_response
,
const
uint32_t
index
,
const
char
**
name
,
TRITONSERVER_DataType
*
datatype
,
const
int64_t
**
shape
,
uint64_t
*
dim_count
,
const
void
**
base
,
size_t
*
byte_size
,
TRITONSERVER_MemoryType
*
memory_type
,
int64_t
*
memory_type_id
,
void
**
userp
);
/// Get a classification label associated with an output for a given
/// index. The caller does not own the returned label and must not
/// modify or delete it. The lifetime of all returned label extends
/// until 'inference_response' is deleted.
///
/// \param inference_response The response object.
/// \param index The index of the output tensor, must be 0 <= index <
/// count, where 'count' is the value returned by
/// TRITONSERVER_InferenceResponseOutputCount.
/// \param class_index The index of the class.
/// \param name Returns the label corresponding to 'class_index' or
/// nullptr if no label.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceResponseOutputClassificationLabel
(
TRITONSERVER_InferenceResponse
*
inference_response
,
const
uint32_t
index
,
const
size_t
class_index
,
const
char
**
label
);
/// TRITONSERVER_BufferAttributes
///
/// API to create, modify, or retrieve attributes associated with a buffer.
///
/// Create a new buffer attributes object. The caller takes ownership of
/// the TRITONSERVER_BufferAttributes object and must call
/// TRITONSERVER_BufferAttributesDelete to release the object.
///
/// \param buffer_attributes Returns the new buffer attributes object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_BufferAttributesNew
(
TRITONSERVER_BufferAttributes
**
buffer_attributes
);
/// Delete a buffer attributes object.
///
/// \param buffer_attributes The buffer_attributes object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_BufferAttributesDelete
(
TRITONSERVER_BufferAttributes
*
buffer_attributes
);
/// Set the memory type id field of the buffer attributes.
///
/// \param buffer_attributes The buffer attributes object.
/// \param memory_type_id Memory type id to assign to the buffer attributes
/// object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_BufferAttributesSetMemoryTypeId
(
TRITONSERVER_BufferAttributes
*
buffer_attributes
,
int64_t
memory_type_id
);
/// Set the memory type field of the buffer attributes.
///
/// \param buffer_attributes The buffer attributes object.
/// \param memory_type Memory type to assign to the buffer attributes object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_BufferAttributesSetMemoryType
(
TRITONSERVER_BufferAttributes
*
buffer_attributes
,
TRITONSERVER_MemoryType
memory_type
);
/// Set the CudaIpcHandle field of the buffer attributes.
///
/// \param buffer_attributes The buffer attributes object.
/// \param cuda_ipc_handle The CudaIpcHandle to assign to the buffer attributes
/// object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_BufferAttributesSetCudaIpcHandle
(
TRITONSERVER_BufferAttributes
*
buffer_attributes
,
void
*
cuda_ipc_handle
);
/// Set the byte size field of the buffer attributes.
///
/// \param buffer_attributes The buffer attributes object.
/// \param byte_size Byte size to assign to the buffer attributes object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_BufferAttributesSetByteSize
(
TRITONSERVER_BufferAttributes
*
buffer_attributes
,
size_t
byte_size
);
/// Get the memory type id field of the buffer attributes.
///
/// \param buffer_attributes The buffer attributes object.
/// \param memory_type_id Returns the memory type id associated with the buffer
/// attributes object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_BufferAttributesMemoryTypeId
(
TRITONSERVER_BufferAttributes
*
buffer_attributes
,
int64_t
*
memory_type_id
);
/// Get the memory type field of the buffer attributes.
///
/// \param buffer_attributes The buffer attributes object.
/// \param memory_type Returns the memory type associated with the buffer
/// attributes object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_BufferAttributesMemoryType
(
TRITONSERVER_BufferAttributes
*
buffer_attributes
,
TRITONSERVER_MemoryType
*
memory_type
);
/// Get the CudaIpcHandle field of the buffer attributes object.
///
/// \param buffer_attributes The buffer attributes object.
/// \param cuda_ipc_handle Returns the memory type associated with the buffer
/// attributes object. If the cudaIpcHandle does not exist for the buffer,
/// nullptr will be returned.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_BufferAttributesCudaIpcHandle
(
TRITONSERVER_BufferAttributes
*
buffer_attributes
,
void
**
cuda_ipc_handle
);
/// Get the byte size field of the buffer attributes.
///
/// \param buffer_attributes The buffer attributes object.
/// \param byte_size Returns the byte size associated with the buffer attributes
/// object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_BufferAttributesByteSize
(
TRITONSERVER_BufferAttributes
*
buffer_attributes
,
size_t
*
byte_size
);
/// TRITONSERVER_ServerOptions
///
/// Options to use when creating an inference server.
///
/// Model control modes
typedef
enum
tritonserver_modelcontrolmode_enum
{
TRITONSERVER_MODEL_CONTROL_NONE
,
TRITONSERVER_MODEL_CONTROL_POLL
,
TRITONSERVER_MODEL_CONTROL_EXPLICIT
}
TRITONSERVER_ModelControlMode
;
/// Rate limit modes
typedef
enum
tritonserver_ratelimitmode_enum
{
TRITONSERVER_RATE_LIMIT_OFF
,
TRITONSERVER_RATE_LIMIT_EXEC_COUNT
}
TRITONSERVER_RateLimitMode
;
/// Create a new server options object. The caller takes ownership of
/// the TRITONSERVER_ServerOptions object and must call
/// TRITONSERVER_ServerOptionsDelete to release the object.
///
/// \param options Returns the new server options object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsNew
(
TRITONSERVER_ServerOptions
**
options
);
/// Delete a server options object.
///
/// \param options The server options object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsDelete
(
TRITONSERVER_ServerOptions
*
options
);
/// Set the textual ID for the server in a server options. The ID is a
/// name that identifies the server.
///
/// \param options The server options object.
/// \param server_id The server identifier.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsSetServerId
(
TRITONSERVER_ServerOptions
*
options
,
const
char
*
server_id
);
/// Set the model repository path in a server options. The path must be
/// the full absolute path to the model repository. This function can be called
/// multiple times with different paths to set multiple model repositories.
/// Note that if a model is not unique across all model repositories
/// at any time, the model will not be available.
///
/// \param options The server options object.
/// \param model_repository_path The full path to the model repository.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsSetModelRepositoryPath
(
TRITONSERVER_ServerOptions
*
options
,
const
char
*
model_repository_path
);
/// Set the model control mode in a server options. For each mode the models
/// will be managed as the following:
///
/// TRITONSERVER_MODEL_CONTROL_NONE: the models in model repository will be
/// loaded on startup. After startup any changes to the model repository will
/// be ignored. Calling TRITONSERVER_ServerPollModelRepository will result in
/// an error.
///
/// TRITONSERVER_MODEL_CONTROL_POLL: the models in model repository will be
/// loaded on startup. The model repository can be polled periodically using
/// TRITONSERVER_ServerPollModelRepository and the server will load, unload,
/// and updated models according to changes in the model repository.
///
/// TRITONSERVER_MODEL_CONTROL_EXPLICIT: the models in model repository will
/// not be loaded on startup. The corresponding model control APIs must be
/// called to load / unload a model in the model repository.
///
/// \param options The server options object.
/// \param mode The mode to use for the model control.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsSetModelControlMode
(
TRITONSERVER_ServerOptions
*
options
,
TRITONSERVER_ModelControlMode
mode
);
/// Set the model to be loaded at startup in a server options. The model must be
/// present in one, and only one, of the specified model repositories.
/// This function can be called multiple times with different model name
/// to set multiple startup models.
/// Note that it only takes affect on TRITONSERVER_MODEL_CONTROL_EXPLICIT mode.
///
/// \param options The server options object.
/// \param mode_name The name of the model to load on startup.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsSetStartupModel
(
TRITONSERVER_ServerOptions
*
options
,
const
char
*
model_name
);
/// Enable or disable strict model configuration handling in a server
/// options.
///
/// \param options The server options object.
/// \param strict True to enable strict model configuration handling,
/// false to disable.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsSetStrictModelConfig
(
TRITONSERVER_ServerOptions
*
options
,
bool
strict
);
/// Set the rate limit mode in a server options.
///
/// TRITONSERVER_RATE_LIMIT_EXEC_COUNT: The rate limiting prioritizes the
/// inference execution using the number of times each instance has got a
/// chance to run. The execution gets to run only when its resource
/// constraints are satisfied.
///
/// TRITONSERVER_RATE_LIMIT_OFF: The rate limiting is turned off and the
/// inference gets executed whenever an instance is available.
///
/// \param options The server options object.
/// \param mode The mode to use for the rate limiting. By default, execution
/// count is used to determine the priorities.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsSetRateLimiterMode
(
TRITONSERVER_ServerOptions
*
options
,
TRITONSERVER_RateLimitMode
mode
);
/// Add resource count for rate limiting.
///
/// \param options The server options object.
/// \param name The name of the resource.
/// \param count The count of the resource.
/// \param device The device identifier for the resource. A value of -1
/// indicates that the specified number of resources are available on every
/// device. The device value is ignored for a global resource. The server
/// will use the rate limiter configuration specified for instance groups
/// in model config to determine whether resource is global. In case of
/// conflicting resource type in different model configurations, server
/// will raise an appropriate error while loading model.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsAddRateLimiterResource
(
TRITONSERVER_ServerOptions
*
options
,
const
char
*
resource_name
,
const
size_t
resource_count
,
const
int
device
);
/// Set the total pinned memory byte size that the server can allocate
/// in a server options. The pinned memory pool will be shared across
/// Triton itself and the backends that use
/// TRITONBACKEND_MemoryManager to allocate memory.
///
/// \param options The server options object.
/// \param size The pinned memory pool byte size.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsSetPinnedMemoryPoolByteSize
(
TRITONSERVER_ServerOptions
*
options
,
uint64_t
size
);
/// Set the total CUDA memory byte size that the server can allocate
/// on given GPU device in a server options. The pinned memory pool
/// will be shared across Triton itself and the backends that use
/// TRITONBACKEND_MemoryManager to allocate memory.
///
/// \param options The server options object.
/// \param gpu_device The GPU device to allocate the memory pool.
/// \param size The CUDA memory pool byte size.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsSetCudaMemoryPoolByteSize
(
TRITONSERVER_ServerOptions
*
options
,
int
gpu_device
,
uint64_t
size
);
/// Set the total response cache byte size that the server can allocate in CPU
/// memory. The response cache will be shared across all inference requests and
/// across all models.
///
/// \param options The server options object.
/// \param size The total response cache byte size.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsSetResponseCacheByteSize
(
TRITONSERVER_ServerOptions
*
options
,
uint64_t
size
);
/// Set the minimum support CUDA compute capability in a server
/// options.
///
/// \param options The server options object.
/// \param cc The minimum CUDA compute capability.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsSetMinSupportedComputeCapability
(
TRITONSERVER_ServerOptions
*
options
,
double
cc
);
/// Enable or disable exit-on-error in a server options.
///
/// \param options The server options object.
/// \param exit True to enable exiting on intialization error, false
/// to continue.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsSetExitOnError
(
TRITONSERVER_ServerOptions
*
options
,
bool
exit
);
/// Enable or disable strict readiness handling in a server options.
///
/// \param options The server options object.
/// \param strict True to enable strict readiness handling, false to
/// disable.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsSetStrictReadiness
(
TRITONSERVER_ServerOptions
*
options
,
bool
strict
);
/// Set the exit timeout, in seconds, for the server in a server
/// options.
///
/// \param options The server options object.
/// \param timeout The exit timeout, in seconds.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsSetExitTimeout
(
TRITONSERVER_ServerOptions
*
options
,
unsigned
int
timeout
);
/// Set the number of threads used in buffer manager in a server options.
///
/// \param options The server options object.
/// \param thread_count The number of threads.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsSetBufferManagerThreadCount
(
TRITONSERVER_ServerOptions
*
options
,
unsigned
int
thread_count
);
/// Set the number of threads to concurrently load models in a server options.
///
/// \param options The server options object.
/// \param thread_count The number of threads.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsSetModelLoadThreadCount
(
TRITONSERVER_ServerOptions
*
options
,
unsigned
int
thread_count
);
/// Provide a log output file.
///
/// \param options The server options object.
/// \param file a string defining the file where the log outputs will be saved.
/// An empty string for the file name will cause triton to direct logging
/// facilities to the console
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsSetLogFile
(
TRITONSERVER_ServerOptions
*
options
,
const
char
*
file
);
/// Enable or disable info level logging.
///
/// \param options The server options object.
/// \param log True to enable info logging, false to disable.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsSetLogInfo
(
TRITONSERVER_ServerOptions
*
options
,
bool
log
);
/// Enable or disable warning level logging.
///
/// \param options The server options object.
/// \param log True to enable warning logging, false to disable.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsSetLogWarn
(
TRITONSERVER_ServerOptions
*
options
,
bool
log
);
/// Enable or disable error level logging.
///
/// \param options The server options object.
/// \param log True to enable error logging, false to disable.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsSetLogError
(
TRITONSERVER_ServerOptions
*
options
,
bool
log
);
/// Set the logging format.
///
/// \param options The server options object.
/// \param format The logging format.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsSetLogFormat
(
TRITONSERVER_ServerOptions
*
options
,
const
TRITONSERVER_LogFormat
format
);
/// Set verbose logging level. Level zero disables verbose logging.
///
/// \param options The server options object.
/// \param level The verbose logging level.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsSetLogVerbose
(
TRITONSERVER_ServerOptions
*
options
,
int
level
);
/// Enable or disable metrics collection in a server options.
///
/// \param options The server options object.
/// \param metrics True to enable metrics, false to disable.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsSetMetrics
(
TRITONSERVER_ServerOptions
*
options
,
bool
metrics
);
/// Enable or disable GPU metrics collection in a server options. GPU
/// metrics are collected if both this option and
/// TRITONSERVER_ServerOptionsSetMetrics are true.
///
/// \param options The server options object.
/// \param gpu_metrics True to enable GPU metrics, false to disable.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsSetGpuMetrics
(
TRITONSERVER_ServerOptions
*
options
,
bool
gpu_metrics
);
/// Enable or disable CPU metrics collection in a server options. CPU
/// metrics are collected if both this option and
/// TRITONSERVER_ServerOptionsSetMetrics are true.
///
/// \param options The server options object.
/// \param cpu_metrics True to enable CPU metrics, false to disable.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsSetCpuMetrics
(
TRITONSERVER_ServerOptions
*
options
,
bool
cpu_metrics
);
/// Set the interval for metrics collection in a server options.
/// This is 2000 milliseconds by default.
///
/// \param options The server options object.
/// \param metrics_interval_ms The time interval in ms between
/// successive metrics updates.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsSetMetricsInterval
(
TRITONSERVER_ServerOptions
*
options
,
uint64_t
metrics_interval_ms
);
/// Set the directory containing backend shared libraries. This
/// directory is searched last after the version and model directory
/// in the model repository when looking for the backend shared
/// library for a model. If the backend is named 'be' the directory
/// searched is 'backend_dir'/be/libtriton_be.so.
///
/// \param options The server options object.
/// \param backend_dir The full path of the backend directory.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsSetBackendDirectory
(
TRITONSERVER_ServerOptions
*
options
,
const
char
*
backend_dir
);
/// Set the directory containing repository agent shared libraries. This
/// directory is searched when looking for the repository agent shared
/// library for a model. If the backend is named 'ra' the directory
/// searched is 'repoagent_dir'/ra/libtritonrepoagent_ra.so.
///
/// \param options The server options object.
/// \param repoagent_dir The full path of the repository agent directory.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsSetRepoAgentDirectory
(
TRITONSERVER_ServerOptions
*
options
,
const
char
*
repoagent_dir
);
/// Specify the limit on memory usage as a fraction on the device identified by
/// 'kind' and 'device_id'. If model loading on the device is requested and the
/// current memory usage exceeds the limit, the load will be rejected. If not
/// specified, the limit will not be set.
///
/// Currently support TRITONSERVER_INSTANCEGROUPKIND_GPU
///
/// \param options The server options object.
/// \param kind The kind of the device.
/// \param device_id The id of the device.
/// \param fraction The limit on memory usage as a fraction
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsSetModelLoadDeviceLimit
(
TRITONSERVER_ServerOptions
*
options
,
const
TRITONSERVER_InstanceGroupKind
kind
,
const
int
device_id
,
const
double
fraction
);
/// Set a configuration setting for a named backend in a server
/// options.
///
/// \param options The server options object.
/// \param backend_name The name of the backend.
/// \param setting The name of the setting.
/// \param value The setting value.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsSetBackendConfig
(
TRITONSERVER_ServerOptions
*
options
,
const
char
*
backend_name
,
const
char
*
setting
,
const
char
*
value
);
/// Set a host policy setting for a given policy name in a server options.
///
/// \param options The server options object.
/// \param policy_name The name of the policy.
/// \param setting The name of the setting.
/// \param value The setting value.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsSetHostPolicy
(
TRITONSERVER_ServerOptions
*
options
,
const
char
*
policy_name
,
const
char
*
setting
,
const
char
*
value
);
/// TRITONSERVER_Server
///
/// An inference server.
///
/// Model batch flags. The enum values must be power-of-2 values.
typedef
enum
tritonserver_batchflag_enum
{
TRITONSERVER_BATCH_UNKNOWN
=
1
,
TRITONSERVER_BATCH_FIRST_DIM
=
2
}
TRITONSERVER_ModelBatchFlag
;
/// Model index flags. The enum values must be power-of-2 values.
typedef
enum
tritonserver_modelindexflag_enum
{
TRITONSERVER_INDEX_FLAG_READY
=
1
}
TRITONSERVER_ModelIndexFlag
;
/// Model transaction policy flags. The enum values must be
/// power-of-2 values.
typedef
enum
tritonserver_txn_property_flag_enum
{
TRITONSERVER_TXN_ONE_TO_ONE
=
1
,
TRITONSERVER_TXN_DECOUPLED
=
2
}
TRITONSERVER_ModelTxnPropertyFlag
;
/// Create a new server object. The caller takes ownership of the
/// TRITONSERVER_Server object and must call TRITONSERVER_ServerDelete
/// to release the object.
///
/// \param server Returns the new inference server object.
/// \param options The inference server options object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerNew
(
TRITONSERVER_Server
**
server
,
TRITONSERVER_ServerOptions
*
options
);
/// Delete a server object. If server is not already stopped it is
/// stopped before being deleted.
///
/// \param server The inference server object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerDelete
(
TRITONSERVER_Server
*
server
);
/// Stop a server object. A server can't be restarted once it is
/// stopped.
///
/// \param server The inference server object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerStop
(
TRITONSERVER_Server
*
server
);
/// Register a new model repository. Not available in polling mode.
///
/// \param server The inference server object.
/// \param repository_path The full path to the model repository.
/// \param name_mapping List of name_mapping parameters. Each mapping has
/// the model directory name as its key, overriden model name as its value.
/// \param model_count Number of mappings provided.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerRegisterModelRepository
(
TRITONSERVER_Server
*
server
,
const
char
*
repository_path
,
const
TRITONSERVER_Parameter
**
name_mapping
,
const
uint32_t
mapping_count
);
/// Unregister a model repository. Not available in polling mode.
///
/// \param server The inference server object.
/// \param repository_path The full path to the model repository.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerUnregisterModelRepository
(
TRITONSERVER_Server
*
server
,
const
char
*
repository_path
);
/// Check the model repository for changes and update server state
/// based on those changes.
///
/// \param server The inference server object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerPollModelRepository
(
TRITONSERVER_Server
*
server
);
/// Is the server live?
///
/// \param server The inference server object.
/// \param live Returns true if server is live, false otherwise.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerIsLive
(
TRITONSERVER_Server
*
server
,
bool
*
live
);
/// Is the server ready?
///
/// \param server The inference server object.
/// \param ready Returns true if server is ready, false otherwise.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerIsReady
(
TRITONSERVER_Server
*
server
,
bool
*
ready
);
/// Is the model ready?
///
/// \param server The inference server object.
/// \param model_name The name of the model to get readiness for.
/// \param model_version The version of the model to get readiness
/// for. If -1 then the server will choose a version based on the
/// model's policy.
/// \param ready Returns true if server is ready, false otherwise.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerModelIsReady
(
TRITONSERVER_Server
*
server
,
const
char
*
model_name
,
const
int64_t
model_version
,
bool
*
ready
);
/// Get the batch properties of the model. The properties are
/// communicated by a flags value and an (optional) object returned by
/// 'voidp'.
///
/// - TRITONSERVER_BATCH_UNKNOWN: Triton cannot determine the
/// batching properties of the model. This means that the model
/// does not support batching in any way that is useable by
/// Triton. The returned 'voidp' value is nullptr.
///
/// - TRITONSERVER_BATCH_FIRST_DIM: The model supports batching
/// along the first dimension of every input and output
/// tensor. Triton schedulers that perform batching can
/// automatically batch inference requests along this dimension.
/// The returned 'voidp' value is nullptr.
///
/// \param server The inference server object.
/// \param model_name The name of the model.
/// \param model_version The version of the model. If -1 then the
/// server will choose a version based on the model's policy.
/// \param flags Returns flags indicating the batch properties of the
/// model.
/// \param voidp If non-nullptr, returns a point specific to the
/// 'flags' value.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerModelBatchProperties
(
TRITONSERVER_Server
*
server
,
const
char
*
model_name
,
const
int64_t
model_version
,
uint32_t
*
flags
,
void
**
voidp
);
/// Get the transaction policy of the model. The policy is
/// communicated by a flags value.
///
/// - TRITONSERVER_TXN_ONE_TO_ONE: The model generates exactly
/// one response per request.
///
/// - TRITONSERVER_TXN_DECOUPLED: The model may generate zero
/// to many responses per request.
///
/// \param server The inference server object.
/// \param model_name The name of the model.
/// \param model_version The version of the model. If -1 then the
/// server will choose a version based on the model's policy.
/// \param txn_flags Returns flags indicating the transaction policy of the
/// model.
/// \param voidp If non-nullptr, returns a point specific to the 'flags' value.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerModelTransactionProperties
(
TRITONSERVER_Server
*
server
,
const
char
*
model_name
,
const
int64_t
model_version
,
uint32_t
*
txn_flags
,
void
**
voidp
);
/// Get the metadata of the server as a TRITONSERVER_Message object.
/// The caller takes ownership of the message object and must call
/// TRITONSERVER_MessageDelete to release the object.
///
/// \param server The inference server object.
/// \param server_metadata Returns the server metadata message.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerMetadata
(
TRITONSERVER_Server
*
server
,
TRITONSERVER_Message
**
server_metadata
);
/// Get the metadata of a model as a TRITONSERVER_Message
/// object. The caller takes ownership of the message object and must
/// call TRITONSERVER_MessageDelete to release the object.
///
/// \param server The inference server object.
/// \param model_name The name of the model.
/// \param model_version The version of the model.
/// If -1 then the server will choose a version based on the model's
/// policy.
/// \param model_metadata Returns the model metadata message.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerModelMetadata
(
TRITONSERVER_Server
*
server
,
const
char
*
model_name
,
const
int64_t
model_version
,
TRITONSERVER_Message
**
model_metadata
);
/// Get the statistics of a model as a TRITONSERVER_Message
/// object. The caller takes ownership of the object and must call
/// TRITONSERVER_MessageDelete to release the object.
///
/// \param server The inference server object.
/// \param model_name The name of the model.
/// If empty, then statistics for all available models will be returned,
/// and the server will choose a version based on those models' policies.
/// \param model_version The version of the model. If -1 then the
/// server will choose a version based on the model's policy.
/// \param model_stats Returns the model statistics message.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerModelStatistics
(
TRITONSERVER_Server
*
server
,
const
char
*
model_name
,
const
int64_t
model_version
,
TRITONSERVER_Message
**
model_stats
);
/// Get the configuration of a model as a TRITONSERVER_Message object.
/// The caller takes ownership of the message object and must call
/// TRITONSERVER_MessageDelete to release the object.
///
/// \param server The inference server object.
/// \param model_name The name of the model.
/// \param model_version The version of the model. If -1 then the
/// server will choose a version based on the model's policy.
/// \param config_version The model configuration will be returned in
/// a format matching this version. If the configuration cannot be
/// represented in the requested version's format then an error will
/// be returned. Currently only version 1 is supported.
/// \param model_config Returns the model config message.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerModelConfig
(
TRITONSERVER_Server
*
server
,
const
char
*
model_name
,
const
int64_t
model_version
,
const
uint32_t
config_version
,
TRITONSERVER_Message
**
model_config
);
/// Get the index of all unique models in the model repositories as a
/// TRITONSERVER_Message object. The caller takes ownership of the
/// message object and must call TRITONSERVER_MessageDelete to release
/// the object.
///
/// If TRITONSERVER_INDEX_FLAG_READY is set in 'flags' only the models
/// that are loaded into the server and ready for inferencing are
/// returned.
///
/// \param server The inference server object.
/// \param flags TRITONSERVER_ModelIndexFlag flags that control how to
/// collect the index.
/// \param model_index Return the model index message that holds the
/// index of all models contained in the server's model repository(s).
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerModelIndex
(
TRITONSERVER_Server
*
server
,
uint32_t
flags
,
TRITONSERVER_Message
**
model_index
);
/// Load the requested model or reload the model if it is already
/// loaded. The function does not return until the model is loaded or
/// fails to load. Returned error indicates if model loaded
/// successfully or not.
///
/// \param server The inference server object.
/// \param model_name The name of the model.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerLoadModel
(
TRITONSERVER_Server
*
server
,
const
char
*
model_name
);
/// Load the requested model or reload the model if it is already
/// loaded, with load parameters provided. The function does not return until
/// the model is loaded or fails to load. Returned error indicates if model
/// loaded successfully or not.
/// Currently the below parameter names are recognized:
/// - "config" : string parameter that contains a JSON representation of the
/// model configuration. This config will be used for loading the model instead
/// of the one in the model directory.
///
/// \param server The inference server object.
/// \param model_name The name of the model.
/// \param parameters The array of load parameters.
/// \param parameter_count The number of parameters.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerLoadModelWithParameters
(
TRITONSERVER_Server
*
server
,
const
char
*
model_name
,
const
TRITONSERVER_Parameter
**
parameters
,
const
uint64_t
parameter_count
);
/// Unload the requested model. Unloading a model that is not loaded
/// on server has no affect and success code will be returned.
/// The function does not wait for the requested model to be fully unload
/// and success code will be returned.
/// Returned error indicates if model unloaded successfully or not.
///
/// \param server The inference server object.
/// \param model_name The name of the model.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerUnloadModel
(
TRITONSERVER_Server
*
server
,
const
char
*
model_name
);
/// Unload the requested model, and also unload any dependent model that
/// was loaded along with the requested model (for example, the models composing
/// an ensemble). Unloading a model that is not loaded
/// on server has no affect and success code will be returned.
/// The function does not wait for the requested model and all dependent
/// models to be fully unload and success code will be returned.
/// Returned error indicates if model unloaded successfully or not.
///
/// \param server The inference server object.
/// \param model_name The name of the model.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerUnloadModelAndDependents
(
TRITONSERVER_Server
*
server
,
const
char
*
model_name
);
/// Get the current metrics for the server. The caller takes ownership
/// of the metrics object and must call TRITONSERVER_MetricsDelete to
/// release the object.
///
/// \param server The inference server object.
/// \param metrics Returns the metrics.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerMetrics
(
TRITONSERVER_Server
*
server
,
TRITONSERVER_Metrics
**
metrics
);
/// Perform inference using the meta-data and inputs supplied by the
/// 'inference_request'. If the function returns success, then the
/// caller releases ownership of 'inference_request' and must not
/// access it in any way after this call, until ownership is returned
/// via the 'request_release_fn' callback registered in the request
/// object with TRITONSERVER_InferenceRequestSetReleaseCallback.
///
/// The function unconditionally takes ownership of 'trace' and so the
/// caller must not access it in any way after this call (except in
/// the trace activity callbacks) until ownership is returned via the
/// trace's release_fn callback.
///
/// Responses produced for this request are returned using the
/// allocator and callback registered with the request by
/// TRITONSERVER_InferenceRequestSetResponseCallback.
///
/// \param server The inference server object.
/// \param inference_request The request object.
/// \param trace The trace object for this request, or nullptr if no
/// tracing.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerInferAsync
(
TRITONSERVER_Server
*
server
,
TRITONSERVER_InferenceRequest
*
inference_request
,
TRITONSERVER_InferenceTrace
*
trace
);
/// TRITONSERVER_MetricKind
///
/// Types of metrics recognized by TRITONSERVER.
///
typedef
enum
TRITONSERVER_metrickind_enum
{
TRITONSERVER_METRIC_KIND_COUNTER
,
TRITONSERVER_METRIC_KIND_GAUGE
}
TRITONSERVER_MetricKind
;
/// Create a new metric family object. The caller takes ownership of the
/// TRITONSERVER_MetricFamily object and must call
/// TRITONSERVER_MetricFamilyDelete to release the object.
///
/// \param family Returns the new metric family object.
/// \param kind The type of metric family to create.
/// \param name The name of the metric family seen when calling the metrics
/// endpoint.
/// \param description The description of the metric family seen when
/// calling the metrics endpoint.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_MetricFamilyNew
(
TRITONSERVER_MetricFamily
**
family
,
const
TRITONSERVER_MetricKind
kind
,
const
char
*
name
,
const
char
*
description
);
/// Delete a metric family object.
/// A TRITONSERVER_MetricFamily* object should be deleted AFTER its
/// corresponding TRITONSERVER_Metric* objects have been deleted.
/// Attempting to delete a family before its metrics will return an error.
///
/// \param family The metric family object to delete.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_MetricFamilyDelete
(
TRITONSERVER_MetricFamily
*
family
);
/// Create a new metric object. The caller takes ownership of the
/// TRITONSERVER_Metric object and must call
/// TRITONSERVER_MetricDelete to release the object. The caller is also
/// responsible for ownership of the labels passed in. Each label can be deleted
/// immediately after creating the metric with TRITONSERVER_ParameterDelete
/// if not re-using the labels.
///
/// \param metric Returns the new metric object.
/// \param family The metric family to add this new metric to.
/// \param labels The array of labels to associate with this new metric.
/// \param label_count The number of labels.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_MetricNew
(
TRITONSERVER_Metric
**
metric
,
TRITONSERVER_MetricFamily
*
family
,
const
TRITONSERVER_Parameter
**
labels
,
const
uint64_t
label_count
);
/// Delete a metric object.
/// All TRITONSERVER_Metric* objects should be deleted BEFORE their
/// corresponding TRITONSERVER_MetricFamily* objects have been deleted.
/// If a family is deleted before its metrics, an error will be returned.
///
/// \param metric The metric object to delete.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_MetricDelete
(
TRITONSERVER_Metric
*
metric
);
/// Get the current value of a metric object.
/// Supports metrics of kind TRITONSERVER_METRIC_KIND_COUNTER
/// and TRITONSERVER_METRIC_KIND_GAUGE, and returns
/// TRITONSERVER_ERROR_UNSUPPORTED for unsupported TRITONSERVER_MetricKind.
///
/// \param metric The metric object to query.
/// \param value Returns the current value of the metric object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_MetricValue
(
TRITONSERVER_Metric
*
metric
,
double
*
value
);
/// Increment the current value of metric by value.
/// Supports metrics of kind TRITONSERVER_METRIC_KIND_GAUGE for any value,
/// and TRITONSERVER_METRIC_KIND_COUNTER for non-negative values. Returns
/// TRITONSERVER_ERROR_UNSUPPORTED for unsupported TRITONSERVER_MetricKind
/// and TRITONSERVER_ERROR_INVALID_ARG for negative values on a
/// TRITONSERVER_METRIC_KIND_COUNTER metric.
///
/// \param metric The metric object to update.
/// \param value The amount to increment the metric's value by.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_MetricIncrement
(
TRITONSERVER_Metric
*
metric
,
double
value
);
/// Set the current value of metric to value.
/// Supports metrics of kind TRITONSERVER_METRIC_KIND_GAUGE and returns
/// TRITONSERVER_ERROR_UNSUPPORTED for unsupported TRITONSERVER_MetricKind.
///
/// \param metric The metric object to update.
/// \param value The amount to set metric's value to.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_MetricSet
(
TRITONSERVER_Metric
*
metric
,
double
value
);
/// Get the TRITONSERVER_MetricKind of metric and its corresponding family.
///
/// \param metric The metric object to query.
/// \param kind Returns the TRITONSERVER_MetricKind of metric.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_GetMetricKind
(
TRITONSERVER_Metric
*
metric
,
TRITONSERVER_MetricKind
*
kind
);
#ifdef __cplusplus
}
#endif
3rdparty/core-r22.12/src/backend_config.cc
0 → 100644
View file @
0a21fff9
// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "backend_config.h"
#include "status.h"
#include "triton/common/logging.h"
#include "triton/common/model_config.h"
namespace
triton
{
namespace
core
{
namespace
{
Status
GetTFSpecializedBackendName
(
const
triton
::
common
::
BackendCmdlineConfigMap
&
config_map
,
std
::
string
*
specialized_name
)
{
std
::
string
tf_version_str
=
"2"
;
const
auto
&
itr
=
config_map
.
find
(
"tensorflow"
);
if
(
itr
!=
config_map
.
end
())
{
if
(
BackendConfiguration
(
itr
->
second
,
"version"
,
&
tf_version_str
).
IsOk
())
{
if
((
tf_version_str
!=
"1"
)
&&
(
tf_version_str
!=
"2"
))
{
return
Status
(
Status
::
Code
::
INVALID_ARG
,
"unexpected TensorFlow library version '"
+
tf_version_str
+
"', expects 1 or 2."
);
}
}
}
*
specialized_name
+=
tf_version_str
;
return
Status
::
Success
;
}
}
// namespace
Status
BackendConfiguration
(
const
triton
::
common
::
BackendCmdlineConfig
&
config
,
const
std
::
string
&
key
,
std
::
string
*
val
)
{
for
(
const
auto
&
pr
:
config
)
{
if
(
pr
.
first
==
key
)
{
*
val
=
pr
.
second
;
return
Status
::
Success
;
}
}
return
Status
(
Status
::
Code
::
INTERNAL
,
std
::
string
(
"unable to find common backend configuration for '"
)
+
key
+
"'"
);
}
Status
BackendConfigurationParseStringToDouble
(
const
std
::
string
&
str
,
double
*
val
)
{
try
{
*
val
=
std
::
stod
(
str
);
}
catch
(...)
{
return
Status
(
Status
::
Code
::
INTERNAL
,
"unable to parse common backend configuration as double"
);
}
return
Status
::
Success
;
}
Status
BackendConfigurationParseStringToBool
(
const
std
::
string
&
str
,
bool
*
val
)
{
try
{
std
::
string
lowercase_str
{
str
};
std
::
transform
(
lowercase_str
.
begin
(),
lowercase_str
.
end
(),
lowercase_str
.
begin
(),
[](
unsigned
char
c
)
{
return
std
::
tolower
(
c
);
});
*
val
=
(
lowercase_str
==
"true"
);
}
catch
(...)
{
return
Status
(
Status
::
Code
::
INTERNAL
,
"unable to parse common backend configuration as bool"
);
}
return
Status
::
Success
;
}
Status
BackendConfigurationGlobalBackendsDirectory
(
const
triton
::
common
::
BackendCmdlineConfigMap
&
config_map
,
std
::
string
*
dir
)
{
const
auto
&
itr
=
config_map
.
find
(
std
::
string
());
if
(
itr
==
config_map
.
end
())
{
return
Status
(
Status
::
Code
::
INTERNAL
,
"unable to find global backends directory configuration"
);
}
RETURN_IF_ERROR
(
BackendConfiguration
(
itr
->
second
,
"backend-directory"
,
dir
));
return
Status
::
Success
;
}
Status
BackendConfigurationMinComputeCapability
(
const
triton
::
common
::
BackendCmdlineConfigMap
&
config_map
,
double
*
mcc
)
{
#ifdef TRITON_ENABLE_GPU
*
mcc
=
TRITON_MIN_COMPUTE_CAPABILITY
;
#else
*
mcc
=
0
;
#endif // TRITON_ENABLE_GPU
const
auto
&
itr
=
config_map
.
find
(
std
::
string
());
if
(
itr
==
config_map
.
end
())
{
return
Status
(
Status
::
Code
::
INTERNAL
,
"unable to find common backend configuration"
);
}
std
::
string
min_compute_capability_str
;
RETURN_IF_ERROR
(
BackendConfiguration
(
itr
->
second
,
"min-compute-capability"
,
&
min_compute_capability_str
));
RETURN_IF_ERROR
(
BackendConfigurationParseStringToDouble
(
min_compute_capability_str
,
mcc
));
return
Status
::
Success
;
}
Status
BackendConfigurationAutoCompleteConfig
(
const
triton
::
common
::
BackendCmdlineConfigMap
&
config_map
,
bool
*
acc
)
{
const
auto
&
itr
=
config_map
.
find
(
std
::
string
());
if
(
itr
==
config_map
.
end
())
{
return
Status
(
Status
::
Code
::
INTERNAL
,
"unable to find auto-complete configuration"
);
}
std
::
string
auto_complete_config_str
;
RETURN_IF_ERROR
(
BackendConfiguration
(
itr
->
second
,
"auto-complete-config"
,
&
auto_complete_config_str
));
RETURN_IF_ERROR
(
BackendConfigurationParseStringToBool
(
auto_complete_config_str
,
acc
));
return
Status
::
Success
;
}
Status
BackendConfigurationSpecializeBackendName
(
const
triton
::
common
::
BackendCmdlineConfigMap
&
config_map
,
const
std
::
string
&
backend_name
,
std
::
string
*
specialized_name
)
{
*
specialized_name
=
backend_name
;
if
(
backend_name
==
"tensorflow"
)
{
RETURN_IF_ERROR
(
GetTFSpecializedBackendName
(
config_map
,
specialized_name
));
}
return
Status
::
Success
;
}
Status
BackendConfigurationBackendLibraryName
(
const
std
::
string
&
backend_name
,
std
::
string
*
libname
)
{
#ifdef _WIN32
*
libname
=
"triton_"
+
backend_name
+
".dll"
;
#else
*
libname
=
"libtriton_"
+
backend_name
+
".so"
;
#endif
return
Status
::
Success
;
}
Status
BackendConfigurationModelLoadGpuFraction
(
const
triton
::
common
::
BackendCmdlineConfigMap
&
config_map
,
const
int
device_id
,
double
*
memory_limit
)
{
*
memory_limit
=
1.0
;
const
auto
&
itr
=
config_map
.
find
(
std
::
string
());
if
(
itr
==
config_map
.
end
())
{
return
Status
(
Status
::
Code
::
INTERNAL
,
"unable to find global backends directory configuration"
);
}
static
std
::
string
key_prefix
=
"model-load-gpu-limit-device-"
;
std
::
string
memory_limit_str
;
auto
status
=
BackendConfiguration
(
itr
->
second
,
key_prefix
+
std
::
to_string
(
device_id
),
&
memory_limit_str
);
// Allow missing key, default to 1.0 (no limit) if the limit is not specified
if
(
status
.
IsOk
())
{
RETURN_IF_ERROR
(
BackendConfigurationParseStringToDouble
(
memory_limit_str
,
memory_limit
));
}
return
Status
::
Success
;
}
}}
// namespace triton::core
3rdparty/core-r22.12/src/backend_config.h
0 → 100644
View file @
0a21fff9
// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include "status.h"
#include "triton/common/model_config.h"
namespace
triton
{
namespace
core
{
/// Get a key's string value from a backend configuration.
Status
BackendConfiguration
(
const
triton
::
common
::
BackendCmdlineConfig
&
config
,
const
std
::
string
&
key
,
std
::
string
*
val
);
/// Convert a backend configuration string value into a double.
Status
BackendConfigurationParseStringToDouble
(
const
std
::
string
&
str
,
double
*
val
);
/// Convert a backend configuration string value into a bool.
Status
BackendConfigurationParseStringToBool
(
const
std
::
string
&
str
,
bool
*
val
);
/// Get the global backends directory from the backend configuration.
Status
BackendConfigurationGlobalBackendsDirectory
(
const
triton
::
common
::
BackendCmdlineConfigMap
&
config_map
,
std
::
string
*
dir
);
/// Get the minimum compute capability from the backend configuration.
Status
BackendConfigurationMinComputeCapability
(
const
triton
::
common
::
BackendCmdlineConfigMap
&
config_map
,
double
*
mcc
);
/// Get the model configuration auto-complete setting from the backend
/// configuration.
Status
BackendConfigurationAutoCompleteConfig
(
const
triton
::
common
::
BackendCmdlineConfigMap
&
config_map
,
bool
*
acc
);
/// Convert a backend name to the specialized version of that name
/// based on the backend configuration. For example, "tensorflow" will
/// convert to either "tensorflow1" or "tensorflow2" depending on how
/// tritonserver is run.
Status
BackendConfigurationSpecializeBackendName
(
const
triton
::
common
::
BackendCmdlineConfigMap
&
config_map
,
const
std
::
string
&
backend_name
,
std
::
string
*
specialized_name
);
/// Return the shared library name for a backend.
Status
BackendConfigurationBackendLibraryName
(
const
std
::
string
&
backend_name
,
std
::
string
*
libname
);
/// Get GPU memory limit fraction for model loading
/// from the backend configuration.
Status
BackendConfigurationModelLoadGpuFraction
(
const
triton
::
common
::
BackendCmdlineConfigMap
&
config_map
,
const
int
device_id
,
double
*
memory_limit
);
}}
// namespace triton::core
3rdparty/core-r22.12/src/backend_manager.cc
0 → 100644
View file @
0a21fff9
// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "backend_manager.h"
#include "backend_memory_manager.h"
#include "server_message.h"
#include "shared_library.h"
#include "triton/common/logging.h"
// For unknown reason, windows will not export the TRITONBACKEND_*
// functions declared with dllexport in tritonbackend.h. To get those
// functions exported it is (also?) necessary to mark the definitions
// in this file with dllexport as well.
#if defined(_MSC_VER)
#define TRITONAPI_DECLSPEC __declspec(dllexport)
#elif defined(__GNUC__)
#define TRITONAPI_DECLSPEC __attribute__((__visibility__("default")))
#else
#define TRITONAPI_DECLSPEC
#endif
namespace
triton
{
namespace
core
{
//
// TritonBackend
//
Status
TritonBackend
::
Create
(
const
std
::
string
&
name
,
const
std
::
string
&
dir
,
const
std
::
string
&
libpath
,
const
triton
::
common
::
BackendCmdlineConfig
&
backend_cmdline_config
,
std
::
shared_ptr
<
TritonBackend
>*
backend
)
{
// Create the JSON representation of the backend configuration.
triton
::
common
::
TritonJson
::
Value
backend_config_json
(
triton
::
common
::
TritonJson
::
ValueType
::
OBJECT
);
if
(
!
backend_cmdline_config
.
empty
())
{
triton
::
common
::
TritonJson
::
Value
cmdline_json
(
backend_config_json
,
triton
::
common
::
TritonJson
::
ValueType
::
OBJECT
);
for
(
const
auto
&
pr
:
backend_cmdline_config
)
{
RETURN_IF_ERROR
(
cmdline_json
.
AddString
(
pr
.
first
.
c_str
(),
pr
.
second
));
}
RETURN_IF_ERROR
(
backend_config_json
.
Add
(
"cmdline"
,
std
::
move
(
cmdline_json
)));
}
TritonServerMessage
backend_config
(
backend_config_json
);
auto
local_backend
=
std
::
shared_ptr
<
TritonBackend
>
(
new
TritonBackend
(
name
,
dir
,
libpath
,
backend_config
));
// Load the library and initialize all the entrypoints
RETURN_IF_ERROR
(
local_backend
->
LoadBackendLibrary
());
// Backend initialization is optional... The TRITONBACKEND_Backend
// object is this TritonBackend object. We must set set shared
// library path to point to the backend directory in case the
// backend library attempts to load additional shared libaries.
if
(
local_backend
->
backend_init_fn_
!=
nullptr
)
{
std
::
unique_ptr
<
SharedLibrary
>
slib
;
RETURN_IF_ERROR
(
SharedLibrary
::
Acquire
(
&
slib
));
RETURN_IF_ERROR
(
slib
->
SetLibraryDirectory
(
local_backend
->
dir_
));
TRITONSERVER_Error
*
err
=
local_backend
->
backend_init_fn_
(
reinterpret_cast
<
TRITONBACKEND_Backend
*>
(
local_backend
.
get
()));
RETURN_IF_ERROR
(
slib
->
ResetLibraryDirectory
());
RETURN_IF_TRITONSERVER_ERROR
(
err
);
}
local_backend
->
UpdateAttributes
();
*
backend
=
std
::
move
(
local_backend
);
return
Status
::
Success
;
}
Status
TritonBackend
::
UpdateAttributes
()
{
if
(
backend_attri_fn_
==
nullptr
)
{
return
Status
::
Success
;
}
// Create an Attribute object for the backend to fill, note that it copies
// some fields from 'attributes_' while the others use default value. This
// is an ad hoc way to determine whether the attribute is set by the backend
// and keep / update current value.
Attribute
latest
;
latest
.
exec_policy_
=
attributes_
.
exec_policy_
;
RETURN_IF_TRITONSERVER_ERROR
(
backend_attri_fn_
(
reinterpret_cast
<
TRITONBACKEND_Backend
*>
(
this
),
reinterpret_cast
<
TRITONBACKEND_BackendAttribute
*>
(
&
latest
)));
// Update attributes that were set
attributes_
.
exec_policy_
=
latest
.
exec_policy_
;
if
(
!
latest
.
preferred_groups_
.
empty
())
{
attributes_
.
preferred_groups_
=
latest
.
preferred_groups_
;
}
return
Status
::
Success
;
}
TritonBackend
::
TritonBackend
(
const
std
::
string
&
name
,
const
std
::
string
&
dir
,
const
std
::
string
&
libpath
,
const
TritonServerMessage
&
backend_config
)
:
name_
(
name
),
dir_
(
dir
),
libpath_
(
libpath
),
backend_config_
(
backend_config
),
state_
(
nullptr
)
{
ClearHandles
();
}
TritonBackend
::~
TritonBackend
()
{
LOG_VERBOSE
(
1
)
<<
"unloading backend '"
<<
name_
<<
"'"
;
// Backend finalization is optional... The TRITONBACKEND_Backend
// object is this TritonBackend object.
if
(
backend_fini_fn_
!=
nullptr
)
{
LOG_TRITONSERVER_ERROR
(
backend_fini_fn_
(
reinterpret_cast
<
TRITONBACKEND_Backend
*>
(
this
)),
"failed finalizing backend"
);
}
ClearHandles
();
}
void
TritonBackend
::
ClearHandles
()
{
dlhandle_
=
nullptr
;
backend_init_fn_
=
nullptr
;
backend_fini_fn_
=
nullptr
;
backend_attri_fn_
=
nullptr
;
model_init_fn_
=
nullptr
;
model_fini_fn_
=
nullptr
;
inst_init_fn_
=
nullptr
;
inst_fini_fn_
=
nullptr
;
inst_exec_fn_
=
nullptr
;
}
Status
TritonBackend
::
LoadBackendLibrary
()
{
TritonBackendInitFn_t
bifn
;
TritonBackendFiniFn_t
bffn
;
TritonBackendAttriFn_t
bafn
;
TritonModelInitFn_t
mifn
;
TritonModelFiniFn_t
mffn
;
TritonModelInstanceInitFn_t
iifn
;
TritonModelInstanceFiniFn_t
iffn
;
TritonModelInstanceExecFn_t
iefn
;
{
std
::
unique_ptr
<
SharedLibrary
>
slib
;
RETURN_IF_ERROR
(
SharedLibrary
::
Acquire
(
&
slib
));
RETURN_IF_ERROR
(
slib
->
OpenLibraryHandle
(
libpath_
,
&
dlhandle_
));
// Backend initialize and finalize functions, optional
RETURN_IF_ERROR
(
slib
->
GetEntrypoint
(
dlhandle_
,
"TRITONBACKEND_Initialize"
,
true
/* optional */
,
reinterpret_cast
<
void
**>
(
&
bifn
)));
RETURN_IF_ERROR
(
slib
->
GetEntrypoint
(
dlhandle_
,
"TRITONBACKEND_Finalize"
,
true
/* optional */
,
reinterpret_cast
<
void
**>
(
&
bffn
)));
// Backend attribute function, optional
RETURN_IF_ERROR
(
slib
->
GetEntrypoint
(
dlhandle_
,
"TRITONBACKEND_GetBackendAttribute"
,
true
/* optional */
,
reinterpret_cast
<
void
**>
(
&
bafn
)));
// Model initialize and finalize functions, optional
RETURN_IF_ERROR
(
slib
->
GetEntrypoint
(
dlhandle_
,
"TRITONBACKEND_ModelInitialize"
,
true
/* optional */
,
reinterpret_cast
<
void
**>
(
&
mifn
)));
RETURN_IF_ERROR
(
slib
->
GetEntrypoint
(
dlhandle_
,
"TRITONBACKEND_ModelFinalize"
,
true
/* optional */
,
reinterpret_cast
<
void
**>
(
&
mffn
)));
// Model instance initialize and finalize functions, optional
RETURN_IF_ERROR
(
slib
->
GetEntrypoint
(
dlhandle_
,
"TRITONBACKEND_ModelInstanceInitialize"
,
true
/* optional */
,
reinterpret_cast
<
void
**>
(
&
iifn
)));
RETURN_IF_ERROR
(
slib
->
GetEntrypoint
(
dlhandle_
,
"TRITONBACKEND_ModelInstanceFinalize"
,
true
/* optional */
,
reinterpret_cast
<
void
**>
(
&
iffn
)));
// Model instance execute function, required
RETURN_IF_ERROR
(
slib
->
GetEntrypoint
(
dlhandle_
,
"TRITONBACKEND_ModelInstanceExecute"
,
false
/* optional */
,
reinterpret_cast
<
void
**>
(
&
iefn
)));
}
backend_init_fn_
=
bifn
;
backend_fini_fn_
=
bffn
;
backend_attri_fn_
=
bafn
;
model_init_fn_
=
mifn
;
model_fini_fn_
=
mffn
;
inst_init_fn_
=
iifn
;
inst_fini_fn_
=
iffn
;
inst_exec_fn_
=
iefn
;
return
Status
::
Success
;
}
extern
"C"
{
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ApiVersion
(
uint32_t
*
major
,
uint32_t
*
minor
)
{
*
major
=
TRITONBACKEND_API_VERSION_MAJOR
;
*
minor
=
TRITONBACKEND_API_VERSION_MINOR
;
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_BackendName
(
TRITONBACKEND_Backend
*
backend
,
const
char
**
name
)
{
TritonBackend
*
tb
=
reinterpret_cast
<
TritonBackend
*>
(
backend
);
*
name
=
tb
->
Name
().
c_str
();
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_BackendConfig
(
TRITONBACKEND_Backend
*
backend
,
TRITONSERVER_Message
**
backend_config
)
{
TritonBackend
*
tb
=
reinterpret_cast
<
TritonBackend
*>
(
backend
);
*
backend_config
=
const_cast
<
TRITONSERVER_Message
*>
(
reinterpret_cast
<
const
TRITONSERVER_Message
*>
(
&
tb
->
BackendConfig
()));
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_BackendExecutionPolicy
(
TRITONBACKEND_Backend
*
backend
,
TRITONBACKEND_ExecutionPolicy
*
policy
)
{
TritonBackend
*
tb
=
reinterpret_cast
<
TritonBackend
*>
(
backend
);
*
policy
=
tb
->
ExecutionPolicy
();
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_BackendSetExecutionPolicy
(
TRITONBACKEND_Backend
*
backend
,
TRITONBACKEND_ExecutionPolicy
policy
)
{
TritonBackend
*
tb
=
reinterpret_cast
<
TritonBackend
*>
(
backend
);
tb
->
SetExecutionPolicy
(
policy
);
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_BackendArtifacts
(
TRITONBACKEND_Backend
*
backend
,
TRITONBACKEND_ArtifactType
*
artifact_type
,
const
char
**
location
)
{
TritonBackend
*
tb
=
reinterpret_cast
<
TritonBackend
*>
(
backend
);
*
artifact_type
=
TRITONBACKEND_ARTIFACT_FILESYSTEM
;
*
location
=
tb
->
Directory
().
c_str
();
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_BackendMemoryManager
(
TRITONBACKEND_Backend
*
backend
,
TRITONBACKEND_MemoryManager
**
manager
)
{
static
TritonMemoryManager
gMemoryManager
;
*
manager
=
reinterpret_cast
<
TRITONBACKEND_MemoryManager
*>
(
&
gMemoryManager
);
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_BackendState
(
TRITONBACKEND_Backend
*
backend
,
void
**
state
)
{
TritonBackend
*
tb
=
reinterpret_cast
<
TritonBackend
*>
(
backend
);
*
state
=
tb
->
State
();
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_BackendSetState
(
TRITONBACKEND_Backend
*
backend
,
void
*
state
)
{
TritonBackend
*
tb
=
reinterpret_cast
<
TritonBackend
*>
(
backend
);
tb
->
SetState
(
state
);
return
nullptr
;
// success
}
}
// extern C
//
// TritonBackendManager
//
static
std
::
weak_ptr
<
TritonBackendManager
>
backend_manager_
;
static
std
::
mutex
mu_
;
Status
TritonBackendManager
::
Create
(
std
::
shared_ptr
<
TritonBackendManager
>*
manager
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mu_
);
// If there is already a manager then we just use it...
*
manager
=
backend_manager_
.
lock
();
if
(
*
manager
!=
nullptr
)
{
return
Status
::
Success
;
}
manager
->
reset
(
new
TritonBackendManager
());
backend_manager_
=
*
manager
;
return
Status
::
Success
;
}
Status
TritonBackendManager
::
CreateBackend
(
const
std
::
string
&
name
,
const
std
::
string
&
dir
,
const
std
::
string
&
libpath
,
const
triton
::
common
::
BackendCmdlineConfig
&
backend_cmdline_config
,
std
::
shared_ptr
<
TritonBackend
>*
backend
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mu_
);
const
auto
&
itr
=
backend_map_
.
find
(
libpath
);
if
(
itr
!=
backend_map_
.
end
())
{
*
backend
=
itr
->
second
;
return
Status
::
Success
;
}
RETURN_IF_ERROR
(
TritonBackend
::
Create
(
name
,
dir
,
libpath
,
backend_cmdline_config
,
backend
));
backend_map_
.
insert
({
libpath
,
*
backend
});
return
Status
::
Success
;
}
Status
TritonBackendManager
::
BackendState
(
std
::
unique_ptr
<
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
std
::
string
>>>*
backend_state
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mu_
);
std
::
unique_ptr
<
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
std
::
string
>>>
backend_state_map
(
new
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
std
::
string
>>
);
for
(
const
auto
&
backend_pair
:
backend_map_
)
{
auto
&
libpath
=
backend_pair
.
first
;
auto
backend
=
backend_pair
.
second
;
const
char
*
backend_config
;
size_t
backend_config_size
;
backend
->
BackendConfig
().
Serialize
(
&
backend_config
,
&
backend_config_size
);
backend_state_map
->
insert
(
{
backend
->
Name
(),
std
::
vector
<
std
::
string
>
{
libpath
,
backend_config
}});
}
*
backend_state
=
std
::
move
(
backend_state_map
);
return
Status
::
Success
;
}
}}
// namespace triton::core
3rdparty/core-r22.12/src/backend_manager.h
0 → 100644
View file @
0a21fff9
// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <memory>
#include <mutex>
#include <string>
#include <unordered_map>
#include "constants.h"
#include "server_message.h"
#include "status.h"
#include "triton/common/model_config.h"
#include "tritonserver_apis.h"
namespace
triton
{
namespace
core
{
//
// Proxy to a backend shared library.
//
class
TritonBackend
{
public:
struct
Attribute
{
Attribute
()
:
exec_policy_
(
TRITONBACKEND_EXECUTION_BLOCKING
)
{}
TRITONBACKEND_ExecutionPolicy
exec_policy_
;
std
::
vector
<
inference
::
ModelInstanceGroup
>
preferred_groups_
;
};
typedef
TRITONSERVER_Error
*
(
*
TritonModelInitFn_t
)(
TRITONBACKEND_Model
*
model
);
typedef
TRITONSERVER_Error
*
(
*
TritonModelFiniFn_t
)(
TRITONBACKEND_Model
*
model
);
typedef
TRITONSERVER_Error
*
(
*
TritonModelInstanceInitFn_t
)(
TRITONBACKEND_ModelInstance
*
instance
);
typedef
TRITONSERVER_Error
*
(
*
TritonModelInstanceFiniFn_t
)(
TRITONBACKEND_ModelInstance
*
instance
);
typedef
TRITONSERVER_Error
*
(
*
TritonModelInstanceExecFn_t
)(
TRITONBACKEND_ModelInstance
*
instance
,
TRITONBACKEND_Request
**
requests
,
const
uint32_t
request_cnt
);
static
Status
Create
(
const
std
::
string
&
name
,
const
std
::
string
&
dir
,
const
std
::
string
&
libpath
,
const
triton
::
common
::
BackendCmdlineConfig
&
backend_cmdline_config
,
std
::
shared_ptr
<
TritonBackend
>*
backend
);
~
TritonBackend
();
const
std
::
string
&
Name
()
const
{
return
name_
;
}
const
std
::
string
&
Directory
()
const
{
return
dir_
;
}
const
TritonServerMessage
&
BackendConfig
()
const
{
return
backend_config_
;
}
const
Attribute
&
BackendAttributes
()
const
{
return
attributes_
;
}
TRITONBACKEND_ExecutionPolicy
ExecutionPolicy
()
const
{
return
attributes_
.
exec_policy_
;
}
void
SetExecutionPolicy
(
const
TRITONBACKEND_ExecutionPolicy
policy
)
{
attributes_
.
exec_policy_
=
policy
;
}
void
*
State
()
{
return
state_
;
}
void
SetState
(
void
*
state
)
{
state_
=
state
;
}
TritonModelInitFn_t
ModelInitFn
()
const
{
return
model_init_fn_
;
}
TritonModelFiniFn_t
ModelFiniFn
()
const
{
return
model_fini_fn_
;
}
TritonModelInstanceInitFn_t
ModelInstanceInitFn
()
const
{
return
inst_init_fn_
;
}
TritonModelInstanceFiniFn_t
ModelInstanceFiniFn
()
const
{
return
inst_fini_fn_
;
}
TritonModelInstanceExecFn_t
ModelInstanceExecFn
()
const
{
return
inst_exec_fn_
;
}
private:
typedef
TRITONSERVER_Error
*
(
*
TritonBackendInitFn_t
)(
TRITONBACKEND_Backend
*
backend
);
typedef
TRITONSERVER_Error
*
(
*
TritonBackendFiniFn_t
)(
TRITONBACKEND_Backend
*
backend
);
typedef
TRITONSERVER_Error
*
(
*
TritonBackendAttriFn_t
)(
TRITONBACKEND_Backend
*
backend
,
TRITONBACKEND_BackendAttribute
*
backend_attributes
);
TritonBackend
(
const
std
::
string
&
name
,
const
std
::
string
&
dir
,
const
std
::
string
&
libpath
,
const
TritonServerMessage
&
backend_config
);
void
ClearHandles
();
Status
LoadBackendLibrary
();
Status
UpdateAttributes
();
// The name of the backend.
const
std
::
string
name_
;
// Full path to the directory holding backend shared library and
// other artifacts.
const
std
::
string
dir_
;
// Full path to the backend shared library.
const
std
::
string
libpath_
;
// Backend configuration as JSON
TritonServerMessage
backend_config_
;
// backend attributes
Attribute
attributes_
;
// dlopen / dlsym handles
void
*
dlhandle_
;
TritonBackendInitFn_t
backend_init_fn_
;
TritonBackendFiniFn_t
backend_fini_fn_
;
TritonBackendAttriFn_t
backend_attri_fn_
;
TritonModelInitFn_t
model_init_fn_
;
TritonModelFiniFn_t
model_fini_fn_
;
TritonModelInstanceInitFn_t
inst_init_fn_
;
TritonModelInstanceFiniFn_t
inst_fini_fn_
;
TritonModelInstanceExecFn_t
inst_exec_fn_
;
// Opaque state associated with the backend.
void
*
state_
;
};
//
// Manage communication with Triton backends and their lifecycle.
//
class
TritonBackendManager
{
public:
static
Status
Create
(
std
::
shared_ptr
<
TritonBackendManager
>*
manager
);
Status
CreateBackend
(
const
std
::
string
&
name
,
const
std
::
string
&
dir
,
const
std
::
string
&
libpath
,
const
triton
::
common
::
BackendCmdlineConfig
&
backend_cmdline_config
,
std
::
shared_ptr
<
TritonBackend
>*
backend
);
Status
BackendState
(
std
::
unique_ptr
<
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
std
::
string
>>>*
backend_state
);
private:
DISALLOW_COPY_AND_ASSIGN
(
TritonBackendManager
);
TritonBackendManager
()
=
default
;
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
TritonBackend
>>
backend_map_
;
};
}}
// namespace triton::core
3rdparty/core-r22.12/src/backend_memory_manager.cc
0 → 100644
View file @
0a21fff9
// Copyright 2020-2022, NVIDIA CORPORATION. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "backend_memory_manager.h"
#include "pinned_memory_manager.h"
#include "status.h"
#include "tritonserver_apis.h"
#ifdef TRITON_ENABLE_GPU
#include <cuda_runtime_api.h>
#include "cuda_memory_manager.h"
#endif // TRITON_ENABLE_GPU
// For unknown reason, windows will not export the TRITONBACKEND_*
// functions declared with dllexport in tritonbackend.h. To get those
// functions exported it is (also?) necessary to mark the definitions
// in this file with dllexport as well.
#if defined(_MSC_VER)
#define TRITONAPI_DECLSPEC __declspec(dllexport)
#elif defined(__GNUC__)
#define TRITONAPI_DECLSPEC __attribute__((__visibility__("default")))
#else
#define TRITONAPI_DECLSPEC
#endif
namespace
triton
{
namespace
core
{
extern
"C"
{
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_MemoryManagerAllocate
(
TRITONBACKEND_MemoryManager
*
manager
,
void
**
buffer
,
const
TRITONSERVER_MemoryType
memory_type
,
const
int64_t
memory_type_id
,
const
uint64_t
byte_size
)
{
switch
(
memory_type
)
{
case
TRITONSERVER_MEMORY_GPU
:
#ifdef TRITON_ENABLE_GPU
{
auto
status
=
CudaMemoryManager
::
Alloc
(
buffer
,
byte_size
,
memory_type_id
);
if
(
!
status
.
IsOk
())
{
return
TRITONSERVER_ErrorNew
(
StatusCodeToTritonCode
(
status
.
ErrorCode
()),
status
.
Message
().
c_str
());
}
break
;
}
#else
return
TRITONSERVER_ErrorNew
(
TRITONSERVER_ERROR_UNSUPPORTED
,
"GPU memory allocation not supported"
);
#endif // TRITON_ENABLE_GPU
case
TRITONSERVER_MEMORY_CPU_PINNED
:
#ifdef TRITON_ENABLE_GPU
{
TRITONSERVER_MemoryType
mt
=
memory_type
;
auto
status
=
PinnedMemoryManager
::
Alloc
(
buffer
,
byte_size
,
&
mt
,
false
);
if
(
!
status
.
IsOk
())
{
return
TRITONSERVER_ErrorNew
(
StatusCodeToTritonCode
(
status
.
ErrorCode
()),
status
.
Message
().
c_str
());
}
break
;
}
#else
return
TRITONSERVER_ErrorNew
(
TRITONSERVER_ERROR_UNSUPPORTED
,
"Pinned memory allocation not supported"
);
#endif // TRITON_ENABLE_GPU
case
TRITONSERVER_MEMORY_CPU
:
{
*
buffer
=
malloc
(
byte_size
);
if
(
*
buffer
==
nullptr
)
{
return
TRITONSERVER_ErrorNew
(
TRITONSERVER_ERROR_UNAVAILABLE
,
"CPU memory allocation failed"
);
}
break
;
}
}
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_MemoryManagerFree
(
TRITONBACKEND_MemoryManager
*
manager
,
void
*
buffer
,
const
TRITONSERVER_MemoryType
memory_type
,
const
int64_t
memory_type_id
)
{
switch
(
memory_type
)
{
case
TRITONSERVER_MEMORY_GPU
:
{
#ifdef TRITON_ENABLE_GPU
auto
status
=
CudaMemoryManager
::
Free
(
buffer
,
memory_type_id
);
if
(
!
status
.
IsOk
())
{
return
TRITONSERVER_ErrorNew
(
StatusCodeToTritonCode
(
status
.
StatusCode
()),
status
.
Message
().
c_str
());
}
#endif // TRITON_ENABLE_GPU
break
;
}
case
TRITONSERVER_MEMORY_CPU_PINNED
:
{
#ifdef TRITON_ENABLE_GPU
auto
status
=
PinnedMemoryManager
::
Free
(
buffer
);
if
(
!
status
.
IsOk
())
{
return
TRITONSERVER_ErrorNew
(
StatusCodeToTritonCode
(
status
.
StatusCode
()),
status
.
Message
().
c_str
());
}
#endif // TRITON_ENABLE_GPU
break
;
}
case
TRITONSERVER_MEMORY_CPU
:
free
(
buffer
);
break
;
}
return
nullptr
;
// success
}
}
// extern C
}}
// namespace triton::core
3rdparty/core-r22.12/src/backend_memory_manager.h
0 → 100644
View file @
0a21fff9
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
namespace
triton
{
namespace
core
{
// Currently there is just a global memory manager that is used for
// all backends and which simply forwards requests on to the core
// memory manager.
struct
TritonMemoryManager
{
};
}}
// namespace triton::core
3rdparty/core-r22.12/src/backend_model.cc
0 → 100644
View file @
0a21fff9
// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "backend_model.h"
#include <vector>
#include "backend_config.h"
#include "backend_model_instance.h"
#include "dynamic_batch_scheduler.h"
#include "filesystem.h"
#include "model_config_utils.h"
#include "numa_utils.h"
#include "sequence_batch_scheduler.h"
#include "sequence_state.h"
#include "server.h"
#include "server_message.h"
#include "shared_library.h"
#include "triton/common/logging.h"
#include "tritonserver_apis.h"
// For unknown reason, windows will not export the TRITONBACKEND_*
// functions declared with dllexport in tritonbackend.h. To get those
// functions exported it is (also?) necessary to mark the definitions
// in this file with dllexport as well.
#if defined(_MSC_VER)
#define TRITONAPI_DECLSPEC __declspec(dllexport)
#elif defined(__GNUC__)
#define TRITONAPI_DECLSPEC __attribute__((__visibility__("default")))
#else
#define TRITONAPI_DECLSPEC
#endif
namespace
triton
{
namespace
core
{
Status
TritonModel
::
Create
(
InferenceServer
*
server
,
const
std
::
string
&
model_path
,
const
triton
::
common
::
BackendCmdlineConfigMap
&
backend_cmdline_config_map
,
const
triton
::
common
::
HostPolicyCmdlineConfigMap
&
host_policy_map
,
const
std
::
string
&
model_name
,
const
int64_t
version
,
inference
::
ModelConfig
model_config
,
const
bool
is_config_provided
,
std
::
unique_ptr
<
TritonModel
>*
model
)
{
model
->
reset
();
// The model configuration must specify a backend. The name of the
// corresponding shared library must be libtriton_<backend>.so.
if
(
model_config
.
backend
().
empty
())
{
return
Status
(
Status
::
Code
::
INVALID_ARG
,
"must specify 'backend' for '"
+
model_config
.
name
()
+
"'"
);
}
// Localize the content of the model repository corresponding to
// 'model_name'. This model holds a handle to the localized content
// so that it persists as long as the model is loaded.
std
::
shared_ptr
<
LocalizedPath
>
localized_model_dir
;
RETURN_IF_ERROR
(
LocalizePath
(
model_path
,
&
localized_model_dir
));
// Localize paths in backend model config
// [FIXME] Remove once a more permanent solution is implemented (DLIS-4211)
RETURN_IF_ERROR
(
LocalizePythonBackendExecutionEnvironmentPath
(
model_path
,
&
model_config
,
&
localized_model_dir
));
// Get some internal configuration values needed for initialization.
std
::
string
backend_dir
;
RETURN_IF_ERROR
(
BackendConfigurationGlobalBackendsDirectory
(
backend_cmdline_config_map
,
&
backend_dir
));
bool
auto_complete_config
=
false
;
RETURN_IF_ERROR
(
BackendConfigurationAutoCompleteConfig
(
backend_cmdline_config_map
,
&
auto_complete_config
));
double
min_compute_capability
=
0
;
RETURN_IF_ERROR
(
BackendConfigurationMinComputeCapability
(
backend_cmdline_config_map
,
&
min_compute_capability
));
std
::
string
specialized_backend_name
;
RETURN_IF_ERROR
(
BackendConfigurationSpecializeBackendName
(
backend_cmdline_config_map
,
model_config
.
backend
(),
&
specialized_backend_name
));
std
::
string
backend_libname
;
RETURN_IF_ERROR
(
BackendConfigurationBackendLibraryName
(
specialized_backend_name
,
&
backend_libname
));
// Get the path to the backend shared library. Search path is
// version directory, model directory, global backend directory.
const
auto
localized_model_path
=
localized_model_dir
->
Path
();
const
auto
version_path
=
JoinPath
({
localized_model_path
,
std
::
to_string
(
version
)});
const
std
::
string
global_path
=
JoinPath
({
backend_dir
,
specialized_backend_name
});
const
std
::
vector
<
std
::
string
>
search_paths
=
{
version_path
,
localized_model_path
,
global_path
};
std
::
string
backend_libdir
;
std
::
string
backend_libpath
;
for
(
const
auto
&
path
:
search_paths
)
{
const
auto
full_path
=
JoinPath
({
path
,
backend_libname
});
bool
exists
=
false
;
RETURN_IF_ERROR
(
FileExists
(
full_path
,
&
exists
));
if
(
exists
)
{
backend_libdir
=
path
;
backend_libpath
=
full_path
;
break
;
}
}
if
(
backend_libpath
.
empty
())
{
return
Status
(
Status
::
Code
::
INVALID_ARG
,
"unable to find '"
+
backend_libname
+
"' for model '"
+
model_config
.
name
()
+
"', searched: "
+
version_path
+
", "
+
model_path
+
", "
+
global_path
);
}
// Resolve the global backend configuration with the specific backend
// configuration
triton
::
common
::
BackendCmdlineConfig
config
;
RETURN_IF_ERROR
(
ResolveBackendConfigs
(
backend_cmdline_config_map
,
model_config
.
backend
(),
config
));
RETURN_IF_ERROR
(
SetBackendConfigDefaults
(
config
));
std
::
shared_ptr
<
TritonBackend
>
backend
;
RETURN_IF_ERROR
(
server
->
BackendManager
()
->
CreateBackend
(
model_config
.
backend
(),
backend_libdir
,
backend_libpath
,
config
,
&
backend
));
// Normalize backend-dependent config
{
const
auto
&
attributes
=
backend
->
BackendAttributes
();
// [WIP] formalize config normalization / validation
RETURN_IF_ERROR
(
NormalizeInstanceGroup
(
min_compute_capability
,
attributes
.
preferred_groups_
,
&
model_config
));
RETURN_IF_ERROR
(
ValidateInstanceGroup
(
model_config
,
min_compute_capability
));
}
// Create and initialize the model.
std
::
unique_ptr
<
TritonModel
>
local_model
(
new
TritonModel
(
server
,
localized_model_dir
,
backend
,
min_compute_capability
,
version
,
model_config
,
auto_complete_config
));
TritonModel
*
raw_local_model
=
local_model
.
get
();
// Model initialization is optional... The TRITONBACKEND_Model
// object is this TritonModel object. We must set set shared library
// path to point to the backend directory in case the backend
// library attempts to load additional shared libaries.
if
(
backend
->
ModelInitFn
()
!=
nullptr
)
{
std
::
unique_ptr
<
SharedLibrary
>
slib
;
RETURN_IF_ERROR
(
SharedLibrary
::
Acquire
(
&
slib
));
RETURN_IF_ERROR
(
slib
->
SetLibraryDirectory
(
backend
->
Directory
()));
TRITONSERVER_Error
*
err
=
backend
->
ModelInitFn
()(
reinterpret_cast
<
TRITONBACKEND_Model
*>
(
raw_local_model
));
RETURN_IF_ERROR
(
slib
->
ResetLibraryDirectory
());
RETURN_IF_TRITONSERVER_ERROR
(
err
);
}
// Initialize the model for Triton core usage
RETURN_IF_ERROR
(
local_model
->
Init
(
is_config_provided
));
bool
device_blocking
=
false
;
if
(
local_model
->
backend_
->
ExecutionPolicy
()
==
TRITONBACKEND_EXECUTION_DEVICE_BLOCKING
)
{
if
(
model_config
.
has_sequence_batching
())
{
LOG_INFO
<<
"Overriding execution policy to "
"
\"
TRITONBACKEND_EXECUTION_BLOCKING
\"
for sequence model
\"
"
<<
model_config
.
name
()
<<
"
\"
"
;
}
else
{
device_blocking
=
true
;
}
}
// Create and initialize the model instances for this model.
RETURN_IF_ERROR
(
TritonModelInstance
::
CreateInstances
(
raw_local_model
,
backend_cmdline_config_map
,
host_policy_map
,
model_config
,
device_blocking
));
RETURN_IF_ERROR
(
local_model
->
SetConfiguredScheduler
());
*
model
=
std
::
move
(
local_model
);
return
Status
::
Success
;
}
Status
TritonModel
::
ResolveBackendConfigs
(
const
triton
::
common
::
BackendCmdlineConfigMap
&
backend_cmdline_config_map
,
const
std
::
string
&
backend_name
,
triton
::
common
::
BackendCmdlineConfig
&
config
)
{
const
auto
&
global_itr
=
backend_cmdline_config_map
.
find
(
std
::
string
());
const
auto
&
specific_itr
=
backend_cmdline_config_map
.
find
(
backend_name
);
if
(
specific_itr
==
backend_cmdline_config_map
.
end
()
&&
global_itr
!=
backend_cmdline_config_map
.
end
())
{
for
(
auto
setting
:
global_itr
->
second
)
{
config
.
push_back
(
setting
);
}
}
else
if
(
specific_itr
!=
backend_cmdline_config_map
.
end
()
&&
global_itr
==
backend_cmdline_config_map
.
end
())
{
for
(
auto
setting
:
specific_itr
->
second
)
{
config
.
push_back
(
setting
);
}
}
else
if
(
specific_itr
!=
backend_cmdline_config_map
.
end
()
&&
global_itr
!=
backend_cmdline_config_map
.
end
())
{
triton
::
common
::
BackendCmdlineConfig
global_backend_config
=
global_itr
->
second
;
triton
::
common
::
BackendCmdlineConfig
specific_backend_config
=
specific_itr
->
second
;
std
::
sort
(
global_backend_config
.
begin
(),
global_backend_config
.
end
());
std
::
sort
(
specific_backend_config
.
begin
(),
specific_backend_config
.
end
());
size_t
global_index
=
0
;
size_t
specific_index
=
0
;
while
(
global_index
<
global_backend_config
.
size
()
&&
specific_index
<
specific_backend_config
.
size
())
{
auto
&
current_global_setting
=
global_backend_config
.
at
(
global_index
);
auto
&
current_specific_setting
=
specific_backend_config
.
at
(
specific_index
);
if
(
current_specific_setting
.
first
.
compare
(
current_global_setting
.
first
)
==
0
)
{
// specific setting overrides global setting
config
.
push_back
(
current_specific_setting
);
++
global_index
;
++
specific_index
;
}
else
if
(
current_specific_setting
.
first
.
compare
(
current_global_setting
.
first
)
<
0
)
{
config
.
push_back
(
current_specific_setting
);
++
specific_index
;
}
else
{
config
.
push_back
(
current_global_setting
);
++
global_index
;
}
}
// add the rest of the global configs
if
(
global_index
<
global_backend_config
.
size
())
{
auto
&
current_global_setting
=
global_backend_config
.
at
(
global_index
);
config
.
push_back
(
current_global_setting
);
}
// add the rest of the specific settings
if
(
specific_index
<
specific_backend_config
.
size
())
{
auto
&
current_specific_setting
=
specific_backend_config
.
at
(
specific_index
);
config
.
push_back
(
current_specific_setting
);
}
}
// else empty config
return
Status
::
Success
;
}
const
std
::
unordered_map
<
std
::
string
,
std
::
string
>
backend_config_defaults
(
{{
"default-max-batch-size"
,
"4"
}});
Status
TritonModel
::
SetBackendConfigDefaults
(
triton
::
common
::
BackendCmdlineConfig
&
config
)
{
auto
backend_config_defaults_copy
=
backend_config_defaults
;
for
(
auto
&
setting
:
config
)
{
if
(
setting
.
first
.
compare
(
"default-max-batch-size"
)
==
0
)
{
LOG_VERBOSE
(
1
)
<<
"Found overwritten default setting: "
<<
setting
.
first
<<
","
<<
setting
.
second
;
backend_config_defaults_copy
.
erase
(
setting
.
first
);
}
if
(
backend_config_defaults_copy
.
empty
())
{
break
;
}
}
// Anything left should be added to the config
for
(
const
auto
&
default_setting
:
backend_config_defaults_copy
)
{
LOG_VERBOSE
(
1
)
<<
"Adding default backend config setting: "
<<
default_setting
.
first
<<
","
<<
default_setting
.
second
;
config
.
push_back
(
std
::
make_pair
(
default_setting
.
first
,
default_setting
.
second
));
}
return
Status
::
Success
;
}
Status
TritonModel
::
AddInstance
(
std
::
unique_ptr
<
TritonModelInstance
>&&
instance
,
const
bool
passive
)
{
if
(
passive
)
{
passive_instances_
.
emplace_back
(
std
::
move
(
instance
));
}
else
{
instances_
.
emplace_back
(
std
::
move
(
instance
));
}
return
Status
::
Success
;
}
Status
TritonModel
::
UpdateModelConfig
(
const
uint32_t
config_version
,
TRITONSERVER_Message
*
updated_config_message
)
{
const
char
*
buffer
;
size_t
byte_size
;
RETURN_IF_TRITONSERVER_ERROR
(
TRITONSERVER_MessageSerializeToJson
(
updated_config_message
,
&
buffer
,
&
byte_size
));
inference
::
ModelConfig
updated_config
;
RETURN_IF_ERROR
(
JsonToModelConfig
({
buffer
,
byte_size
},
config_version
,
&
updated_config
));
auto
config
=
Config
();
config
.
set_max_batch_size
(
updated_config
.
max_batch_size
());
auto
inputs_config
=
config
.
mutable_input
();
*
inputs_config
=
updated_config
.
input
();
auto
outputs_config
=
config
.
mutable_output
();
*
outputs_config
=
updated_config
.
output
();
if
(
!
config
.
scheduling_choice_case
())
{
if
(
updated_config
.
has_dynamic_batching
())
{
auto
dynamic_batching_config
=
config
.
mutable_dynamic_batching
();
*
dynamic_batching_config
=
updated_config
.
dynamic_batching
();
}
else
if
(
updated_config
.
has_sequence_batching
())
{
auto
sequence_batching_config
=
config
.
mutable_sequence_batching
();
*
sequence_batching_config
=
updated_config
.
sequence_batching
();
}
else
if
(
updated_config
.
has_ensemble_scheduling
())
{
auto
ensemble_scheduling_config
=
config
.
mutable_ensemble_scheduling
();
*
ensemble_scheduling_config
=
updated_config
.
ensemble_scheduling
();
}
// else do nothing
}
else
if
(
config
.
scheduling_choice_case
()
!=
updated_config
.
scheduling_choice_case
())
{
return
Status
(
triton
::
common
::
Error
::
Code
::
INTERNAL
,
(
std
::
string
(
"Cannot update scheduling choice from "
)
+
std
::
to_string
(
config
.
scheduling_choice_case
())
+
std
::
string
(
" to "
)
+
std
::
to_string
(
config
.
scheduling_choice_case
())
+
std
::
string
(
" when auto-completing."
))
.
c_str
());
}
// else do nothing
// Need to normalize the model configuration for
// populating missing fields.
RETURN_IF_ERROR
(
NormalizeModelConfig
(
min_compute_capability_
,
&
config
));
RETURN_IF_ERROR
(
SetModelConfig
(
config
));
return
Status
::
Success
;
}
Status
TritonModel
::
SetConfiguredScheduler
()
{
std
::
unique_ptr
<
Scheduler
>
scheduler
;
// Need to enforce equal shape batches (i.e. non-ragged batches) if
// the model 1) allows one or more variable-size input tensors that
// are not marked as 'allow_ragged_batch' or 2) has one or more
// shape-tensor inputs. This is not needed if all input shapes are
// non-variable and if there are no shape tensors... so we don't
// enable it in that case for efficiency reasons.
std
::
unordered_map
<
std
::
string
,
bool
>
enforce_equal_shape_tensors
;
for
(
const
auto
input
:
config_
.
input
())
{
if
(
input
.
is_shape_tensor
())
{
enforce_equal_shape_tensors
.
insert
({
input
.
name
(),
true
});
}
else
if
(
!
input
.
allow_ragged_batch
()
&&
(
triton
::
common
::
GetElementCount
(
input
)
==
-
1
))
{
enforce_equal_shape_tensors
.
insert
({
input
.
name
(),
false
});
}
}
// If 'sequence_batching' is configured, then use the SequenceBatchScheduler,
// otherwise use the default DynamicBatchScheduler.
if
(
config_
.
has_sequence_batching
())
{
// Sequence batcher
RETURN_IF_ERROR
(
SequenceBatchScheduler
::
Create
(
this
,
enforce_equal_shape_tensors
,
&
scheduler
));
}
else
if
(
config_
.
has_dynamic_batching
())
{
// Dynamic batcher
RETURN_IF_ERROR
(
DynamicBatchScheduler
::
Create
(
this
,
nullptr
,
0
/*nice*/
,
true
/* dynamic_batching_enabled */
,
config_
.
max_batch_size
(),
enforce_equal_shape_tensors
,
config_
.
dynamic_batching
(),
config_
.
response_cache
().
enable
()
/* response_cache_enable */
,
&
scheduler
));
}
else
{
// Default scheduler. Use dynamic batch scheduler (with batching
// disabled) as the default scheduler.
RETURN_IF_ERROR
(
DynamicBatchScheduler
::
Create
(
this
,
nullptr
,
0
/*nice*/
,
false
/* dynamic_batching_enabled */
,
1
/* max_batch_size */
,
std
::
unordered_map
<
std
::
string
,
bool
>
()
/* enforce_equal_shape_tensors */
,
false
/* preserve_ordering */
,
config_
.
response_cache
().
enable
()
/* response_cache_enable */
,
std
::
set
<
int32_t
>
()
/* preferred_batch_sizes */
,
0
/* max_queue_delay_microseconds */
,
&
scheduler
));
}
return
SetScheduler
(
std
::
move
(
scheduler
));
}
Status
TritonModel
::
Initialize
()
{
for
(
const
auto
&
instance
:
instances_
)
{
RETURN_IF_ERROR
(
instance
->
Initialize
());
}
return
Status
::
Success
;
}
Status
TritonModel
::
WarmUp
()
{
for
(
const
auto
&
instance
:
instances_
)
{
RETURN_IF_ERROR
(
instance
->
WarmUp
());
}
return
Status
::
Success
;
}
TritonModel
::
TritonModel
(
InferenceServer
*
server
,
const
std
::
shared_ptr
<
LocalizedPath
>&
localized_model_dir
,
const
std
::
shared_ptr
<
TritonBackend
>&
backend
,
const
double
min_compute_capability
,
const
int64_t
version
,
const
inference
::
ModelConfig
&
config
,
const
bool
auto_complete_config
)
:
Model
(
min_compute_capability
,
localized_model_dir
->
Path
(),
version
,
config
),
server_
(
server
),
min_compute_capability_
(
min_compute_capability
),
auto_complete_config_
(
auto_complete_config
),
localized_model_dir_
(
localized_model_dir
),
backend_
(
backend
),
state_
(
nullptr
)
{
}
TritonModel
::~
TritonModel
()
{
// Explicitly delete/finalize all model instances before finalizing
// the model itself.
instances_
.
clear
();
passive_instances_
.
clear
();
// Unregister itself from the rate limiter. Note this should happen
// after all instances are destructed. Destrucing instances ensures
// there are no instance threads waiting on rate limiter for
// receiving their payloads.
server_
->
GetRateLimiter
()
->
UnregisterModel
(
this
);
// Model finalization is optional... The TRITONBACKEND_Model
// object is this TritonModel object.
if
(
backend_
->
ModelFiniFn
()
!=
nullptr
)
{
LOG_TRITONSERVER_ERROR
(
backend_
->
ModelFiniFn
()(
reinterpret_cast
<
TRITONBACKEND_Model
*>
(
this
)),
"failed finalizing model"
);
}
}
extern
"C"
{
//
// TRITONBACKEND_Model
//
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelName
(
TRITONBACKEND_Model
*
model
,
const
char
**
name
)
{
TritonModel
*
tm
=
reinterpret_cast
<
TritonModel
*>
(
model
);
*
name
=
tm
->
Name
().
c_str
();
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelVersion
(
TRITONBACKEND_Model
*
model
,
uint64_t
*
version
)
{
TritonModel
*
tm
=
reinterpret_cast
<
TritonModel
*>
(
model
);
*
version
=
tm
->
Version
();
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelRepository
(
TRITONBACKEND_Model
*
model
,
TRITONBACKEND_ArtifactType
*
artifact_type
,
const
char
**
location
)
{
TritonModel
*
tm
=
reinterpret_cast
<
TritonModel
*>
(
model
);
*
artifact_type
=
TRITONBACKEND_ARTIFACT_FILESYSTEM
;
*
location
=
tm
->
LocalizedModelPath
().
c_str
();
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelConfig
(
TRITONBACKEND_Model
*
model
,
const
uint32_t
config_version
,
TRITONSERVER_Message
**
model_config
)
{
TritonModel
*
tm
=
reinterpret_cast
<
TritonModel
*>
(
model
);
std
::
string
model_config_json
;
Status
status
=
ModelConfigToJson
(
tm
->
Config
(),
config_version
,
&
model_config_json
);
if
(
!
status
.
IsOk
())
{
return
TRITONSERVER_ErrorNew
(
StatusCodeToTritonCode
(
status
.
StatusCode
()),
status
.
Message
().
c_str
());
}
*
model_config
=
reinterpret_cast
<
TRITONSERVER_Message
*>
(
new
TritonServerMessage
(
std
::
move
(
model_config_json
)));
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelAutoCompleteConfig
(
TRITONBACKEND_Model
*
model
,
bool
*
auto_complete_config
)
{
TritonModel
*
tm
=
reinterpret_cast
<
TritonModel
*>
(
model
);
*
auto_complete_config
=
tm
->
AutoCompleteConfig
();
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelSetConfig
(
TRITONBACKEND_Model
*
model
,
const
uint32_t
config_version
,
TRITONSERVER_Message
*
model_config
)
{
TritonModel
*
tm
=
reinterpret_cast
<
TritonModel
*>
(
model
);
Status
status
=
tm
->
UpdateModelConfig
(
config_version
,
model_config
);
if
(
!
status
.
IsOk
())
{
return
TRITONSERVER_ErrorNew
(
StatusCodeToTritonCode
(
status
.
StatusCode
()),
status
.
Message
().
c_str
());
}
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelServer
(
TRITONBACKEND_Model
*
model
,
TRITONSERVER_Server
**
server
)
{
TritonModel
*
tm
=
reinterpret_cast
<
TritonModel
*>
(
model
);
*
server
=
reinterpret_cast
<
TRITONSERVER_Server
*>
(
tm
->
Server
());
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelBackend
(
TRITONBACKEND_Model
*
model
,
TRITONBACKEND_Backend
**
backend
)
{
TritonModel
*
tm
=
reinterpret_cast
<
TritonModel
*>
(
model
);
*
backend
=
reinterpret_cast
<
TRITONBACKEND_Backend
*>
(
tm
->
Backend
().
get
());
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelState
(
TRITONBACKEND_Model
*
model
,
void
**
state
)
{
TritonModel
*
tm
=
reinterpret_cast
<
TritonModel
*>
(
model
);
*
state
=
tm
->
State
();
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelSetState
(
TRITONBACKEND_Model
*
model
,
void
*
state
)
{
TritonModel
*
tm
=
reinterpret_cast
<
TritonModel
*>
(
model
);
tm
->
SetState
(
state
);
return
nullptr
;
// success
}
///
/// TRITONBACKEND_Request
///
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_RequestId
(
TRITONBACKEND_Request
*
request
,
const
char
**
id
)
{
InferenceRequest
*
tr
=
reinterpret_cast
<
InferenceRequest
*>
(
request
);
*
id
=
tr
->
Id
().
c_str
();
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_RequestCorrelationId
(
TRITONBACKEND_Request
*
request
,
uint64_t
*
id
)
{
InferenceRequest
*
tr
=
reinterpret_cast
<
InferenceRequest
*>
(
request
);
const
InferenceRequest
::
SequenceId
&
correlation_id
=
tr
->
CorrelationId
();
if
(
correlation_id
.
Type
()
!=
InferenceRequest
::
SequenceId
::
DataType
::
UINT64
)
{
return
TRITONSERVER_ErrorNew
(
TRITONSERVER_ERROR_INVALID_ARG
,
(
tr
->
LogRequest
()
+
"correlation ID in request is not an unsigned int"
)
.
c_str
());
}
*
id
=
correlation_id
.
UnsignedIntValue
();
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_RequestFlags
(
TRITONBACKEND_Request
*
request
,
uint32_t
*
flags
)
{
InferenceRequest
*
tr
=
reinterpret_cast
<
InferenceRequest
*>
(
request
);
*
flags
=
tr
->
Flags
();
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_RequestCorrelationIdString
(
TRITONBACKEND_Request
*
request
,
const
char
**
id
)
{
InferenceRequest
*
tr
=
reinterpret_cast
<
InferenceRequest
*>
(
request
);
const
InferenceRequest
::
SequenceId
&
correlation_id
=
tr
->
CorrelationId
();
if
(
correlation_id
.
Type
()
!=
InferenceRequest
::
SequenceId
::
DataType
::
STRING
)
{
return
TRITONSERVER_ErrorNew
(
TRITONSERVER_ERROR_INVALID_ARG
,
(
tr
->
LogRequest
()
+
"correlation ID in request is not a string"
)
.
c_str
());
}
*
id
=
correlation_id
.
StringValue
().
c_str
();
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_RequestInputCount
(
TRITONBACKEND_Request
*
request
,
uint32_t
*
count
)
{
InferenceRequest
*
tr
=
reinterpret_cast
<
InferenceRequest
*>
(
request
);
*
count
=
tr
->
ImmutableInputs
().
size
();
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_RequestInputName
(
TRITONBACKEND_Request
*
request
,
const
uint32_t
index
,
const
char
**
input_name
)
{
*
input_name
=
nullptr
;
InferenceRequest
*
tr
=
reinterpret_cast
<
InferenceRequest
*>
(
request
);
const
auto
&
inputs
=
tr
->
ImmutableInputs
();
if
(
index
>=
inputs
.
size
())
{
return
TRITONSERVER_ErrorNew
(
TRITONSERVER_ERROR_INVALID_ARG
,
(
tr
->
LogRequest
()
+
"out of bounds index "
+
std
::
to_string
(
index
)
+
": request has "
+
std
::
to_string
(
inputs
.
size
())
+
" inputs"
)
.
c_str
());
}
// The request inputs are not allowed to change once the request
// makes it to the backend, so it is ok to just iterate through the
// map. This linear search is the best we can do given the need for
// the inputs to be in a map and given the typical small number of
// inputs is better than having every request maintain the inputs as
// both map and vector.
uint32_t
cnt
=
0
;
for
(
const
auto
&
pr
:
inputs
)
{
if
(
cnt
++
==
index
)
{
InferenceRequest
::
Input
*
in
=
pr
.
second
;
*
input_name
=
in
->
Name
().
c_str
();
break
;
}
}
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_RequestInput
(
TRITONBACKEND_Request
*
request
,
const
char
*
name
,
TRITONBACKEND_Input
**
input
)
{
InferenceRequest
*
tr
=
reinterpret_cast
<
InferenceRequest
*>
(
request
);
const
auto
&
inputs
=
tr
->
ImmutableInputs
();
const
auto
&
itr
=
inputs
.
find
(
name
);
if
(
itr
==
inputs
.
end
())
{
*
input
=
nullptr
;
return
TRITONSERVER_ErrorNew
(
TRITONSERVER_ERROR_INVALID_ARG
,
(
tr
->
LogRequest
()
+
"unknown request input name "
+
name
).
c_str
());
}
InferenceRequest
::
Input
*
in
=
itr
->
second
;
*
input
=
reinterpret_cast
<
TRITONBACKEND_Input
*>
(
in
);
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_RequestInputByIndex
(
TRITONBACKEND_Request
*
request
,
const
uint32_t
index
,
TRITONBACKEND_Input
**
input
)
{
InferenceRequest
*
tr
=
reinterpret_cast
<
InferenceRequest
*>
(
request
);
const
auto
&
inputs
=
tr
->
ImmutableInputs
();
if
(
index
>=
inputs
.
size
())
{
return
TRITONSERVER_ErrorNew
(
TRITONSERVER_ERROR_INVALID_ARG
,
(
tr
->
LogRequest
()
+
"out of bounds index "
+
std
::
to_string
(
index
)
+
": request has "
+
std
::
to_string
(
inputs
.
size
())
+
" inputs"
)
.
c_str
());
}
// The request inputs are not allowed to change once the request
// makes it to the backend, so it is ok to just iterate through the
// map. This linear search is the best we can do given the need for
// the inputs to be in a map and given the typical small number of
// inputs is better than having every request maintain the inputs as
// both map and vector.
uint32_t
cnt
=
0
;
for
(
const
auto
&
pr
:
inputs
)
{
if
(
cnt
++
==
index
)
{
InferenceRequest
::
Input
*
in
=
pr
.
second
;
*
input
=
reinterpret_cast
<
TRITONBACKEND_Input
*>
(
in
);
break
;
}
}
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_RequestOutputCount
(
TRITONBACKEND_Request
*
request
,
uint32_t
*
count
)
{
InferenceRequest
*
tr
=
reinterpret_cast
<
InferenceRequest
*>
(
request
);
*
count
=
tr
->
ImmutableRequestedOutputs
().
size
();
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_RequestOutputName
(
TRITONBACKEND_Request
*
request
,
const
uint32_t
index
,
const
char
**
output_name
)
{
*
output_name
=
nullptr
;
InferenceRequest
*
tr
=
reinterpret_cast
<
InferenceRequest
*>
(
request
);
const
auto
&
routputs
=
tr
->
ImmutableRequestedOutputs
();
if
(
index
>=
routputs
.
size
())
{
return
TRITONSERVER_ErrorNew
(
TRITONSERVER_ERROR_INVALID_ARG
,
(
tr
->
LogRequest
()
+
"out of bounds index "
+
std
::
to_string
(
index
)
+
": request has "
+
std
::
to_string
(
routputs
.
size
())
+
" requested outputs"
)
.
c_str
());
}
// The requested outputs are not allowed to change once the request
// makes it to the backend, so it is ok to just iterate through the
// set. This linear search is the best we can do given the requested
// outputs being in a set and given the typical small number of
// requested outputs it should not be a performance issue.
uint32_t
cnt
=
0
;
for
(
const
auto
&
rout
:
routputs
)
{
if
(
cnt
++
==
index
)
{
*
output_name
=
rout
.
c_str
();
break
;
}
}
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_RequestOutputBufferProperties
(
TRITONBACKEND_Request
*
request
,
const
char
*
name
,
size_t
*
byte_size
,
TRITONSERVER_MemoryType
*
memory_type
,
int64_t
*
memory_type_id
)
{
InferenceRequest
*
tr
=
reinterpret_cast
<
InferenceRequest
*>
(
request
);
auto
status
=
tr
->
OutputBufferProperties
(
name
,
byte_size
,
memory_type
,
memory_type_id
);
if
(
!
status
.
IsOk
())
{
return
TRITONSERVER_ErrorNew
(
StatusCodeToTritonCode
(
status
.
StatusCode
()),
status
.
Message
().
c_str
());
}
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_RequestRelease
(
TRITONBACKEND_Request
*
request
,
uint32_t
release_flags
)
{
InferenceRequest
*
tr
=
reinterpret_cast
<
InferenceRequest
*>
(
request
);
std
::
unique_ptr
<
InferenceRequest
>
ur
(
tr
);
InferenceRequest
::
Release
(
std
::
move
(
ur
),
release_flags
);
return
nullptr
;
// success
}
///
/// TRITONBACKEND_State
///
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_StateUpdate
(
TRITONBACKEND_State
*
state
)
{
SequenceState
*
ts
=
reinterpret_cast
<
SequenceState
*>
(
state
);
auto
status
=
ts
->
Update
();
if
(
!
status
.
IsOk
())
{
return
TRITONSERVER_ErrorNew
(
StatusCodeToTritonCode
(
status
.
StatusCode
()),
status
.
Message
().
c_str
());
}
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_StateNew
(
TRITONBACKEND_State
**
state
,
TRITONBACKEND_Request
*
request
,
const
char
*
name
,
const
TRITONSERVER_DataType
datatype
,
const
int64_t
*
shape
,
const
uint32_t
dims_count
)
{
InferenceRequest
*
tr
=
reinterpret_cast
<
InferenceRequest
*>
(
request
);
SequenceState
*
lstate
;
std
::
vector
<
int64_t
>
lshape
(
shape
,
shape
+
dims_count
);
auto
&
sequence_state
=
tr
->
GetSequenceStates
();
if
(
sequence_state
==
nullptr
)
{
return
TRITONSERVER_ErrorNew
(
TRITONSERVER_ERROR_INVALID_ARG
,
(
std
::
string
(
"unable to add state '"
)
+
name
+
"'. State configuration is missing for model '"
+
tr
->
ModelName
()
+
"'."
)
.
c_str
());
}
Status
status
=
sequence_state
->
OutputState
(
name
,
TritonToDataType
(
datatype
),
lshape
,
&
lstate
);
if
(
!
status
.
IsOk
())
{
*
state
=
nullptr
;
return
TRITONSERVER_ErrorNew
(
StatusCodeToTritonCode
(
status
.
StatusCode
()),
status
.
Message
().
c_str
());
}
*
state
=
reinterpret_cast
<
TRITONBACKEND_State
*>
(
lstate
);
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_StateBuffer
(
TRITONBACKEND_State
*
state
,
void
**
buffer
,
const
uint64_t
buffer_byte_size
,
TRITONSERVER_MemoryType
*
memory_type
,
int64_t
*
memory_type_id
)
{
SequenceState
*
to
=
reinterpret_cast
<
SequenceState
*>
(
state
);
Status
status
=
Status
::
Success
;
// If the buffer size exactly matches the buffer available, reuse the
// currently allocated buffer.
if
(
to
->
Data
()
->
TotalByteSize
()
==
buffer_byte_size
)
{
const
std
::
shared_ptr
<
AllocatedMemory
>&
memory
=
reinterpret_cast
<
const
std
::
shared_ptr
<
AllocatedMemory
>&>
(
to
->
Data
());
TRITONSERVER_MemoryType
current_memory_type
;
int64_t
current_memory_type_id
;
void
*
lbuffer
=
memory
->
MutableBuffer
(
&
current_memory_type
,
&
current_memory_type_id
);
// If the requested memory type doesn't match the current buffer, allocate a
// new buffer with the requested memory type and memory type id.
if
(
current_memory_type
==
*
memory_type
&&
current_memory_type_id
==
*
memory_type_id
)
{
*
buffer
=
lbuffer
;
}
else
{
std
::
shared_ptr
<
AllocatedMemory
>
memory
=
std
::
make_shared
<
AllocatedMemory
>
(
buffer_byte_size
,
*
memory_type
,
*
memory_type_id
);
*
buffer
=
memory
->
MutableBuffer
(
memory_type
,
memory_type_id
);
to
->
RemoveAllData
();
status
=
to
->
SetData
(
memory
);
}
}
else
{
std
::
shared_ptr
<
AllocatedMemory
>
memory
=
std
::
make_shared
<
AllocatedMemory
>
(
buffer_byte_size
,
*
memory_type
,
*
memory_type_id
);
*
buffer
=
memory
->
MutableBuffer
(
memory_type
,
memory_type_id
);
to
->
RemoveAllData
();
status
=
to
->
SetData
(
memory
);
}
if
(
!
status
.
IsOk
())
{
*
buffer
=
nullptr
;
return
TRITONSERVER_ErrorNew
(
StatusCodeToTritonCode
(
status
.
StatusCode
()),
status
.
Message
().
c_str
());
}
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_StateBufferAttributes
(
TRITONBACKEND_State
*
state
,
TRITONSERVER_BufferAttributes
**
buffer_attributes
)
{
SequenceState
*
to
=
reinterpret_cast
<
SequenceState
*>
(
state
);
to
->
Data
()
->
BufferAt
(
0
,
reinterpret_cast
<
BufferAttributes
**>
(
buffer_attributes
));
return
nullptr
;
// success
}
//
// TRITONBACKEND_ResponseFactory
//
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ResponseFactoryNew
(
TRITONBACKEND_ResponseFactory
**
factory
,
TRITONBACKEND_Request
*
request
)
{
InferenceRequest
*
tr
=
reinterpret_cast
<
InferenceRequest
*>
(
request
);
std
::
shared_ptr
<
InferenceResponseFactory
>*
response_factory
=
new
std
::
shared_ptr
<
InferenceResponseFactory
>
(
tr
->
ResponseFactory
());
*
factory
=
reinterpret_cast
<
TRITONBACKEND_ResponseFactory
*>
(
response_factory
);
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ResponseFactoryDelete
(
TRITONBACKEND_ResponseFactory
*
factory
)
{
std
::
shared_ptr
<
InferenceResponseFactory
>*
response_factory
=
reinterpret_cast
<
std
::
shared_ptr
<
InferenceResponseFactory
>*>
(
factory
);
delete
response_factory
;
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ResponseFactorySendFlags
(
TRITONBACKEND_ResponseFactory
*
factory
,
const
uint32_t
send_flags
)
{
std
::
shared_ptr
<
InferenceResponseFactory
>*
response_factory
=
reinterpret_cast
<
std
::
shared_ptr
<
InferenceResponseFactory
>*>
(
factory
);
Status
status
=
(
*
response_factory
)
->
SendFlags
(
send_flags
);
if
(
!
status
.
IsOk
())
{
return
TRITONSERVER_ErrorNew
(
StatusCodeToTritonCode
(
status
.
StatusCode
()),
status
.
Message
().
c_str
());
}
return
nullptr
;
// success
}
///
/// TRITONBACKEND_Response
///
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ResponseNew
(
TRITONBACKEND_Response
**
response
,
TRITONBACKEND_Request
*
request
)
{
InferenceRequest
*
tr
=
reinterpret_cast
<
InferenceRequest
*>
(
request
);
std
::
unique_ptr
<
InferenceResponse
>
tresp
;
Status
status
=
tr
->
ResponseFactory
()
->
CreateResponse
(
&
tresp
);
if
(
!
status
.
IsOk
())
{
*
response
=
nullptr
;
return
TRITONSERVER_ErrorNew
(
StatusCodeToTritonCode
(
status
.
StatusCode
()),
status
.
Message
().
c_str
());
}
*
response
=
reinterpret_cast
<
TRITONBACKEND_Response
*>
(
tresp
.
release
());
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ResponseNewFromFactory
(
TRITONBACKEND_Response
**
response
,
TRITONBACKEND_ResponseFactory
*
factory
)
{
std
::
shared_ptr
<
InferenceResponseFactory
>*
response_factory
=
reinterpret_cast
<
std
::
shared_ptr
<
InferenceResponseFactory
>*>
(
factory
);
std
::
unique_ptr
<
InferenceResponse
>
tr
;
Status
status
=
(
*
response_factory
)
->
CreateResponse
(
&
tr
);
if
(
!
status
.
IsOk
())
{
*
response
=
nullptr
;
return
TRITONSERVER_ErrorNew
(
StatusCodeToTritonCode
(
status
.
StatusCode
()),
status
.
Message
().
c_str
());
}
*
response
=
reinterpret_cast
<
TRITONBACKEND_Response
*>
(
tr
.
release
());
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ResponseDelete
(
TRITONBACKEND_Response
*
response
)
{
InferenceResponse
*
tr
=
reinterpret_cast
<
InferenceResponse
*>
(
response
);
delete
tr
;
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ResponseSetStringParameter
(
TRITONBACKEND_Response
*
response
,
const
char
*
name
,
const
char
*
value
)
{
InferenceResponse
*
tr
=
reinterpret_cast
<
InferenceResponse
*>
(
response
);
Status
status
=
tr
->
AddParameter
(
name
,
value
);
if
(
!
status
.
IsOk
())
{
return
TRITONSERVER_ErrorNew
(
StatusCodeToTritonCode
(
status
.
StatusCode
()),
status
.
Message
().
c_str
());
}
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ResponseSetIntParameter
(
TRITONBACKEND_Response
*
response
,
const
char
*
name
,
const
int64_t
value
)
{
InferenceResponse
*
tr
=
reinterpret_cast
<
InferenceResponse
*>
(
response
);
Status
status
=
tr
->
AddParameter
(
name
,
value
);
if
(
!
status
.
IsOk
())
{
return
TRITONSERVER_ErrorNew
(
StatusCodeToTritonCode
(
status
.
StatusCode
()),
status
.
Message
().
c_str
());
}
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ResponseSetBoolParameter
(
TRITONBACKEND_Response
*
response
,
const
char
*
name
,
const
bool
value
)
{
InferenceResponse
*
tr
=
reinterpret_cast
<
InferenceResponse
*>
(
response
);
Status
status
=
tr
->
AddParameter
(
name
,
value
);
if
(
!
status
.
IsOk
())
{
return
TRITONSERVER_ErrorNew
(
StatusCodeToTritonCode
(
status
.
StatusCode
()),
status
.
Message
().
c_str
());
}
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ResponseOutput
(
TRITONBACKEND_Response
*
response
,
TRITONBACKEND_Output
**
output
,
const
char
*
name
,
const
TRITONSERVER_DataType
datatype
,
const
int64_t
*
shape
,
const
uint32_t
dims_count
)
{
InferenceResponse
*
tr
=
reinterpret_cast
<
InferenceResponse
*>
(
response
);
std
::
vector
<
int64_t
>
lshape
(
shape
,
shape
+
dims_count
);
InferenceResponse
::
Output
*
loutput
;
Status
status
=
tr
->
AddOutput
(
name
,
TritonToDataType
(
datatype
),
std
::
move
(
lshape
),
&
loutput
);
if
(
!
status
.
IsOk
())
{
*
output
=
nullptr
;
return
TRITONSERVER_ErrorNew
(
StatusCodeToTritonCode
(
status
.
StatusCode
()),
status
.
Message
().
c_str
());
}
*
output
=
reinterpret_cast
<
TRITONBACKEND_Output
*>
(
loutput
);
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ResponseSend
(
TRITONBACKEND_Response
*
response
,
const
uint32_t
send_flags
,
TRITONSERVER_Error
*
error
)
{
InferenceResponse
*
tr
=
reinterpret_cast
<
InferenceResponse
*>
(
response
);
Status
status
;
std
::
unique_ptr
<
InferenceResponse
>
utr
(
tr
);
if
(
error
==
nullptr
)
{
status
=
InferenceResponse
::
Send
(
std
::
move
(
utr
),
send_flags
);
}
else
{
status
=
InferenceResponse
::
SendWithStatus
(
std
::
move
(
utr
),
send_flags
,
Status
(
TritonCodeToStatusCode
(
TRITONSERVER_ErrorCode
(
error
)),
TRITONSERVER_ErrorMessage
(
error
)));
}
if
(
!
status
.
IsOk
())
{
return
TRITONSERVER_ErrorNew
(
StatusCodeToTritonCode
(
status
.
StatusCode
()),
status
.
Message
().
c_str
());
}
return
nullptr
;
// success
}
///
/// TRITONBACKEND_Input
///
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_InputProperties
(
TRITONBACKEND_Input
*
input
,
const
char
**
name
,
TRITONSERVER_DataType
*
datatype
,
const
int64_t
**
shape
,
uint32_t
*
dims_count
,
uint64_t
*
byte_size
,
uint32_t
*
buffer_count
)
{
InferenceRequest
::
Input
*
ti
=
reinterpret_cast
<
InferenceRequest
::
Input
*>
(
input
);
if
(
name
!=
nullptr
)
{
*
name
=
ti
->
Name
().
c_str
();
}
if
(
datatype
!=
nullptr
)
{
*
datatype
=
DataTypeToTriton
(
ti
->
DType
());
}
if
(
shape
!=
nullptr
)
{
*
shape
=
ti
->
ShapeWithBatchDim
().
data
();
}
if
(
dims_count
!=
nullptr
)
{
*
dims_count
=
ti
->
ShapeWithBatchDim
().
size
();
}
if
(
byte_size
!=
nullptr
)
{
*
byte_size
=
ti
->
Data
()
->
TotalByteSize
();
}
if
(
buffer_count
!=
nullptr
)
{
*
buffer_count
=
ti
->
DataBufferCount
();
}
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_InputPropertiesForHostPolicy
(
TRITONBACKEND_Input
*
input
,
const
char
*
host_policy_name
,
const
char
**
name
,
TRITONSERVER_DataType
*
datatype
,
const
int64_t
**
shape
,
uint32_t
*
dims_count
,
uint64_t
*
byte_size
,
uint32_t
*
buffer_count
)
{
InferenceRequest
::
Input
*
ti
=
reinterpret_cast
<
InferenceRequest
::
Input
*>
(
input
);
if
(
name
!=
nullptr
)
{
*
name
=
ti
->
Name
().
c_str
();
}
if
(
datatype
!=
nullptr
)
{
*
datatype
=
DataTypeToTriton
(
ti
->
DType
());
}
if
(
shape
!=
nullptr
)
{
*
shape
=
ti
->
ShapeWithBatchDim
().
data
();
}
if
(
dims_count
!=
nullptr
)
{
*
dims_count
=
ti
->
ShapeWithBatchDim
().
size
();
}
if
(
host_policy_name
!=
nullptr
)
{
if
(
byte_size
!=
nullptr
)
{
*
byte_size
=
ti
->
Data
(
host_policy_name
)
->
TotalByteSize
();
}
if
(
buffer_count
!=
nullptr
)
{
*
buffer_count
=
ti
->
DataBufferCountForHostPolicy
(
host_policy_name
);
}
}
else
{
if
(
byte_size
!=
nullptr
)
{
*
byte_size
=
ti
->
Data
()
->
TotalByteSize
();
}
if
(
buffer_count
!=
nullptr
)
{
*
buffer_count
=
ti
->
DataBufferCount
();
}
}
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_InputBuffer
(
TRITONBACKEND_Input
*
input
,
const
uint32_t
index
,
const
void
**
buffer
,
uint64_t
*
buffer_byte_size
,
TRITONSERVER_MemoryType
*
memory_type
,
int64_t
*
memory_type_id
)
{
InferenceRequest
::
Input
*
ti
=
reinterpret_cast
<
InferenceRequest
::
Input
*>
(
input
);
Status
status
=
ti
->
DataBuffer
(
index
,
buffer
,
buffer_byte_size
,
memory_type
,
memory_type_id
);
if
(
!
status
.
IsOk
())
{
*
buffer
=
nullptr
;
*
buffer_byte_size
=
0
;
return
TRITONSERVER_ErrorNew
(
StatusCodeToTritonCode
(
status
.
StatusCode
()),
status
.
Message
().
c_str
());
}
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_InputBufferAttributes
(
TRITONBACKEND_Input
*
input
,
const
uint32_t
index
,
const
void
**
buffer
,
TRITONSERVER_BufferAttributes
**
buffer_attributes
)
{
InferenceRequest
::
Input
*
ti
=
reinterpret_cast
<
InferenceRequest
::
Input
*>
(
input
);
Status
status
=
ti
->
DataBufferAttributes
(
index
,
buffer
,
reinterpret_cast
<
BufferAttributes
**>
(
buffer_attributes
));
if
(
!
status
.
IsOk
())
{
*
buffer
=
nullptr
;
*
buffer_attributes
=
nullptr
;
return
TRITONSERVER_ErrorNew
(
StatusCodeToTritonCode
(
status
.
StatusCode
()),
status
.
Message
().
c_str
());
}
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_InputBufferForHostPolicy
(
TRITONBACKEND_Input
*
input
,
const
char
*
host_policy_name
,
const
uint32_t
index
,
const
void
**
buffer
,
uint64_t
*
buffer_byte_size
,
TRITONSERVER_MemoryType
*
memory_type
,
int64_t
*
memory_type_id
)
{
InferenceRequest
::
Input
*
ti
=
reinterpret_cast
<
InferenceRequest
::
Input
*>
(
input
);
Status
status
=
(
host_policy_name
==
nullptr
)
?
ti
->
DataBuffer
(
index
,
buffer
,
buffer_byte_size
,
memory_type
,
memory_type_id
)
:
ti
->
DataBufferForHostPolicy
(
index
,
buffer
,
buffer_byte_size
,
memory_type
,
memory_type_id
,
host_policy_name
);
if
(
!
status
.
IsOk
())
{
*
buffer
=
nullptr
;
*
buffer_byte_size
=
0
;
return
TRITONSERVER_ErrorNew
(
StatusCodeToTritonCode
(
status
.
StatusCode
()),
status
.
Message
().
c_str
());
}
return
nullptr
;
// success
}
///
/// TRITONBACKEND_Output
///
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_OutputBuffer
(
TRITONBACKEND_Output
*
output
,
void
**
buffer
,
const
uint64_t
buffer_byte_size
,
TRITONSERVER_MemoryType
*
memory_type
,
int64_t
*
memory_type_id
)
{
InferenceResponse
::
Output
*
to
=
reinterpret_cast
<
InferenceResponse
::
Output
*>
(
output
);
Status
status
=
to
->
AllocateDataBuffer
(
buffer
,
buffer_byte_size
,
memory_type
,
memory_type_id
);
if
(
!
status
.
IsOk
())
{
*
buffer
=
nullptr
;
return
TRITONSERVER_ErrorNew
(
StatusCodeToTritonCode
(
status
.
StatusCode
()),
status
.
Message
().
c_str
());
}
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_OutputBufferAttributes
(
TRITONBACKEND_Output
*
output
,
TRITONSERVER_BufferAttributes
**
buffer_attributes
)
{
InferenceResponse
::
Output
*
to
=
reinterpret_cast
<
InferenceResponse
::
Output
*>
(
output
);
*
buffer_attributes
=
reinterpret_cast
<
TRITONSERVER_BufferAttributes
*>
(
to
->
GetBufferAttributes
());
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_BackendAttributeAddPreferredInstanceGroup
(
TRITONBACKEND_BackendAttribute
*
backend_attributes
,
const
TRITONSERVER_InstanceGroupKind
kind
,
const
uint64_t
count
,
const
uint64_t
*
device_ids
,
const
uint64_t
id_count
)
{
auto
ba
=
reinterpret_cast
<
TritonBackend
::
Attribute
*>
(
backend_attributes
);
ba
->
preferred_groups_
.
emplace_back
();
auto
&
pg
=
ba
->
preferred_groups_
.
back
();
switch
(
kind
)
{
case
TRITONSERVER_INSTANCEGROUPKIND_AUTO
:
pg
.
set_kind
(
inference
::
ModelInstanceGroup
::
KIND_AUTO
);
break
;
case
TRITONSERVER_INSTANCEGROUPKIND_CPU
:
pg
.
set_kind
(
inference
::
ModelInstanceGroup
::
KIND_CPU
);
break
;
case
TRITONSERVER_INSTANCEGROUPKIND_GPU
:
pg
.
set_kind
(
inference
::
ModelInstanceGroup
::
KIND_GPU
);
break
;
case
TRITONSERVER_INSTANCEGROUPKIND_MODEL
:
pg
.
set_kind
(
inference
::
ModelInstanceGroup
::
KIND_MODEL
);
break
;
}
pg
.
set_count
(
count
);
if
(
device_ids
!=
nullptr
)
{
for
(
size_t
i
=
0
;
i
<
id_count
;
++
i
)
{
pg
.
add_gpus
(
device_ids
[
i
]);
}
}
return
nullptr
;
}
}
// extern C
}}
// namespace triton::core
3rdparty/core-r22.12/src/backend_model.h
0 → 100644
View file @
0a21fff9
// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <memory>
#include <string>
#include "backend_manager.h"
#include "filesystem.h"
#include "infer_request.h"
#include "model.h"
#include "model_config.pb.h"
#include "status.h"
namespace
triton
{
namespace
core
{
class
InferenceServer
;
class
TritonModelInstance
;
//
// Represents a model.
//
// Inheriting from Model to implement backend APIs
//
class
TritonModel
:
public
Model
{
public:
static
Status
Create
(
InferenceServer
*
server
,
const
std
::
string
&
model_path
,
const
triton
::
common
::
BackendCmdlineConfigMap
&
backend_cmdline_config_map
,
const
triton
::
common
::
HostPolicyCmdlineConfigMap
&
host_policy_map
,
const
std
::
string
&
model_name
,
const
int64_t
version
,
inference
::
ModelConfig
model_config
,
const
bool
is_config_provided
,
std
::
unique_ptr
<
TritonModel
>*
model
);
~
TritonModel
();
const
std
::
string
&
LocalizedModelPath
()
const
{
return
localized_model_dir_
->
Path
();
}
InferenceServer
*
Server
()
{
return
server_
;
}
bool
AutoCompleteConfig
()
const
{
return
auto_complete_config_
;
}
Status
UpdateModelConfig
(
const
uint32_t
config_version
,
TRITONSERVER_Message
*
updated_config_message
);
const
std
::
shared_ptr
<
TritonBackend
>&
Backend
()
const
{
return
backend_
;
}
const
std
::
vector
<
std
::
unique_ptr
<
TritonModelInstance
>>&
Instances
()
const
{
return
instances_
;
}
void
*
State
()
{
return
state_
;
}
void
SetState
(
void
*
state
)
{
state_
=
state
;
}
Status
AddInstance
(
std
::
unique_ptr
<
TritonModelInstance
>&&
instance
,
const
bool
passive
);
private:
DISALLOW_COPY_AND_ASSIGN
(
TritonModel
);
TritonModel
(
InferenceServer
*
server
,
const
std
::
shared_ptr
<
LocalizedPath
>&
localized_model_dir
,
const
std
::
shared_ptr
<
TritonBackend
>&
backend
,
const
double
min_compute_capability
,
const
int64_t
version
,
const
inference
::
ModelConfig
&
config
,
const
bool
auto_complete_config
);
// Set the scheduler based on the model configuration. The scheduler
// can only be set once for a backend.
Status
SetConfiguredScheduler
();
// Merges the global backend configs with the specific
// backend configs.
static
Status
ResolveBackendConfigs
(
const
triton
::
common
::
BackendCmdlineConfigMap
&
backend_cmdline_config_map
,
const
std
::
string
&
backend_name
,
triton
::
common
::
BackendCmdlineConfig
&
config
);
// Sets defaults for some backend configurations when none are specified on
// the command line.
static
Status
SetBackendConfigDefaults
(
triton
::
common
::
BackendCmdlineConfig
&
config
);
Status
Initialize
();
Status
WarmUp
();
// The server object that owns this model. The model holds this as a
// raw pointer because the lifetime of the server is guaranteed to
// be longer than the lifetime of a model owned by the server.
InferenceServer
*
server_
;
// The minimum supported compute capability on device.
const
double
min_compute_capability_
;
// Whether the backend should attempt to auto-complete the model config.
const
bool
auto_complete_config_
;
// The localized repo directory holding the model. If localization
// required creation of a temporary local copy then that copy will
// persist as along as this object is retained by this model.
std
::
shared_ptr
<
LocalizedPath
>
localized_model_dir_
;
// Backend used by this model.
std
::
shared_ptr
<
TritonBackend
>
backend_
;
// The model instances for this model.
std
::
vector
<
std
::
unique_ptr
<
TritonModelInstance
>>
instances_
;
std
::
vector
<
std
::
unique_ptr
<
TritonModelInstance
>>
passive_instances_
;
// Opaque state associated with this model.
void
*
state_
;
};
}}
// namespace triton::core
3rdparty/core-r22.12/src/backend_model_instance.cc
0 → 100644
View file @
0a21fff9
// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "backend_model_instance.h"
#ifndef _WIN32
#include <sys/resource.h>
#include <sys/syscall.h>
#include <unistd.h>
#endif
#include "backend_config.h"
#include "backend_model.h"
#include "cuda_utils.h"
#include "metrics.h"
#include "model_config.pb.h"
#include "numa_utils.h"
#include "server.h"
#include "shared_library.h"
#include "triton/common/logging.h"
#include "triton/common/nvtx.h"
#include "tritonserver_apis.h"
// For unknown reason, windows will not export the TRITONBACKEND_*
// functions declared with dllexport in tritonbackend.h. To get those
// functions exported it is (also?) necessary to mark the definitions
// in this file with dllexport as well.
#if defined(_MSC_VER)
#define TRITONAPI_DECLSPEC __declspec(dllexport)
#elif defined(__GNUC__)
#define TRITONAPI_DECLSPEC __attribute__((__visibility__("default")))
#else
#define TRITONAPI_DECLSPEC
#endif
namespace
triton
{
namespace
core
{
namespace
{
// Utilities for warmup feature
TRITONSERVER_Error
*
WarmupResponseAlloc
(
TRITONSERVER_ResponseAllocator
*
allocator
,
const
char
*
tensor_name
,
size_t
byte_size
,
TRITONSERVER_MemoryType
preferred_memory_type
,
int64_t
preferred_memory_type_id
,
void
*
userp
,
void
**
buffer
,
void
**
buffer_userp
,
TRITONSERVER_MemoryType
*
actual_memory_type
,
int64_t
*
actual_memory_type_id
)
{
*
buffer
=
malloc
(
byte_size
);
if
(
*
buffer
!=
nullptr
)
{
*
actual_memory_type
=
TRITONSERVER_MEMORY_CPU
;
*
actual_memory_type_id
=
0
;
return
nullptr
;
}
return
TRITONSERVER_ErrorNew
(
TRITONSERVER_ERROR_INTERNAL
,
"failed to allocate output buffer for warmup."
);
}
TRITONSERVER_Error
*
WarmupResponseRelease
(
TRITONSERVER_ResponseAllocator
*
allocator
,
void
*
buffer
,
void
*
buffer_userp
,
size_t
byte_size
,
TRITONSERVER_MemoryType
memory_type
,
int64_t
memory_type_id
)
{
free
(
buffer
);
return
nullptr
;
}
ResponseAllocator
warmup_allocator
=
ResponseAllocator
(
WarmupResponseAlloc
,
WarmupResponseRelease
,
nullptr
/* start_fn */
);
void
WarmupResponseComplete
(
TRITONSERVER_InferenceResponse
*
iresponse
,
const
uint32_t
flags
,
void
*
userp
)
{
auto
res_pair
=
reinterpret_cast
<
std
::
pair
<
std
::
promise
<
void
>
,
std
::
vector
<
std
::
string
>*>*>
(
userp
);
if
(
iresponse
!=
nullptr
)
{
auto
err
=
TRITONSERVER_InferenceResponseError
(
iresponse
);
if
(
err
!=
nullptr
)
{
// The error vector is shared by all requests in the batch for now
static
std
::
mutex
res_mtx
;
{
std
::
lock_guard
<
std
::
mutex
>
lk
(
res_mtx
);
res_pair
->
second
->
emplace_back
(
TRITONSERVER_ErrorMessage
(
err
));
}
TRITONSERVER_ErrorDelete
(
err
);
}
// Just delete the response, warmup doesn't check for correctness
LOG_TRITONSERVER_ERROR
(
TRITONSERVER_InferenceResponseDelete
(
iresponse
),
"deleting warmup response"
);
}
// Last response
if
((
flags
&
TRITONSERVER_RESPONSE_COMPLETE_FINAL
)
!=
0
)
{
res_pair
->
first
.
set_value
();
}
}
void
WarmupRequestComplete
(
TRITONSERVER_InferenceRequest
*
request
,
const
uint32_t
flags
,
void
*
userp
)
{
if
((
flags
&
TRITONSERVER_REQUEST_RELEASE_ALL
)
!=
0
)
{
// Don't need to release request here, it is managed in WarmupData
if
(
userp
!=
nullptr
)
{
auto
warmup_promise
=
reinterpret_cast
<
std
::
promise
<
void
>*>
(
userp
);
warmup_promise
->
set_value
();
}
}
}
}
// namespace
TritonModelInstance
::
TritonModelInstance
(
TritonModel
*
model
,
const
std
::
string
&
name
,
const
size_t
index
,
const
TRITONSERVER_InstanceGroupKind
kind
,
const
int32_t
device_id
,
const
std
::
vector
<
std
::
string
>&
profile_names
,
const
bool
passive
,
const
triton
::
common
::
HostPolicyCmdlineConfig
&
host_policy
,
const
TritonServerMessage
&
host_policy_message
,
const
std
::
vector
<
SecondaryDevice
>&
secondary_devices
)
:
model_
(
model
),
name_
(
name
),
index_
(
index
),
kind_
(
kind
),
device_id_
(
device_id
),
host_policy_
(
host_policy
),
host_policy_message_
(
host_policy_message
),
profile_names_
(
profile_names
),
passive_
(
passive
),
secondary_devices_
(
secondary_devices
),
state_
(
nullptr
)
{
#ifdef TRITON_ENABLE_METRICS
if
(
Metrics
::
Enabled
())
{
// Use an ID in the metric only for GPU instances. Otherwise use
// METRIC_REPORTER_ID_CPU to indicate no device should be reported in the
// metric.
const
int
id
=
(
kind_
==
TRITONSERVER_INSTANCEGROUPKIND_GPU
)
?
device_id_
:
METRIC_REPORTER_ID_CPU
;
MetricModelReporter
::
Create
(
model_
->
Name
(),
model_
->
Version
(),
id
,
model_
->
Config
().
metric_tags
(),
&
reporter_
);
}
#endif // TRITON_ENABLE_METRICS
}
TritonModelInstance
::~
TritonModelInstance
()
{
if
(
triton_backend_thread_
.
get
()
!=
nullptr
)
{
triton_backend_thread_
->
StopBackendThread
();
}
// Model finalization is optional...
if
(
model_
->
Backend
()
->
ModelInstanceFiniFn
()
!=
nullptr
)
{
LOG_TRITONSERVER_ERROR
(
model_
->
Backend
()
->
ModelInstanceFiniFn
()(
reinterpret_cast
<
TRITONBACKEND_ModelInstance
*>
(
this
)),
"failed finalizing model instance"
);
}
}
Status
TritonModelInstance
::
CreateInstances
(
TritonModel
*
model
,
const
triton
::
common
::
BackendCmdlineConfigMap
&
backend_cmdline_config_map
,
const
triton
::
common
::
HostPolicyCmdlineConfigMap
&
host_policy_map
,
const
inference
::
ModelConfig
&
model_config
,
const
bool
device_blocking
)
{
static
triton
::
common
::
HostPolicyCmdlineConfig
empty_host_policy
;
// This structure is used to allocate TritonBackendThread to instances on same
// device for device blocking execution policy.
std
::
map
<
uint32_t
,
std
::
shared_ptr
<
TritonBackendThread
>>
device_to_thread_map
;
for
(
const
auto
&
group
:
model_config
.
instance_group
())
{
std
::
vector
<
std
::
string
>
profile_names
;
for
(
const
auto
&
profile_name
:
group
.
profile
())
{
profile_names
.
push_back
(
profile_name
);
}
std
::
vector
<
SecondaryDevice
>
secondary_devices
;
for
(
const
auto
&
secondary_device
:
group
.
secondary_devices
())
{
secondary_devices
.
emplace_back
(
inference
::
ModelInstanceGroup_SecondaryDevice_SecondaryDeviceKind_Name
(
secondary_device
.
kind
()),
secondary_device
.
device_id
());
}
for
(
int32_t
c
=
0
;
c
<
group
.
count
();
++
c
)
{
std
::
string
instance_name
{
group
.
count
()
>
1
?
group
.
name
()
+
"_"
+
std
::
to_string
(
c
)
:
group
.
name
()};
const
bool
passive
=
group
.
passive
();
std
::
vector
<
std
::
tuple
<
std
::
string
,
TRITONSERVER_InstanceGroupKind
,
int32_t
,
const
inference
::
ModelRateLimiter
*>>
instance_setting
;
if
(
group
.
kind
()
==
inference
::
ModelInstanceGroup
::
KIND_CPU
)
{
instance_setting
.
emplace_back
(
group
.
host_policy
().
empty
()
?
"cpu"
:
group
.
host_policy
(),
TRITONSERVER_INSTANCEGROUPKIND_CPU
,
0
/* device_id */
,
&
group
.
rate_limiter
());
}
else
if
(
group
.
kind
()
==
inference
::
ModelInstanceGroup
::
KIND_GPU
)
{
for
(
const
int32_t
device_id
:
group
.
gpus
())
{
instance_setting
.
emplace_back
(
group
.
host_policy
().
empty
()
?
(
"gpu_"
+
std
::
to_string
(
device_id
))
:
group
.
host_policy
(),
TRITONSERVER_INSTANCEGROUPKIND_GPU
,
device_id
,
&
group
.
rate_limiter
());
}
}
else
if
(
group
.
kind
()
==
inference
::
ModelInstanceGroup
::
KIND_MODEL
)
{
instance_setting
.
emplace_back
(
group
.
host_policy
().
empty
()
?
"model"
:
group
.
host_policy
(),
TRITONSERVER_INSTANCEGROUPKIND_MODEL
,
0
/* device_id */
,
&
group
.
rate_limiter
());
}
else
{
return
Status
(
Status
::
Code
::
INVALID_ARG
,
std
::
string
(
"instance_group kind "
)
+
ModelInstanceGroup_Kind_Name
(
group
.
kind
())
+
" not supported"
);
}
for
(
const
auto
is
:
instance_setting
)
{
const
auto
&
kind
=
std
::
get
<
1
>
(
is
);
const
auto
&
id
=
std
::
get
<
2
>
(
is
);
const
std
::
string
&
policy_name
=
std
::
get
<
0
>
(
is
);
const
triton
::
common
::
HostPolicyCmdlineConfig
*
host_policy
;
const
auto
policy_it
=
host_policy_map
.
find
(
policy_name
);
if
(
policy_it
!=
host_policy_map
.
end
())
{
host_policy
=
&
policy_it
->
second
;
}
else
{
host_policy
=
&
empty_host_policy
;
}
RETURN_IF_ERROR
(
SetNumaConfigOnThread
(
*
host_policy
));
auto
err
=
CreateInstance
(
model
,
instance_name
,
c
,
kind
,
id
,
profile_names
,
passive
,
policy_name
,
*
host_policy
,
*
(
std
::
get
<
3
>
(
is
)),
device_blocking
,
&
device_to_thread_map
,
secondary_devices
);
RETURN_IF_ERROR
(
ResetNumaMemoryPolicy
());
RETURN_IF_ERROR
(
err
);
// When deploying on GPU, we want to make sure the GPU memory usage
// is within allowed range, otherwise, stop the creation to ensure
// there is sufficient GPU memory for other use.
// We check the usage after loading the instance to better enforcing
// the limit. If we check before loading, we may create instance
// that occupies the rest of available memory which against the purpose
if
(
kind
==
TRITONSERVER_INSTANCEGROUPKIND_GPU
)
{
size_t
free
,
total
;
double
memory_limit
;
RETURN_IF_ERROR
(
GetDeviceMemoryInfo
(
id
,
&
free
,
&
total
));
RETURN_IF_ERROR
(
BackendConfigurationModelLoadGpuFraction
(
backend_cmdline_config_map
,
id
,
&
memory_limit
));
const
size_t
allow
=
total
*
memory_limit
;
const
size_t
used
=
total
-
free
;
if
(
used
>
allow
)
{
return
Status
(
Status
::
Code
::
UNAVAILABLE
,
std
::
string
(
"can not create model '"
)
+
instance_name
+
"': memory limit set for "
+
TRITONSERVER_InstanceGroupKindString
(
kind
)
+
" "
+
std
::
to_string
(
id
)
+
" has exceeded, model loading is rejected."
);
}
}
}
}
}
return
Status
::
Success
;
}
Status
TritonModelInstance
::
CreateInstance
(
TritonModel
*
model
,
const
std
::
string
&
name
,
const
size_t
index
,
const
TRITONSERVER_InstanceGroupKind
kind
,
const
int32_t
device_id
,
const
std
::
vector
<
std
::
string
>&
profile_names
,
const
bool
passive
,
const
std
::
string
&
host_policy_name
,
const
triton
::
common
::
HostPolicyCmdlineConfig
&
host_policy
,
const
inference
::
ModelRateLimiter
&
rate_limiter_config
,
const
bool
device_blocking
,
std
::
map
<
uint32_t
,
std
::
shared_ptr
<
TritonBackendThread
>>*
device_to_thread_map
,
const
std
::
vector
<
SecondaryDevice
>&
secondary_devices
)
{
// Create the JSON representation of the backend configuration.
triton
::
common
::
TritonJson
::
Value
host_policy_json
(
triton
::
common
::
TritonJson
::
ValueType
::
OBJECT
);
triton
::
common
::
TritonJson
::
Value
policy_setting_json
(
host_policy_json
,
triton
::
common
::
TritonJson
::
ValueType
::
OBJECT
);
for
(
const
auto
&
pr
:
host_policy
)
{
RETURN_IF_ERROR
(
policy_setting_json
.
AddString
(
pr
.
first
.
c_str
(),
pr
.
second
));
}
RETURN_IF_ERROR
(
host_policy_json
.
Add
(
host_policy_name
.
c_str
(),
std
::
move
(
policy_setting_json
)));
TritonServerMessage
host_policy_message
(
host_policy_json
);
std
::
unique_ptr
<
TritonModelInstance
>
local_instance
(
new
TritonModelInstance
(
model
,
name
,
index
,
kind
,
device_id
,
profile_names
,
passive
,
host_policy
,
host_policy_message
,
secondary_devices
));
TRITONBACKEND_ModelInstance
*
triton_instance
=
reinterpret_cast
<
TRITONBACKEND_ModelInstance
*>
(
local_instance
.
get
());
// Instance initialization is optional... We must set set shared
// library path to point to the backend directory in case the
// backend library attempts to load additional shared libaries.
if
(
model
->
Backend
()
->
ModelInstanceInitFn
()
!=
nullptr
)
{
std
::
unique_ptr
<
SharedLibrary
>
slib
;
RETURN_IF_ERROR
(
SharedLibrary
::
Acquire
(
&
slib
));
RETURN_IF_ERROR
(
slib
->
SetLibraryDirectory
(
model
->
Backend
()
->
Directory
()));
TRITONSERVER_Error
*
err
=
model
->
Backend
()
->
ModelInstanceInitFn
()(
triton_instance
);
RETURN_IF_ERROR
(
slib
->
ResetLibraryDirectory
());
RETURN_IF_TRITONSERVER_ERROR
(
err
);
}
if
(
!
passive
)
{
RETURN_IF_ERROR
(
local_instance
->
GenerateWarmupData
());
RETURN_IF_ERROR
(
model
->
Server
()
->
GetRateLimiter
()
->
RegisterModelInstance
(
local_instance
.
get
(),
rate_limiter_config
));
RETURN_IF_ERROR
(
local_instance
->
SetBackendThread
(
kind
,
device_id
,
device_blocking
,
device_to_thread_map
));
}
RETURN_IF_ERROR
(
model
->
AddInstance
(
std
::
move
(
local_instance
),
passive
));
return
Status
::
Success
;
}
Status
TritonModelInstance
::
SetBackendThread
(
const
TRITONSERVER_InstanceGroupKind
kind
,
const
int32_t
device_id
,
const
bool
device_blocking
,
std
::
map
<
uint32_t
,
std
::
shared_ptr
<
TritonBackendThread
>>*
device_to_thread_map
)
{
if
(
device_blocking
&&
(
kind
==
TRITONSERVER_INSTANCEGROUPKIND_GPU
))
{
auto
thread_it
=
device_to_thread_map
->
find
(
device_id
);
if
(
thread_it
!=
device_to_thread_map
->
end
())
{
LOG_VERBOSE
(
1
)
<<
"Using already started backend thread for "
<<
Name
()
<<
" on device "
<<
device_id
;
triton_backend_thread_
=
thread_it
->
second
;
}
}
if
(
triton_backend_thread_
.
get
()
==
nullptr
)
{
std
::
unique_ptr
<
TritonBackendThread
>
local_backend_thread
;
RETURN_IF_ERROR
(
TritonBackendThread
::
CreateBackendThread
(
Name
(),
this
,
0
/* nice */
,
device_id
,
&
local_backend_thread
));
triton_backend_thread_
=
std
::
move
(
local_backend_thread
);
device_to_thread_map
->
insert
({
device_id
,
triton_backend_thread_
});
}
else
{
triton_backend_thread_
->
AddModelInstance
(
this
);
}
RETURN_IF_ERROR
(
triton_backend_thread_
->
InitAndWarmUpModelInstance
(
this
));
return
Status
::
Success
;
}
Status
TritonModelInstance
::
GenerateWarmupData
()
{
warmup_samples_
.
clear
();
for
(
const
auto
&
warmup_setting
:
model_
->
Config
().
model_warmup
())
{
if
(
warmup_setting
.
batch_size
()
==
0
)
{
LOG_VERBOSE
(
1
)
<<
"Skipping batch 0 warmup sample '"
<<
warmup_setting
.
name
()
<<
"'"
;
continue
;
}
LOG_VERBOSE
(
1
)
<<
"Generating warmup sample data for '"
<<
warmup_setting
.
name
()
<<
"'"
;
// Two passes. First pass to get max byte size for synthetic
// data. Second pass to add original inputs and override inputs
// for control inputs.
int64_t
max_zero_byte_size
=
0
;
int64_t
max_random_byte_size
=
0
;
for
(
const
auto
&
input_meta
:
warmup_setting
.
inputs
())
{
auto
element_count
=
triton
::
common
::
GetElementCount
(
input_meta
.
second
.
dims
());
if
(
element_count
==
-
1
)
{
return
Status
(
Status
::
Code
::
INVALID_ARG
,
"warmup setting expects all variable-size dimensions are specified "
"for input '"
+
input_meta
.
first
+
"'"
);
}
int64_t
batch_byte_size
=
element_count
*
triton
::
common
::
GetDataTypeByteSize
(
input_meta
.
second
.
data_type
());
if
(
batch_byte_size
==
0
)
{
batch_byte_size
=
element_count
*
sizeof
(
int32_t
);
}
switch
(
input_meta
.
second
.
input_data_type_case
())
{
case
inference
::
ModelWarmup_Input
::
InputDataTypeCase
::
kZeroData
:
max_zero_byte_size
=
std
::
max
(
batch_byte_size
,
max_zero_byte_size
);
break
;
case
inference
::
ModelWarmup_Input
::
InputDataTypeCase
::
kRandomData
:
{
// Because Triton expects STRING type to be in special format
// (prepend 4 bytes to specify string length), so using zero data
// for simplicity (4 bytes * element count of zeros).
if
(
input_meta
.
second
.
data_type
()
==
inference
::
DataType
::
TYPE_STRING
)
{
max_zero_byte_size
=
std
::
max
(
batch_byte_size
,
max_zero_byte_size
);
}
else
{
max_random_byte_size
=
std
::
max
(
batch_byte_size
,
max_random_byte_size
);
}
break
;
}
default:
break
;
}
}
warmup_samples_
.
emplace_back
(
warmup_setting
.
name
(),
warmup_setting
.
count
());
auto
&
warmup_data
=
warmup_samples_
.
back
();
// Create buffers for synthetic data
TRITONSERVER_MemoryType
type
;
int64_t
type_id
;
warmup_data
.
zero_data_
.
reset
(
new
AllocatedMemory
(
max_zero_byte_size
,
TRITONSERVER_MEMORY_CPU_PINNED
/* memory_type */
,
0
/* memory_type_id */
));
char
*
zero_buffer
=
warmup_data
.
zero_data_
->
MutableBuffer
(
&
type
,
&
type_id
);
memset
(
zero_buffer
,
0
,
max_zero_byte_size
);
warmup_data
.
random_data_
.
reset
(
new
AllocatedMemory
(
max_random_byte_size
,
TRITONSERVER_MEMORY_CPU_PINNED
/* memory_type */
,
0
/* memory_type_id */
));
char
*
random_buffer
=
warmup_data
.
random_data_
->
MutableBuffer
(
&
type
,
&
type_id
);
for
(
int64_t
offset
=
0
;
offset
<
max_random_byte_size
;
offset
++
)
{
random_buffer
[
offset
]
=
rand
();
}
// Prepare the inference request for the specified sample, not using
// in-process C API because the request doesn't go through the same pipeline
// (i.e. no normalization / scheduler) so we need to prepare the request to
// the state just before calling instance execute function.
for
(
size_t
cnt
=
0
;
cnt
<
warmup_setting
.
batch_size
();
cnt
++
)
{
warmup_data
.
requests_
.
emplace_back
(
new
InferenceRequest
(
model_
,
model_
->
Version
()));
auto
&
lrequest
=
warmup_data
.
requests_
.
back
();
// Second pass to prepare original inputs.
std
::
vector
<
std
::
shared_ptr
<
InferenceRequest
::
Input
>>
input_sps
;
for
(
const
auto
&
input_meta
:
warmup_setting
.
inputs
())
{
auto
batch1_element_count
=
triton
::
common
::
GetElementCount
(
input_meta
.
second
.
dims
());
auto
batch_byte_size
=
batch1_element_count
*
triton
::
common
::
GetDataTypeByteSize
(
input_meta
.
second
.
data_type
());
if
(
batch_byte_size
==
0
)
{
batch_byte_size
=
batch1_element_count
*
sizeof
(
int32_t
);
}
const
char
*
allocated_ptr
;
switch
(
input_meta
.
second
.
input_data_type_case
())
{
case
inference
::
ModelWarmup_Input
::
InputDataTypeCase
::
kZeroData
:
allocated_ptr
=
zero_buffer
;
break
;
case
inference
::
ModelWarmup_Input
::
InputDataTypeCase
::
kRandomData
:
{
if
(
input_meta
.
second
.
data_type
()
==
inference
::
DataType
::
TYPE_STRING
)
{
allocated_ptr
=
zero_buffer
;
}
else
{
allocated_ptr
=
random_buffer
;
}
break
;
}
case
inference
::
ModelWarmup_Input
::
InputDataTypeCase
::
kInputDataFile:
{
// For data provided from file, we can set buffer in first pass
warmup_data
.
provided_data_
.
emplace_back
(
new
std
::
string
());
auto
input_data
=
warmup_data
.
provided_data_
.
back
().
get
();
RETURN_IF_ERROR
(
ReadTextFile
(
JoinPath
(
{
model_
->
LocalizedModelPath
(),
kWarmupDataFolder
,
input_meta
.
second
.
input_data_file
()
}
),
input_data
));
if
(
input_meta
.
second
.
data_type
()
==
inference
::
DataType
::
TYPE_STRING
)
{
batch_byte_size
=
input_data
->
size
();
}
else
if
(((
size_t
)
batch_byte_size
)
>
input_data
->
size
())
{
return
Status
(
Status
::
Code
::
INVALID_ARG
,
lrequest
->
LogRequest
()
+
"warmup setting expects "
+
std
::
to_string
(
batch_byte_size
)
+
" bytes, but the data "
"provided from "
+
input_meta
.
second
.
input_data_file
()
+
"only has "
+
std
::
to_string
(
input_data
->
size
())
+
" bytes"
);
}
allocated_ptr
=
input_data
->
data
();
break
;
}
default:
return
Status
(
Status
::
Code
::
INVALID_ARG
,
lrequest
->
LogRequest
()
+
"warmup setting expects input '"
+
input_meta
.
first
+
"' to have input_data_type set"
);
}
const
inference
::
ModelInput
*
input_config
;
bool
is_original_input
=
model_
->
GetInput
(
input_meta
.
first
,
&
input_config
).
IsOk
();
InferenceRequest
::
Input
*
input
=
nullptr
;
std
::
vector
<
int64_t
>
input_meta_shape
;
// Append batch size only if the model supports batching
// and not control inpt.
if
((
model_
->
Config
().
max_batch_size
()
!=
0
)
&&
is_original_input
)
{
input_meta_shape
.
push_back
(
1
);
}
for
(
auto
d
:
input_meta
.
second
.
dims
())
{
input_meta_shape
.
push_back
(
d
);
}
if
(
is_original_input
)
{
RETURN_IF_ERROR
(
lrequest
->
AddOriginalInput
(
input_meta
.
first
,
input_meta
.
second
.
data_type
(),
input_meta_shape
,
&
input
));
}
else
{
input_sps
.
emplace_back
();
RETURN_IF_ERROR
(
lrequest
->
AddOverrideInput
(
input_meta
.
first
,
input_meta
.
second
.
data_type
(),
(
model_
->
Config
().
max_batch_size
()
!=
0
?
1
:
0
),
input_meta_shape
,
&
input_sps
.
back
()));
input
=
input_sps
.
back
().
get
();
}
RETURN_IF_ERROR
(
input
->
AppendData
(
allocated_ptr
,
batch_byte_size
,
TRITONSERVER_MEMORY_CPU
/* memory_type */
,
0
/* memory_type_id */
));
}
RETURN_IF_ERROR
(
lrequest
->
PrepareForInference
());
// Override inputs must be added after PrepareForInference() is called
for
(
const
auto
&
sp
:
input_sps
)
{
RETURN_IF_ERROR
(
lrequest
->
AddOverrideInput
(
sp
));
}
}
}
return
Status
::
Success
;
}
void
TritonModelInstance
::
Schedule
(
std
::
vector
<
std
::
unique_ptr
<
InferenceRequest
>>&&
requests
,
const
std
::
function
<
void
()
>&
OnCompletion
)
{
// Use a thread local vector to avoid needing to malloc each
// time an inference is run.
thread_local
std
::
vector
<
TRITONBACKEND_Request
*>
triton_requests
(
1024
);
triton_requests
.
clear
();
for
(
auto
&
r
:
requests
)
{
// Load the input states for the inference request.
r
->
LoadInputStates
();
triton_requests
.
push_back
(
reinterpret_cast
<
TRITONBACKEND_Request
*>
(
r
.
release
()));
}
Execute
(
triton_requests
);
OnCompletion
();
}
Status
TritonModelInstance
::
Initialize
()
{
RETURN_IF_ERROR
(
SetNumaConfigOnThread
(
HostPolicy
()));
return
Status
::
Success
;
}
Status
TritonModelInstance
::
WarmUp
()
{
// move samples to local variable for scoped cleanup
std
::
vector
<
triton
::
core
::
TritonModelInstance
::
WarmupData
>
lwarmup_samples
;
lwarmup_samples
.
swap
(
warmup_samples_
);
for
(
auto
&
sample
:
lwarmup_samples
)
{
for
(
size_t
iteration
=
1
;
iteration
<=
sample
.
count_
;
++
iteration
)
{
LOG_VERBOSE
(
1
)
<<
"model '"
<<
sample
.
requests_
.
back
()
->
ModelName
()
<<
"' instance "
<<
Name
()
<<
" is running warmup sample '"
<<
sample
.
sample_name_
<<
"' for iteration "
<<
iteration
;
// request/response complete is asynchronous so use promise to wait for
// completion. Also collects error message from the responses in a vector.
std
::
vector
<
std
::
promise
<
void
>>
request_complete
(
sample
.
requests_
.
size
());
std
::
vector
<
std
::
string
>
response_errors
;
std
::
vector
<
std
::
pair
<
std
::
promise
<
void
>
,
std
::
vector
<
std
::
string
>*>>
response_complete
(
sample
.
requests_
.
size
());
std
::
vector
<
TRITONBACKEND_Request
*>
triton_requests
;
for
(
size_t
i
=
0
;
i
<
sample
.
requests_
.
size
();
++
i
)
{
auto
&
request
=
sample
.
requests_
[
i
];
request
->
SetReleaseCallback
(
WarmupRequestComplete
,
&
request_complete
[
i
]);
response_complete
[
i
].
second
=
&
response_errors
;
request
->
SetResponseCallback
(
&
warmup_allocator
,
nullptr
,
WarmupResponseComplete
,
&
response_complete
[
i
]);
// Capture timestamp before run to avoid incorrect accumulation from
// sequential warmup runs
#ifdef TRITON_ENABLE_STATS
request
->
CaptureRequestStartNs
();
#endif // TRITON_ENABLE_STATS
request
->
CaptureQueueStartNs
();
triton_requests
.
push_back
(
reinterpret_cast
<
TRITONBACKEND_Request
*>
(
request
.
get
()));
}
Execute
(
triton_requests
);
// Wait for warmup sample to complete and check error
for
(
size_t
i
=
0
;
i
<
sample
.
requests_
.
size
();
++
i
)
{
request_complete
[
i
].
get_future
().
get
();
response_complete
[
i
].
first
.
get_future
().
get
();
}
if
(
response_errors
.
size
()
!=
0
)
{
std
::
string
err_str
=
"failed to run warmup sample '"
+
sample
.
sample_name_
+
"': "
;
for
(
const
auto
&
error
:
response_errors
)
{
err_str
+=
(
error
+
"; "
);
}
// End warmup as soon as there is failing sample
LOG_VERBOSE
(
1
)
<<
"model '"
<<
sample
.
requests_
.
back
()
->
ModelName
()
<<
"' instance "
<<
Name
()
<<
" failed to run warmup sample '"
<<
sample
.
sample_name_
<<
"'"
;
return
Status
(
Status
::
Code
::
INVALID_ARG
,
err_str
);
}
}
}
return
Status
::
Success
;
}
void
TritonModelInstance
::
Execute
(
std
::
vector
<
TRITONBACKEND_Request
*>&
triton_requests
)
{
TRITONBACKEND_ModelInstance
*
triton_model_instance
=
reinterpret_cast
<
TRITONBACKEND_ModelInstance
*>
(
this
);
TritonBackend
::
TritonModelInstanceExecFn_t
inst_exec_fn
=
model_
->
Backend
()
->
ModelInstanceExecFn
();
// If there is an error then we retain ownership of 'requests'
// and must send error responses.
TRITONSERVER_Error
*
err
=
inst_exec_fn
(
triton_model_instance
,
&
triton_requests
[
0
],
triton_requests
.
size
());
if
(
err
!=
nullptr
)
{
Status
status
=
Status
(
TritonCodeToStatusCode
(
TRITONSERVER_ErrorCode
(
err
)),
TRITONSERVER_ErrorMessage
(
err
));
for
(
TRITONBACKEND_Request
*
tr
:
triton_requests
)
{
std
::
unique_ptr
<
InferenceRequest
>
ur
(
reinterpret_cast
<
InferenceRequest
*>
(
tr
));
InferenceRequest
::
RespondIfError
(
ur
,
status
,
true
/* release_requests */
);
}
TRITONSERVER_ErrorDelete
(
err
);
}
}
Status
TritonModelInstance
::
TritonBackendThread
::
CreateBackendThread
(
const
std
::
string
name
,
TritonModelInstance
*
model_instance
,
const
int
nice
,
const
int32_t
device_id
,
std
::
unique_ptr
<
TritonBackendThread
>*
triton_backend_thread
)
{
TritonBackendThread
*
raw_triton_backend_thread
=
new
TritonBackendThread
(
name
,
model_instance
->
Model
());
std
::
unique_ptr
<
TritonBackendThread
>
runner
(
raw_triton_backend_thread
);
runner
->
AddModelInstance
(
model_instance
);
runner
->
backend_thread_
=
std
::
thread
([
raw_triton_backend_thread
,
nice
,
device_id
]()
{
raw_triton_backend_thread
->
BackendThread
(
nice
,
device_id
);
});
triton_backend_thread
->
reset
(
runner
.
release
());
return
Status
::
Success
;
}
void
TritonModelInstance
::
TritonBackendThread
::
AddModelInstance
(
TritonModelInstance
*
model_instance
)
{
model_instances_
.
push_back
(
model_instance
);
}
Status
TritonModelInstance
::
TritonBackendThread
::
InitAndWarmUpModelInstance
(
TritonModelInstance
*
model_instance
)
{
// Initialize the instance on the backend thread
auto
init_payload
=
model_
->
Server
()
->
GetRateLimiter
()
->
GetPayload
(
Payload
::
Operation
::
INIT
,
model_instance
);
RETURN_IF_ERROR
(
model_
->
Server
()
->
GetRateLimiter
()
->
EnqueuePayload
(
model_
,
init_payload
));
RETURN_IF_ERROR
(
init_payload
->
Wait
());
// Warm-up the instance on the backend thread
auto
warmup_payload
=
model_
->
Server
()
->
GetRateLimiter
()
->
GetPayload
(
Payload
::
Operation
::
WARM_UP
,
model_instance
);
RETURN_IF_ERROR
(
model_
->
Server
()
->
GetRateLimiter
()
->
EnqueuePayload
(
model_
,
warmup_payload
));
RETURN_IF_ERROR
(
warmup_payload
->
Wait
());
return
Status
::
Success
;
}
TritonModelInstance
::
TritonBackendThread
::
TritonBackendThread
(
const
std
::
string
&
name
,
TritonModel
*
model
)
:
name_
(
name
),
model_
(
model
)
{
}
TritonModelInstance
::
TritonBackendThread
::~
TritonBackendThread
()
{
StopBackendThread
();
}
void
TritonModelInstance
::
TritonBackendThread
::
StopBackendThread
()
{
if
(
backend_thread_
.
joinable
())
{
// Signal the backend thread to exit and then wait for it...
auto
exit_payload
=
model_
->
Server
()
->
GetRateLimiter
()
->
GetPayload
(
Payload
::
Operation
::
EXIT
,
model_instances_
.
back
());
model_
->
Server
()
->
GetRateLimiter
()
->
EnqueuePayload
(
model_
,
exit_payload
);
backend_thread_
.
join
();
}
}
void
TritonModelInstance
::
TritonBackendThread
::
BackendThread
(
const
int
nice
,
const
int32_t
device_id
)
{
#ifndef _WIN32
if
(
setpriority
(
PRIO_PROCESS
,
syscall
(
SYS_gettid
),
nice
)
==
0
)
{
LOG_VERBOSE
(
1
)
<<
"Starting backend thread for "
<<
name_
<<
" at nice "
<<
nice
<<
" on device "
<<
device_id
<<
"..."
;
}
else
{
LOG_VERBOSE
(
1
)
<<
"Starting backend thread for "
<<
name_
<<
" at default nice (requested nice "
<<
nice
<<
" failed)"
<<
" on device "
<<
device_id
<<
"..."
;
}
#else
LOG_VERBOSE
(
1
)
<<
"Starting backend thread for "
<<
name_
<<
" at default nice on device "
<<
device_id
<<
"..."
;
#endif
bool
should_exit
=
false
;
while
(
!
should_exit
)
{
std
::
shared_ptr
<
Payload
>
payload
;
model_
->
Server
()
->
GetRateLimiter
()
->
DequeuePayload
(
model_instances_
,
&
payload
);
NVTX_RANGE
(
nvtx_
,
"BackendThread "
+
name_
);
payload
->
Execute
(
&
should_exit
);
model_instances_
.
push_back
(
payload
->
GetInstance
());
// Release the payload to the RateLimiter
model_
->
Server
()
->
GetRateLimiter
()
->
PayloadRelease
(
payload
);
}
LOG_VERBOSE
(
1
)
<<
"Stopping backend thread for "
<<
name_
<<
"..."
;
}
extern
"C"
{
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelInstanceName
(
TRITONBACKEND_ModelInstance
*
instance
,
const
char
**
name
)
{
TritonModelInstance
*
ti
=
reinterpret_cast
<
TritonModelInstance
*>
(
instance
);
*
name
=
ti
->
Name
().
c_str
();
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelInstanceKind
(
TRITONBACKEND_ModelInstance
*
instance
,
TRITONSERVER_InstanceGroupKind
*
kind
)
{
TritonModelInstance
*
ti
=
reinterpret_cast
<
TritonModelInstance
*>
(
instance
);
*
kind
=
ti
->
Kind
();
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelInstanceDeviceId
(
TRITONBACKEND_ModelInstance
*
instance
,
int32_t
*
device_id
)
{
TritonModelInstance
*
ti
=
reinterpret_cast
<
TritonModelInstance
*>
(
instance
);
*
device_id
=
ti
->
DeviceId
();
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelInstanceHostPolicy
(
TRITONBACKEND_ModelInstance
*
instance
,
TRITONSERVER_Message
**
host_policy
)
{
TritonModelInstance
*
ti
=
reinterpret_cast
<
TritonModelInstance
*>
(
instance
);
*
host_policy
=
const_cast
<
TRITONSERVER_Message
*>
(
reinterpret_cast
<
const
TRITONSERVER_Message
*>
(
&
ti
->
HostPolicyMessage
()));
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelInstanceProfileCount
(
TRITONBACKEND_ModelInstance
*
instance
,
uint32_t
*
count
)
{
TritonModelInstance
*
ti
=
reinterpret_cast
<
TritonModelInstance
*>
(
instance
);
*
count
=
ti
->
Profiles
().
size
();
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelInstanceProfileName
(
TRITONBACKEND_ModelInstance
*
instance
,
const
uint32_t
index
,
const
char
**
profile_name
)
{
*
profile_name
=
nullptr
;
TritonModelInstance
*
ti
=
reinterpret_cast
<
TritonModelInstance
*>
(
instance
);
const
auto
&
rprofiles
=
ti
->
Profiles
();
if
(
index
>=
rprofiles
.
size
())
{
return
TRITONSERVER_ErrorNew
(
TRITONSERVER_ERROR_INVALID_ARG
,
(
std
::
string
(
"out of bounds index "
)
+
std
::
to_string
(
index
)
+
": instance is configured with "
+
std
::
to_string
(
rprofiles
.
size
())
+
" profiles"
)
.
c_str
());
}
*
profile_name
=
rprofiles
[
index
].
c_str
();
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelInstanceSecondaryDeviceCount
(
TRITONBACKEND_ModelInstance
*
instance
,
uint32_t
*
count
)
{
TritonModelInstance
*
ti
=
reinterpret_cast
<
TritonModelInstance
*>
(
instance
);
*
count
=
ti
->
SecondaryDevices
().
size
();
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelInstanceSecondaryDeviceProperties
(
TRITONBACKEND_ModelInstance
*
instance
,
uint32_t
index
,
const
char
**
kind
,
int64_t
*
id
)
{
TritonModelInstance
*
ti
=
reinterpret_cast
<
TritonModelInstance
*>
(
instance
);
const
auto
&
rsecondarydevices
=
ti
->
SecondaryDevices
();
if
(
index
>=
rsecondarydevices
.
size
())
{
return
TRITONSERVER_ErrorNew
(
TRITONSERVER_ERROR_INVALID_ARG
,
(
std
::
string
(
"out of bounds index "
)
+
std
::
to_string
(
index
)
+
": instance is configured with "
+
std
::
to_string
(
rsecondarydevices
.
size
())
+
" secondary devices"
)
.
c_str
());
}
*
kind
=
rsecondarydevices
[
index
].
kind_
.
c_str
();
*
id
=
rsecondarydevices
[
index
].
id_
;
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelInstanceIsPassive
(
TRITONBACKEND_ModelInstance
*
instance
,
bool
*
is_passive
)
{
TritonModelInstance
*
ti
=
reinterpret_cast
<
TritonModelInstance
*>
(
instance
);
*
is_passive
=
ti
->
IsPassive
();
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelInstanceModel
(
TRITONBACKEND_ModelInstance
*
instance
,
TRITONBACKEND_Model
**
model
)
{
TritonModelInstance
*
ti
=
reinterpret_cast
<
TritonModelInstance
*>
(
instance
);
*
model
=
reinterpret_cast
<
TRITONBACKEND_Model
*>
(
ti
->
Model
());
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelInstanceState
(
TRITONBACKEND_ModelInstance
*
instance
,
void
**
state
)
{
TritonModelInstance
*
ti
=
reinterpret_cast
<
TritonModelInstance
*>
(
instance
);
*
state
=
ti
->
State
();
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelInstanceSetState
(
TRITONBACKEND_ModelInstance
*
instance
,
void
*
state
)
{
TritonModelInstance
*
ti
=
reinterpret_cast
<
TritonModelInstance
*>
(
instance
);
ti
->
SetState
(
state
);
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelInstanceReportStatistics
(
TRITONBACKEND_ModelInstance
*
instance
,
TRITONBACKEND_Request
*
request
,
const
bool
success
,
const
uint64_t
exec_start_ns
,
const
uint64_t
compute_start_ns
,
const
uint64_t
compute_end_ns
,
const
uint64_t
exec_end_ns
)
{
#ifdef TRITON_ENABLE_STATS
TritonModelInstance
*
ti
=
reinterpret_cast
<
TritonModelInstance
*>
(
instance
);
InferenceRequest
*
tr
=
reinterpret_cast
<
InferenceRequest
*>
(
request
);
tr
->
ReportStatistics
(
ti
->
MetricReporter
(),
success
,
exec_start_ns
,
compute_start_ns
,
compute_end_ns
,
exec_end_ns
);
#endif // TRITON_ENABLE_STATS
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelInstanceReportBatchStatistics
(
TRITONBACKEND_ModelInstance
*
instance
,
const
uint64_t
batch_size
,
const
uint64_t
exec_start_ns
,
const
uint64_t
compute_start_ns
,
const
uint64_t
compute_end_ns
,
const
uint64_t
exec_end_ns
)
{
#ifdef TRITON_ENABLE_STATS
TritonModelInstance
*
ti
=
reinterpret_cast
<
TritonModelInstance
*>
(
instance
);
ti
->
Model
()
->
MutableStatsAggregator
()
->
UpdateInferBatchStats
(
ti
->
MetricReporter
(),
batch_size
,
exec_start_ns
,
compute_start_ns
,
compute_end_ns
,
exec_end_ns
);
#endif // TRITON_ENABLE_STATS
return
nullptr
;
// success
}
}
// extern C
}}
// namespace triton::core
3rdparty/core-r22.12/src/backend_model_instance.h
0 → 100644
View file @
0a21fff9
// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <functional>
#include <future>
#include <memory>
#include <string>
#include <thread>
#include "constants.h"
#include "memory.h"
#include "metric_model_reporter.h"
#include "model_config.pb.h"
#include "server_message.h"
#include "status.h"
#include "triton/common/sync_queue.h"
namespace
triton
{
namespace
core
{
class
TritonModel
;
class
InferenceRequest
;
//
// Represents a model instance.
//
class
TritonModelInstance
{
public:
static
Status
CreateInstances
(
TritonModel
*
model
,
const
triton
::
common
::
BackendCmdlineConfigMap
&
backend_cmdline_config_map
,
const
triton
::
common
::
HostPolicyCmdlineConfigMap
&
host_policy_map
,
const
inference
::
ModelConfig
&
model_config
,
const
bool
device_blocking
);
~
TritonModelInstance
();
const
std
::
string
&
Name
()
const
{
return
name_
;
}
size_t
Index
()
const
{
return
index_
;
}
TRITONSERVER_InstanceGroupKind
Kind
()
const
{
return
kind_
;
}
int32_t
DeviceId
()
const
{
return
device_id_
;
}
const
triton
::
common
::
HostPolicyCmdlineConfig
&
HostPolicy
()
const
{
return
host_policy_
;
}
const
TritonServerMessage
&
HostPolicyMessage
()
const
{
return
host_policy_message_
;
}
bool
IsPassive
()
const
{
return
passive_
;
}
const
std
::
vector
<
std
::
string
>&
Profiles
()
const
{
return
profile_names_
;
}
struct
SecondaryDevice
{
SecondaryDevice
(
const
std
::
string
kind
,
const
int64_t
id
)
:
kind_
(
kind
),
id_
(
id
)
{
}
const
std
::
string
kind_
;
const
int64_t
id_
;
};
const
std
::
vector
<
SecondaryDevice
>&
SecondaryDevices
()
const
{
return
secondary_devices_
;
}
Status
Initialize
();
Status
WarmUp
();
void
Schedule
(
std
::
vector
<
std
::
unique_ptr
<
InferenceRequest
>>&&
requests
,
const
std
::
function
<
void
()
>&
OnCompletion
);
TritonModel
*
Model
()
const
{
return
model_
;
}
void
*
State
()
{
return
state_
;
}
void
SetState
(
void
*
state
)
{
state_
=
state
;
}
MetricModelReporter
*
MetricReporter
()
const
{
return
reporter_
.
get
();
}
private:
DISALLOW_COPY_AND_ASSIGN
(
TritonModelInstance
);
class
TritonBackendThread
;
TritonModelInstance
(
TritonModel
*
model
,
const
std
::
string
&
name
,
const
size_t
index
,
const
TRITONSERVER_InstanceGroupKind
kind
,
const
int32_t
device_id
,
const
std
::
vector
<
std
::
string
>&
profile_names
,
const
bool
passive
,
const
triton
::
common
::
HostPolicyCmdlineConfig
&
host_policy
,
const
TritonServerMessage
&
host_policy_message
,
const
std
::
vector
<
SecondaryDevice
>&
secondary_devices
);
static
Status
CreateInstance
(
TritonModel
*
model
,
const
std
::
string
&
name
,
const
size_t
index
,
const
TRITONSERVER_InstanceGroupKind
kind
,
const
int32_t
device_id
,
const
std
::
vector
<
std
::
string
>&
profile_names
,
const
bool
passive
,
const
std
::
string
&
host_policy_name
,
const
triton
::
common
::
HostPolicyCmdlineConfig
&
host_policy
,
const
inference
::
ModelRateLimiter
&
rate_limiter_config
,
const
bool
device_blocking
,
std
::
map
<
uint32_t
,
std
::
shared_ptr
<
TritonBackendThread
>>*
device_to_thread_map
,
const
std
::
vector
<
SecondaryDevice
>&
secondary_devices
);
Status
SetBackendThread
(
const
TRITONSERVER_InstanceGroupKind
kind
,
const
int32_t
device_id
,
const
bool
device_blocking
,
std
::
map
<
uint32_t
,
std
::
shared_ptr
<
TritonBackendThread
>>*
device_to_thread_map
);
Status
GenerateWarmupData
();
void
Execute
(
std
::
vector
<
TRITONBACKEND_Request
*>&
triton_requests
);
class
TritonBackendThread
{
public:
static
Status
CreateBackendThread
(
const
std
::
string
name
,
TritonModelInstance
*
model
,
const
int
nice
,
const
int32_t
device_id
,
std
::
unique_ptr
<
TritonBackendThread
>*
triton_backend_thread
);
void
AddModelInstance
(
TritonModelInstance
*
model_instance
);
Status
InitAndWarmUpModelInstance
(
TritonModelInstance
*
model_instance
);
void
StopBackendThread
();
~
TritonBackendThread
();
private:
TritonBackendThread
(
const
std
::
string
&
name
,
TritonModel
*
model
);
void
BackendThread
(
const
int
nice
,
const
int32_t
device_id
);
std
::
string
name_
;
TritonModel
*
model_
;
std
::
deque
<
TritonModelInstance
*>
model_instances_
;
std
::
thread
backend_thread_
;
std
::
atomic
<
bool
>
backend_thread_exit_
;
};
std
::
shared_ptr
<
TritonBackendThread
>
triton_backend_thread_
;
struct
WarmupData
{
WarmupData
(
const
std
::
string
&
sample_name
,
const
size_t
count
)
:
sample_name_
(
sample_name
),
count_
(
std
::
max
(
count
,
size_t
{
1
}))
{
}
std
::
string
sample_name_
;
size_t
count_
;
// Using a batch of requests to satisfy batch size, this provides better
// alignment on the batch expected by the model, especially for sequence
// model.
std
::
vector
<
std
::
unique_ptr
<
InferenceRequest
>>
requests_
;
// Placeholder for input data
std
::
unique_ptr
<
AllocatedMemory
>
zero_data_
;
std
::
unique_ptr
<
AllocatedMemory
>
random_data_
;
std
::
vector
<
std
::
unique_ptr
<
std
::
string
>>
provided_data_
;
};
std
::
vector
<
WarmupData
>
warmup_samples_
;
// The TritonModel object that owns this instance. The instance
// holds this as a raw pointer because the lifetime of the model is
// guaranteed to be longer than the lifetime of an instance owned by the
// model.
TritonModel
*
model_
;
std
::
string
name_
;
size_t
index_
;
// For CPU device_id_ is always 0. For GPU device_id_ indicates the
// GPU device to be used by the instance.
TRITONSERVER_InstanceGroupKind
kind_
;
int32_t
device_id_
;
const
triton
::
common
::
HostPolicyCmdlineConfig
host_policy_
;
TritonServerMessage
host_policy_message_
;
std
::
vector
<
std
::
string
>
profile_names_
;
bool
passive_
;
std
::
vector
<
SecondaryDevice
>
secondary_devices_
;
// Reporter for metrics, or nullptr if no metrics should be reported
std
::
shared_ptr
<
MetricModelReporter
>
reporter_
;
// Opaque state associated with this model instance.
void
*
state_
;
};
}}
// namespace triton::core
3rdparty/core-r22.12/src/buffer_attributes.cc
0 → 100644
View file @
0a21fff9
// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "buffer_attributes.h"
#include <cstring>
#include "constants.h"
namespace
triton
{
namespace
core
{
void
BufferAttributes
::
SetByteSize
(
const
size_t
&
byte_size
)
{
byte_size_
=
byte_size
;
}
void
BufferAttributes
::
SetMemoryType
(
const
TRITONSERVER_MemoryType
&
memory_type
)
{
memory_type_
=
memory_type
;
}
void
BufferAttributes
::
SetMemoryTypeId
(
const
int64_t
&
memory_type_id
)
{
memory_type_id_
=
memory_type_id
;
}
void
BufferAttributes
::
SetCudaIpcHandle
(
void
*
cuda_ipc_handle
)
{
char
*
lcuda_ipc_handle
=
reinterpret_cast
<
char
*>
(
cuda_ipc_handle
);
cuda_ipc_handle_
.
clear
();
std
::
copy
(
lcuda_ipc_handle
,
lcuda_ipc_handle
+
CUDA_IPC_STRUCT_SIZE
,
std
::
back_inserter
(
cuda_ipc_handle_
));
}
void
*
BufferAttributes
::
CudaIpcHandle
()
{
if
(
cuda_ipc_handle_
.
empty
())
{
return
nullptr
;
}
else
{
return
reinterpret_cast
<
void
*>
(
cuda_ipc_handle_
.
data
());
}
}
size_t
BufferAttributes
::
ByteSize
()
const
{
return
byte_size_
;
}
TRITONSERVER_MemoryType
BufferAttributes
::
MemoryType
()
const
{
return
memory_type_
;
}
int64_t
BufferAttributes
::
MemoryTypeId
()
const
{
return
memory_type_id_
;
}
BufferAttributes
::
BufferAttributes
(
size_t
byte_size
,
TRITONSERVER_MemoryType
memory_type
,
int64_t
memory_type_id
,
char
*
cuda_ipc_handle
)
:
byte_size_
(
byte_size
),
memory_type_
(
memory_type
),
memory_type_id_
(
memory_type_id
)
{
// cuda ipc handle size
cuda_ipc_handle_
.
reserve
(
CUDA_IPC_STRUCT_SIZE
);
if
(
cuda_ipc_handle
!=
nullptr
)
{
std
::
copy
(
cuda_ipc_handle
,
cuda_ipc_handle
+
CUDA_IPC_STRUCT_SIZE
,
std
::
back_inserter
(
cuda_ipc_handle_
));
}
}
}}
// namespace triton::core
3rdparty/core-r22.12/src/buffer_attributes.h
0 → 100644
View file @
0a21fff9
// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <iterator>
#include <vector>
#include "tritonserver_apis.h"
#pragma once
namespace
triton
{
namespace
core
{
//
// A class to hold information about the buffer allocation.
//
class
BufferAttributes
{
public:
BufferAttributes
(
size_t
byte_size
,
TRITONSERVER_MemoryType
memory_type
,
int64_t
memory_type_id
,
char
cuda_ipc_handle
[
64
]);
BufferAttributes
()
{
memory_type_
=
TRITONSERVER_MEMORY_CPU
;
memory_type_id_
=
0
;
cuda_ipc_handle_
.
reserve
(
64
);
}
// Set the buffer byte size
void
SetByteSize
(
const
size_t
&
byte_size
);
// Set the buffer memory_type
void
SetMemoryType
(
const
TRITONSERVER_MemoryType
&
memory_type
);
// Set the buffer memory type id
void
SetMemoryTypeId
(
const
int64_t
&
memory_type_id
);
// Set the cuda ipc handle
void
SetCudaIpcHandle
(
void
*
cuda_ipc_handle
);
// Get the cuda ipc handle
void
*
CudaIpcHandle
();
// Get the byte size
size_t
ByteSize
()
const
;
// Get the memory type
TRITONSERVER_MemoryType
MemoryType
()
const
;
// Get the memory type id
int64_t
MemoryTypeId
()
const
;
private:
size_t
byte_size_
;
TRITONSERVER_MemoryType
memory_type_
;
int64_t
memory_type_id_
;
std
::
vector
<
char
>
cuda_ipc_handle_
;
};
}}
// namespace triton::core
3rdparty/core-r22.12/src/constants.h
0 → 100644
View file @
0a21fff9
// Copyright 2018-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <stdint.h>
namespace
triton
{
namespace
core
{
constexpr
char
kInferHeaderContentLengthHTTPHeader
[]
=
"Inference-Header-Content-Length"
;
constexpr
char
kAcceptEncodingHTTPHeader
[]
=
"Accept-Encoding"
;
constexpr
char
kContentEncodingHTTPHeader
[]
=
"Content-Encoding"
;
constexpr
char
kContentTypeHeader
[]
=
"Content-Type"
;
constexpr
char
kContentLengthHeader
[]
=
"Content-Length"
;
constexpr
char
kTensorFlowGraphDefPlatform
[]
=
"tensorflow_graphdef"
;
constexpr
char
kTensorFlowSavedModelPlatform
[]
=
"tensorflow_savedmodel"
;
constexpr
char
kTensorFlowGraphDefFilename
[]
=
"model.graphdef"
;
constexpr
char
kTensorFlowSavedModelFilename
[]
=
"model.savedmodel"
;
constexpr
char
kTensorFlowBackend
[]
=
"tensorflow"
;
constexpr
char
kTensorRTPlanPlatform
[]
=
"tensorrt_plan"
;
constexpr
char
kTensorRTPlanFilename
[]
=
"model.plan"
;
constexpr
char
kTensorRTBackend
[]
=
"tensorrt"
;
constexpr
char
kOnnxRuntimeOnnxPlatform
[]
=
"onnxruntime_onnx"
;
constexpr
char
kOnnxRuntimeOnnxFilename
[]
=
"model.onnx"
;
constexpr
char
kOnnxRuntimeBackend
[]
=
"onnxruntime"
;
constexpr
char
kOpenVINORuntimeOpenVINOFilename
[]
=
"model.xml"
;
constexpr
char
kOpenVINORuntimeBackend
[]
=
"openvino"
;
constexpr
char
kPyTorchLibTorchPlatform
[]
=
"pytorch_libtorch"
;
constexpr
char
kPyTorchLibTorchFilename
[]
=
"model.pt"
;
constexpr
char
kPyTorchBackend
[]
=
"pytorch"
;
constexpr
char
kPythonFilename
[]
=
"model.py"
;
constexpr
char
kPythonBackend
[]
=
"python"
;
#ifdef TRITON_ENABLE_ENSEMBLE
constexpr
char
kEnsemblePlatform
[]
=
"ensemble"
;
#endif // TRITON_ENABLE_ENSEMBLE
constexpr
char
kTensorRTExecutionAccelerator
[]
=
"tensorrt"
;
constexpr
char
kOpenVINOExecutionAccelerator
[]
=
"openvino"
;
constexpr
char
kGPUIOExecutionAccelerator
[]
=
"gpu_io"
;
constexpr
char
kAutoMixedPrecisionExecutionAccelerator
[]
=
"auto_mixed_precision"
;
constexpr
char
kModelConfigPbTxt
[]
=
"config.pbtxt"
;
constexpr
char
kMetricsLabelModelName
[]
=
"model"
;
constexpr
char
kMetricsLabelModelVersion
[]
=
"version"
;
constexpr
char
kMetricsLabelGpuUuid
[]
=
"gpu_uuid"
;
constexpr
char
kWarmupDataFolder
[]
=
"warmup"
;
constexpr
char
kInitialStateFolder
[]
=
"initial_state"
;
constexpr
uint64_t
NANOS_PER_SECOND
=
1000000000
;
constexpr
uint64_t
NANOS_PER_MILLIS
=
1000000
;
constexpr
int
MAX_GRPC_MESSAGE_SIZE
=
INT32_MAX
;
constexpr
uint64_t
SEQUENCE_IDLE_DEFAULT_MICROSECONDS
=
1000
*
1000
;
constexpr
size_t
STRING_CORRELATION_ID_MAX_LENGTH_BYTES
=
128
;
constexpr
size_t
CUDA_IPC_STRUCT_SIZE
=
64
;
#ifdef TRITON_ENABLE_METRICS
// MetricModelReporter expects a device ID for GPUs, but we reuse this device
// ID for other metrics as well such as for CPU and Response Cache metrics
constexpr
int
METRIC_REPORTER_ID_CPU
=
-
1
;
constexpr
int
METRIC_REPORTER_ID_RESPONSE_CACHE
=
-
2
;
#endif
#define TIMESPEC_TO_NANOS(TS) \
((TS).tv_sec * triton::core::NANOS_PER_SECOND + (TS).tv_nsec)
#define TIMESPEC_TO_MILLIS(TS) \
(TIMESPEC_TO_NANOS(TS) / triton::core::NANOS_PER_MILLIS)
#define DISALLOW_MOVE(TypeName) TypeName(Context&& o) = delete;
#define DISALLOW_COPY(TypeName) TypeName(const TypeName&) = delete;
#define DISALLOW_ASSIGN(TypeName) void operator=(const TypeName&) = delete;
#define DISALLOW_COPY_AND_ASSIGN(TypeName) \
DISALLOW_COPY(TypeName) \
DISALLOW_ASSIGN(TypeName)
}}
// namespace triton::core
3rdparty/core-r22.12/src/cuda_memory_manager.cc
0 → 100644
View file @
0a21fff9
// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
#include "cuda_memory_manager.h"
#include <cnmem.h>
#include <string.h>
#include <set>
#include "cuda_utils.h"
#include "triton/common/logging.h"
namespace
{
#define RETURN_IF_CNMEM_ERROR(S, MSG) \
do { \
auto status__ = (S); \
if (status__ != CNMEM_STATUS_SUCCESS) { \
return Status( \
Status::Code::INTERNAL, \
(MSG) + ": " + cnmemGetErrorString(status__)); \
} \
} while (false)
std
::
string
PointerToString
(
void
*
ptr
)
{
std
::
stringstream
ss
;
ss
<<
ptr
;
return
ss
.
str
();
}
}
// namespace
namespace
triton
{
namespace
core
{
std
::
unique_ptr
<
CudaMemoryManager
>
CudaMemoryManager
::
instance_
;
std
::
mutex
CudaMemoryManager
::
instance_mu_
;
CudaMemoryManager
::~
CudaMemoryManager
()
{
if
(
has_allocation_
)
{
auto
status
=
cnmemFinalize
();
if
(
status
!=
CNMEM_STATUS_SUCCESS
)
{
LOG_ERROR
<<
"Failed to finalize CUDA memory manager: ["
<<
status
<<
"] "
<<
cnmemGetErrorString
(
status
);
}
}
}
void
CudaMemoryManager
::
Reset
()
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
instance_mu_
);
instance_
.
reset
();
}
Status
CudaMemoryManager
::
Create
(
const
CudaMemoryManager
::
Options
&
options
)
{
// Ensure thread-safe creation of CUDA memory pool
std
::
lock_guard
<
std
::
mutex
>
lock
(
instance_mu_
);
if
(
instance_
!=
nullptr
)
{
LOG_WARNING
<<
"New CUDA memory pools could not be created since they "
"already exists"
;
return
Status
::
Success
;
}
std
::
set
<
int
>
supported_gpus
;
auto
status
=
GetSupportedGPUs
(
&
supported_gpus
,
options
.
min_supported_compute_capability_
);
if
(
status
.
IsOk
())
{
std
::
vector
<
cnmemDevice_t
>
devices
;
for
(
auto
gpu
:
supported_gpus
)
{
const
auto
it
=
options
.
memory_pool_byte_size_
.
find
(
gpu
);
if
((
it
!=
options
.
memory_pool_byte_size_
.
end
())
&&
(
it
->
second
!=
0
))
{
devices
.
emplace_back
();
auto
&
device
=
devices
.
back
();
memset
(
&
device
,
0
,
sizeof
(
device
));
device
.
device
=
gpu
;
device
.
size
=
it
->
second
;
LOG_INFO
<<
"CUDA memory pool is created on device "
<<
device
.
device
<<
" with size "
<<
device
.
size
;
}
}
if
(
!
devices
.
empty
())
{
RETURN_IF_CNMEM_ERROR
(
cnmemInit
(
devices
.
size
(),
devices
.
data
(),
CNMEM_FLAGS_CANNOT_GROW
),
std
::
string
(
"Failed to finalize CUDA memory manager"
));
}
else
{
LOG_INFO
<<
"CUDA memory pool disabled"
;
}
// Use to finalize CNMeM properly when out of scope
instance_
.
reset
(
new
CudaMemoryManager
(
!
devices
.
empty
()));
}
else
{
return
Status
(
status
.
ErrorCode
(),
"Failed to initialize CUDA memory manager: "
+
status
.
Message
());
}
return
Status
::
Success
;
}
Status
CudaMemoryManager
::
Alloc
(
void
**
ptr
,
uint64_t
size
,
int64_t
device_id
)
{
if
(
instance_
==
nullptr
)
{
return
Status
(
Status
::
Code
::
UNAVAILABLE
,
"CudaMemoryManager has not been created"
);
}
else
if
(
!
instance_
->
has_allocation_
)
{
return
Status
(
Status
::
Code
::
UNAVAILABLE
,
"CudaMemoryManager has no preallocated CUDA memory"
);
}
int
current_device
;
RETURN_IF_CUDA_ERR
(
cudaGetDevice
(
&
current_device
),
std
::
string
(
"Failed to get device"
));
bool
overridden
=
(
current_device
!=
device_id
);
if
(
overridden
)
{
RETURN_IF_CUDA_ERR
(
cudaSetDevice
(
device_id
),
std
::
string
(
"Failed to set device"
));
}
// Defer returning error to make sure the device is recovered
auto
err
=
cnmemMalloc
(
ptr
,
size
,
nullptr
);
if
(
overridden
)
{
cudaSetDevice
(
current_device
);
}
RETURN_IF_CNMEM_ERROR
(
err
,
std
::
string
(
"Failed to allocate CUDA memory with byte size "
)
+
std
::
to_string
(
size
)
+
" on GPU "
+
std
::
to_string
(
device_id
));
return
Status
::
Success
;
}
Status
CudaMemoryManager
::
Free
(
void
*
ptr
,
int64_t
device_id
)
{
if
(
instance_
==
nullptr
)
{
return
Status
(
Status
::
Code
::
UNAVAILABLE
,
"CudaMemoryManager has not been created"
);
}
else
if
(
!
instance_
->
has_allocation_
)
{
return
Status
(
Status
::
Code
::
UNAVAILABLE
,
"CudaMemoryManager has no preallocated CUDA memory"
);
}
int
current_device
;
RETURN_IF_CUDA_ERR
(
cudaGetDevice
(
&
current_device
),
std
::
string
(
"Failed to get device"
));
bool
overridden
=
(
current_device
!=
device_id
);
if
(
overridden
)
{
RETURN_IF_CUDA_ERR
(
cudaSetDevice
(
device_id
),
std
::
string
(
"Failed to set device"
));
}
// Defer returning error to make sure the device is recovered
auto
err
=
cnmemFree
(
ptr
,
nullptr
);
if
(
overridden
)
{
cudaSetDevice
(
current_device
);
}
RETURN_IF_CNMEM_ERROR
(
err
,
std
::
string
(
"Failed to deallocate CUDA memory at address "
)
+
PointerToString
(
ptr
)
+
" on GPU "
+
std
::
to_string
(
device_id
));
return
Status
::
Success
;
}
}}
// namespace triton::core
3rdparty/core-r22.12/src/cuda_memory_manager.h
0 → 100644
View file @
0a21fff9
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
#pragma once
#include <map>
#include <memory>
#include <mutex>
#include "status.h"
namespace
triton
{
namespace
core
{
// This is a singleton class responsible for maintaining CUDA memory pool
// used by the inference server. CUDA memory allocations and deallocations
// must be requested via functions provided by this class.
class
CudaMemoryManager
{
public:
// Options to configure CUDA memory manager.
struct
Options
{
Options
(
double
cc
=
6.0
,
const
std
::
map
<
int
,
uint64_t
>&
s
=
{})
:
min_supported_compute_capability_
(
cc
),
memory_pool_byte_size_
(
s
)
{
}
// The minimum compute capability of the supported devices.
double
min_supported_compute_capability_
;
// The size of CUDA memory reserved for the specified devices.
// The memory size will be rounded up to align with
// the default granularity (512 bytes).
// No memory will be reserved for devices that is not listed.
std
::
map
<
int
,
uint64_t
>
memory_pool_byte_size_
;
};
~
CudaMemoryManager
();
// Create the memory manager based on 'options' specified.
// Return Status object indicating success or failure.
static
Status
Create
(
const
Options
&
options
);
// Allocate CUDA memory on GPU 'device_id' with
// the requested 'size' and return the pointer in 'ptr'.
// Return Status object indicating success or failure.
static
Status
Alloc
(
void
**
ptr
,
uint64_t
size
,
int64_t
device_id
);
// Free the memory allocated by the memory manager on 'device_id'.
// Return Status object indicating success or failure.
static
Status
Free
(
void
*
ptr
,
int64_t
device_id
);
protected:
// Provide explicit control on the lifecycle of the CUDA memory manager,
// for testing only.
static
void
Reset
();
private:
CudaMemoryManager
(
bool
has_allocation
)
:
has_allocation_
(
has_allocation
)
{}
bool
has_allocation_
;
static
std
::
unique_ptr
<
CudaMemoryManager
>
instance_
;
static
std
::
mutex
instance_mu_
;
};
}}
// namespace triton::core
3rdparty/core-r22.12/src/cuda_utils.cc
0 → 100644
View file @
0a21fff9
// Copyright 2019-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "cuda_utils.h"
#include "model_config_utils.h"
#include "triton/common/nvtx.h"
namespace
triton
{
namespace
core
{
#ifdef TRITON_ENABLE_GPU
void
CUDART_CB
MemcpyHost
(
void
*
args
)
{
auto
*
copy_params
=
reinterpret_cast
<
CopyParams
*>
(
args
);
memcpy
(
copy_params
->
dst_
,
copy_params
->
src_
,
copy_params
->
byte_size_
);
delete
copy_params
;
}
#endif // TRITON_ENABLE_GPU
Status
GetDeviceMemoryInfo
(
const
int
device_id
,
size_t
*
free
,
size_t
*
total
)
{
*
free
=
0
;
*
total
=
0
;
#ifdef TRITON_ENABLE_GPU
// Make sure that correct device is set before creating stream and
// then restore the device to what was set by the caller.
int
current_device
;
auto
cuerr
=
cudaGetDevice
(
&
current_device
);
bool
overridden
=
false
;
if
(
cuerr
==
cudaSuccess
)
{
overridden
=
(
current_device
!=
device_id
);
if
(
overridden
)
{
cuerr
=
cudaSetDevice
(
device_id
);
}
}
if
(
cuerr
==
cudaSuccess
)
{
cuerr
=
cudaMemGetInfo
(
free
,
total
);
}
if
(
overridden
)
{
cudaSetDevice
(
current_device
);
}
if
(
cuerr
!=
cudaSuccess
)
{
return
Status
(
Status
::
Code
::
INTERNAL
,
(
std
::
string
(
"unable to get memory info for device "
)
+
std
::
to_string
(
device_id
)
+
": "
+
cudaGetErrorString
(
cuerr
)));
}
#endif // TRITON_ENABLE_GPU
return
Status
::
Success
;
}
Status
EnablePeerAccess
(
const
double
min_compute_capability
)
{
#ifdef TRITON_ENABLE_GPU
// If we can't enable peer access for one device pair, the best we can
// do is skipping it...
std
::
set
<
int
>
supported_gpus
;
bool
all_enabled
=
false
;
if
(
GetSupportedGPUs
(
&
supported_gpus
,
min_compute_capability
).
IsOk
())
{
all_enabled
=
true
;
int
can_access_peer
=
false
;
for
(
const
auto
&
host
:
supported_gpus
)
{
auto
cuerr
=
cudaSetDevice
(
host
);
if
(
cuerr
==
cudaSuccess
)
{
for
(
const
auto
&
peer
:
supported_gpus
)
{
if
(
host
==
peer
)
{
continue
;
}
cuerr
=
cudaDeviceCanAccessPeer
(
&
can_access_peer
,
host
,
peer
);
if
((
cuerr
==
cudaSuccess
)
&&
(
can_access_peer
==
1
))
{
cuerr
=
cudaDeviceEnablePeerAccess
(
peer
,
0
);
}
all_enabled
&=
((
cuerr
==
cudaSuccess
)
&&
(
can_access_peer
==
1
));
}
}
}
}
if
(
!
all_enabled
)
{
return
Status
(
Status
::
Code
::
UNSUPPORTED
,
"failed to enable peer access for some device pairs"
);
}
#endif // TRITON_ENABLE_GPU
return
Status
::
Success
;
}
Status
CopyBuffer
(
const
std
::
string
&
msg
,
const
TRITONSERVER_MemoryType
src_memory_type
,
const
int64_t
src_memory_type_id
,
const
TRITONSERVER_MemoryType
dst_memory_type
,
const
int64_t
dst_memory_type_id
,
const
size_t
byte_size
,
const
void
*
src
,
void
*
dst
,
cudaStream_t
cuda_stream
,
bool
*
cuda_used
,
bool
copy_on_stream
)
{
NVTX_RANGE
(
nvtx_
,
"CopyBuffer"
);
*
cuda_used
=
false
;
// For CUDA memcpy, all host to host copy will be blocked in respect to the
// host, so use memcpy() directly. In this case, need to be careful on whether
// the src buffer is valid.
if
((
src_memory_type
!=
TRITONSERVER_MEMORY_GPU
)
&&
(
dst_memory_type
!=
TRITONSERVER_MEMORY_GPU
))
{
#ifdef TRITON_ENABLE_GPU
if
(
copy_on_stream
)
{
auto
params
=
new
CopyParams
(
dst
,
src
,
byte_size
);
cudaLaunchHostFunc
(
cuda_stream
,
MemcpyHost
,
reinterpret_cast
<
void
*>
(
params
));
*
cuda_used
=
true
;
}
else
{
memcpy
(
dst
,
src
,
byte_size
);
}
#else
memcpy
(
dst
,
src
,
byte_size
);
#endif // TRITON_ENABLE_GPU
}
else
{
#ifdef TRITON_ENABLE_GPU
RETURN_IF_CUDA_ERR
(
cudaMemcpyAsync
(
dst
,
src
,
byte_size
,
cudaMemcpyDefault
,
cuda_stream
),
msg
+
": failed to perform CUDA copy"
);
*
cuda_used
=
true
;
#else
return
Status
(
Status
::
Code
::
INTERNAL
,
msg
+
": try to use CUDA copy while GPU is not supported"
);
#endif // TRITON_ENABLE_GPU
}
return
Status
::
Success
;
}
void
CopyBufferHandler
(
const
std
::
string
&
msg
,
const
TRITONSERVER_MemoryType
src_memory_type
,
const
int64_t
src_memory_type_id
,
const
TRITONSERVER_MemoryType
dst_memory_type
,
const
int64_t
dst_memory_type_id
,
const
size_t
byte_size
,
const
void
*
src
,
void
*
dst
,
cudaStream_t
cuda_stream
,
void
*
response_ptr
,
triton
::
common
::
SyncQueue
<
std
::
tuple
<
Status
,
bool
,
void
*>>*
completion_queue
)
{
bool
cuda_used
=
false
;
Status
status
=
CopyBuffer
(
msg
,
src_memory_type
,
src_memory_type_id
,
dst_memory_type
,
dst_memory_type_id
,
byte_size
,
src
,
dst
,
cuda_stream
,
&
cuda_used
);
completion_queue
->
Put
(
std
::
make_tuple
(
status
,
cuda_used
,
response_ptr
));
}
#ifdef TRITON_ENABLE_GPU
Status
CheckGPUCompatibility
(
const
int
gpu_id
,
const
double
min_compute_capability
)
{
// Query the compute capability from the device
cudaDeviceProp
cuprops
;
cudaError_t
cuerr
=
cudaGetDeviceProperties
(
&
cuprops
,
gpu_id
);
if
(
cuerr
!=
cudaSuccess
)
{
return
Status
(
Status
::
Code
::
INTERNAL
,
"unable to get CUDA device properties for GPU ID"
+
std
::
to_string
(
gpu_id
)
+
": "
+
cudaGetErrorString
(
cuerr
));
}
double
compute_compability
=
cuprops
.
major
+
(
cuprops
.
minor
/
10.0
);
if
((
compute_compability
>
min_compute_capability
)
||
(
abs
(
compute_compability
-
min_compute_capability
)
<
0.01
))
{
return
Status
::
Success
;
}
else
{
return
Status
(
Status
::
Code
::
UNSUPPORTED
,
"gpu "
+
std
::
to_string
(
gpu_id
)
+
" has compute capability '"
+
std
::
to_string
(
cuprops
.
major
)
+
"."
+
std
::
to_string
(
cuprops
.
minor
)
+
"' which is less than the minimum supported of '"
+
std
::
to_string
(
min_compute_capability
)
+
"'"
);
}
}
Status
GetSupportedGPUs
(
std
::
set
<
int
>*
supported_gpus
,
const
double
min_compute_capability
)
{
// Make sure set is empty before starting
supported_gpus
->
clear
();
int
device_cnt
;
cudaError_t
cuerr
=
cudaGetDeviceCount
(
&
device_cnt
);
if
((
cuerr
==
cudaErrorNoDevice
)
||
(
cuerr
==
cudaErrorInsufficientDriver
))
{
device_cnt
=
0
;
}
else
if
(
cuerr
!=
cudaSuccess
)
{
return
Status
(
Status
::
Code
::
INTERNAL
,
"unable to get number of CUDA devices: "
+
std
::
string
(
cudaGetErrorString
(
cuerr
)));
}
// populates supported_gpus
for
(
int
gpu_id
=
0
;
gpu_id
<
device_cnt
;
gpu_id
++
)
{
Status
status
=
CheckGPUCompatibility
(
gpu_id
,
min_compute_capability
);
if
(
status
.
IsOk
())
{
supported_gpus
->
insert
(
gpu_id
);
}
}
return
Status
::
Success
;
}
Status
SupportsIntegratedZeroCopy
(
const
int
gpu_id
,
bool
*
zero_copy_support
)
{
// Query the device to check if integrated
cudaDeviceProp
cuprops
;
cudaError_t
cuerr
=
cudaGetDeviceProperties
(
&
cuprops
,
gpu_id
);
if
(
cuerr
!=
cudaSuccess
)
{
return
Status
(
Status
::
Code
::
INTERNAL
,
"unable to get CUDA device properties for GPU ID"
+
std
::
to_string
(
gpu_id
)
+
": "
+
cudaGetErrorString
(
cuerr
));
}
// Zero-copy supported only on integrated GPU when it can map host memory
if
(
cuprops
.
integrated
&&
cuprops
.
canMapHostMemory
)
{
*
zero_copy_support
=
true
;
}
else
{
*
zero_copy_support
=
false
;
}
return
Status
::
Success
;
}
#endif
}}
// namespace triton::core
Prev
1
2
3
4
5
6
7
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment