Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Lmdeploy
Commits
b30f3cdb
Commit
b30f3cdb
authored
Nov 14, 2023
by
xiabo
Browse files
添加下载的代码
parent
e38ee081
Changes
157
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
7864 additions
and
0 deletions
+7864
-0
3rdparty/core-r22.12/include/triton/core/tritonserver.h
3rdparty/core-r22.12/include/triton/core/tritonserver.h
+2360
-0
3rdparty/core-r22.12/src/backend_config.cc
3rdparty/core-r22.12/src/backend_config.cc
+225
-0
3rdparty/core-r22.12/src/backend_config.h
3rdparty/core-r22.12/src/backend_config.h
+77
-0
3rdparty/core-r22.12/src/backend_manager.cc
3rdparty/core-r22.12/src/backend_manager.cc
+383
-0
3rdparty/core-r22.12/src/backend_manager.h
3rdparty/core-r22.12/src/backend_manager.h
+174
-0
3rdparty/core-r22.12/src/backend_memory_manager.cc
3rdparty/core-r22.12/src/backend_memory_manager.cc
+149
-0
3rdparty/core-r22.12/src/backend_memory_manager.h
3rdparty/core-r22.12/src/backend_memory_manager.h
+36
-0
3rdparty/core-r22.12/src/backend_model.cc
3rdparty/core-r22.12/src/backend_model.cc
+1301
-0
3rdparty/core-r22.12/src/backend_model.h
3rdparty/core-r22.12/src/backend_model.h
+133
-0
3rdparty/core-r22.12/src/backend_model_instance.cc
3rdparty/core-r22.12/src/backend_model_instance.cc
+966
-0
3rdparty/core-r22.12/src/backend_model_instance.h
3rdparty/core-r22.12/src/backend_model_instance.h
+200
-0
3rdparty/core-r22.12/src/buffer_attributes.cc
3rdparty/core-r22.12/src/buffer_attributes.cc
+104
-0
3rdparty/core-r22.12/src/buffer_attributes.h
3rdparty/core-r22.12/src/buffer_attributes.h
+79
-0
3rdparty/core-r22.12/src/constants.h
3rdparty/core-r22.12/src/constants.h
+108
-0
3rdparty/core-r22.12/src/cuda_memory_manager.cc
3rdparty/core-r22.12/src/cuda_memory_manager.cc
+197
-0
3rdparty/core-r22.12/src/cuda_memory_manager.h
3rdparty/core-r22.12/src/cuda_memory_manager.h
+85
-0
3rdparty/core-r22.12/src/cuda_utils.cc
3rdparty/core-r22.12/src/cuda_utils.cc
+263
-0
3rdparty/core-r22.12/src/cuda_utils.h
3rdparty/core-r22.12/src/cuda_utils.h
+144
-0
3rdparty/core-r22.12/src/dynamic_batch_scheduler.cc
3rdparty/core-r22.12/src/dynamic_batch_scheduler.cc
+698
-0
3rdparty/core-r22.12/src/dynamic_batch_scheduler.h
3rdparty/core-r22.12/src/dynamic_batch_scheduler.h
+182
-0
No files found.
Too many changes to show.
To preserve performance only
157 of 157+
files are displayed.
Plain diff
Email patch
3rdparty/core-r22.12/include/triton/core/tritonserver.h
0 → 100644
View file @
b30f3cdb
// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
/// \file
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#ifdef __cplusplus
extern
"C"
{
#endif
#ifdef _COMPILING_TRITONSERVER
#if defined(_MSC_VER)
#define TRITONSERVER_DECLSPEC __declspec(dllexport)
#elif defined(__GNUC__)
#define TRITONSERVER_DECLSPEC __attribute__((__visibility__("default")))
#else
#define TRITONSERVER_DECLSPEC
#endif
#else
#if defined(_MSC_VER)
#define TRITONSERVER_DECLSPEC __declspec(dllimport)
#else
#define TRITONSERVER_DECLSPEC
#endif
#endif
struct
TRITONSERVER_BufferAttributes
;
struct
TRITONSERVER_Error
;
struct
TRITONSERVER_InferenceRequest
;
struct
TRITONSERVER_InferenceResponse
;
struct
TRITONSERVER_InferenceTrace
;
struct
TRITONSERVER_Message
;
struct
TRITONSERVER_Metrics
;
struct
TRITONSERVER_Parameter
;
struct
TRITONSERVER_ResponseAllocator
;
struct
TRITONSERVER_Server
;
struct
TRITONSERVER_ServerOptions
;
struct
TRITONSERVER_Metric
;
struct
TRITONSERVER_MetricFamily
;
///
/// TRITONSERVER API Version
///
/// The TRITONSERVER API is versioned with major and minor version
/// numbers. Any change to the API that does not impact backwards
/// compatibility (for example, adding a non-required function)
/// increases the minor version number. Any change that breaks
/// backwards compatibility (for example, deleting or changing the
/// behavior of a function) increases the major version number. A
/// client should check that the API version used to compile the
/// client is compatible with the API version of the Triton shared
/// library that it is linking against. This is typically done by code
/// similar to the following which makes sure that the major versions
/// are equal and that the minor version of the Triton shared library
/// is >= the minor version used to build the client.
///
/// uint32_t api_version_major, api_version_minor;
/// TRITONSERVER_ApiVersion(&api_version_major, &api_version_minor);
/// if ((api_version_major != TRITONSERVER_API_VERSION_MAJOR) ||
/// (api_version_minor < TRITONSERVER_API_VERSION_MINOR)) {
/// return TRITONSERVER_ErrorNew(
/// TRITONSERVER_ERROR_UNSUPPORTED,
/// "triton server API version does not support this client");
/// }
///
#define TRITONSERVER_API_VERSION_MAJOR 1
#define TRITONSERVER_API_VERSION_MINOR 17
/// Get the TRITONBACKEND API version supported by the Triton shared
/// library. This value can be compared against the
/// TRITONSERVER_API_VERSION_MAJOR and TRITONSERVER_API_VERSION_MINOR
/// used to build the client to ensure that Triton shared library is
/// compatible with the client.
///
/// \param major Returns the TRITONSERVER API major version supported
/// by Triton.
/// \param minor Returns the TRITONSERVER API minor version supported
/// by Triton.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ApiVersion
(
uint32_t
*
major
,
uint32_t
*
minor
);
/// TRITONSERVER_DataType
///
/// Tensor data types recognized by TRITONSERVER.
///
typedef
enum
TRITONSERVER_datatype_enum
{
TRITONSERVER_TYPE_INVALID
,
TRITONSERVER_TYPE_BOOL
,
TRITONSERVER_TYPE_UINT8
,
TRITONSERVER_TYPE_UINT16
,
TRITONSERVER_TYPE_UINT32
,
TRITONSERVER_TYPE_UINT64
,
TRITONSERVER_TYPE_INT8
,
TRITONSERVER_TYPE_INT16
,
TRITONSERVER_TYPE_INT32
,
TRITONSERVER_TYPE_INT64
,
TRITONSERVER_TYPE_FP16
,
TRITONSERVER_TYPE_FP32
,
TRITONSERVER_TYPE_FP64
,
TRITONSERVER_TYPE_BYTES
,
TRITONSERVER_TYPE_BF16
}
TRITONSERVER_DataType
;
/// Get the string representation of a data type. The returned string
/// is not owned by the caller and so should not be modified or freed.
///
/// \param datatype The data type.
/// \return The string representation of the data type.
TRITONSERVER_DECLSPEC
const
char
*
TRITONSERVER_DataTypeString
(
TRITONSERVER_DataType
datatype
);
/// Get the Triton datatype corresponding to a string representation
/// of a datatype.
///
/// \param dtype The datatype string representation.
/// \return The Triton data type or TRITONSERVER_TYPE_INVALID if the
/// string does not represent a data type.
TRITONSERVER_DECLSPEC
TRITONSERVER_DataType
TRITONSERVER_StringToDataType
(
const
char
*
dtype
);
/// Get the size of a Triton datatype in bytes. Zero is returned for
/// TRITONSERVER_TYPE_BYTES because it have variable size. Zero is
/// returned for TRITONSERVER_TYPE_INVALID.
///
/// \param dtype The datatype.
/// \return The size of the datatype.
TRITONSERVER_DECLSPEC
uint32_t
TRITONSERVER_DataTypeByteSize
(
TRITONSERVER_DataType
datatype
);
/// TRITONSERVER_MemoryType
///
/// Types of memory recognized by TRITONSERVER.
///
typedef
enum
TRITONSERVER_memorytype_enum
{
TRITONSERVER_MEMORY_CPU
,
TRITONSERVER_MEMORY_CPU_PINNED
,
TRITONSERVER_MEMORY_GPU
}
TRITONSERVER_MemoryType
;
/// Get the string representation of a memory type. The returned
/// string is not owned by the caller and so should not be modified or
/// freed.
///
/// \param memtype The memory type.
/// \return The string representation of the memory type.
TRITONSERVER_DECLSPEC
const
char
*
TRITONSERVER_MemoryTypeString
(
TRITONSERVER_MemoryType
memtype
);
/// TRITONSERVER_ParameterType
///
/// Types of parameters recognized by TRITONSERVER.
///
typedef
enum
TRITONSERVER_parametertype_enum
{
TRITONSERVER_PARAMETER_STRING
,
TRITONSERVER_PARAMETER_INT
,
TRITONSERVER_PARAMETER_BOOL
,
TRITONSERVER_PARAMETER_BYTES
}
TRITONSERVER_ParameterType
;
/// Get the string representation of a parameter type. The returned
/// string is not owned by the caller and so should not be modified or
/// freed.
///
/// \param paramtype The parameter type.
/// \return The string representation of the parameter type.
TRITONSERVER_DECLSPEC
const
char
*
TRITONSERVER_ParameterTypeString
(
TRITONSERVER_ParameterType
paramtype
);
/// Create a new parameter object. The caller takes ownership of the
/// TRITONSERVER_Parameter object and must call TRITONSERVER_ParameterDelete to
/// release the object. The object will maintain its own copy of the 'value'
///
/// \param name The parameter name.
/// \param type The parameter type.
/// \param value The pointer to the value.
/// \return A new TRITONSERVER_Parameter object. 'nullptr' will be returned if
/// 'type' is 'TRITONSERVER_PARAMETER_BYTES'. The caller should use
/// TRITONSERVER_ParameterBytesNew to create parameter with bytes type.
TRITONSERVER_DECLSPEC
TRITONSERVER_Parameter
*
TRITONSERVER_ParameterNew
(
const
char
*
name
,
const
TRITONSERVER_ParameterType
type
,
const
void
*
value
);
/// Create a new parameter object with type TRITONSERVER_PARAMETER_BYTES.
/// The caller takes ownership of the TRITONSERVER_Parameter object and must
/// call TRITONSERVER_ParameterDelete to release the object. The object only
/// maintains a shallow copy of the 'byte_ptr' so the data content must be
/// valid until the parameter object is deleted.
///
/// \param name The parameter name.
/// \param byte_ptr The pointer to the data content.
/// \param size The size of the data content.
/// \return A new TRITONSERVER_Error object.
TRITONSERVER_DECLSPEC
TRITONSERVER_Parameter
*
TRITONSERVER_ParameterBytesNew
(
const
char
*
name
,
const
void
*
byte_ptr
,
const
uint64_t
size
);
/// Delete an parameter object.
///
/// \param parameter The parameter object.
TRITONSERVER_DECLSPEC
void
TRITONSERVER_ParameterDelete
(
TRITONSERVER_Parameter
*
parameter
);
/// TRITONSERVER_InstanceGroupKind
///
/// Kinds of instance groups recognized by TRITONSERVER.
///
typedef
enum
TRITONSERVER_instancegroupkind_enum
{
TRITONSERVER_INSTANCEGROUPKIND_AUTO
,
TRITONSERVER_INSTANCEGROUPKIND_CPU
,
TRITONSERVER_INSTANCEGROUPKIND_GPU
,
TRITONSERVER_INSTANCEGROUPKIND_MODEL
}
TRITONSERVER_InstanceGroupKind
;
/// Get the string representation of an instance-group kind. The
/// returned string is not owned by the caller and so should not be
/// modified or freed.
///
/// \param kind The instance-group kind.
/// \return The string representation of the kind.
TRITONSERVER_DECLSPEC
const
char
*
TRITONSERVER_InstanceGroupKindString
(
TRITONSERVER_InstanceGroupKind
kind
);
/// TRITONSERVER_Logging
///
/// Types/levels of logging.
///
typedef
enum
TRITONSERVER_loglevel_enum
{
TRITONSERVER_LOG_INFO
,
TRITONSERVER_LOG_WARN
,
TRITONSERVER_LOG_ERROR
,
TRITONSERVER_LOG_VERBOSE
}
TRITONSERVER_LogLevel
;
///
/// Format of logging.
///
/// TRITONSERVER_LOG_DEFAULT: the log severity (L) and timestamp will be
/// logged as "LMMDD hh:mm:ss.ssssss".
///
/// TRITONSERVER_LOG_ISO8601: the log format will be "YYYY-MM-DDThh:mm:ssZ L".
///
typedef
enum
TRITONSERVER_logformat_enum
{
TRITONSERVER_LOG_DEFAULT
,
TRITONSERVER_LOG_ISO8601
}
TRITONSERVER_LogFormat
;
/// Is a log level enabled?
///
/// \param level The log level.
/// \return True if the log level is enabled, false if not enabled.
TRITONSERVER_DECLSPEC
bool
TRITONSERVER_LogIsEnabled
(
TRITONSERVER_LogLevel
level
);
/// Log a message at a given log level if that level is enabled.
///
/// \param level The log level.
/// \param filename The file name of the location of the log message.
/// \param line The line number of the log message.
/// \param msg The log message.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_LogMessage
(
TRITONSERVER_LogLevel
level
,
const
char
*
filename
,
const
int
line
,
const
char
*
msg
);
/// TRITONSERVER_Error
///
/// Errors are reported by a TRITONSERVER_Error object. A NULL
/// TRITONSERVER_Error indicates no error, a non-NULL TRITONSERVER_Error
/// indicates error and the code and message for the error can be
/// retrieved from the object.
///
/// The caller takes ownership of a TRITONSERVER_Error object returned by
/// the API and must call TRITONSERVER_ErrorDelete to release the object.
///
/// The TRITONSERVER_Error error codes
typedef
enum
TRITONSERVER_errorcode_enum
{
TRITONSERVER_ERROR_UNKNOWN
,
TRITONSERVER_ERROR_INTERNAL
,
TRITONSERVER_ERROR_NOT_FOUND
,
TRITONSERVER_ERROR_INVALID_ARG
,
TRITONSERVER_ERROR_UNAVAILABLE
,
TRITONSERVER_ERROR_UNSUPPORTED
,
TRITONSERVER_ERROR_ALREADY_EXISTS
}
TRITONSERVER_Error_Code
;
/// Create a new error object. The caller takes ownership of the
/// TRITONSERVER_Error object and must call TRITONSERVER_ErrorDelete to
/// release the object.
///
/// \param code The error code.
/// \param msg The error message.
/// \return A new TRITONSERVER_Error object.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ErrorNew
(
TRITONSERVER_Error_Code
code
,
const
char
*
msg
);
/// Delete an error object.
///
/// \param error The error object.
TRITONSERVER_DECLSPEC
void
TRITONSERVER_ErrorDelete
(
TRITONSERVER_Error
*
error
);
/// Get the error code.
///
/// \param error The error object.
/// \return The error code.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error_Code
TRITONSERVER_ErrorCode
(
TRITONSERVER_Error
*
error
);
/// Get the string representation of an error code. The returned
/// string is not owned by the caller and so should not be modified or
/// freed. The lifetime of the returned string extends only as long as
/// 'error' and must not be accessed once 'error' is deleted.
///
/// \param error The error object.
/// \return The string representation of the error code.
TRITONSERVER_DECLSPEC
const
char
*
TRITONSERVER_ErrorCodeString
(
TRITONSERVER_Error
*
error
);
/// Get the error message. The returned string is not owned by the
/// caller and so should not be modified or freed. The lifetime of the
/// returned string extends only as long as 'error' and must not be
/// accessed once 'error' is deleted.
///
/// \param error The error object.
/// \return The error message.
TRITONSERVER_DECLSPEC
const
char
*
TRITONSERVER_ErrorMessage
(
TRITONSERVER_Error
*
error
);
/// TRITONSERVER_ResponseAllocator
///
/// Object representing a memory allocator for output tensors in an
/// inference response.
///
/// Type for allocation function that allocates a buffer to hold an
/// output tensor.
///
/// \param allocator The allocator that is provided in the call to
/// TRITONSERVER_InferenceRequestSetResponseCallback.
/// \param tensor_name The name of the output tensor to allocate for.
/// \param byte_size The size of the buffer to allocate.
/// \param memory_type The type of memory that the caller prefers for
/// the buffer allocation.
/// \param memory_type_id The ID of the memory that the caller prefers
/// for the buffer allocation.
/// \param userp The user data pointer that is provided as
/// 'response_allocator_userp' in the call to
/// TRITONSERVER_InferenceRequestSetResponseCallback.
/// \param buffer Returns a pointer to the allocated memory.
/// \param buffer_userp Returns a user-specified value to associate
/// with the buffer, or nullptr if no user-specified value should be
/// associated with the buffer. This value will be provided in the
/// call to TRITONSERVER_ResponseAllocatorReleaseFn_t when the buffer
/// is released and will also be returned by
/// TRITONSERVER_InferenceResponseOutput.
/// \param actual_memory_type Returns the type of memory where the
/// allocation resides. May be different than the type of memory
/// requested by 'memory_type'.
/// \param actual_memory_type_id Returns the ID of the memory where
/// the allocation resides. May be different than the ID of the memory
/// requested by 'memory_type_id'.
/// \return a TRITONSERVER_Error object if a failure occurs while
/// attempting an allocation. If an error is returned all other return
/// values will be ignored.
typedef
TRITONSERVER_Error
*
(
*
TRITONSERVER_ResponseAllocatorAllocFn_t
)(
TRITONSERVER_ResponseAllocator
*
allocator
,
const
char
*
tensor_name
,
size_t
byte_size
,
TRITONSERVER_MemoryType
memory_type
,
int64_t
memory_type_id
,
void
*
userp
,
void
**
buffer
,
void
**
buffer_userp
,
TRITONSERVER_MemoryType
*
actual_memory_type
,
int64_t
*
actual_memory_type_id
);
/// Type for allocation function that allocates a buffer to hold an
/// output tensor with buffer attributes. The callback function must fill in the
/// appropriate buffer attributes information related to this buffer. If set,
/// this function is always called after TRITONSERVER_ResponseAllocatorAllocFn_t
/// function.
///
/// \param allocator The allocator that is provided in the call to
/// TRITONSERVER_InferenceRequestSetResponseCallback.
/// \param tensor_name The name of the output tensor to allocate for.
/// \param buffer_attributes The buffer attributes associated with the buffer.
/// \param userp The user data pointer that is provided as
/// 'response_allocator_userp' in the call to
/// TRITONSERVER_InferenceRequestSetResponseCallback.
/// \param buffer_userp Returns a user-specified value to associate
/// with the buffer, or nullptr if no user-specified value should be
/// associated with the buffer. This value will be provided in the
/// call to TRITONSERVER_ResponseAllocatorReleaseFn_t when the buffer
/// is released and will also be returned by
/// TRITONSERVER_InferenceResponseOutput.
/// \return a TRITONSERVER_Error object if a failure occurs while
/// attempting an allocation. If an error is returned all other return
/// values will be ignored.
typedef
TRITONSERVER_Error
*
(
*
TRITONSERVER_ResponseAllocatorBufferAttributesFn_t
)(
TRITONSERVER_ResponseAllocator
*
allocator
,
const
char
*
tensor_name
,
TRITONSERVER_BufferAttributes
*
buffer_attributes
,
void
*
userp
,
void
*
buffer_userp
);
/// Type for function that is called to query the allocator's preferred memory
/// type and memory type ID. As much as possible, the allocator should attempt
/// to return the same memory_type and memory_type_id values that will be
/// returned by the subsequent call to TRITONSERVER_ResponseAllocatorAllocFn_t.
/// But the allocator is not required to do so.
///
/// \param allocator The allocator that is provided in the call to
/// TRITONSERVER_InferenceRequestSetResponseCallback.
/// \param userp The user data pointer that is provided as
/// 'response_allocator_userp' in the call to
/// TRITONSERVER_InferenceRequestSetResponseCallback.
/// \param tensor_name The name of the output tensor. This is optional
/// and it should be set to nullptr to indicate that the tensor name has
/// not determined.
/// \param byte_size The expected size of the buffer. This is optional
/// and it should be set to nullptr to indicate that the byte size has
/// not determined.
/// \param memory_type Acts as both input and output. On input gives
/// the memory type preferred by the caller. Returns memory type preferred
/// by the allocator, taken account of the caller preferred type.
/// \param memory_type_id Acts as both input and output. On input gives
/// the memory type ID preferred by the caller. Returns memory type ID preferred
/// by the allocator, taken account of the caller preferred type ID.
/// \return a TRITONSERVER_Error object if a failure occurs.
typedef
TRITONSERVER_Error
*
(
*
TRITONSERVER_ResponseAllocatorQueryFn_t
)(
TRITONSERVER_ResponseAllocator
*
allocator
,
void
*
userp
,
const
char
*
tensor_name
,
size_t
*
byte_size
,
TRITONSERVER_MemoryType
*
memory_type
,
int64_t
*
memory_type_id
);
/// Type for function that is called when the server no longer holds
/// any reference to a buffer allocated by
/// TRITONSERVER_ResponseAllocatorAllocFn_t. In practice this function
/// is typically called when the response object associated with the
/// buffer is deleted by TRITONSERVER_InferenceResponseDelete.
///
/// \param allocator The allocator that is provided in the call to
/// TRITONSERVER_InferenceRequestSetResponseCallback.
/// \param buffer Pointer to the buffer to be freed.
/// \param buffer_userp The user-specified value associated
/// with the buffer in TRITONSERVER_ResponseAllocatorAllocFn_t.
/// \param byte_size The size of the buffer.
/// \param memory_type The type of memory holding the buffer.
/// \param memory_type_id The ID of the memory holding the buffer.
/// \return a TRITONSERVER_Error object if a failure occurs while
/// attempting the release. If an error is returned Triton will not
/// attempt to release the buffer again.
typedef
TRITONSERVER_Error
*
(
*
TRITONSERVER_ResponseAllocatorReleaseFn_t
)(
TRITONSERVER_ResponseAllocator
*
allocator
,
void
*
buffer
,
void
*
buffer_userp
,
size_t
byte_size
,
TRITONSERVER_MemoryType
memory_type
,
int64_t
memory_type_id
);
/// Type for function that is called to indicate that subsequent
/// allocation requests will refer to a new response.
///
/// \param allocator The allocator that is provided in the call to
/// TRITONSERVER_InferenceRequestSetResponseCallback.
/// \param userp The user data pointer that is provided as
/// 'response_allocator_userp' in the call to
/// TRITONSERVER_InferenceRequestSetResponseCallback.
/// \return a TRITONSERVER_Error object if a failure occurs.
typedef
TRITONSERVER_Error
*
(
*
TRITONSERVER_ResponseAllocatorStartFn_t
)(
TRITONSERVER_ResponseAllocator
*
allocator
,
void
*
userp
);
/// Create a new response allocator object.
///
/// The response allocator object is used by Triton to allocate
/// buffers to hold the output tensors in inference responses. Most
/// models generate a single response for each inference request
/// (TRITONSERVER_TXN_ONE_TO_ONE). For these models the order of
/// callbacks will be:
///
/// TRITONSERVER_ServerInferAsync called
/// - start_fn : optional (and typically not required)
/// - alloc_fn : called once for each output tensor in response
/// TRITONSERVER_InferenceResponseDelete called
/// - release_fn: called once for each output tensor in response
///
/// For models that generate multiple responses for each inference
/// request (TRITONSERVER_TXN_DECOUPLED), the start_fn callback can be
/// used to determine sets of alloc_fn callbacks that belong to the
/// same response:
///
/// TRITONSERVER_ServerInferAsync called
/// - start_fn
/// - alloc_fn : called once for each output tensor in response
/// - start_fn
/// - alloc_fn : called once for each output tensor in response
/// ...
/// For each response, TRITONSERVER_InferenceResponseDelete called
/// - release_fn: called once for each output tensor in the response
///
/// In all cases the start_fn, alloc_fn and release_fn callback
/// functions must be thread-safe. Typically making these functions
/// thread-safe does not require explicit locking. The recommended way
/// to implement these functions is to have each inference request
/// provide a 'response_allocator_userp' object that is unique to that
/// request with TRITONSERVER_InferenceRequestSetResponseCallback. The
/// callback functions then operate only on this unique state. Locking
/// is required only when the callback function needs to access state
/// that is shared across inference requests (for example, a common
/// allocation pool).
///
/// \param allocator Returns the new response allocator object.
/// \param alloc_fn The function to call to allocate buffers for result
/// tensors.
/// \param release_fn The function to call when the server no longer
/// holds a reference to an allocated buffer.
/// \param start_fn The function to call to indicate that the
/// subsequent 'alloc_fn' calls are for a new response. This callback
/// is optional (use nullptr to indicate that it should not be
/// invoked).
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ResponseAllocatorNew
(
TRITONSERVER_ResponseAllocator
**
allocator
,
TRITONSERVER_ResponseAllocatorAllocFn_t
alloc_fn
,
TRITONSERVER_ResponseAllocatorReleaseFn_t
release_fn
,
TRITONSERVER_ResponseAllocatorStartFn_t
start_fn
);
/// Set the buffer attributes function for a response allocator object.
/// The function will be called after alloc_fn to set the buffer attributes
/// associated with the output buffer.
///
/// The thread-safy requirement for buffer_attributes_fn is the same as other
/// allocator callbacks.
///
/// \param allocator The response allocator object.
/// \param buffer_attributes_fn The function to call to get the buffer
/// attributes information for an allocated buffer.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ResponseAllocatorSetBufferAttributesFunction
(
TRITONSERVER_ResponseAllocator
*
allocator
,
TRITONSERVER_ResponseAllocatorBufferAttributesFn_t
buffer_attributes_fn
);
/// Set the query function to a response allocator object. Usually the
/// function will be called before alloc_fn to understand what is the
/// allocator's preferred memory type and memory type ID at the current
/// situation to make different execution decision.
///
/// The thread-safy requirement for query_fn is the same as other allocator
/// callbacks.
///
/// \param allocator The response allocator object.
/// \param query_fn The function to call to query allocator's preferred memory
/// type and memory type ID.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ResponseAllocatorSetQueryFunction
(
TRITONSERVER_ResponseAllocator
*
allocator
,
TRITONSERVER_ResponseAllocatorQueryFn_t
query_fn
);
/// Delete a response allocator.
///
/// \param allocator The response allocator object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ResponseAllocatorDelete
(
TRITONSERVER_ResponseAllocator
*
allocator
);
/// TRITONSERVER_Message
///
/// Object representing a Triton Server message.
///
/// Create a new message object from serialized JSON string.
///
/// \param message The message object.
/// \param base The base of the serialized JSON.
/// \param byte_size The size, in bytes, of the serialized message.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_MessageNewFromSerializedJson
(
TRITONSERVER_Message
**
message
,
const
char
*
base
,
size_t
byte_size
);
/// Delete a message object.
///
/// \param message The message object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_MessageDelete
(
TRITONSERVER_Message
*
message
);
/// Get the base and size of the buffer containing the serialized
/// message in JSON format. The buffer is owned by the
/// TRITONSERVER_Message object and should not be modified or freed by
/// the caller. The lifetime of the buffer extends only as long as
/// 'message' and must not be accessed once 'message' is deleted.
///
/// \param message The message object.
/// \param base Returns the base of the serialized message.
/// \param byte_size Returns the size, in bytes, of the serialized
/// message.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_MessageSerializeToJson
(
TRITONSERVER_Message
*
message
,
const
char
**
base
,
size_t
*
byte_size
);
/// TRITONSERVER_Metrics
///
/// Object representing metrics.
///
/// Metric format types
typedef
enum
tritonserver_metricformat_enum
{
TRITONSERVER_METRIC_PROMETHEUS
}
TRITONSERVER_MetricFormat
;
/// Delete a metrics object.
///
/// \param metrics The metrics object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_MetricsDelete
(
TRITONSERVER_Metrics
*
metrics
);
/// Get a buffer containing the metrics in the specified format. For
/// each format the buffer contains the following:
///
/// TRITONSERVER_METRIC_PROMETHEUS: 'base' points to a single multiline
/// string (char*) that gives a text representation of the metrics in
/// prometheus format. 'byte_size' returns the length of the string
/// in bytes.
///
/// The buffer is owned by the 'metrics' object and should not be
/// modified or freed by the caller. The lifetime of the buffer
/// extends only as long as 'metrics' and must not be accessed once
/// 'metrics' is deleted.
///
/// \param metrics The metrics object.
/// \param format The format to use for the returned metrics.
/// \param base Returns a pointer to the base of the formatted
/// metrics, as described above.
/// \param byte_size Returns the size, in bytes, of the formatted
/// metrics.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_MetricsFormatted
(
TRITONSERVER_Metrics
*
metrics
,
TRITONSERVER_MetricFormat
format
,
const
char
**
base
,
size_t
*
byte_size
);
/// TRITONSERVER_InferenceTrace
///
/// Object that represents tracing for an inference request.
///
/// Trace levels. The trace level controls the type of trace
/// activities that are reported for an inference request.
///
/// Trace level values are power-of-2 and can be combined to trace
/// multiple types of activities. For example, use
/// (TRITONSERVER_TRACE_LEVEL_TIMESTAMPS |
/// TRITONSERVER_TRACE_LEVEL_TENSORS) to trace both timestamps and
/// tensors for an inference request.
///
/// TRITONSERVER_TRACE_LEVEL_MIN and TRITONSERVER_TRACE_LEVEL_MAX are
/// deprecated and should not be used.
typedef
enum
tritonserver_tracelevel_enum
{
/// Tracing disabled. No trace activities are reported.
TRITONSERVER_TRACE_LEVEL_DISABLED
=
0
,
/// Deprecated. Use TRITONSERVER_TRACE_LEVEL_TIMESTAMPS.
TRITONSERVER_TRACE_LEVEL_MIN
=
1
,
/// Deprecated. Use TRITONSERVER_TRACE_LEVEL_TIMESTAMPS.
TRITONSERVER_TRACE_LEVEL_MAX
=
2
,
/// Record timestamps for the inference request.
TRITONSERVER_TRACE_LEVEL_TIMESTAMPS
=
0x4
,
/// Record input and output tensor values for the inference request.
TRITONSERVER_TRACE_LEVEL_TENSORS
=
0x8
}
TRITONSERVER_InferenceTraceLevel
;
/// Get the string representation of a trace level. The returned
/// string is not owned by the caller and so should not be modified or
/// freed.
///
/// \param level The trace level.
/// \return The string representation of the trace level.
TRITONSERVER_DECLSPEC
const
char
*
TRITONSERVER_InferenceTraceLevelString
(
TRITONSERVER_InferenceTraceLevel
level
);
/// Trace activities
typedef
enum
tritonserver_traceactivity_enum
{
TRITONSERVER_TRACE_REQUEST_START
=
0
,
TRITONSERVER_TRACE_QUEUE_START
=
1
,
TRITONSERVER_TRACE_COMPUTE_START
=
2
,
TRITONSERVER_TRACE_COMPUTE_INPUT_END
=
3
,
TRITONSERVER_TRACE_COMPUTE_OUTPUT_START
=
4
,
TRITONSERVER_TRACE_COMPUTE_END
=
5
,
TRITONSERVER_TRACE_REQUEST_END
=
6
,
TRITONSERVER_TRACE_TENSOR_QUEUE_INPUT
=
7
,
TRITONSERVER_TRACE_TENSOR_BACKEND_INPUT
=
8
,
TRITONSERVER_TRACE_TENSOR_BACKEND_OUTPUT
=
9
}
TRITONSERVER_InferenceTraceActivity
;
/// Get the string representation of a trace activity. The returned
/// string is not owned by the caller and so should not be modified or
/// freed.
///
/// \param activity The trace activity.
/// \return The string representation of the trace activity.
TRITONSERVER_DECLSPEC
const
char
*
TRITONSERVER_InferenceTraceActivityString
(
TRITONSERVER_InferenceTraceActivity
activity
);
/// Type for trace timeline activity callback function. This callback function
/// is used to report activity occurring for a trace. This function
/// does not take ownership of 'trace' and so any information needed
/// from that object must be copied before returning. The 'userp' data
/// is the same as what is supplied in the call to
/// TRITONSERVER_InferenceTraceNew.
typedef
void
(
*
TRITONSERVER_InferenceTraceActivityFn_t
)(
TRITONSERVER_InferenceTrace
*
trace
,
TRITONSERVER_InferenceTraceActivity
activity
,
uint64_t
timestamp_ns
,
void
*
userp
);
/// Type for trace tensor activity callback function. This callback function
/// is used to report tensor activity occurring for a trace. This function
/// does not take ownership of 'trace' and so any information needed
/// from that object must be copied before returning. The 'userp' data
/// is the same as what is supplied in the call to
/// TRITONSERVER_InferenceTraceTensorNew.
typedef
void
(
*
TRITONSERVER_InferenceTraceTensorActivityFn_t
)(
TRITONSERVER_InferenceTrace
*
trace
,
TRITONSERVER_InferenceTraceActivity
activity
,
const
char
*
name
,
TRITONSERVER_DataType
datatype
,
const
void
*
base
,
size_t
byte_size
,
const
int64_t
*
shape
,
uint64_t
dim_count
,
TRITONSERVER_MemoryType
memory_type
,
int64_t
memory_type_id
,
void
*
userp
);
/// Type for trace release callback function. This callback function
/// is called when all activity for the trace has completed. The
/// callback function takes ownership of the
/// TRITONSERVER_InferenceTrace object. The 'userp' data is the same
/// as what is supplied in the call to TRITONSERVER_InferenceTraceNew.
typedef
void
(
*
TRITONSERVER_InferenceTraceReleaseFn_t
)(
TRITONSERVER_InferenceTrace
*
trace
,
void
*
userp
);
/// Create a new inference trace object. The caller takes ownership of
/// the TRITONSERVER_InferenceTrace object and must call
/// TRITONSERVER_InferenceTraceDelete to release the object.
///
/// The activity callback function will be called to report activity
/// for 'trace' as well as for any child traces that are spawned by
/// 'trace', and so the activity callback must check the trace object
/// to determine specifically what activity is being reported.
///
/// The release callback is called for both 'trace' and for any child
/// traces spawned by 'trace'.
///
/// \param trace Returns the new inference trace object.
/// \param level The tracing level.
/// \param parent_id The parent trace id for this trace. A value of 0
/// indicates that there is not parent trace.
/// \param activity_fn The callback function where activity for the
/// trace is reported.
/// \param release_fn The callback function called when all activity
/// is complete for the trace.
/// \param trace_userp User-provided pointer that is delivered to
/// the activity and release callback functions.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceTraceNew
(
TRITONSERVER_InferenceTrace
**
trace
,
TRITONSERVER_InferenceTraceLevel
level
,
uint64_t
parent_id
,
TRITONSERVER_InferenceTraceActivityFn_t
activity_fn
,
TRITONSERVER_InferenceTraceReleaseFn_t
release_fn
,
void
*
trace_userp
);
/// Create a new inference trace object. The caller takes ownership of
/// the TRITONSERVER_InferenceTrace object and must call
/// TRITONSERVER_InferenceTraceDelete to release the object.
///
/// The timeline and tensor activity callback function will be called to report
/// activity for 'trace' as well as for any child traces that are spawned by
/// 'trace', and so the activity callback must check the trace object
/// to determine specifically what activity is being reported.
///
/// The release callback is called for both 'trace' and for any child
/// traces spawned by 'trace'.
///
/// \param trace Returns the new inference trace object.
/// \param level The tracing level.
/// \param parent_id The parent trace id for this trace. A value of 0
/// indicates that there is not parent trace.
/// \param activity_fn The callback function where timeline activity for the
/// trace is reported.
/// \param tensor_activity_fn The callback function where tensor activity for
/// the trace is reported.
/// \param release_fn The callback function called when all activity
/// is complete for the trace.
/// \param trace_userp User-provided pointer that is delivered to
/// the activity and release callback functions.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceTraceTensorNew
(
TRITONSERVER_InferenceTrace
**
trace
,
TRITONSERVER_InferenceTraceLevel
level
,
uint64_t
parent_id
,
TRITONSERVER_InferenceTraceActivityFn_t
activity_fn
,
TRITONSERVER_InferenceTraceTensorActivityFn_t
tensor_activity_fn
,
TRITONSERVER_InferenceTraceReleaseFn_t
release_fn
,
void
*
trace_userp
);
/// Delete a trace object.
///
/// \param trace The trace object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceTraceDelete
(
TRITONSERVER_InferenceTrace
*
trace
);
/// Get the id associated with a trace. Every trace is assigned an id
/// that is unique across all traces created for a Triton server.
///
/// \param trace The trace.
/// \param id Returns the id associated with the trace.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceTraceId
(
TRITONSERVER_InferenceTrace
*
trace
,
uint64_t
*
id
);
/// Get the parent id associated with a trace. The parent id indicates
/// a parent-child relationship between two traces. A parent id value
/// of 0 indicates that there is no parent trace.
///
/// \param trace The trace.
/// \param id Returns the parent id associated with the trace.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceTraceParentId
(
TRITONSERVER_InferenceTrace
*
trace
,
uint64_t
*
parent_id
);
/// Get the name of the model associated with a trace. The caller does
/// not own the returned string and must not modify or delete it. The
/// lifetime of the returned string extends only as long as 'trace'.
///
/// \param trace The trace.
/// \param model_name Returns the name of the model associated with
/// the trace.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceTraceModelName
(
TRITONSERVER_InferenceTrace
*
trace
,
const
char
**
model_name
);
/// Get the version of the model associated with a trace.
///
/// \param trace The trace.
/// \param model_version Returns the version of the model associated
/// with the trace.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceTraceModelVersion
(
TRITONSERVER_InferenceTrace
*
trace
,
int64_t
*
model_version
);
/// TRITONSERVER_InferenceRequest
///
/// Object representing an inference request. The inference request
/// provides the meta-data and input tensor values needed for an
/// inference and returns the inference result meta-data and output
/// tensors. An inference request object can be modified and reused
/// multiple times.
///
/// Inference request flags. The enum values must be power-of-2 values.
typedef
enum
tritonserver_requestflag_enum
{
TRITONSERVER_REQUEST_FLAG_SEQUENCE_START
=
1
,
TRITONSERVER_REQUEST_FLAG_SEQUENCE_END
=
2
}
TRITONSERVER_RequestFlag
;
/// Inference request release flags. The enum values must be
/// power-of-2 values.
typedef
enum
tritonserver_requestreleaseflag_enum
{
TRITONSERVER_REQUEST_RELEASE_ALL
=
1
}
TRITONSERVER_RequestReleaseFlag
;
/// Inference response complete flags. The enum values must be
/// power-of-2 values.
typedef
enum
tritonserver_responsecompleteflag_enum
{
TRITONSERVER_RESPONSE_COMPLETE_FINAL
=
1
}
TRITONSERVER_ResponseCompleteFlag
;
/// Type for inference request release callback function. The callback
/// indicates what type of release is being performed on the request
/// and for some of these the callback function takes ownership of the
/// TRITONSERVER_InferenceRequest object. The 'userp' data is the data
/// provided as 'request_release_userp' in the call to
/// TRITONSERVER_InferenceRequestSetReleaseCallback.
///
/// One or more flags will be specified when the callback is invoked,
/// and the callback must take the following actions:
///
/// - TRITONSERVER_REQUEST_RELEASE_ALL: The entire inference request
/// is being released and ownership is passed to the callback
/// function. Triton will not longer access the 'request' object
/// itself nor any input tensor data associated with the
/// request. The callback should free or otherwise manage the
/// 'request' object and all associated tensor data.
///
/// Note that currently TRITONSERVER_REQUEST_RELEASE_ALL should always
/// be set when the callback is invoked but in the future that may
/// change, so the callback should explicitly check for the flag
/// before taking ownership of the request object.
///
typedef
void
(
*
TRITONSERVER_InferenceRequestReleaseFn_t
)(
TRITONSERVER_InferenceRequest
*
request
,
const
uint32_t
flags
,
void
*
userp
);
/// Type for callback function indicating that an inference response
/// has completed. The callback function takes ownership of the
/// TRITONSERVER_InferenceResponse object. The 'userp' data is the
/// data provided as 'response_userp' in the call to
/// TRITONSERVER_InferenceRequestSetResponseCallback.
///
/// One or more flags may be specified when the callback is invoked:
///
/// - TRITONSERVER_RESPONSE_COMPLETE_FINAL: Indicates that no more
/// responses will be generated for a given request (more
/// specifically, that no more responses will be generated for the
/// inference request that set this callback and 'userp'). When
/// this flag is set 'response' may be a response object or may be
/// nullptr. If 'response' is not nullptr, then 'response' is the
/// last response that Triton will produce for the request. If
/// 'response' is nullptr then Triton is indicating that no more
/// responses will be produced for the request.
typedef
void
(
*
TRITONSERVER_InferenceResponseCompleteFn_t
)(
TRITONSERVER_InferenceResponse
*
response
,
const
uint32_t
flags
,
void
*
userp
);
/// Create a new inference request object.
///
/// \param inference_request Returns the new request object.
/// \param server the inference server object.
/// \param model_name The name of the model to use for the request.
/// \param model_version The version of the model to use for the
/// request. If -1 then the server will choose a version based on the
/// model's policy.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceRequestNew
(
TRITONSERVER_InferenceRequest
**
inference_request
,
TRITONSERVER_Server
*
server
,
const
char
*
model_name
,
const
int64_t
model_version
);
/// Delete an inference request object.
///
/// \param inference_request The request object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceRequestDelete
(
TRITONSERVER_InferenceRequest
*
inference_request
);
/// Get the ID for a request. The returned ID is owned by
/// 'inference_request' and must not be modified or freed by the
/// caller.
///
/// \param inference_request The request object.
/// \param id Returns the ID.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceRequestId
(
TRITONSERVER_InferenceRequest
*
inference_request
,
const
char
**
id
);
/// Set the ID for a request.
///
/// \param inference_request The request object.
/// \param id The ID.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceRequestSetId
(
TRITONSERVER_InferenceRequest
*
inference_request
,
const
char
*
id
);
/// Get the flag(s) associated with a request. On return 'flags' holds
/// a bitwise-or of all flag values, see TRITONSERVER_RequestFlag for
/// available flags.
///
/// \param inference_request The request object.
/// \param flags Returns the flags.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceRequestFlags
(
TRITONSERVER_InferenceRequest
*
inference_request
,
uint32_t
*
flags
);
/// Set the flag(s) associated with a request. 'flags' should hold a
/// bitwise-or of all flag values, see TRITONSERVER_RequestFlag for
/// available flags.
///
/// \param inference_request The request object.
/// \param flags The flags.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceRequestSetFlags
(
TRITONSERVER_InferenceRequest
*
inference_request
,
uint32_t
flags
);
/// Get the correlation ID of the inference request as an unsigned integer.
/// Default is 0, which indicates that the request has no correlation ID.
/// If the correlation id associated with the inference request is a string,
/// this function will return a failure. The correlation ID is used
/// to indicate two or more inference request are related to each other.
/// How this relationship is handled by the inference server is determined by
/// the model's scheduling policy.
///
/// \param inference_request The request object.
/// \param correlation_id Returns the correlation ID.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceRequestCorrelationId
(
TRITONSERVER_InferenceRequest
*
inference_request
,
uint64_t
*
correlation_id
);
/// Get the correlation ID of the inference request as a string.
/// Default is empty "", which indicates that the request has no correlation ID.
/// If the correlation id associated with the inference request is an unsigned
/// integer, then this function will return a failure. The correlation ID
/// is used to indicate two or more inference request are related to each other.
/// How this relationship is handled by the inference server is determined by
/// the model's scheduling policy.
///
/// \param inference_request The request object.
/// \param correlation_id Returns the correlation ID.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceRequestCorrelationIdString
(
TRITONSERVER_InferenceRequest
*
inference_request
,
const
char
**
correlation_id
);
/// Set the correlation ID of the inference request to be an unsigned integer.
/// Default is 0, which indicates that the request has no correlation ID.
/// The correlation ID is used to indicate two or more inference request
/// are related to each other. How this relationship is handled by the
/// inference server is determined by the model's scheduling policy.
///
/// \param inference_request The request object.
/// \param correlation_id The correlation ID.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceRequestSetCorrelationId
(
TRITONSERVER_InferenceRequest
*
inference_request
,
uint64_t
correlation_id
);
/// Set the correlation ID of the inference request to be a string.
/// The correlation ID is used to indicate two or more inference
/// request are related to each other. How this relationship is
/// handled by the inference server is determined by the model's
/// scheduling policy.
///
/// \param inference_request The request object.
/// \param correlation_id The correlation ID.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceRequestSetCorrelationIdString
(
TRITONSERVER_InferenceRequest
*
inference_request
,
const
char
*
correlation_id
);
/// Get the priority for a request. The default is 0 indicating that
/// the request does not specify a priority and so will use the
/// model's default priority.
///
/// \param inference_request The request object.
/// \param priority Returns the priority level.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceRequestPriority
(
TRITONSERVER_InferenceRequest
*
inference_request
,
uint32_t
*
priority
);
/// Set the priority for a request. The default is 0 indicating that
/// the request does not specify a priority and so will use the
/// model's default priority.
///
/// \param inference_request The request object.
/// \param priority The priority level.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceRequestSetPriority
(
TRITONSERVER_InferenceRequest
*
inference_request
,
uint32_t
priority
);
/// Get the timeout for a request, in microseconds. The default is 0
/// which indicates that the request has no timeout.
///
/// \param inference_request The request object.
/// \param timeout_us Returns the timeout, in microseconds.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceRequestTimeoutMicroseconds
(
TRITONSERVER_InferenceRequest
*
inference_request
,
uint64_t
*
timeout_us
);
/// Set the timeout for a request, in microseconds. The default is 0
/// which indicates that the request has no timeout.
///
/// \param inference_request The request object.
/// \param timeout_us The timeout, in microseconds.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceRequestSetTimeoutMicroseconds
(
TRITONSERVER_InferenceRequest
*
inference_request
,
uint64_t
timeout_us
);
/// Add an input to a request.
///
/// \param inference_request The request object.
/// \param name The name of the input.
/// \param datatype The type of the input. Valid type names are BOOL,
/// UINT8, UINT16, UINT32, UINT64, INT8, INT16, INT32, INT64, FP16,
/// FP32, FP64, and BYTES.
/// \param shape The shape of the input.
/// \param dim_count The number of dimensions of 'shape'.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceRequestAddInput
(
TRITONSERVER_InferenceRequest
*
inference_request
,
const
char
*
name
,
const
TRITONSERVER_DataType
datatype
,
const
int64_t
*
shape
,
uint64_t
dim_count
);
/// Add a raw input to a request. The name recognized by the model, data type
/// and shape of the input will be deduced from model configuration.
/// This function must be called at most once on request with no other input to
/// ensure the deduction is accurate.
///
/// \param inference_request The request object.
/// \param name The name of the input. This name is only used as a reference
/// of the raw input in other Tritonserver APIs. It doesn't assoicate with the
/// name used in the model.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceRequestAddRawInput
(
TRITONSERVER_InferenceRequest
*
inference_request
,
const
char
*
name
);
/// Remove an input from a request.
///
/// \param inference_request The request object.
/// \param name The name of the input.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceRequestRemoveInput
(
TRITONSERVER_InferenceRequest
*
inference_request
,
const
char
*
name
);
/// Remove all inputs from a request.
///
/// \param inference_request The request object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceRequestRemoveAllInputs
(
TRITONSERVER_InferenceRequest
*
inference_request
);
/// Assign a buffer of data to an input. The buffer will be appended
/// to any existing buffers for that input. The 'inference_request'
/// object takes ownership of the buffer and so the caller should not
/// modify or free the buffer until that ownership is released by
/// 'inference_request' being deleted or by the input being removed
/// from 'inference_request'.
///
/// \param inference_request The request object.
/// \param name The name of the input.
/// \param base The base address of the input data.
/// \param byte_size The size, in bytes, of the input data.
/// \param memory_type The memory type of the input data.
/// \param memory_type_id The memory type id of the input data.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceRequestAppendInputData
(
TRITONSERVER_InferenceRequest
*
inference_request
,
const
char
*
name
,
const
void
*
base
,
size_t
byte_size
,
TRITONSERVER_MemoryType
memory_type
,
int64_t
memory_type_id
);
/// Assign a buffer of data to an input for execution on all model instances
/// with the specified host policy. The buffer will be appended to any existing
/// buffers for that input on all devices with this host policy. The
/// 'inference_request' object takes ownership of the buffer and so the caller
/// should not modify or free the buffer until that ownership is released by
/// 'inference_request' being deleted or by the input being removed from
/// 'inference_request'. If the execution is scheduled on a device that does not
/// have a input buffer specified using this function, then the input buffer
/// specified with TRITONSERVER_InferenceRequestAppendInputData will be used so
/// a non-host policy specific version of data must be added using that API.
/// \param inference_request The request object.
/// \param name The name of the input.
/// \param base The base address of the input data.
/// \param byte_size The size, in bytes, of the input data.
/// \param memory_type The memory type of the input data.
/// \param memory_type_id The memory type id of the input data.
/// \param host_policy_name All model instances executing with this host_policy
/// will use this input buffer for execution.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceRequestAppendInputDataWithHostPolicy
(
TRITONSERVER_InferenceRequest
*
inference_request
,
const
char
*
name
,
const
void
*
base
,
size_t
byte_size
,
TRITONSERVER_MemoryType
memory_type
,
int64_t
memory_type_id
,
const
char
*
host_policy_name
);
/// Assign a buffer of data to an input. The buffer will be appended
/// to any existing buffers for that input. The 'inference_request'
/// object takes ownership of the buffer and so the caller should not
/// modify or free the buffer until that ownership is released by
/// 'inference_request' being deleted or by the input being removed
/// from 'inference_request'.
///
/// \param inference_request The request object.
/// \param name The name of the input.
/// \param base The base address of the input data.
/// \param buffer_attributes The buffer attrubutes of the input.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceRequestAppendInputDataWithBufferAttributes
(
TRITONSERVER_InferenceRequest
*
inference_request
,
const
char
*
name
,
const
void
*
base
,
TRITONSERVER_BufferAttributes
*
buffer_attributes
);
/// Clear all input data from an input, releasing ownership of the
/// buffer(s) that were appended to the input with
/// TRITONSERVER_InferenceRequestAppendInputData or
/// TRITONSERVER_InferenceRequestAppendInputDataWithHostPolicy
/// \param inference_request The request object.
/// \param name The name of the input.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceRequestRemoveAllInputData
(
TRITONSERVER_InferenceRequest
*
inference_request
,
const
char
*
name
);
/// Add an output request to an inference request.
///
/// \param inference_request The request object.
/// \param name The name of the output.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceRequestAddRequestedOutput
(
TRITONSERVER_InferenceRequest
*
inference_request
,
const
char
*
name
);
/// Remove an output request from an inference request.
///
/// \param inference_request The request object.
/// \param name The name of the output.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceRequestRemoveRequestedOutput
(
TRITONSERVER_InferenceRequest
*
inference_request
,
const
char
*
name
);
/// Remove all output requests from an inference request.
///
/// \param inference_request The request object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceRequestRemoveAllRequestedOutputs
(
TRITONSERVER_InferenceRequest
*
inference_request
);
/// Set the release callback for an inference request. The release
/// callback is called by Triton to return ownership of the request
/// object.
///
/// \param inference_request The request object.
/// \param request_release_fn The function called to return ownership
/// of the 'inference_request' object.
/// \param request_release_userp User-provided pointer that is
/// delivered to the 'request_release_fn' callback.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceRequestSetReleaseCallback
(
TRITONSERVER_InferenceRequest
*
inference_request
,
TRITONSERVER_InferenceRequestReleaseFn_t
request_release_fn
,
void
*
request_release_userp
);
/// Set the allocator and response callback for an inference
/// request. The allocator is used to allocate buffers for any output
/// tensors included in responses that are produced for this
/// request. The response callback is called to return response
/// objects representing responses produced for this request.
///
/// \param inference_request The request object.
/// \param response_allocator The TRITONSERVER_ResponseAllocator to use
/// to allocate buffers to hold inference results.
/// \param response_allocator_userp User-provided pointer that is
/// delivered to the response allocator's start and allocation functions.
/// \param response_fn The function called to deliver an inference
/// response for this request.
/// \param response_userp User-provided pointer that is delivered to
/// the 'response_fn' callback.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceRequestSetResponseCallback
(
TRITONSERVER_InferenceRequest
*
inference_request
,
TRITONSERVER_ResponseAllocator
*
response_allocator
,
void
*
response_allocator_userp
,
TRITONSERVER_InferenceResponseCompleteFn_t
response_fn
,
void
*
response_userp
);
/// TRITONSERVER_InferenceResponse
///
/// Object representing an inference response. The inference response
/// provides the meta-data and output tensor values calculated by the
/// inference.
///
/// Delete an inference response object.
///
/// \param inference_response The response object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceResponseDelete
(
TRITONSERVER_InferenceResponse
*
inference_response
);
/// Return the error status of an inference response. Return a
/// TRITONSERVER_Error object on failure, return nullptr on success.
/// The returned error object is owned by 'inference_response' and so
/// should not be deleted by the caller.
///
/// \param inference_response The response object.
/// \return a TRITONSERVER_Error indicating the success or failure
/// status of the response.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceResponseError
(
TRITONSERVER_InferenceResponse
*
inference_response
);
/// Get model used to produce a response. The caller does not own the
/// returned model name value and must not modify or delete it. The
/// lifetime of all returned values extends until 'inference_response'
/// is deleted.
///
/// \param inference_response The response object.
/// \param model_name Returns the name of the model.
/// \param model_version Returns the version of the model.
/// this response.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceResponseModel
(
TRITONSERVER_InferenceResponse
*
inference_response
,
const
char
**
model_name
,
int64_t
*
model_version
);
/// Get the ID of the request corresponding to a response. The caller
/// does not own the returned ID and must not modify or delete it. The
/// lifetime of all returned values extends until 'inference_response'
/// is deleted.
///
/// \param inference_response The response object.
/// \param request_id Returns the ID of the request corresponding to
/// this response.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceResponseId
(
TRITONSERVER_InferenceResponse
*
inference_response
,
const
char
**
request_id
);
/// Get the number of parameters available in the response.
///
/// \param inference_response The response object.
/// \param count Returns the number of parameters.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceResponseParameterCount
(
TRITONSERVER_InferenceResponse
*
inference_response
,
uint32_t
*
count
);
/// Get all information about a parameter. The caller does not own any
/// of the returned values and must not modify or delete them. The
/// lifetime of all returned values extends until 'inference_response'
/// is deleted.
///
/// The 'vvalue' returns a void* pointer that must be cast
/// appropriately based on 'type'. For example:
///
/// void* vvalue;
/// TRITONSERVER_ParameterType type;
/// TRITONSERVER_InferenceResponseParameter(
/// response, index, &name, &type, &vvalue);
/// switch (type) {
/// case TRITONSERVER_PARAMETER_BOOL:
/// bool value = *(reinterpret_cast<bool*>(vvalue));
/// ...
/// case TRITONSERVER_PARAMETER_INT:
/// int64_t value = *(reinterpret_cast<int64_t*>(vvalue));
/// ...
/// case TRITONSERVER_PARAMETER_STRING:
/// const char* value = reinterpret_cast<const char*>(vvalue);
/// ...
///
/// \param inference_response The response object.
/// \param index The index of the parameter, must be 0 <= index <
/// count, where 'count' is the value returned by
/// TRITONSERVER_InferenceResponseParameterCount.
/// \param name Returns the name of the parameter.
/// \param type Returns the type of the parameter.
/// \param vvalue Returns a pointer to the parameter value.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceResponseParameter
(
TRITONSERVER_InferenceResponse
*
inference_response
,
const
uint32_t
index
,
const
char
**
name
,
TRITONSERVER_ParameterType
*
type
,
const
void
**
vvalue
);
/// Get the number of outputs available in the response.
///
/// \param inference_response The response object.
/// \param count Returns the number of output tensors.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceResponseOutputCount
(
TRITONSERVER_InferenceResponse
*
inference_response
,
uint32_t
*
count
);
/// Get all information about an output tensor. The tensor data is
/// returned as the base pointer to the data and the size, in bytes,
/// of the data. The caller does not own any of the returned values
/// and must not modify or delete them. The lifetime of all returned
/// values extends until 'inference_response' is deleted.
///
/// \param inference_response The response object.
/// \param index The index of the output tensor, must be 0 <= index <
/// count, where 'count' is the value returned by
/// TRITONSERVER_InferenceResponseOutputCount.
/// \param name Returns the name of the output.
/// \param datatype Returns the type of the output.
/// \param shape Returns the shape of the output.
/// \param dim_count Returns the number of dimensions of the returned
/// shape.
/// \param base Returns the tensor data for the output.
/// \param byte_size Returns the size, in bytes, of the data.
/// \param memory_type Returns the memory type of the data.
/// \param memory_type_id Returns the memory type id of the data.
/// \param userp The user-specified value associated with the buffer
/// in TRITONSERVER_ResponseAllocatorAllocFn_t.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceResponseOutput
(
TRITONSERVER_InferenceResponse
*
inference_response
,
const
uint32_t
index
,
const
char
**
name
,
TRITONSERVER_DataType
*
datatype
,
const
int64_t
**
shape
,
uint64_t
*
dim_count
,
const
void
**
base
,
size_t
*
byte_size
,
TRITONSERVER_MemoryType
*
memory_type
,
int64_t
*
memory_type_id
,
void
**
userp
);
/// Get a classification label associated with an output for a given
/// index. The caller does not own the returned label and must not
/// modify or delete it. The lifetime of all returned label extends
/// until 'inference_response' is deleted.
///
/// \param inference_response The response object.
/// \param index The index of the output tensor, must be 0 <= index <
/// count, where 'count' is the value returned by
/// TRITONSERVER_InferenceResponseOutputCount.
/// \param class_index The index of the class.
/// \param name Returns the label corresponding to 'class_index' or
/// nullptr if no label.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_InferenceResponseOutputClassificationLabel
(
TRITONSERVER_InferenceResponse
*
inference_response
,
const
uint32_t
index
,
const
size_t
class_index
,
const
char
**
label
);
/// TRITONSERVER_BufferAttributes
///
/// API to create, modify, or retrieve attributes associated with a buffer.
///
/// Create a new buffer attributes object. The caller takes ownership of
/// the TRITONSERVER_BufferAttributes object and must call
/// TRITONSERVER_BufferAttributesDelete to release the object.
///
/// \param buffer_attributes Returns the new buffer attributes object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_BufferAttributesNew
(
TRITONSERVER_BufferAttributes
**
buffer_attributes
);
/// Delete a buffer attributes object.
///
/// \param buffer_attributes The buffer_attributes object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_BufferAttributesDelete
(
TRITONSERVER_BufferAttributes
*
buffer_attributes
);
/// Set the memory type id field of the buffer attributes.
///
/// \param buffer_attributes The buffer attributes object.
/// \param memory_type_id Memory type id to assign to the buffer attributes
/// object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_BufferAttributesSetMemoryTypeId
(
TRITONSERVER_BufferAttributes
*
buffer_attributes
,
int64_t
memory_type_id
);
/// Set the memory type field of the buffer attributes.
///
/// \param buffer_attributes The buffer attributes object.
/// \param memory_type Memory type to assign to the buffer attributes object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_BufferAttributesSetMemoryType
(
TRITONSERVER_BufferAttributes
*
buffer_attributes
,
TRITONSERVER_MemoryType
memory_type
);
/// Set the CudaIpcHandle field of the buffer attributes.
///
/// \param buffer_attributes The buffer attributes object.
/// \param cuda_ipc_handle The CudaIpcHandle to assign to the buffer attributes
/// object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_BufferAttributesSetCudaIpcHandle
(
TRITONSERVER_BufferAttributes
*
buffer_attributes
,
void
*
cuda_ipc_handle
);
/// Set the byte size field of the buffer attributes.
///
/// \param buffer_attributes The buffer attributes object.
/// \param byte_size Byte size to assign to the buffer attributes object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_BufferAttributesSetByteSize
(
TRITONSERVER_BufferAttributes
*
buffer_attributes
,
size_t
byte_size
);
/// Get the memory type id field of the buffer attributes.
///
/// \param buffer_attributes The buffer attributes object.
/// \param memory_type_id Returns the memory type id associated with the buffer
/// attributes object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_BufferAttributesMemoryTypeId
(
TRITONSERVER_BufferAttributes
*
buffer_attributes
,
int64_t
*
memory_type_id
);
/// Get the memory type field of the buffer attributes.
///
/// \param buffer_attributes The buffer attributes object.
/// \param memory_type Returns the memory type associated with the buffer
/// attributes object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_BufferAttributesMemoryType
(
TRITONSERVER_BufferAttributes
*
buffer_attributes
,
TRITONSERVER_MemoryType
*
memory_type
);
/// Get the CudaIpcHandle field of the buffer attributes object.
///
/// \param buffer_attributes The buffer attributes object.
/// \param cuda_ipc_handle Returns the memory type associated with the buffer
/// attributes object. If the cudaIpcHandle does not exist for the buffer,
/// nullptr will be returned.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_BufferAttributesCudaIpcHandle
(
TRITONSERVER_BufferAttributes
*
buffer_attributes
,
void
**
cuda_ipc_handle
);
/// Get the byte size field of the buffer attributes.
///
/// \param buffer_attributes The buffer attributes object.
/// \param byte_size Returns the byte size associated with the buffer attributes
/// object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_BufferAttributesByteSize
(
TRITONSERVER_BufferAttributes
*
buffer_attributes
,
size_t
*
byte_size
);
/// TRITONSERVER_ServerOptions
///
/// Options to use when creating an inference server.
///
/// Model control modes
typedef
enum
tritonserver_modelcontrolmode_enum
{
TRITONSERVER_MODEL_CONTROL_NONE
,
TRITONSERVER_MODEL_CONTROL_POLL
,
TRITONSERVER_MODEL_CONTROL_EXPLICIT
}
TRITONSERVER_ModelControlMode
;
/// Rate limit modes
typedef
enum
tritonserver_ratelimitmode_enum
{
TRITONSERVER_RATE_LIMIT_OFF
,
TRITONSERVER_RATE_LIMIT_EXEC_COUNT
}
TRITONSERVER_RateLimitMode
;
/// Create a new server options object. The caller takes ownership of
/// the TRITONSERVER_ServerOptions object and must call
/// TRITONSERVER_ServerOptionsDelete to release the object.
///
/// \param options Returns the new server options object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsNew
(
TRITONSERVER_ServerOptions
**
options
);
/// Delete a server options object.
///
/// \param options The server options object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsDelete
(
TRITONSERVER_ServerOptions
*
options
);
/// Set the textual ID for the server in a server options. The ID is a
/// name that identifies the server.
///
/// \param options The server options object.
/// \param server_id The server identifier.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsSetServerId
(
TRITONSERVER_ServerOptions
*
options
,
const
char
*
server_id
);
/// Set the model repository path in a server options. The path must be
/// the full absolute path to the model repository. This function can be called
/// multiple times with different paths to set multiple model repositories.
/// Note that if a model is not unique across all model repositories
/// at any time, the model will not be available.
///
/// \param options The server options object.
/// \param model_repository_path The full path to the model repository.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsSetModelRepositoryPath
(
TRITONSERVER_ServerOptions
*
options
,
const
char
*
model_repository_path
);
/// Set the model control mode in a server options. For each mode the models
/// will be managed as the following:
///
/// TRITONSERVER_MODEL_CONTROL_NONE: the models in model repository will be
/// loaded on startup. After startup any changes to the model repository will
/// be ignored. Calling TRITONSERVER_ServerPollModelRepository will result in
/// an error.
///
/// TRITONSERVER_MODEL_CONTROL_POLL: the models in model repository will be
/// loaded on startup. The model repository can be polled periodically using
/// TRITONSERVER_ServerPollModelRepository and the server will load, unload,
/// and updated models according to changes in the model repository.
///
/// TRITONSERVER_MODEL_CONTROL_EXPLICIT: the models in model repository will
/// not be loaded on startup. The corresponding model control APIs must be
/// called to load / unload a model in the model repository.
///
/// \param options The server options object.
/// \param mode The mode to use for the model control.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsSetModelControlMode
(
TRITONSERVER_ServerOptions
*
options
,
TRITONSERVER_ModelControlMode
mode
);
/// Set the model to be loaded at startup in a server options. The model must be
/// present in one, and only one, of the specified model repositories.
/// This function can be called multiple times with different model name
/// to set multiple startup models.
/// Note that it only takes affect on TRITONSERVER_MODEL_CONTROL_EXPLICIT mode.
///
/// \param options The server options object.
/// \param mode_name The name of the model to load on startup.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsSetStartupModel
(
TRITONSERVER_ServerOptions
*
options
,
const
char
*
model_name
);
/// Enable or disable strict model configuration handling in a server
/// options.
///
/// \param options The server options object.
/// \param strict True to enable strict model configuration handling,
/// false to disable.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsSetStrictModelConfig
(
TRITONSERVER_ServerOptions
*
options
,
bool
strict
);
/// Set the rate limit mode in a server options.
///
/// TRITONSERVER_RATE_LIMIT_EXEC_COUNT: The rate limiting prioritizes the
/// inference execution using the number of times each instance has got a
/// chance to run. The execution gets to run only when its resource
/// constraints are satisfied.
///
/// TRITONSERVER_RATE_LIMIT_OFF: The rate limiting is turned off and the
/// inference gets executed whenever an instance is available.
///
/// \param options The server options object.
/// \param mode The mode to use for the rate limiting. By default, execution
/// count is used to determine the priorities.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsSetRateLimiterMode
(
TRITONSERVER_ServerOptions
*
options
,
TRITONSERVER_RateLimitMode
mode
);
/// Add resource count for rate limiting.
///
/// \param options The server options object.
/// \param name The name of the resource.
/// \param count The count of the resource.
/// \param device The device identifier for the resource. A value of -1
/// indicates that the specified number of resources are available on every
/// device. The device value is ignored for a global resource. The server
/// will use the rate limiter configuration specified for instance groups
/// in model config to determine whether resource is global. In case of
/// conflicting resource type in different model configurations, server
/// will raise an appropriate error while loading model.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsAddRateLimiterResource
(
TRITONSERVER_ServerOptions
*
options
,
const
char
*
resource_name
,
const
size_t
resource_count
,
const
int
device
);
/// Set the total pinned memory byte size that the server can allocate
/// in a server options. The pinned memory pool will be shared across
/// Triton itself and the backends that use
/// TRITONBACKEND_MemoryManager to allocate memory.
///
/// \param options The server options object.
/// \param size The pinned memory pool byte size.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsSetPinnedMemoryPoolByteSize
(
TRITONSERVER_ServerOptions
*
options
,
uint64_t
size
);
/// Set the total CUDA memory byte size that the server can allocate
/// on given GPU device in a server options. The pinned memory pool
/// will be shared across Triton itself and the backends that use
/// TRITONBACKEND_MemoryManager to allocate memory.
///
/// \param options The server options object.
/// \param gpu_device The GPU device to allocate the memory pool.
/// \param size The CUDA memory pool byte size.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsSetCudaMemoryPoolByteSize
(
TRITONSERVER_ServerOptions
*
options
,
int
gpu_device
,
uint64_t
size
);
/// Set the total response cache byte size that the server can allocate in CPU
/// memory. The response cache will be shared across all inference requests and
/// across all models.
///
/// \param options The server options object.
/// \param size The total response cache byte size.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsSetResponseCacheByteSize
(
TRITONSERVER_ServerOptions
*
options
,
uint64_t
size
);
/// Set the minimum support CUDA compute capability in a server
/// options.
///
/// \param options The server options object.
/// \param cc The minimum CUDA compute capability.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsSetMinSupportedComputeCapability
(
TRITONSERVER_ServerOptions
*
options
,
double
cc
);
/// Enable or disable exit-on-error in a server options.
///
/// \param options The server options object.
/// \param exit True to enable exiting on intialization error, false
/// to continue.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsSetExitOnError
(
TRITONSERVER_ServerOptions
*
options
,
bool
exit
);
/// Enable or disable strict readiness handling in a server options.
///
/// \param options The server options object.
/// \param strict True to enable strict readiness handling, false to
/// disable.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsSetStrictReadiness
(
TRITONSERVER_ServerOptions
*
options
,
bool
strict
);
/// Set the exit timeout, in seconds, for the server in a server
/// options.
///
/// \param options The server options object.
/// \param timeout The exit timeout, in seconds.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsSetExitTimeout
(
TRITONSERVER_ServerOptions
*
options
,
unsigned
int
timeout
);
/// Set the number of threads used in buffer manager in a server options.
///
/// \param options The server options object.
/// \param thread_count The number of threads.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsSetBufferManagerThreadCount
(
TRITONSERVER_ServerOptions
*
options
,
unsigned
int
thread_count
);
/// Set the number of threads to concurrently load models in a server options.
///
/// \param options The server options object.
/// \param thread_count The number of threads.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsSetModelLoadThreadCount
(
TRITONSERVER_ServerOptions
*
options
,
unsigned
int
thread_count
);
/// Provide a log output file.
///
/// \param options The server options object.
/// \param file a string defining the file where the log outputs will be saved.
/// An empty string for the file name will cause triton to direct logging
/// facilities to the console
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsSetLogFile
(
TRITONSERVER_ServerOptions
*
options
,
const
char
*
file
);
/// Enable or disable info level logging.
///
/// \param options The server options object.
/// \param log True to enable info logging, false to disable.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsSetLogInfo
(
TRITONSERVER_ServerOptions
*
options
,
bool
log
);
/// Enable or disable warning level logging.
///
/// \param options The server options object.
/// \param log True to enable warning logging, false to disable.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsSetLogWarn
(
TRITONSERVER_ServerOptions
*
options
,
bool
log
);
/// Enable or disable error level logging.
///
/// \param options The server options object.
/// \param log True to enable error logging, false to disable.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsSetLogError
(
TRITONSERVER_ServerOptions
*
options
,
bool
log
);
/// Set the logging format.
///
/// \param options The server options object.
/// \param format The logging format.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsSetLogFormat
(
TRITONSERVER_ServerOptions
*
options
,
const
TRITONSERVER_LogFormat
format
);
/// Set verbose logging level. Level zero disables verbose logging.
///
/// \param options The server options object.
/// \param level The verbose logging level.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsSetLogVerbose
(
TRITONSERVER_ServerOptions
*
options
,
int
level
);
/// Enable or disable metrics collection in a server options.
///
/// \param options The server options object.
/// \param metrics True to enable metrics, false to disable.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsSetMetrics
(
TRITONSERVER_ServerOptions
*
options
,
bool
metrics
);
/// Enable or disable GPU metrics collection in a server options. GPU
/// metrics are collected if both this option and
/// TRITONSERVER_ServerOptionsSetMetrics are true.
///
/// \param options The server options object.
/// \param gpu_metrics True to enable GPU metrics, false to disable.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsSetGpuMetrics
(
TRITONSERVER_ServerOptions
*
options
,
bool
gpu_metrics
);
/// Enable or disable CPU metrics collection in a server options. CPU
/// metrics are collected if both this option and
/// TRITONSERVER_ServerOptionsSetMetrics are true.
///
/// \param options The server options object.
/// \param cpu_metrics True to enable CPU metrics, false to disable.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsSetCpuMetrics
(
TRITONSERVER_ServerOptions
*
options
,
bool
cpu_metrics
);
/// Set the interval for metrics collection in a server options.
/// This is 2000 milliseconds by default.
///
/// \param options The server options object.
/// \param metrics_interval_ms The time interval in ms between
/// successive metrics updates.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsSetMetricsInterval
(
TRITONSERVER_ServerOptions
*
options
,
uint64_t
metrics_interval_ms
);
/// Set the directory containing backend shared libraries. This
/// directory is searched last after the version and model directory
/// in the model repository when looking for the backend shared
/// library for a model. If the backend is named 'be' the directory
/// searched is 'backend_dir'/be/libtriton_be.so.
///
/// \param options The server options object.
/// \param backend_dir The full path of the backend directory.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsSetBackendDirectory
(
TRITONSERVER_ServerOptions
*
options
,
const
char
*
backend_dir
);
/// Set the directory containing repository agent shared libraries. This
/// directory is searched when looking for the repository agent shared
/// library for a model. If the backend is named 'ra' the directory
/// searched is 'repoagent_dir'/ra/libtritonrepoagent_ra.so.
///
/// \param options The server options object.
/// \param repoagent_dir The full path of the repository agent directory.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsSetRepoAgentDirectory
(
TRITONSERVER_ServerOptions
*
options
,
const
char
*
repoagent_dir
);
/// Specify the limit on memory usage as a fraction on the device identified by
/// 'kind' and 'device_id'. If model loading on the device is requested and the
/// current memory usage exceeds the limit, the load will be rejected. If not
/// specified, the limit will not be set.
///
/// Currently support TRITONSERVER_INSTANCEGROUPKIND_GPU
///
/// \param options The server options object.
/// \param kind The kind of the device.
/// \param device_id The id of the device.
/// \param fraction The limit on memory usage as a fraction
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsSetModelLoadDeviceLimit
(
TRITONSERVER_ServerOptions
*
options
,
const
TRITONSERVER_InstanceGroupKind
kind
,
const
int
device_id
,
const
double
fraction
);
/// Set a configuration setting for a named backend in a server
/// options.
///
/// \param options The server options object.
/// \param backend_name The name of the backend.
/// \param setting The name of the setting.
/// \param value The setting value.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsSetBackendConfig
(
TRITONSERVER_ServerOptions
*
options
,
const
char
*
backend_name
,
const
char
*
setting
,
const
char
*
value
);
/// Set a host policy setting for a given policy name in a server options.
///
/// \param options The server options object.
/// \param policy_name The name of the policy.
/// \param setting The name of the setting.
/// \param value The setting value.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerOptionsSetHostPolicy
(
TRITONSERVER_ServerOptions
*
options
,
const
char
*
policy_name
,
const
char
*
setting
,
const
char
*
value
);
/// TRITONSERVER_Server
///
/// An inference server.
///
/// Model batch flags. The enum values must be power-of-2 values.
typedef
enum
tritonserver_batchflag_enum
{
TRITONSERVER_BATCH_UNKNOWN
=
1
,
TRITONSERVER_BATCH_FIRST_DIM
=
2
}
TRITONSERVER_ModelBatchFlag
;
/// Model index flags. The enum values must be power-of-2 values.
typedef
enum
tritonserver_modelindexflag_enum
{
TRITONSERVER_INDEX_FLAG_READY
=
1
}
TRITONSERVER_ModelIndexFlag
;
/// Model transaction policy flags. The enum values must be
/// power-of-2 values.
typedef
enum
tritonserver_txn_property_flag_enum
{
TRITONSERVER_TXN_ONE_TO_ONE
=
1
,
TRITONSERVER_TXN_DECOUPLED
=
2
}
TRITONSERVER_ModelTxnPropertyFlag
;
/// Create a new server object. The caller takes ownership of the
/// TRITONSERVER_Server object and must call TRITONSERVER_ServerDelete
/// to release the object.
///
/// \param server Returns the new inference server object.
/// \param options The inference server options object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerNew
(
TRITONSERVER_Server
**
server
,
TRITONSERVER_ServerOptions
*
options
);
/// Delete a server object. If server is not already stopped it is
/// stopped before being deleted.
///
/// \param server The inference server object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerDelete
(
TRITONSERVER_Server
*
server
);
/// Stop a server object. A server can't be restarted once it is
/// stopped.
///
/// \param server The inference server object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerStop
(
TRITONSERVER_Server
*
server
);
/// Register a new model repository. Not available in polling mode.
///
/// \param server The inference server object.
/// \param repository_path The full path to the model repository.
/// \param name_mapping List of name_mapping parameters. Each mapping has
/// the model directory name as its key, overriden model name as its value.
/// \param model_count Number of mappings provided.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerRegisterModelRepository
(
TRITONSERVER_Server
*
server
,
const
char
*
repository_path
,
const
TRITONSERVER_Parameter
**
name_mapping
,
const
uint32_t
mapping_count
);
/// Unregister a model repository. Not available in polling mode.
///
/// \param server The inference server object.
/// \param repository_path The full path to the model repository.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerUnregisterModelRepository
(
TRITONSERVER_Server
*
server
,
const
char
*
repository_path
);
/// Check the model repository for changes and update server state
/// based on those changes.
///
/// \param server The inference server object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerPollModelRepository
(
TRITONSERVER_Server
*
server
);
/// Is the server live?
///
/// \param server The inference server object.
/// \param live Returns true if server is live, false otherwise.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerIsLive
(
TRITONSERVER_Server
*
server
,
bool
*
live
);
/// Is the server ready?
///
/// \param server The inference server object.
/// \param ready Returns true if server is ready, false otherwise.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerIsReady
(
TRITONSERVER_Server
*
server
,
bool
*
ready
);
/// Is the model ready?
///
/// \param server The inference server object.
/// \param model_name The name of the model to get readiness for.
/// \param model_version The version of the model to get readiness
/// for. If -1 then the server will choose a version based on the
/// model's policy.
/// \param ready Returns true if server is ready, false otherwise.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerModelIsReady
(
TRITONSERVER_Server
*
server
,
const
char
*
model_name
,
const
int64_t
model_version
,
bool
*
ready
);
/// Get the batch properties of the model. The properties are
/// communicated by a flags value and an (optional) object returned by
/// 'voidp'.
///
/// - TRITONSERVER_BATCH_UNKNOWN: Triton cannot determine the
/// batching properties of the model. This means that the model
/// does not support batching in any way that is useable by
/// Triton. The returned 'voidp' value is nullptr.
///
/// - TRITONSERVER_BATCH_FIRST_DIM: The model supports batching
/// along the first dimension of every input and output
/// tensor. Triton schedulers that perform batching can
/// automatically batch inference requests along this dimension.
/// The returned 'voidp' value is nullptr.
///
/// \param server The inference server object.
/// \param model_name The name of the model.
/// \param model_version The version of the model. If -1 then the
/// server will choose a version based on the model's policy.
/// \param flags Returns flags indicating the batch properties of the
/// model.
/// \param voidp If non-nullptr, returns a point specific to the
/// 'flags' value.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerModelBatchProperties
(
TRITONSERVER_Server
*
server
,
const
char
*
model_name
,
const
int64_t
model_version
,
uint32_t
*
flags
,
void
**
voidp
);
/// Get the transaction policy of the model. The policy is
/// communicated by a flags value.
///
/// - TRITONSERVER_TXN_ONE_TO_ONE: The model generates exactly
/// one response per request.
///
/// - TRITONSERVER_TXN_DECOUPLED: The model may generate zero
/// to many responses per request.
///
/// \param server The inference server object.
/// \param model_name The name of the model.
/// \param model_version The version of the model. If -1 then the
/// server will choose a version based on the model's policy.
/// \param txn_flags Returns flags indicating the transaction policy of the
/// model.
/// \param voidp If non-nullptr, returns a point specific to the 'flags' value.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerModelTransactionProperties
(
TRITONSERVER_Server
*
server
,
const
char
*
model_name
,
const
int64_t
model_version
,
uint32_t
*
txn_flags
,
void
**
voidp
);
/// Get the metadata of the server as a TRITONSERVER_Message object.
/// The caller takes ownership of the message object and must call
/// TRITONSERVER_MessageDelete to release the object.
///
/// \param server The inference server object.
/// \param server_metadata Returns the server metadata message.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerMetadata
(
TRITONSERVER_Server
*
server
,
TRITONSERVER_Message
**
server_metadata
);
/// Get the metadata of a model as a TRITONSERVER_Message
/// object. The caller takes ownership of the message object and must
/// call TRITONSERVER_MessageDelete to release the object.
///
/// \param server The inference server object.
/// \param model_name The name of the model.
/// \param model_version The version of the model.
/// If -1 then the server will choose a version based on the model's
/// policy.
/// \param model_metadata Returns the model metadata message.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerModelMetadata
(
TRITONSERVER_Server
*
server
,
const
char
*
model_name
,
const
int64_t
model_version
,
TRITONSERVER_Message
**
model_metadata
);
/// Get the statistics of a model as a TRITONSERVER_Message
/// object. The caller takes ownership of the object and must call
/// TRITONSERVER_MessageDelete to release the object.
///
/// \param server The inference server object.
/// \param model_name The name of the model.
/// If empty, then statistics for all available models will be returned,
/// and the server will choose a version based on those models' policies.
/// \param model_version The version of the model. If -1 then the
/// server will choose a version based on the model's policy.
/// \param model_stats Returns the model statistics message.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerModelStatistics
(
TRITONSERVER_Server
*
server
,
const
char
*
model_name
,
const
int64_t
model_version
,
TRITONSERVER_Message
**
model_stats
);
/// Get the configuration of a model as a TRITONSERVER_Message object.
/// The caller takes ownership of the message object and must call
/// TRITONSERVER_MessageDelete to release the object.
///
/// \param server The inference server object.
/// \param model_name The name of the model.
/// \param model_version The version of the model. If -1 then the
/// server will choose a version based on the model's policy.
/// \param config_version The model configuration will be returned in
/// a format matching this version. If the configuration cannot be
/// represented in the requested version's format then an error will
/// be returned. Currently only version 1 is supported.
/// \param model_config Returns the model config message.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerModelConfig
(
TRITONSERVER_Server
*
server
,
const
char
*
model_name
,
const
int64_t
model_version
,
const
uint32_t
config_version
,
TRITONSERVER_Message
**
model_config
);
/// Get the index of all unique models in the model repositories as a
/// TRITONSERVER_Message object. The caller takes ownership of the
/// message object and must call TRITONSERVER_MessageDelete to release
/// the object.
///
/// If TRITONSERVER_INDEX_FLAG_READY is set in 'flags' only the models
/// that are loaded into the server and ready for inferencing are
/// returned.
///
/// \param server The inference server object.
/// \param flags TRITONSERVER_ModelIndexFlag flags that control how to
/// collect the index.
/// \param model_index Return the model index message that holds the
/// index of all models contained in the server's model repository(s).
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerModelIndex
(
TRITONSERVER_Server
*
server
,
uint32_t
flags
,
TRITONSERVER_Message
**
model_index
);
/// Load the requested model or reload the model if it is already
/// loaded. The function does not return until the model is loaded or
/// fails to load. Returned error indicates if model loaded
/// successfully or not.
///
/// \param server The inference server object.
/// \param model_name The name of the model.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerLoadModel
(
TRITONSERVER_Server
*
server
,
const
char
*
model_name
);
/// Load the requested model or reload the model if it is already
/// loaded, with load parameters provided. The function does not return until
/// the model is loaded or fails to load. Returned error indicates if model
/// loaded successfully or not.
/// Currently the below parameter names are recognized:
/// - "config" : string parameter that contains a JSON representation of the
/// model configuration. This config will be used for loading the model instead
/// of the one in the model directory.
///
/// \param server The inference server object.
/// \param model_name The name of the model.
/// \param parameters The array of load parameters.
/// \param parameter_count The number of parameters.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerLoadModelWithParameters
(
TRITONSERVER_Server
*
server
,
const
char
*
model_name
,
const
TRITONSERVER_Parameter
**
parameters
,
const
uint64_t
parameter_count
);
/// Unload the requested model. Unloading a model that is not loaded
/// on server has no affect and success code will be returned.
/// The function does not wait for the requested model to be fully unload
/// and success code will be returned.
/// Returned error indicates if model unloaded successfully or not.
///
/// \param server The inference server object.
/// \param model_name The name of the model.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerUnloadModel
(
TRITONSERVER_Server
*
server
,
const
char
*
model_name
);
/// Unload the requested model, and also unload any dependent model that
/// was loaded along with the requested model (for example, the models composing
/// an ensemble). Unloading a model that is not loaded
/// on server has no affect and success code will be returned.
/// The function does not wait for the requested model and all dependent
/// models to be fully unload and success code will be returned.
/// Returned error indicates if model unloaded successfully or not.
///
/// \param server The inference server object.
/// \param model_name The name of the model.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerUnloadModelAndDependents
(
TRITONSERVER_Server
*
server
,
const
char
*
model_name
);
/// Get the current metrics for the server. The caller takes ownership
/// of the metrics object and must call TRITONSERVER_MetricsDelete to
/// release the object.
///
/// \param server The inference server object.
/// \param metrics Returns the metrics.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerMetrics
(
TRITONSERVER_Server
*
server
,
TRITONSERVER_Metrics
**
metrics
);
/// Perform inference using the meta-data and inputs supplied by the
/// 'inference_request'. If the function returns success, then the
/// caller releases ownership of 'inference_request' and must not
/// access it in any way after this call, until ownership is returned
/// via the 'request_release_fn' callback registered in the request
/// object with TRITONSERVER_InferenceRequestSetReleaseCallback.
///
/// The function unconditionally takes ownership of 'trace' and so the
/// caller must not access it in any way after this call (except in
/// the trace activity callbacks) until ownership is returned via the
/// trace's release_fn callback.
///
/// Responses produced for this request are returned using the
/// allocator and callback registered with the request by
/// TRITONSERVER_InferenceRequestSetResponseCallback.
///
/// \param server The inference server object.
/// \param inference_request The request object.
/// \param trace The trace object for this request, or nullptr if no
/// tracing.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_ServerInferAsync
(
TRITONSERVER_Server
*
server
,
TRITONSERVER_InferenceRequest
*
inference_request
,
TRITONSERVER_InferenceTrace
*
trace
);
/// TRITONSERVER_MetricKind
///
/// Types of metrics recognized by TRITONSERVER.
///
typedef
enum
TRITONSERVER_metrickind_enum
{
TRITONSERVER_METRIC_KIND_COUNTER
,
TRITONSERVER_METRIC_KIND_GAUGE
}
TRITONSERVER_MetricKind
;
/// Create a new metric family object. The caller takes ownership of the
/// TRITONSERVER_MetricFamily object and must call
/// TRITONSERVER_MetricFamilyDelete to release the object.
///
/// \param family Returns the new metric family object.
/// \param kind The type of metric family to create.
/// \param name The name of the metric family seen when calling the metrics
/// endpoint.
/// \param description The description of the metric family seen when
/// calling the metrics endpoint.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_MetricFamilyNew
(
TRITONSERVER_MetricFamily
**
family
,
const
TRITONSERVER_MetricKind
kind
,
const
char
*
name
,
const
char
*
description
);
/// Delete a metric family object.
/// A TRITONSERVER_MetricFamily* object should be deleted AFTER its
/// corresponding TRITONSERVER_Metric* objects have been deleted.
/// Attempting to delete a family before its metrics will return an error.
///
/// \param family The metric family object to delete.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_MetricFamilyDelete
(
TRITONSERVER_MetricFamily
*
family
);
/// Create a new metric object. The caller takes ownership of the
/// TRITONSERVER_Metric object and must call
/// TRITONSERVER_MetricDelete to release the object. The caller is also
/// responsible for ownership of the labels passed in. Each label can be deleted
/// immediately after creating the metric with TRITONSERVER_ParameterDelete
/// if not re-using the labels.
///
/// \param metric Returns the new metric object.
/// \param family The metric family to add this new metric to.
/// \param labels The array of labels to associate with this new metric.
/// \param label_count The number of labels.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_MetricNew
(
TRITONSERVER_Metric
**
metric
,
TRITONSERVER_MetricFamily
*
family
,
const
TRITONSERVER_Parameter
**
labels
,
const
uint64_t
label_count
);
/// Delete a metric object.
/// All TRITONSERVER_Metric* objects should be deleted BEFORE their
/// corresponding TRITONSERVER_MetricFamily* objects have been deleted.
/// If a family is deleted before its metrics, an error will be returned.
///
/// \param metric The metric object to delete.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_MetricDelete
(
TRITONSERVER_Metric
*
metric
);
/// Get the current value of a metric object.
/// Supports metrics of kind TRITONSERVER_METRIC_KIND_COUNTER
/// and TRITONSERVER_METRIC_KIND_GAUGE, and returns
/// TRITONSERVER_ERROR_UNSUPPORTED for unsupported TRITONSERVER_MetricKind.
///
/// \param metric The metric object to query.
/// \param value Returns the current value of the metric object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_MetricValue
(
TRITONSERVER_Metric
*
metric
,
double
*
value
);
/// Increment the current value of metric by value.
/// Supports metrics of kind TRITONSERVER_METRIC_KIND_GAUGE for any value,
/// and TRITONSERVER_METRIC_KIND_COUNTER for non-negative values. Returns
/// TRITONSERVER_ERROR_UNSUPPORTED for unsupported TRITONSERVER_MetricKind
/// and TRITONSERVER_ERROR_INVALID_ARG for negative values on a
/// TRITONSERVER_METRIC_KIND_COUNTER metric.
///
/// \param metric The metric object to update.
/// \param value The amount to increment the metric's value by.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_MetricIncrement
(
TRITONSERVER_Metric
*
metric
,
double
value
);
/// Set the current value of metric to value.
/// Supports metrics of kind TRITONSERVER_METRIC_KIND_GAUGE and returns
/// TRITONSERVER_ERROR_UNSUPPORTED for unsupported TRITONSERVER_MetricKind.
///
/// \param metric The metric object to update.
/// \param value The amount to set metric's value to.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_MetricSet
(
TRITONSERVER_Metric
*
metric
,
double
value
);
/// Get the TRITONSERVER_MetricKind of metric and its corresponding family.
///
/// \param metric The metric object to query.
/// \param kind Returns the TRITONSERVER_MetricKind of metric.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC
TRITONSERVER_Error
*
TRITONSERVER_GetMetricKind
(
TRITONSERVER_Metric
*
metric
,
TRITONSERVER_MetricKind
*
kind
);
#ifdef __cplusplus
}
#endif
3rdparty/core-r22.12/src/backend_config.cc
0 → 100644
View file @
b30f3cdb
// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "backend_config.h"
#include "status.h"
#include "triton/common/logging.h"
#include "triton/common/model_config.h"
namespace
triton
{
namespace
core
{
namespace
{
Status
GetTFSpecializedBackendName
(
const
triton
::
common
::
BackendCmdlineConfigMap
&
config_map
,
std
::
string
*
specialized_name
)
{
std
::
string
tf_version_str
=
"2"
;
const
auto
&
itr
=
config_map
.
find
(
"tensorflow"
);
if
(
itr
!=
config_map
.
end
())
{
if
(
BackendConfiguration
(
itr
->
second
,
"version"
,
&
tf_version_str
).
IsOk
())
{
if
((
tf_version_str
!=
"1"
)
&&
(
tf_version_str
!=
"2"
))
{
return
Status
(
Status
::
Code
::
INVALID_ARG
,
"unexpected TensorFlow library version '"
+
tf_version_str
+
"', expects 1 or 2."
);
}
}
}
*
specialized_name
+=
tf_version_str
;
return
Status
::
Success
;
}
}
// namespace
Status
BackendConfiguration
(
const
triton
::
common
::
BackendCmdlineConfig
&
config
,
const
std
::
string
&
key
,
std
::
string
*
val
)
{
for
(
const
auto
&
pr
:
config
)
{
if
(
pr
.
first
==
key
)
{
*
val
=
pr
.
second
;
return
Status
::
Success
;
}
}
return
Status
(
Status
::
Code
::
INTERNAL
,
std
::
string
(
"unable to find common backend configuration for '"
)
+
key
+
"'"
);
}
Status
BackendConfigurationParseStringToDouble
(
const
std
::
string
&
str
,
double
*
val
)
{
try
{
*
val
=
std
::
stod
(
str
);
}
catch
(...)
{
return
Status
(
Status
::
Code
::
INTERNAL
,
"unable to parse common backend configuration as double"
);
}
return
Status
::
Success
;
}
Status
BackendConfigurationParseStringToBool
(
const
std
::
string
&
str
,
bool
*
val
)
{
try
{
std
::
string
lowercase_str
{
str
};
std
::
transform
(
lowercase_str
.
begin
(),
lowercase_str
.
end
(),
lowercase_str
.
begin
(),
[](
unsigned
char
c
)
{
return
std
::
tolower
(
c
);
});
*
val
=
(
lowercase_str
==
"true"
);
}
catch
(...)
{
return
Status
(
Status
::
Code
::
INTERNAL
,
"unable to parse common backend configuration as bool"
);
}
return
Status
::
Success
;
}
Status
BackendConfigurationGlobalBackendsDirectory
(
const
triton
::
common
::
BackendCmdlineConfigMap
&
config_map
,
std
::
string
*
dir
)
{
const
auto
&
itr
=
config_map
.
find
(
std
::
string
());
if
(
itr
==
config_map
.
end
())
{
return
Status
(
Status
::
Code
::
INTERNAL
,
"unable to find global backends directory configuration"
);
}
RETURN_IF_ERROR
(
BackendConfiguration
(
itr
->
second
,
"backend-directory"
,
dir
));
return
Status
::
Success
;
}
Status
BackendConfigurationMinComputeCapability
(
const
triton
::
common
::
BackendCmdlineConfigMap
&
config_map
,
double
*
mcc
)
{
#ifdef TRITON_ENABLE_GPU
*
mcc
=
TRITON_MIN_COMPUTE_CAPABILITY
;
#else
*
mcc
=
0
;
#endif // TRITON_ENABLE_GPU
const
auto
&
itr
=
config_map
.
find
(
std
::
string
());
if
(
itr
==
config_map
.
end
())
{
return
Status
(
Status
::
Code
::
INTERNAL
,
"unable to find common backend configuration"
);
}
std
::
string
min_compute_capability_str
;
RETURN_IF_ERROR
(
BackendConfiguration
(
itr
->
second
,
"min-compute-capability"
,
&
min_compute_capability_str
));
RETURN_IF_ERROR
(
BackendConfigurationParseStringToDouble
(
min_compute_capability_str
,
mcc
));
return
Status
::
Success
;
}
Status
BackendConfigurationAutoCompleteConfig
(
const
triton
::
common
::
BackendCmdlineConfigMap
&
config_map
,
bool
*
acc
)
{
const
auto
&
itr
=
config_map
.
find
(
std
::
string
());
if
(
itr
==
config_map
.
end
())
{
return
Status
(
Status
::
Code
::
INTERNAL
,
"unable to find auto-complete configuration"
);
}
std
::
string
auto_complete_config_str
;
RETURN_IF_ERROR
(
BackendConfiguration
(
itr
->
second
,
"auto-complete-config"
,
&
auto_complete_config_str
));
RETURN_IF_ERROR
(
BackendConfigurationParseStringToBool
(
auto_complete_config_str
,
acc
));
return
Status
::
Success
;
}
Status
BackendConfigurationSpecializeBackendName
(
const
triton
::
common
::
BackendCmdlineConfigMap
&
config_map
,
const
std
::
string
&
backend_name
,
std
::
string
*
specialized_name
)
{
*
specialized_name
=
backend_name
;
if
(
backend_name
==
"tensorflow"
)
{
RETURN_IF_ERROR
(
GetTFSpecializedBackendName
(
config_map
,
specialized_name
));
}
return
Status
::
Success
;
}
Status
BackendConfigurationBackendLibraryName
(
const
std
::
string
&
backend_name
,
std
::
string
*
libname
)
{
#ifdef _WIN32
*
libname
=
"triton_"
+
backend_name
+
".dll"
;
#else
*
libname
=
"libtriton_"
+
backend_name
+
".so"
;
#endif
return
Status
::
Success
;
}
Status
BackendConfigurationModelLoadGpuFraction
(
const
triton
::
common
::
BackendCmdlineConfigMap
&
config_map
,
const
int
device_id
,
double
*
memory_limit
)
{
*
memory_limit
=
1.0
;
const
auto
&
itr
=
config_map
.
find
(
std
::
string
());
if
(
itr
==
config_map
.
end
())
{
return
Status
(
Status
::
Code
::
INTERNAL
,
"unable to find global backends directory configuration"
);
}
static
std
::
string
key_prefix
=
"model-load-gpu-limit-device-"
;
std
::
string
memory_limit_str
;
auto
status
=
BackendConfiguration
(
itr
->
second
,
key_prefix
+
std
::
to_string
(
device_id
),
&
memory_limit_str
);
// Allow missing key, default to 1.0 (no limit) if the limit is not specified
if
(
status
.
IsOk
())
{
RETURN_IF_ERROR
(
BackendConfigurationParseStringToDouble
(
memory_limit_str
,
memory_limit
));
}
return
Status
::
Success
;
}
}}
// namespace triton::core
3rdparty/core-r22.12/src/backend_config.h
0 → 100644
View file @
b30f3cdb
// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include "status.h"
#include "triton/common/model_config.h"
namespace
triton
{
namespace
core
{
/// Get a key's string value from a backend configuration.
Status
BackendConfiguration
(
const
triton
::
common
::
BackendCmdlineConfig
&
config
,
const
std
::
string
&
key
,
std
::
string
*
val
);
/// Convert a backend configuration string value into a double.
Status
BackendConfigurationParseStringToDouble
(
const
std
::
string
&
str
,
double
*
val
);
/// Convert a backend configuration string value into a bool.
Status
BackendConfigurationParseStringToBool
(
const
std
::
string
&
str
,
bool
*
val
);
/// Get the global backends directory from the backend configuration.
Status
BackendConfigurationGlobalBackendsDirectory
(
const
triton
::
common
::
BackendCmdlineConfigMap
&
config_map
,
std
::
string
*
dir
);
/// Get the minimum compute capability from the backend configuration.
Status
BackendConfigurationMinComputeCapability
(
const
triton
::
common
::
BackendCmdlineConfigMap
&
config_map
,
double
*
mcc
);
/// Get the model configuration auto-complete setting from the backend
/// configuration.
Status
BackendConfigurationAutoCompleteConfig
(
const
triton
::
common
::
BackendCmdlineConfigMap
&
config_map
,
bool
*
acc
);
/// Convert a backend name to the specialized version of that name
/// based on the backend configuration. For example, "tensorflow" will
/// convert to either "tensorflow1" or "tensorflow2" depending on how
/// tritonserver is run.
Status
BackendConfigurationSpecializeBackendName
(
const
triton
::
common
::
BackendCmdlineConfigMap
&
config_map
,
const
std
::
string
&
backend_name
,
std
::
string
*
specialized_name
);
/// Return the shared library name for a backend.
Status
BackendConfigurationBackendLibraryName
(
const
std
::
string
&
backend_name
,
std
::
string
*
libname
);
/// Get GPU memory limit fraction for model loading
/// from the backend configuration.
Status
BackendConfigurationModelLoadGpuFraction
(
const
triton
::
common
::
BackendCmdlineConfigMap
&
config_map
,
const
int
device_id
,
double
*
memory_limit
);
}}
// namespace triton::core
3rdparty/core-r22.12/src/backend_manager.cc
0 → 100644
View file @
b30f3cdb
// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "backend_manager.h"
#include "backend_memory_manager.h"
#include "server_message.h"
#include "shared_library.h"
#include "triton/common/logging.h"
// For unknown reason, windows will not export the TRITONBACKEND_*
// functions declared with dllexport in tritonbackend.h. To get those
// functions exported it is (also?) necessary to mark the definitions
// in this file with dllexport as well.
#if defined(_MSC_VER)
#define TRITONAPI_DECLSPEC __declspec(dllexport)
#elif defined(__GNUC__)
#define TRITONAPI_DECLSPEC __attribute__((__visibility__("default")))
#else
#define TRITONAPI_DECLSPEC
#endif
namespace
triton
{
namespace
core
{
//
// TritonBackend
//
Status
TritonBackend
::
Create
(
const
std
::
string
&
name
,
const
std
::
string
&
dir
,
const
std
::
string
&
libpath
,
const
triton
::
common
::
BackendCmdlineConfig
&
backend_cmdline_config
,
std
::
shared_ptr
<
TritonBackend
>*
backend
)
{
// Create the JSON representation of the backend configuration.
triton
::
common
::
TritonJson
::
Value
backend_config_json
(
triton
::
common
::
TritonJson
::
ValueType
::
OBJECT
);
if
(
!
backend_cmdline_config
.
empty
())
{
triton
::
common
::
TritonJson
::
Value
cmdline_json
(
backend_config_json
,
triton
::
common
::
TritonJson
::
ValueType
::
OBJECT
);
for
(
const
auto
&
pr
:
backend_cmdline_config
)
{
RETURN_IF_ERROR
(
cmdline_json
.
AddString
(
pr
.
first
.
c_str
(),
pr
.
second
));
}
RETURN_IF_ERROR
(
backend_config_json
.
Add
(
"cmdline"
,
std
::
move
(
cmdline_json
)));
}
TritonServerMessage
backend_config
(
backend_config_json
);
auto
local_backend
=
std
::
shared_ptr
<
TritonBackend
>
(
new
TritonBackend
(
name
,
dir
,
libpath
,
backend_config
));
// Load the library and initialize all the entrypoints
RETURN_IF_ERROR
(
local_backend
->
LoadBackendLibrary
());
// Backend initialization is optional... The TRITONBACKEND_Backend
// object is this TritonBackend object. We must set set shared
// library path to point to the backend directory in case the
// backend library attempts to load additional shared libaries.
if
(
local_backend
->
backend_init_fn_
!=
nullptr
)
{
std
::
unique_ptr
<
SharedLibrary
>
slib
;
RETURN_IF_ERROR
(
SharedLibrary
::
Acquire
(
&
slib
));
RETURN_IF_ERROR
(
slib
->
SetLibraryDirectory
(
local_backend
->
dir_
));
TRITONSERVER_Error
*
err
=
local_backend
->
backend_init_fn_
(
reinterpret_cast
<
TRITONBACKEND_Backend
*>
(
local_backend
.
get
()));
RETURN_IF_ERROR
(
slib
->
ResetLibraryDirectory
());
RETURN_IF_TRITONSERVER_ERROR
(
err
);
}
local_backend
->
UpdateAttributes
();
*
backend
=
std
::
move
(
local_backend
);
return
Status
::
Success
;
}
Status
TritonBackend
::
UpdateAttributes
()
{
if
(
backend_attri_fn_
==
nullptr
)
{
return
Status
::
Success
;
}
// Create an Attribute object for the backend to fill, note that it copies
// some fields from 'attributes_' while the others use default value. This
// is an ad hoc way to determine whether the attribute is set by the backend
// and keep / update current value.
Attribute
latest
;
latest
.
exec_policy_
=
attributes_
.
exec_policy_
;
RETURN_IF_TRITONSERVER_ERROR
(
backend_attri_fn_
(
reinterpret_cast
<
TRITONBACKEND_Backend
*>
(
this
),
reinterpret_cast
<
TRITONBACKEND_BackendAttribute
*>
(
&
latest
)));
// Update attributes that were set
attributes_
.
exec_policy_
=
latest
.
exec_policy_
;
if
(
!
latest
.
preferred_groups_
.
empty
())
{
attributes_
.
preferred_groups_
=
latest
.
preferred_groups_
;
}
return
Status
::
Success
;
}
TritonBackend
::
TritonBackend
(
const
std
::
string
&
name
,
const
std
::
string
&
dir
,
const
std
::
string
&
libpath
,
const
TritonServerMessage
&
backend_config
)
:
name_
(
name
),
dir_
(
dir
),
libpath_
(
libpath
),
backend_config_
(
backend_config
),
state_
(
nullptr
)
{
ClearHandles
();
}
TritonBackend
::~
TritonBackend
()
{
LOG_VERBOSE
(
1
)
<<
"unloading backend '"
<<
name_
<<
"'"
;
// Backend finalization is optional... The TRITONBACKEND_Backend
// object is this TritonBackend object.
if
(
backend_fini_fn_
!=
nullptr
)
{
LOG_TRITONSERVER_ERROR
(
backend_fini_fn_
(
reinterpret_cast
<
TRITONBACKEND_Backend
*>
(
this
)),
"failed finalizing backend"
);
}
ClearHandles
();
}
void
TritonBackend
::
ClearHandles
()
{
dlhandle_
=
nullptr
;
backend_init_fn_
=
nullptr
;
backend_fini_fn_
=
nullptr
;
backend_attri_fn_
=
nullptr
;
model_init_fn_
=
nullptr
;
model_fini_fn_
=
nullptr
;
inst_init_fn_
=
nullptr
;
inst_fini_fn_
=
nullptr
;
inst_exec_fn_
=
nullptr
;
}
Status
TritonBackend
::
LoadBackendLibrary
()
{
TritonBackendInitFn_t
bifn
;
TritonBackendFiniFn_t
bffn
;
TritonBackendAttriFn_t
bafn
;
TritonModelInitFn_t
mifn
;
TritonModelFiniFn_t
mffn
;
TritonModelInstanceInitFn_t
iifn
;
TritonModelInstanceFiniFn_t
iffn
;
TritonModelInstanceExecFn_t
iefn
;
{
std
::
unique_ptr
<
SharedLibrary
>
slib
;
RETURN_IF_ERROR
(
SharedLibrary
::
Acquire
(
&
slib
));
RETURN_IF_ERROR
(
slib
->
OpenLibraryHandle
(
libpath_
,
&
dlhandle_
));
// Backend initialize and finalize functions, optional
RETURN_IF_ERROR
(
slib
->
GetEntrypoint
(
dlhandle_
,
"TRITONBACKEND_Initialize"
,
true
/* optional */
,
reinterpret_cast
<
void
**>
(
&
bifn
)));
RETURN_IF_ERROR
(
slib
->
GetEntrypoint
(
dlhandle_
,
"TRITONBACKEND_Finalize"
,
true
/* optional */
,
reinterpret_cast
<
void
**>
(
&
bffn
)));
// Backend attribute function, optional
RETURN_IF_ERROR
(
slib
->
GetEntrypoint
(
dlhandle_
,
"TRITONBACKEND_GetBackendAttribute"
,
true
/* optional */
,
reinterpret_cast
<
void
**>
(
&
bafn
)));
// Model initialize and finalize functions, optional
RETURN_IF_ERROR
(
slib
->
GetEntrypoint
(
dlhandle_
,
"TRITONBACKEND_ModelInitialize"
,
true
/* optional */
,
reinterpret_cast
<
void
**>
(
&
mifn
)));
RETURN_IF_ERROR
(
slib
->
GetEntrypoint
(
dlhandle_
,
"TRITONBACKEND_ModelFinalize"
,
true
/* optional */
,
reinterpret_cast
<
void
**>
(
&
mffn
)));
// Model instance initialize and finalize functions, optional
RETURN_IF_ERROR
(
slib
->
GetEntrypoint
(
dlhandle_
,
"TRITONBACKEND_ModelInstanceInitialize"
,
true
/* optional */
,
reinterpret_cast
<
void
**>
(
&
iifn
)));
RETURN_IF_ERROR
(
slib
->
GetEntrypoint
(
dlhandle_
,
"TRITONBACKEND_ModelInstanceFinalize"
,
true
/* optional */
,
reinterpret_cast
<
void
**>
(
&
iffn
)));
// Model instance execute function, required
RETURN_IF_ERROR
(
slib
->
GetEntrypoint
(
dlhandle_
,
"TRITONBACKEND_ModelInstanceExecute"
,
false
/* optional */
,
reinterpret_cast
<
void
**>
(
&
iefn
)));
}
backend_init_fn_
=
bifn
;
backend_fini_fn_
=
bffn
;
backend_attri_fn_
=
bafn
;
model_init_fn_
=
mifn
;
model_fini_fn_
=
mffn
;
inst_init_fn_
=
iifn
;
inst_fini_fn_
=
iffn
;
inst_exec_fn_
=
iefn
;
return
Status
::
Success
;
}
extern
"C"
{
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ApiVersion
(
uint32_t
*
major
,
uint32_t
*
minor
)
{
*
major
=
TRITONBACKEND_API_VERSION_MAJOR
;
*
minor
=
TRITONBACKEND_API_VERSION_MINOR
;
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_BackendName
(
TRITONBACKEND_Backend
*
backend
,
const
char
**
name
)
{
TritonBackend
*
tb
=
reinterpret_cast
<
TritonBackend
*>
(
backend
);
*
name
=
tb
->
Name
().
c_str
();
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_BackendConfig
(
TRITONBACKEND_Backend
*
backend
,
TRITONSERVER_Message
**
backend_config
)
{
TritonBackend
*
tb
=
reinterpret_cast
<
TritonBackend
*>
(
backend
);
*
backend_config
=
const_cast
<
TRITONSERVER_Message
*>
(
reinterpret_cast
<
const
TRITONSERVER_Message
*>
(
&
tb
->
BackendConfig
()));
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_BackendExecutionPolicy
(
TRITONBACKEND_Backend
*
backend
,
TRITONBACKEND_ExecutionPolicy
*
policy
)
{
TritonBackend
*
tb
=
reinterpret_cast
<
TritonBackend
*>
(
backend
);
*
policy
=
tb
->
ExecutionPolicy
();
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_BackendSetExecutionPolicy
(
TRITONBACKEND_Backend
*
backend
,
TRITONBACKEND_ExecutionPolicy
policy
)
{
TritonBackend
*
tb
=
reinterpret_cast
<
TritonBackend
*>
(
backend
);
tb
->
SetExecutionPolicy
(
policy
);
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_BackendArtifacts
(
TRITONBACKEND_Backend
*
backend
,
TRITONBACKEND_ArtifactType
*
artifact_type
,
const
char
**
location
)
{
TritonBackend
*
tb
=
reinterpret_cast
<
TritonBackend
*>
(
backend
);
*
artifact_type
=
TRITONBACKEND_ARTIFACT_FILESYSTEM
;
*
location
=
tb
->
Directory
().
c_str
();
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_BackendMemoryManager
(
TRITONBACKEND_Backend
*
backend
,
TRITONBACKEND_MemoryManager
**
manager
)
{
static
TritonMemoryManager
gMemoryManager
;
*
manager
=
reinterpret_cast
<
TRITONBACKEND_MemoryManager
*>
(
&
gMemoryManager
);
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_BackendState
(
TRITONBACKEND_Backend
*
backend
,
void
**
state
)
{
TritonBackend
*
tb
=
reinterpret_cast
<
TritonBackend
*>
(
backend
);
*
state
=
tb
->
State
();
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_BackendSetState
(
TRITONBACKEND_Backend
*
backend
,
void
*
state
)
{
TritonBackend
*
tb
=
reinterpret_cast
<
TritonBackend
*>
(
backend
);
tb
->
SetState
(
state
);
return
nullptr
;
// success
}
}
// extern C
//
// TritonBackendManager
//
static
std
::
weak_ptr
<
TritonBackendManager
>
backend_manager_
;
static
std
::
mutex
mu_
;
Status
TritonBackendManager
::
Create
(
std
::
shared_ptr
<
TritonBackendManager
>*
manager
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mu_
);
// If there is already a manager then we just use it...
*
manager
=
backend_manager_
.
lock
();
if
(
*
manager
!=
nullptr
)
{
return
Status
::
Success
;
}
manager
->
reset
(
new
TritonBackendManager
());
backend_manager_
=
*
manager
;
return
Status
::
Success
;
}
Status
TritonBackendManager
::
CreateBackend
(
const
std
::
string
&
name
,
const
std
::
string
&
dir
,
const
std
::
string
&
libpath
,
const
triton
::
common
::
BackendCmdlineConfig
&
backend_cmdline_config
,
std
::
shared_ptr
<
TritonBackend
>*
backend
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mu_
);
const
auto
&
itr
=
backend_map_
.
find
(
libpath
);
if
(
itr
!=
backend_map_
.
end
())
{
*
backend
=
itr
->
second
;
return
Status
::
Success
;
}
RETURN_IF_ERROR
(
TritonBackend
::
Create
(
name
,
dir
,
libpath
,
backend_cmdline_config
,
backend
));
backend_map_
.
insert
({
libpath
,
*
backend
});
return
Status
::
Success
;
}
Status
TritonBackendManager
::
BackendState
(
std
::
unique_ptr
<
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
std
::
string
>>>*
backend_state
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mu_
);
std
::
unique_ptr
<
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
std
::
string
>>>
backend_state_map
(
new
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
std
::
string
>>
);
for
(
const
auto
&
backend_pair
:
backend_map_
)
{
auto
&
libpath
=
backend_pair
.
first
;
auto
backend
=
backend_pair
.
second
;
const
char
*
backend_config
;
size_t
backend_config_size
;
backend
->
BackendConfig
().
Serialize
(
&
backend_config
,
&
backend_config_size
);
backend_state_map
->
insert
(
{
backend
->
Name
(),
std
::
vector
<
std
::
string
>
{
libpath
,
backend_config
}});
}
*
backend_state
=
std
::
move
(
backend_state_map
);
return
Status
::
Success
;
}
}}
// namespace triton::core
3rdparty/core-r22.12/src/backend_manager.h
0 → 100644
View file @
b30f3cdb
// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <memory>
#include <mutex>
#include <string>
#include <unordered_map>
#include "constants.h"
#include "server_message.h"
#include "status.h"
#include "triton/common/model_config.h"
#include "tritonserver_apis.h"
namespace
triton
{
namespace
core
{
//
// Proxy to a backend shared library.
//
class
TritonBackend
{
public:
struct
Attribute
{
Attribute
()
:
exec_policy_
(
TRITONBACKEND_EXECUTION_BLOCKING
)
{}
TRITONBACKEND_ExecutionPolicy
exec_policy_
;
std
::
vector
<
inference
::
ModelInstanceGroup
>
preferred_groups_
;
};
typedef
TRITONSERVER_Error
*
(
*
TritonModelInitFn_t
)(
TRITONBACKEND_Model
*
model
);
typedef
TRITONSERVER_Error
*
(
*
TritonModelFiniFn_t
)(
TRITONBACKEND_Model
*
model
);
typedef
TRITONSERVER_Error
*
(
*
TritonModelInstanceInitFn_t
)(
TRITONBACKEND_ModelInstance
*
instance
);
typedef
TRITONSERVER_Error
*
(
*
TritonModelInstanceFiniFn_t
)(
TRITONBACKEND_ModelInstance
*
instance
);
typedef
TRITONSERVER_Error
*
(
*
TritonModelInstanceExecFn_t
)(
TRITONBACKEND_ModelInstance
*
instance
,
TRITONBACKEND_Request
**
requests
,
const
uint32_t
request_cnt
);
static
Status
Create
(
const
std
::
string
&
name
,
const
std
::
string
&
dir
,
const
std
::
string
&
libpath
,
const
triton
::
common
::
BackendCmdlineConfig
&
backend_cmdline_config
,
std
::
shared_ptr
<
TritonBackend
>*
backend
);
~
TritonBackend
();
const
std
::
string
&
Name
()
const
{
return
name_
;
}
const
std
::
string
&
Directory
()
const
{
return
dir_
;
}
const
TritonServerMessage
&
BackendConfig
()
const
{
return
backend_config_
;
}
const
Attribute
&
BackendAttributes
()
const
{
return
attributes_
;
}
TRITONBACKEND_ExecutionPolicy
ExecutionPolicy
()
const
{
return
attributes_
.
exec_policy_
;
}
void
SetExecutionPolicy
(
const
TRITONBACKEND_ExecutionPolicy
policy
)
{
attributes_
.
exec_policy_
=
policy
;
}
void
*
State
()
{
return
state_
;
}
void
SetState
(
void
*
state
)
{
state_
=
state
;
}
TritonModelInitFn_t
ModelInitFn
()
const
{
return
model_init_fn_
;
}
TritonModelFiniFn_t
ModelFiniFn
()
const
{
return
model_fini_fn_
;
}
TritonModelInstanceInitFn_t
ModelInstanceInitFn
()
const
{
return
inst_init_fn_
;
}
TritonModelInstanceFiniFn_t
ModelInstanceFiniFn
()
const
{
return
inst_fini_fn_
;
}
TritonModelInstanceExecFn_t
ModelInstanceExecFn
()
const
{
return
inst_exec_fn_
;
}
private:
typedef
TRITONSERVER_Error
*
(
*
TritonBackendInitFn_t
)(
TRITONBACKEND_Backend
*
backend
);
typedef
TRITONSERVER_Error
*
(
*
TritonBackendFiniFn_t
)(
TRITONBACKEND_Backend
*
backend
);
typedef
TRITONSERVER_Error
*
(
*
TritonBackendAttriFn_t
)(
TRITONBACKEND_Backend
*
backend
,
TRITONBACKEND_BackendAttribute
*
backend_attributes
);
TritonBackend
(
const
std
::
string
&
name
,
const
std
::
string
&
dir
,
const
std
::
string
&
libpath
,
const
TritonServerMessage
&
backend_config
);
void
ClearHandles
();
Status
LoadBackendLibrary
();
Status
UpdateAttributes
();
// The name of the backend.
const
std
::
string
name_
;
// Full path to the directory holding backend shared library and
// other artifacts.
const
std
::
string
dir_
;
// Full path to the backend shared library.
const
std
::
string
libpath_
;
// Backend configuration as JSON
TritonServerMessage
backend_config_
;
// backend attributes
Attribute
attributes_
;
// dlopen / dlsym handles
void
*
dlhandle_
;
TritonBackendInitFn_t
backend_init_fn_
;
TritonBackendFiniFn_t
backend_fini_fn_
;
TritonBackendAttriFn_t
backend_attri_fn_
;
TritonModelInitFn_t
model_init_fn_
;
TritonModelFiniFn_t
model_fini_fn_
;
TritonModelInstanceInitFn_t
inst_init_fn_
;
TritonModelInstanceFiniFn_t
inst_fini_fn_
;
TritonModelInstanceExecFn_t
inst_exec_fn_
;
// Opaque state associated with the backend.
void
*
state_
;
};
//
// Manage communication with Triton backends and their lifecycle.
//
class
TritonBackendManager
{
public:
static
Status
Create
(
std
::
shared_ptr
<
TritonBackendManager
>*
manager
);
Status
CreateBackend
(
const
std
::
string
&
name
,
const
std
::
string
&
dir
,
const
std
::
string
&
libpath
,
const
triton
::
common
::
BackendCmdlineConfig
&
backend_cmdline_config
,
std
::
shared_ptr
<
TritonBackend
>*
backend
);
Status
BackendState
(
std
::
unique_ptr
<
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
std
::
string
>>>*
backend_state
);
private:
DISALLOW_COPY_AND_ASSIGN
(
TritonBackendManager
);
TritonBackendManager
()
=
default
;
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
TritonBackend
>>
backend_map_
;
};
}}
// namespace triton::core
3rdparty/core-r22.12/src/backend_memory_manager.cc
0 → 100644
View file @
b30f3cdb
// Copyright 2020-2022, NVIDIA CORPORATION. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "backend_memory_manager.h"
#include "pinned_memory_manager.h"
#include "status.h"
#include "tritonserver_apis.h"
#ifdef TRITON_ENABLE_GPU
#include <cuda_runtime_api.h>
#include "cuda_memory_manager.h"
#endif // TRITON_ENABLE_GPU
// For unknown reason, windows will not export the TRITONBACKEND_*
// functions declared with dllexport in tritonbackend.h. To get those
// functions exported it is (also?) necessary to mark the definitions
// in this file with dllexport as well.
#if defined(_MSC_VER)
#define TRITONAPI_DECLSPEC __declspec(dllexport)
#elif defined(__GNUC__)
#define TRITONAPI_DECLSPEC __attribute__((__visibility__("default")))
#else
#define TRITONAPI_DECLSPEC
#endif
namespace
triton
{
namespace
core
{
extern
"C"
{
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_MemoryManagerAllocate
(
TRITONBACKEND_MemoryManager
*
manager
,
void
**
buffer
,
const
TRITONSERVER_MemoryType
memory_type
,
const
int64_t
memory_type_id
,
const
uint64_t
byte_size
)
{
switch
(
memory_type
)
{
case
TRITONSERVER_MEMORY_GPU
:
#ifdef TRITON_ENABLE_GPU
{
auto
status
=
CudaMemoryManager
::
Alloc
(
buffer
,
byte_size
,
memory_type_id
);
if
(
!
status
.
IsOk
())
{
return
TRITONSERVER_ErrorNew
(
StatusCodeToTritonCode
(
status
.
ErrorCode
()),
status
.
Message
().
c_str
());
}
break
;
}
#else
return
TRITONSERVER_ErrorNew
(
TRITONSERVER_ERROR_UNSUPPORTED
,
"GPU memory allocation not supported"
);
#endif // TRITON_ENABLE_GPU
case
TRITONSERVER_MEMORY_CPU_PINNED
:
#ifdef TRITON_ENABLE_GPU
{
TRITONSERVER_MemoryType
mt
=
memory_type
;
auto
status
=
PinnedMemoryManager
::
Alloc
(
buffer
,
byte_size
,
&
mt
,
false
);
if
(
!
status
.
IsOk
())
{
return
TRITONSERVER_ErrorNew
(
StatusCodeToTritonCode
(
status
.
ErrorCode
()),
status
.
Message
().
c_str
());
}
break
;
}
#else
return
TRITONSERVER_ErrorNew
(
TRITONSERVER_ERROR_UNSUPPORTED
,
"Pinned memory allocation not supported"
);
#endif // TRITON_ENABLE_GPU
case
TRITONSERVER_MEMORY_CPU
:
{
*
buffer
=
malloc
(
byte_size
);
if
(
*
buffer
==
nullptr
)
{
return
TRITONSERVER_ErrorNew
(
TRITONSERVER_ERROR_UNAVAILABLE
,
"CPU memory allocation failed"
);
}
break
;
}
}
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_MemoryManagerFree
(
TRITONBACKEND_MemoryManager
*
manager
,
void
*
buffer
,
const
TRITONSERVER_MemoryType
memory_type
,
const
int64_t
memory_type_id
)
{
switch
(
memory_type
)
{
case
TRITONSERVER_MEMORY_GPU
:
{
#ifdef TRITON_ENABLE_GPU
auto
status
=
CudaMemoryManager
::
Free
(
buffer
,
memory_type_id
);
if
(
!
status
.
IsOk
())
{
return
TRITONSERVER_ErrorNew
(
StatusCodeToTritonCode
(
status
.
StatusCode
()),
status
.
Message
().
c_str
());
}
#endif // TRITON_ENABLE_GPU
break
;
}
case
TRITONSERVER_MEMORY_CPU_PINNED
:
{
#ifdef TRITON_ENABLE_GPU
auto
status
=
PinnedMemoryManager
::
Free
(
buffer
);
if
(
!
status
.
IsOk
())
{
return
TRITONSERVER_ErrorNew
(
StatusCodeToTritonCode
(
status
.
StatusCode
()),
status
.
Message
().
c_str
());
}
#endif // TRITON_ENABLE_GPU
break
;
}
case
TRITONSERVER_MEMORY_CPU
:
free
(
buffer
);
break
;
}
return
nullptr
;
// success
}
}
// extern C
}}
// namespace triton::core
3rdparty/core-r22.12/src/backend_memory_manager.h
0 → 100644
View file @
b30f3cdb
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
namespace
triton
{
namespace
core
{
// Currently there is just a global memory manager that is used for
// all backends and which simply forwards requests on to the core
// memory manager.
struct
TritonMemoryManager
{
};
}}
// namespace triton::core
3rdparty/core-r22.12/src/backend_model.cc
0 → 100644
View file @
b30f3cdb
// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "backend_model.h"
#include <vector>
#include "backend_config.h"
#include "backend_model_instance.h"
#include "dynamic_batch_scheduler.h"
#include "filesystem.h"
#include "model_config_utils.h"
#include "numa_utils.h"
#include "sequence_batch_scheduler.h"
#include "sequence_state.h"
#include "server.h"
#include "server_message.h"
#include "shared_library.h"
#include "triton/common/logging.h"
#include "tritonserver_apis.h"
// For unknown reason, windows will not export the TRITONBACKEND_*
// functions declared with dllexport in tritonbackend.h. To get those
// functions exported it is (also?) necessary to mark the definitions
// in this file with dllexport as well.
#if defined(_MSC_VER)
#define TRITONAPI_DECLSPEC __declspec(dllexport)
#elif defined(__GNUC__)
#define TRITONAPI_DECLSPEC __attribute__((__visibility__("default")))
#else
#define TRITONAPI_DECLSPEC
#endif
namespace
triton
{
namespace
core
{
Status
TritonModel
::
Create
(
InferenceServer
*
server
,
const
std
::
string
&
model_path
,
const
triton
::
common
::
BackendCmdlineConfigMap
&
backend_cmdline_config_map
,
const
triton
::
common
::
HostPolicyCmdlineConfigMap
&
host_policy_map
,
const
std
::
string
&
model_name
,
const
int64_t
version
,
inference
::
ModelConfig
model_config
,
const
bool
is_config_provided
,
std
::
unique_ptr
<
TritonModel
>*
model
)
{
model
->
reset
();
// The model configuration must specify a backend. The name of the
// corresponding shared library must be libtriton_<backend>.so.
if
(
model_config
.
backend
().
empty
())
{
return
Status
(
Status
::
Code
::
INVALID_ARG
,
"must specify 'backend' for '"
+
model_config
.
name
()
+
"'"
);
}
// Localize the content of the model repository corresponding to
// 'model_name'. This model holds a handle to the localized content
// so that it persists as long as the model is loaded.
std
::
shared_ptr
<
LocalizedPath
>
localized_model_dir
;
RETURN_IF_ERROR
(
LocalizePath
(
model_path
,
&
localized_model_dir
));
// Localize paths in backend model config
// [FIXME] Remove once a more permanent solution is implemented (DLIS-4211)
RETURN_IF_ERROR
(
LocalizePythonBackendExecutionEnvironmentPath
(
model_path
,
&
model_config
,
&
localized_model_dir
));
// Get some internal configuration values needed for initialization.
std
::
string
backend_dir
;
RETURN_IF_ERROR
(
BackendConfigurationGlobalBackendsDirectory
(
backend_cmdline_config_map
,
&
backend_dir
));
bool
auto_complete_config
=
false
;
RETURN_IF_ERROR
(
BackendConfigurationAutoCompleteConfig
(
backend_cmdline_config_map
,
&
auto_complete_config
));
double
min_compute_capability
=
0
;
RETURN_IF_ERROR
(
BackendConfigurationMinComputeCapability
(
backend_cmdline_config_map
,
&
min_compute_capability
));
std
::
string
specialized_backend_name
;
RETURN_IF_ERROR
(
BackendConfigurationSpecializeBackendName
(
backend_cmdline_config_map
,
model_config
.
backend
(),
&
specialized_backend_name
));
std
::
string
backend_libname
;
RETURN_IF_ERROR
(
BackendConfigurationBackendLibraryName
(
specialized_backend_name
,
&
backend_libname
));
// Get the path to the backend shared library. Search path is
// version directory, model directory, global backend directory.
const
auto
localized_model_path
=
localized_model_dir
->
Path
();
const
auto
version_path
=
JoinPath
({
localized_model_path
,
std
::
to_string
(
version
)});
const
std
::
string
global_path
=
JoinPath
({
backend_dir
,
specialized_backend_name
});
const
std
::
vector
<
std
::
string
>
search_paths
=
{
version_path
,
localized_model_path
,
global_path
};
std
::
string
backend_libdir
;
std
::
string
backend_libpath
;
for
(
const
auto
&
path
:
search_paths
)
{
const
auto
full_path
=
JoinPath
({
path
,
backend_libname
});
bool
exists
=
false
;
RETURN_IF_ERROR
(
FileExists
(
full_path
,
&
exists
));
if
(
exists
)
{
backend_libdir
=
path
;
backend_libpath
=
full_path
;
break
;
}
}
if
(
backend_libpath
.
empty
())
{
return
Status
(
Status
::
Code
::
INVALID_ARG
,
"unable to find '"
+
backend_libname
+
"' for model '"
+
model_config
.
name
()
+
"', searched: "
+
version_path
+
", "
+
model_path
+
", "
+
global_path
);
}
// Resolve the global backend configuration with the specific backend
// configuration
triton
::
common
::
BackendCmdlineConfig
config
;
RETURN_IF_ERROR
(
ResolveBackendConfigs
(
backend_cmdline_config_map
,
model_config
.
backend
(),
config
));
RETURN_IF_ERROR
(
SetBackendConfigDefaults
(
config
));
std
::
shared_ptr
<
TritonBackend
>
backend
;
RETURN_IF_ERROR
(
server
->
BackendManager
()
->
CreateBackend
(
model_config
.
backend
(),
backend_libdir
,
backend_libpath
,
config
,
&
backend
));
// Normalize backend-dependent config
{
const
auto
&
attributes
=
backend
->
BackendAttributes
();
// [WIP] formalize config normalization / validation
RETURN_IF_ERROR
(
NormalizeInstanceGroup
(
min_compute_capability
,
attributes
.
preferred_groups_
,
&
model_config
));
RETURN_IF_ERROR
(
ValidateInstanceGroup
(
model_config
,
min_compute_capability
));
}
// Create and initialize the model.
std
::
unique_ptr
<
TritonModel
>
local_model
(
new
TritonModel
(
server
,
localized_model_dir
,
backend
,
min_compute_capability
,
version
,
model_config
,
auto_complete_config
));
TritonModel
*
raw_local_model
=
local_model
.
get
();
// Model initialization is optional... The TRITONBACKEND_Model
// object is this TritonModel object. We must set set shared library
// path to point to the backend directory in case the backend
// library attempts to load additional shared libaries.
if
(
backend
->
ModelInitFn
()
!=
nullptr
)
{
std
::
unique_ptr
<
SharedLibrary
>
slib
;
RETURN_IF_ERROR
(
SharedLibrary
::
Acquire
(
&
slib
));
RETURN_IF_ERROR
(
slib
->
SetLibraryDirectory
(
backend
->
Directory
()));
TRITONSERVER_Error
*
err
=
backend
->
ModelInitFn
()(
reinterpret_cast
<
TRITONBACKEND_Model
*>
(
raw_local_model
));
RETURN_IF_ERROR
(
slib
->
ResetLibraryDirectory
());
RETURN_IF_TRITONSERVER_ERROR
(
err
);
}
// Initialize the model for Triton core usage
RETURN_IF_ERROR
(
local_model
->
Init
(
is_config_provided
));
bool
device_blocking
=
false
;
if
(
local_model
->
backend_
->
ExecutionPolicy
()
==
TRITONBACKEND_EXECUTION_DEVICE_BLOCKING
)
{
if
(
model_config
.
has_sequence_batching
())
{
LOG_INFO
<<
"Overriding execution policy to "
"
\"
TRITONBACKEND_EXECUTION_BLOCKING
\"
for sequence model
\"
"
<<
model_config
.
name
()
<<
"
\"
"
;
}
else
{
device_blocking
=
true
;
}
}
// Create and initialize the model instances for this model.
RETURN_IF_ERROR
(
TritonModelInstance
::
CreateInstances
(
raw_local_model
,
backend_cmdline_config_map
,
host_policy_map
,
model_config
,
device_blocking
));
RETURN_IF_ERROR
(
local_model
->
SetConfiguredScheduler
());
*
model
=
std
::
move
(
local_model
);
return
Status
::
Success
;
}
Status
TritonModel
::
ResolveBackendConfigs
(
const
triton
::
common
::
BackendCmdlineConfigMap
&
backend_cmdline_config_map
,
const
std
::
string
&
backend_name
,
triton
::
common
::
BackendCmdlineConfig
&
config
)
{
const
auto
&
global_itr
=
backend_cmdline_config_map
.
find
(
std
::
string
());
const
auto
&
specific_itr
=
backend_cmdline_config_map
.
find
(
backend_name
);
if
(
specific_itr
==
backend_cmdline_config_map
.
end
()
&&
global_itr
!=
backend_cmdline_config_map
.
end
())
{
for
(
auto
setting
:
global_itr
->
second
)
{
config
.
push_back
(
setting
);
}
}
else
if
(
specific_itr
!=
backend_cmdline_config_map
.
end
()
&&
global_itr
==
backend_cmdline_config_map
.
end
())
{
for
(
auto
setting
:
specific_itr
->
second
)
{
config
.
push_back
(
setting
);
}
}
else
if
(
specific_itr
!=
backend_cmdline_config_map
.
end
()
&&
global_itr
!=
backend_cmdline_config_map
.
end
())
{
triton
::
common
::
BackendCmdlineConfig
global_backend_config
=
global_itr
->
second
;
triton
::
common
::
BackendCmdlineConfig
specific_backend_config
=
specific_itr
->
second
;
std
::
sort
(
global_backend_config
.
begin
(),
global_backend_config
.
end
());
std
::
sort
(
specific_backend_config
.
begin
(),
specific_backend_config
.
end
());
size_t
global_index
=
0
;
size_t
specific_index
=
0
;
while
(
global_index
<
global_backend_config
.
size
()
&&
specific_index
<
specific_backend_config
.
size
())
{
auto
&
current_global_setting
=
global_backend_config
.
at
(
global_index
);
auto
&
current_specific_setting
=
specific_backend_config
.
at
(
specific_index
);
if
(
current_specific_setting
.
first
.
compare
(
current_global_setting
.
first
)
==
0
)
{
// specific setting overrides global setting
config
.
push_back
(
current_specific_setting
);
++
global_index
;
++
specific_index
;
}
else
if
(
current_specific_setting
.
first
.
compare
(
current_global_setting
.
first
)
<
0
)
{
config
.
push_back
(
current_specific_setting
);
++
specific_index
;
}
else
{
config
.
push_back
(
current_global_setting
);
++
global_index
;
}
}
// add the rest of the global configs
if
(
global_index
<
global_backend_config
.
size
())
{
auto
&
current_global_setting
=
global_backend_config
.
at
(
global_index
);
config
.
push_back
(
current_global_setting
);
}
// add the rest of the specific settings
if
(
specific_index
<
specific_backend_config
.
size
())
{
auto
&
current_specific_setting
=
specific_backend_config
.
at
(
specific_index
);
config
.
push_back
(
current_specific_setting
);
}
}
// else empty config
return
Status
::
Success
;
}
const
std
::
unordered_map
<
std
::
string
,
std
::
string
>
backend_config_defaults
(
{{
"default-max-batch-size"
,
"4"
}});
Status
TritonModel
::
SetBackendConfigDefaults
(
triton
::
common
::
BackendCmdlineConfig
&
config
)
{
auto
backend_config_defaults_copy
=
backend_config_defaults
;
for
(
auto
&
setting
:
config
)
{
if
(
setting
.
first
.
compare
(
"default-max-batch-size"
)
==
0
)
{
LOG_VERBOSE
(
1
)
<<
"Found overwritten default setting: "
<<
setting
.
first
<<
","
<<
setting
.
second
;
backend_config_defaults_copy
.
erase
(
setting
.
first
);
}
if
(
backend_config_defaults_copy
.
empty
())
{
break
;
}
}
// Anything left should be added to the config
for
(
const
auto
&
default_setting
:
backend_config_defaults_copy
)
{
LOG_VERBOSE
(
1
)
<<
"Adding default backend config setting: "
<<
default_setting
.
first
<<
","
<<
default_setting
.
second
;
config
.
push_back
(
std
::
make_pair
(
default_setting
.
first
,
default_setting
.
second
));
}
return
Status
::
Success
;
}
Status
TritonModel
::
AddInstance
(
std
::
unique_ptr
<
TritonModelInstance
>&&
instance
,
const
bool
passive
)
{
if
(
passive
)
{
passive_instances_
.
emplace_back
(
std
::
move
(
instance
));
}
else
{
instances_
.
emplace_back
(
std
::
move
(
instance
));
}
return
Status
::
Success
;
}
Status
TritonModel
::
UpdateModelConfig
(
const
uint32_t
config_version
,
TRITONSERVER_Message
*
updated_config_message
)
{
const
char
*
buffer
;
size_t
byte_size
;
RETURN_IF_TRITONSERVER_ERROR
(
TRITONSERVER_MessageSerializeToJson
(
updated_config_message
,
&
buffer
,
&
byte_size
));
inference
::
ModelConfig
updated_config
;
RETURN_IF_ERROR
(
JsonToModelConfig
({
buffer
,
byte_size
},
config_version
,
&
updated_config
));
auto
config
=
Config
();
config
.
set_max_batch_size
(
updated_config
.
max_batch_size
());
auto
inputs_config
=
config
.
mutable_input
();
*
inputs_config
=
updated_config
.
input
();
auto
outputs_config
=
config
.
mutable_output
();
*
outputs_config
=
updated_config
.
output
();
if
(
!
config
.
scheduling_choice_case
())
{
if
(
updated_config
.
has_dynamic_batching
())
{
auto
dynamic_batching_config
=
config
.
mutable_dynamic_batching
();
*
dynamic_batching_config
=
updated_config
.
dynamic_batching
();
}
else
if
(
updated_config
.
has_sequence_batching
())
{
auto
sequence_batching_config
=
config
.
mutable_sequence_batching
();
*
sequence_batching_config
=
updated_config
.
sequence_batching
();
}
else
if
(
updated_config
.
has_ensemble_scheduling
())
{
auto
ensemble_scheduling_config
=
config
.
mutable_ensemble_scheduling
();
*
ensemble_scheduling_config
=
updated_config
.
ensemble_scheduling
();
}
// else do nothing
}
else
if
(
config
.
scheduling_choice_case
()
!=
updated_config
.
scheduling_choice_case
())
{
return
Status
(
triton
::
common
::
Error
::
Code
::
INTERNAL
,
(
std
::
string
(
"Cannot update scheduling choice from "
)
+
std
::
to_string
(
config
.
scheduling_choice_case
())
+
std
::
string
(
" to "
)
+
std
::
to_string
(
config
.
scheduling_choice_case
())
+
std
::
string
(
" when auto-completing."
))
.
c_str
());
}
// else do nothing
// Need to normalize the model configuration for
// populating missing fields.
RETURN_IF_ERROR
(
NormalizeModelConfig
(
min_compute_capability_
,
&
config
));
RETURN_IF_ERROR
(
SetModelConfig
(
config
));
return
Status
::
Success
;
}
Status
TritonModel
::
SetConfiguredScheduler
()
{
std
::
unique_ptr
<
Scheduler
>
scheduler
;
// Need to enforce equal shape batches (i.e. non-ragged batches) if
// the model 1) allows one or more variable-size input tensors that
// are not marked as 'allow_ragged_batch' or 2) has one or more
// shape-tensor inputs. This is not needed if all input shapes are
// non-variable and if there are no shape tensors... so we don't
// enable it in that case for efficiency reasons.
std
::
unordered_map
<
std
::
string
,
bool
>
enforce_equal_shape_tensors
;
for
(
const
auto
input
:
config_
.
input
())
{
if
(
input
.
is_shape_tensor
())
{
enforce_equal_shape_tensors
.
insert
({
input
.
name
(),
true
});
}
else
if
(
!
input
.
allow_ragged_batch
()
&&
(
triton
::
common
::
GetElementCount
(
input
)
==
-
1
))
{
enforce_equal_shape_tensors
.
insert
({
input
.
name
(),
false
});
}
}
// If 'sequence_batching' is configured, then use the SequenceBatchScheduler,
// otherwise use the default DynamicBatchScheduler.
if
(
config_
.
has_sequence_batching
())
{
// Sequence batcher
RETURN_IF_ERROR
(
SequenceBatchScheduler
::
Create
(
this
,
enforce_equal_shape_tensors
,
&
scheduler
));
}
else
if
(
config_
.
has_dynamic_batching
())
{
// Dynamic batcher
RETURN_IF_ERROR
(
DynamicBatchScheduler
::
Create
(
this
,
nullptr
,
0
/*nice*/
,
true
/* dynamic_batching_enabled */
,
config_
.
max_batch_size
(),
enforce_equal_shape_tensors
,
config_
.
dynamic_batching
(),
config_
.
response_cache
().
enable
()
/* response_cache_enable */
,
&
scheduler
));
}
else
{
// Default scheduler. Use dynamic batch scheduler (with batching
// disabled) as the default scheduler.
RETURN_IF_ERROR
(
DynamicBatchScheduler
::
Create
(
this
,
nullptr
,
0
/*nice*/
,
false
/* dynamic_batching_enabled */
,
1
/* max_batch_size */
,
std
::
unordered_map
<
std
::
string
,
bool
>
()
/* enforce_equal_shape_tensors */
,
false
/* preserve_ordering */
,
config_
.
response_cache
().
enable
()
/* response_cache_enable */
,
std
::
set
<
int32_t
>
()
/* preferred_batch_sizes */
,
0
/* max_queue_delay_microseconds */
,
&
scheduler
));
}
return
SetScheduler
(
std
::
move
(
scheduler
));
}
Status
TritonModel
::
Initialize
()
{
for
(
const
auto
&
instance
:
instances_
)
{
RETURN_IF_ERROR
(
instance
->
Initialize
());
}
return
Status
::
Success
;
}
Status
TritonModel
::
WarmUp
()
{
for
(
const
auto
&
instance
:
instances_
)
{
RETURN_IF_ERROR
(
instance
->
WarmUp
());
}
return
Status
::
Success
;
}
TritonModel
::
TritonModel
(
InferenceServer
*
server
,
const
std
::
shared_ptr
<
LocalizedPath
>&
localized_model_dir
,
const
std
::
shared_ptr
<
TritonBackend
>&
backend
,
const
double
min_compute_capability
,
const
int64_t
version
,
const
inference
::
ModelConfig
&
config
,
const
bool
auto_complete_config
)
:
Model
(
min_compute_capability
,
localized_model_dir
->
Path
(),
version
,
config
),
server_
(
server
),
min_compute_capability_
(
min_compute_capability
),
auto_complete_config_
(
auto_complete_config
),
localized_model_dir_
(
localized_model_dir
),
backend_
(
backend
),
state_
(
nullptr
)
{
}
TritonModel
::~
TritonModel
()
{
// Explicitly delete/finalize all model instances before finalizing
// the model itself.
instances_
.
clear
();
passive_instances_
.
clear
();
// Unregister itself from the rate limiter. Note this should happen
// after all instances are destructed. Destrucing instances ensures
// there are no instance threads waiting on rate limiter for
// receiving their payloads.
server_
->
GetRateLimiter
()
->
UnregisterModel
(
this
);
// Model finalization is optional... The TRITONBACKEND_Model
// object is this TritonModel object.
if
(
backend_
->
ModelFiniFn
()
!=
nullptr
)
{
LOG_TRITONSERVER_ERROR
(
backend_
->
ModelFiniFn
()(
reinterpret_cast
<
TRITONBACKEND_Model
*>
(
this
)),
"failed finalizing model"
);
}
}
extern
"C"
{
//
// TRITONBACKEND_Model
//
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelName
(
TRITONBACKEND_Model
*
model
,
const
char
**
name
)
{
TritonModel
*
tm
=
reinterpret_cast
<
TritonModel
*>
(
model
);
*
name
=
tm
->
Name
().
c_str
();
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelVersion
(
TRITONBACKEND_Model
*
model
,
uint64_t
*
version
)
{
TritonModel
*
tm
=
reinterpret_cast
<
TritonModel
*>
(
model
);
*
version
=
tm
->
Version
();
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelRepository
(
TRITONBACKEND_Model
*
model
,
TRITONBACKEND_ArtifactType
*
artifact_type
,
const
char
**
location
)
{
TritonModel
*
tm
=
reinterpret_cast
<
TritonModel
*>
(
model
);
*
artifact_type
=
TRITONBACKEND_ARTIFACT_FILESYSTEM
;
*
location
=
tm
->
LocalizedModelPath
().
c_str
();
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelConfig
(
TRITONBACKEND_Model
*
model
,
const
uint32_t
config_version
,
TRITONSERVER_Message
**
model_config
)
{
TritonModel
*
tm
=
reinterpret_cast
<
TritonModel
*>
(
model
);
std
::
string
model_config_json
;
Status
status
=
ModelConfigToJson
(
tm
->
Config
(),
config_version
,
&
model_config_json
);
if
(
!
status
.
IsOk
())
{
return
TRITONSERVER_ErrorNew
(
StatusCodeToTritonCode
(
status
.
StatusCode
()),
status
.
Message
().
c_str
());
}
*
model_config
=
reinterpret_cast
<
TRITONSERVER_Message
*>
(
new
TritonServerMessage
(
std
::
move
(
model_config_json
)));
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelAutoCompleteConfig
(
TRITONBACKEND_Model
*
model
,
bool
*
auto_complete_config
)
{
TritonModel
*
tm
=
reinterpret_cast
<
TritonModel
*>
(
model
);
*
auto_complete_config
=
tm
->
AutoCompleteConfig
();
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelSetConfig
(
TRITONBACKEND_Model
*
model
,
const
uint32_t
config_version
,
TRITONSERVER_Message
*
model_config
)
{
TritonModel
*
tm
=
reinterpret_cast
<
TritonModel
*>
(
model
);
Status
status
=
tm
->
UpdateModelConfig
(
config_version
,
model_config
);
if
(
!
status
.
IsOk
())
{
return
TRITONSERVER_ErrorNew
(
StatusCodeToTritonCode
(
status
.
StatusCode
()),
status
.
Message
().
c_str
());
}
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelServer
(
TRITONBACKEND_Model
*
model
,
TRITONSERVER_Server
**
server
)
{
TritonModel
*
tm
=
reinterpret_cast
<
TritonModel
*>
(
model
);
*
server
=
reinterpret_cast
<
TRITONSERVER_Server
*>
(
tm
->
Server
());
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelBackend
(
TRITONBACKEND_Model
*
model
,
TRITONBACKEND_Backend
**
backend
)
{
TritonModel
*
tm
=
reinterpret_cast
<
TritonModel
*>
(
model
);
*
backend
=
reinterpret_cast
<
TRITONBACKEND_Backend
*>
(
tm
->
Backend
().
get
());
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelState
(
TRITONBACKEND_Model
*
model
,
void
**
state
)
{
TritonModel
*
tm
=
reinterpret_cast
<
TritonModel
*>
(
model
);
*
state
=
tm
->
State
();
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelSetState
(
TRITONBACKEND_Model
*
model
,
void
*
state
)
{
TritonModel
*
tm
=
reinterpret_cast
<
TritonModel
*>
(
model
);
tm
->
SetState
(
state
);
return
nullptr
;
// success
}
///
/// TRITONBACKEND_Request
///
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_RequestId
(
TRITONBACKEND_Request
*
request
,
const
char
**
id
)
{
InferenceRequest
*
tr
=
reinterpret_cast
<
InferenceRequest
*>
(
request
);
*
id
=
tr
->
Id
().
c_str
();
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_RequestCorrelationId
(
TRITONBACKEND_Request
*
request
,
uint64_t
*
id
)
{
InferenceRequest
*
tr
=
reinterpret_cast
<
InferenceRequest
*>
(
request
);
const
InferenceRequest
::
SequenceId
&
correlation_id
=
tr
->
CorrelationId
();
if
(
correlation_id
.
Type
()
!=
InferenceRequest
::
SequenceId
::
DataType
::
UINT64
)
{
return
TRITONSERVER_ErrorNew
(
TRITONSERVER_ERROR_INVALID_ARG
,
(
tr
->
LogRequest
()
+
"correlation ID in request is not an unsigned int"
)
.
c_str
());
}
*
id
=
correlation_id
.
UnsignedIntValue
();
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_RequestFlags
(
TRITONBACKEND_Request
*
request
,
uint32_t
*
flags
)
{
InferenceRequest
*
tr
=
reinterpret_cast
<
InferenceRequest
*>
(
request
);
*
flags
=
tr
->
Flags
();
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_RequestCorrelationIdString
(
TRITONBACKEND_Request
*
request
,
const
char
**
id
)
{
InferenceRequest
*
tr
=
reinterpret_cast
<
InferenceRequest
*>
(
request
);
const
InferenceRequest
::
SequenceId
&
correlation_id
=
tr
->
CorrelationId
();
if
(
correlation_id
.
Type
()
!=
InferenceRequest
::
SequenceId
::
DataType
::
STRING
)
{
return
TRITONSERVER_ErrorNew
(
TRITONSERVER_ERROR_INVALID_ARG
,
(
tr
->
LogRequest
()
+
"correlation ID in request is not a string"
)
.
c_str
());
}
*
id
=
correlation_id
.
StringValue
().
c_str
();
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_RequestInputCount
(
TRITONBACKEND_Request
*
request
,
uint32_t
*
count
)
{
InferenceRequest
*
tr
=
reinterpret_cast
<
InferenceRequest
*>
(
request
);
*
count
=
tr
->
ImmutableInputs
().
size
();
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_RequestInputName
(
TRITONBACKEND_Request
*
request
,
const
uint32_t
index
,
const
char
**
input_name
)
{
*
input_name
=
nullptr
;
InferenceRequest
*
tr
=
reinterpret_cast
<
InferenceRequest
*>
(
request
);
const
auto
&
inputs
=
tr
->
ImmutableInputs
();
if
(
index
>=
inputs
.
size
())
{
return
TRITONSERVER_ErrorNew
(
TRITONSERVER_ERROR_INVALID_ARG
,
(
tr
->
LogRequest
()
+
"out of bounds index "
+
std
::
to_string
(
index
)
+
": request has "
+
std
::
to_string
(
inputs
.
size
())
+
" inputs"
)
.
c_str
());
}
// The request inputs are not allowed to change once the request
// makes it to the backend, so it is ok to just iterate through the
// map. This linear search is the best we can do given the need for
// the inputs to be in a map and given the typical small number of
// inputs is better than having every request maintain the inputs as
// both map and vector.
uint32_t
cnt
=
0
;
for
(
const
auto
&
pr
:
inputs
)
{
if
(
cnt
++
==
index
)
{
InferenceRequest
::
Input
*
in
=
pr
.
second
;
*
input_name
=
in
->
Name
().
c_str
();
break
;
}
}
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_RequestInput
(
TRITONBACKEND_Request
*
request
,
const
char
*
name
,
TRITONBACKEND_Input
**
input
)
{
InferenceRequest
*
tr
=
reinterpret_cast
<
InferenceRequest
*>
(
request
);
const
auto
&
inputs
=
tr
->
ImmutableInputs
();
const
auto
&
itr
=
inputs
.
find
(
name
);
if
(
itr
==
inputs
.
end
())
{
*
input
=
nullptr
;
return
TRITONSERVER_ErrorNew
(
TRITONSERVER_ERROR_INVALID_ARG
,
(
tr
->
LogRequest
()
+
"unknown request input name "
+
name
).
c_str
());
}
InferenceRequest
::
Input
*
in
=
itr
->
second
;
*
input
=
reinterpret_cast
<
TRITONBACKEND_Input
*>
(
in
);
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_RequestInputByIndex
(
TRITONBACKEND_Request
*
request
,
const
uint32_t
index
,
TRITONBACKEND_Input
**
input
)
{
InferenceRequest
*
tr
=
reinterpret_cast
<
InferenceRequest
*>
(
request
);
const
auto
&
inputs
=
tr
->
ImmutableInputs
();
if
(
index
>=
inputs
.
size
())
{
return
TRITONSERVER_ErrorNew
(
TRITONSERVER_ERROR_INVALID_ARG
,
(
tr
->
LogRequest
()
+
"out of bounds index "
+
std
::
to_string
(
index
)
+
": request has "
+
std
::
to_string
(
inputs
.
size
())
+
" inputs"
)
.
c_str
());
}
// The request inputs are not allowed to change once the request
// makes it to the backend, so it is ok to just iterate through the
// map. This linear search is the best we can do given the need for
// the inputs to be in a map and given the typical small number of
// inputs is better than having every request maintain the inputs as
// both map and vector.
uint32_t
cnt
=
0
;
for
(
const
auto
&
pr
:
inputs
)
{
if
(
cnt
++
==
index
)
{
InferenceRequest
::
Input
*
in
=
pr
.
second
;
*
input
=
reinterpret_cast
<
TRITONBACKEND_Input
*>
(
in
);
break
;
}
}
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_RequestOutputCount
(
TRITONBACKEND_Request
*
request
,
uint32_t
*
count
)
{
InferenceRequest
*
tr
=
reinterpret_cast
<
InferenceRequest
*>
(
request
);
*
count
=
tr
->
ImmutableRequestedOutputs
().
size
();
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_RequestOutputName
(
TRITONBACKEND_Request
*
request
,
const
uint32_t
index
,
const
char
**
output_name
)
{
*
output_name
=
nullptr
;
InferenceRequest
*
tr
=
reinterpret_cast
<
InferenceRequest
*>
(
request
);
const
auto
&
routputs
=
tr
->
ImmutableRequestedOutputs
();
if
(
index
>=
routputs
.
size
())
{
return
TRITONSERVER_ErrorNew
(
TRITONSERVER_ERROR_INVALID_ARG
,
(
tr
->
LogRequest
()
+
"out of bounds index "
+
std
::
to_string
(
index
)
+
": request has "
+
std
::
to_string
(
routputs
.
size
())
+
" requested outputs"
)
.
c_str
());
}
// The requested outputs are not allowed to change once the request
// makes it to the backend, so it is ok to just iterate through the
// set. This linear search is the best we can do given the requested
// outputs being in a set and given the typical small number of
// requested outputs it should not be a performance issue.
uint32_t
cnt
=
0
;
for
(
const
auto
&
rout
:
routputs
)
{
if
(
cnt
++
==
index
)
{
*
output_name
=
rout
.
c_str
();
break
;
}
}
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_RequestOutputBufferProperties
(
TRITONBACKEND_Request
*
request
,
const
char
*
name
,
size_t
*
byte_size
,
TRITONSERVER_MemoryType
*
memory_type
,
int64_t
*
memory_type_id
)
{
InferenceRequest
*
tr
=
reinterpret_cast
<
InferenceRequest
*>
(
request
);
auto
status
=
tr
->
OutputBufferProperties
(
name
,
byte_size
,
memory_type
,
memory_type_id
);
if
(
!
status
.
IsOk
())
{
return
TRITONSERVER_ErrorNew
(
StatusCodeToTritonCode
(
status
.
StatusCode
()),
status
.
Message
().
c_str
());
}
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_RequestRelease
(
TRITONBACKEND_Request
*
request
,
uint32_t
release_flags
)
{
InferenceRequest
*
tr
=
reinterpret_cast
<
InferenceRequest
*>
(
request
);
std
::
unique_ptr
<
InferenceRequest
>
ur
(
tr
);
InferenceRequest
::
Release
(
std
::
move
(
ur
),
release_flags
);
return
nullptr
;
// success
}
///
/// TRITONBACKEND_State
///
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_StateUpdate
(
TRITONBACKEND_State
*
state
)
{
SequenceState
*
ts
=
reinterpret_cast
<
SequenceState
*>
(
state
);
auto
status
=
ts
->
Update
();
if
(
!
status
.
IsOk
())
{
return
TRITONSERVER_ErrorNew
(
StatusCodeToTritonCode
(
status
.
StatusCode
()),
status
.
Message
().
c_str
());
}
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_StateNew
(
TRITONBACKEND_State
**
state
,
TRITONBACKEND_Request
*
request
,
const
char
*
name
,
const
TRITONSERVER_DataType
datatype
,
const
int64_t
*
shape
,
const
uint32_t
dims_count
)
{
InferenceRequest
*
tr
=
reinterpret_cast
<
InferenceRequest
*>
(
request
);
SequenceState
*
lstate
;
std
::
vector
<
int64_t
>
lshape
(
shape
,
shape
+
dims_count
);
auto
&
sequence_state
=
tr
->
GetSequenceStates
();
if
(
sequence_state
==
nullptr
)
{
return
TRITONSERVER_ErrorNew
(
TRITONSERVER_ERROR_INVALID_ARG
,
(
std
::
string
(
"unable to add state '"
)
+
name
+
"'. State configuration is missing for model '"
+
tr
->
ModelName
()
+
"'."
)
.
c_str
());
}
Status
status
=
sequence_state
->
OutputState
(
name
,
TritonToDataType
(
datatype
),
lshape
,
&
lstate
);
if
(
!
status
.
IsOk
())
{
*
state
=
nullptr
;
return
TRITONSERVER_ErrorNew
(
StatusCodeToTritonCode
(
status
.
StatusCode
()),
status
.
Message
().
c_str
());
}
*
state
=
reinterpret_cast
<
TRITONBACKEND_State
*>
(
lstate
);
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_StateBuffer
(
TRITONBACKEND_State
*
state
,
void
**
buffer
,
const
uint64_t
buffer_byte_size
,
TRITONSERVER_MemoryType
*
memory_type
,
int64_t
*
memory_type_id
)
{
SequenceState
*
to
=
reinterpret_cast
<
SequenceState
*>
(
state
);
Status
status
=
Status
::
Success
;
// If the buffer size exactly matches the buffer available, reuse the
// currently allocated buffer.
if
(
to
->
Data
()
->
TotalByteSize
()
==
buffer_byte_size
)
{
const
std
::
shared_ptr
<
AllocatedMemory
>&
memory
=
reinterpret_cast
<
const
std
::
shared_ptr
<
AllocatedMemory
>&>
(
to
->
Data
());
TRITONSERVER_MemoryType
current_memory_type
;
int64_t
current_memory_type_id
;
void
*
lbuffer
=
memory
->
MutableBuffer
(
&
current_memory_type
,
&
current_memory_type_id
);
// If the requested memory type doesn't match the current buffer, allocate a
// new buffer with the requested memory type and memory type id.
if
(
current_memory_type
==
*
memory_type
&&
current_memory_type_id
==
*
memory_type_id
)
{
*
buffer
=
lbuffer
;
}
else
{
std
::
shared_ptr
<
AllocatedMemory
>
memory
=
std
::
make_shared
<
AllocatedMemory
>
(
buffer_byte_size
,
*
memory_type
,
*
memory_type_id
);
*
buffer
=
memory
->
MutableBuffer
(
memory_type
,
memory_type_id
);
to
->
RemoveAllData
();
status
=
to
->
SetData
(
memory
);
}
}
else
{
std
::
shared_ptr
<
AllocatedMemory
>
memory
=
std
::
make_shared
<
AllocatedMemory
>
(
buffer_byte_size
,
*
memory_type
,
*
memory_type_id
);
*
buffer
=
memory
->
MutableBuffer
(
memory_type
,
memory_type_id
);
to
->
RemoveAllData
();
status
=
to
->
SetData
(
memory
);
}
if
(
!
status
.
IsOk
())
{
*
buffer
=
nullptr
;
return
TRITONSERVER_ErrorNew
(
StatusCodeToTritonCode
(
status
.
StatusCode
()),
status
.
Message
().
c_str
());
}
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_StateBufferAttributes
(
TRITONBACKEND_State
*
state
,
TRITONSERVER_BufferAttributes
**
buffer_attributes
)
{
SequenceState
*
to
=
reinterpret_cast
<
SequenceState
*>
(
state
);
to
->
Data
()
->
BufferAt
(
0
,
reinterpret_cast
<
BufferAttributes
**>
(
buffer_attributes
));
return
nullptr
;
// success
}
//
// TRITONBACKEND_ResponseFactory
//
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ResponseFactoryNew
(
TRITONBACKEND_ResponseFactory
**
factory
,
TRITONBACKEND_Request
*
request
)
{
InferenceRequest
*
tr
=
reinterpret_cast
<
InferenceRequest
*>
(
request
);
std
::
shared_ptr
<
InferenceResponseFactory
>*
response_factory
=
new
std
::
shared_ptr
<
InferenceResponseFactory
>
(
tr
->
ResponseFactory
());
*
factory
=
reinterpret_cast
<
TRITONBACKEND_ResponseFactory
*>
(
response_factory
);
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ResponseFactoryDelete
(
TRITONBACKEND_ResponseFactory
*
factory
)
{
std
::
shared_ptr
<
InferenceResponseFactory
>*
response_factory
=
reinterpret_cast
<
std
::
shared_ptr
<
InferenceResponseFactory
>*>
(
factory
);
delete
response_factory
;
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ResponseFactorySendFlags
(
TRITONBACKEND_ResponseFactory
*
factory
,
const
uint32_t
send_flags
)
{
std
::
shared_ptr
<
InferenceResponseFactory
>*
response_factory
=
reinterpret_cast
<
std
::
shared_ptr
<
InferenceResponseFactory
>*>
(
factory
);
Status
status
=
(
*
response_factory
)
->
SendFlags
(
send_flags
);
if
(
!
status
.
IsOk
())
{
return
TRITONSERVER_ErrorNew
(
StatusCodeToTritonCode
(
status
.
StatusCode
()),
status
.
Message
().
c_str
());
}
return
nullptr
;
// success
}
///
/// TRITONBACKEND_Response
///
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ResponseNew
(
TRITONBACKEND_Response
**
response
,
TRITONBACKEND_Request
*
request
)
{
InferenceRequest
*
tr
=
reinterpret_cast
<
InferenceRequest
*>
(
request
);
std
::
unique_ptr
<
InferenceResponse
>
tresp
;
Status
status
=
tr
->
ResponseFactory
()
->
CreateResponse
(
&
tresp
);
if
(
!
status
.
IsOk
())
{
*
response
=
nullptr
;
return
TRITONSERVER_ErrorNew
(
StatusCodeToTritonCode
(
status
.
StatusCode
()),
status
.
Message
().
c_str
());
}
*
response
=
reinterpret_cast
<
TRITONBACKEND_Response
*>
(
tresp
.
release
());
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ResponseNewFromFactory
(
TRITONBACKEND_Response
**
response
,
TRITONBACKEND_ResponseFactory
*
factory
)
{
std
::
shared_ptr
<
InferenceResponseFactory
>*
response_factory
=
reinterpret_cast
<
std
::
shared_ptr
<
InferenceResponseFactory
>*>
(
factory
);
std
::
unique_ptr
<
InferenceResponse
>
tr
;
Status
status
=
(
*
response_factory
)
->
CreateResponse
(
&
tr
);
if
(
!
status
.
IsOk
())
{
*
response
=
nullptr
;
return
TRITONSERVER_ErrorNew
(
StatusCodeToTritonCode
(
status
.
StatusCode
()),
status
.
Message
().
c_str
());
}
*
response
=
reinterpret_cast
<
TRITONBACKEND_Response
*>
(
tr
.
release
());
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ResponseDelete
(
TRITONBACKEND_Response
*
response
)
{
InferenceResponse
*
tr
=
reinterpret_cast
<
InferenceResponse
*>
(
response
);
delete
tr
;
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ResponseSetStringParameter
(
TRITONBACKEND_Response
*
response
,
const
char
*
name
,
const
char
*
value
)
{
InferenceResponse
*
tr
=
reinterpret_cast
<
InferenceResponse
*>
(
response
);
Status
status
=
tr
->
AddParameter
(
name
,
value
);
if
(
!
status
.
IsOk
())
{
return
TRITONSERVER_ErrorNew
(
StatusCodeToTritonCode
(
status
.
StatusCode
()),
status
.
Message
().
c_str
());
}
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ResponseSetIntParameter
(
TRITONBACKEND_Response
*
response
,
const
char
*
name
,
const
int64_t
value
)
{
InferenceResponse
*
tr
=
reinterpret_cast
<
InferenceResponse
*>
(
response
);
Status
status
=
tr
->
AddParameter
(
name
,
value
);
if
(
!
status
.
IsOk
())
{
return
TRITONSERVER_ErrorNew
(
StatusCodeToTritonCode
(
status
.
StatusCode
()),
status
.
Message
().
c_str
());
}
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ResponseSetBoolParameter
(
TRITONBACKEND_Response
*
response
,
const
char
*
name
,
const
bool
value
)
{
InferenceResponse
*
tr
=
reinterpret_cast
<
InferenceResponse
*>
(
response
);
Status
status
=
tr
->
AddParameter
(
name
,
value
);
if
(
!
status
.
IsOk
())
{
return
TRITONSERVER_ErrorNew
(
StatusCodeToTritonCode
(
status
.
StatusCode
()),
status
.
Message
().
c_str
());
}
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ResponseOutput
(
TRITONBACKEND_Response
*
response
,
TRITONBACKEND_Output
**
output
,
const
char
*
name
,
const
TRITONSERVER_DataType
datatype
,
const
int64_t
*
shape
,
const
uint32_t
dims_count
)
{
InferenceResponse
*
tr
=
reinterpret_cast
<
InferenceResponse
*>
(
response
);
std
::
vector
<
int64_t
>
lshape
(
shape
,
shape
+
dims_count
);
InferenceResponse
::
Output
*
loutput
;
Status
status
=
tr
->
AddOutput
(
name
,
TritonToDataType
(
datatype
),
std
::
move
(
lshape
),
&
loutput
);
if
(
!
status
.
IsOk
())
{
*
output
=
nullptr
;
return
TRITONSERVER_ErrorNew
(
StatusCodeToTritonCode
(
status
.
StatusCode
()),
status
.
Message
().
c_str
());
}
*
output
=
reinterpret_cast
<
TRITONBACKEND_Output
*>
(
loutput
);
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ResponseSend
(
TRITONBACKEND_Response
*
response
,
const
uint32_t
send_flags
,
TRITONSERVER_Error
*
error
)
{
InferenceResponse
*
tr
=
reinterpret_cast
<
InferenceResponse
*>
(
response
);
Status
status
;
std
::
unique_ptr
<
InferenceResponse
>
utr
(
tr
);
if
(
error
==
nullptr
)
{
status
=
InferenceResponse
::
Send
(
std
::
move
(
utr
),
send_flags
);
}
else
{
status
=
InferenceResponse
::
SendWithStatus
(
std
::
move
(
utr
),
send_flags
,
Status
(
TritonCodeToStatusCode
(
TRITONSERVER_ErrorCode
(
error
)),
TRITONSERVER_ErrorMessage
(
error
)));
}
if
(
!
status
.
IsOk
())
{
return
TRITONSERVER_ErrorNew
(
StatusCodeToTritonCode
(
status
.
StatusCode
()),
status
.
Message
().
c_str
());
}
return
nullptr
;
// success
}
///
/// TRITONBACKEND_Input
///
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_InputProperties
(
TRITONBACKEND_Input
*
input
,
const
char
**
name
,
TRITONSERVER_DataType
*
datatype
,
const
int64_t
**
shape
,
uint32_t
*
dims_count
,
uint64_t
*
byte_size
,
uint32_t
*
buffer_count
)
{
InferenceRequest
::
Input
*
ti
=
reinterpret_cast
<
InferenceRequest
::
Input
*>
(
input
);
if
(
name
!=
nullptr
)
{
*
name
=
ti
->
Name
().
c_str
();
}
if
(
datatype
!=
nullptr
)
{
*
datatype
=
DataTypeToTriton
(
ti
->
DType
());
}
if
(
shape
!=
nullptr
)
{
*
shape
=
ti
->
ShapeWithBatchDim
().
data
();
}
if
(
dims_count
!=
nullptr
)
{
*
dims_count
=
ti
->
ShapeWithBatchDim
().
size
();
}
if
(
byte_size
!=
nullptr
)
{
*
byte_size
=
ti
->
Data
()
->
TotalByteSize
();
}
if
(
buffer_count
!=
nullptr
)
{
*
buffer_count
=
ti
->
DataBufferCount
();
}
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_InputPropertiesForHostPolicy
(
TRITONBACKEND_Input
*
input
,
const
char
*
host_policy_name
,
const
char
**
name
,
TRITONSERVER_DataType
*
datatype
,
const
int64_t
**
shape
,
uint32_t
*
dims_count
,
uint64_t
*
byte_size
,
uint32_t
*
buffer_count
)
{
InferenceRequest
::
Input
*
ti
=
reinterpret_cast
<
InferenceRequest
::
Input
*>
(
input
);
if
(
name
!=
nullptr
)
{
*
name
=
ti
->
Name
().
c_str
();
}
if
(
datatype
!=
nullptr
)
{
*
datatype
=
DataTypeToTriton
(
ti
->
DType
());
}
if
(
shape
!=
nullptr
)
{
*
shape
=
ti
->
ShapeWithBatchDim
().
data
();
}
if
(
dims_count
!=
nullptr
)
{
*
dims_count
=
ti
->
ShapeWithBatchDim
().
size
();
}
if
(
host_policy_name
!=
nullptr
)
{
if
(
byte_size
!=
nullptr
)
{
*
byte_size
=
ti
->
Data
(
host_policy_name
)
->
TotalByteSize
();
}
if
(
buffer_count
!=
nullptr
)
{
*
buffer_count
=
ti
->
DataBufferCountForHostPolicy
(
host_policy_name
);
}
}
else
{
if
(
byte_size
!=
nullptr
)
{
*
byte_size
=
ti
->
Data
()
->
TotalByteSize
();
}
if
(
buffer_count
!=
nullptr
)
{
*
buffer_count
=
ti
->
DataBufferCount
();
}
}
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_InputBuffer
(
TRITONBACKEND_Input
*
input
,
const
uint32_t
index
,
const
void
**
buffer
,
uint64_t
*
buffer_byte_size
,
TRITONSERVER_MemoryType
*
memory_type
,
int64_t
*
memory_type_id
)
{
InferenceRequest
::
Input
*
ti
=
reinterpret_cast
<
InferenceRequest
::
Input
*>
(
input
);
Status
status
=
ti
->
DataBuffer
(
index
,
buffer
,
buffer_byte_size
,
memory_type
,
memory_type_id
);
if
(
!
status
.
IsOk
())
{
*
buffer
=
nullptr
;
*
buffer_byte_size
=
0
;
return
TRITONSERVER_ErrorNew
(
StatusCodeToTritonCode
(
status
.
StatusCode
()),
status
.
Message
().
c_str
());
}
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_InputBufferAttributes
(
TRITONBACKEND_Input
*
input
,
const
uint32_t
index
,
const
void
**
buffer
,
TRITONSERVER_BufferAttributes
**
buffer_attributes
)
{
InferenceRequest
::
Input
*
ti
=
reinterpret_cast
<
InferenceRequest
::
Input
*>
(
input
);
Status
status
=
ti
->
DataBufferAttributes
(
index
,
buffer
,
reinterpret_cast
<
BufferAttributes
**>
(
buffer_attributes
));
if
(
!
status
.
IsOk
())
{
*
buffer
=
nullptr
;
*
buffer_attributes
=
nullptr
;
return
TRITONSERVER_ErrorNew
(
StatusCodeToTritonCode
(
status
.
StatusCode
()),
status
.
Message
().
c_str
());
}
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_InputBufferForHostPolicy
(
TRITONBACKEND_Input
*
input
,
const
char
*
host_policy_name
,
const
uint32_t
index
,
const
void
**
buffer
,
uint64_t
*
buffer_byte_size
,
TRITONSERVER_MemoryType
*
memory_type
,
int64_t
*
memory_type_id
)
{
InferenceRequest
::
Input
*
ti
=
reinterpret_cast
<
InferenceRequest
::
Input
*>
(
input
);
Status
status
=
(
host_policy_name
==
nullptr
)
?
ti
->
DataBuffer
(
index
,
buffer
,
buffer_byte_size
,
memory_type
,
memory_type_id
)
:
ti
->
DataBufferForHostPolicy
(
index
,
buffer
,
buffer_byte_size
,
memory_type
,
memory_type_id
,
host_policy_name
);
if
(
!
status
.
IsOk
())
{
*
buffer
=
nullptr
;
*
buffer_byte_size
=
0
;
return
TRITONSERVER_ErrorNew
(
StatusCodeToTritonCode
(
status
.
StatusCode
()),
status
.
Message
().
c_str
());
}
return
nullptr
;
// success
}
///
/// TRITONBACKEND_Output
///
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_OutputBuffer
(
TRITONBACKEND_Output
*
output
,
void
**
buffer
,
const
uint64_t
buffer_byte_size
,
TRITONSERVER_MemoryType
*
memory_type
,
int64_t
*
memory_type_id
)
{
InferenceResponse
::
Output
*
to
=
reinterpret_cast
<
InferenceResponse
::
Output
*>
(
output
);
Status
status
=
to
->
AllocateDataBuffer
(
buffer
,
buffer_byte_size
,
memory_type
,
memory_type_id
);
if
(
!
status
.
IsOk
())
{
*
buffer
=
nullptr
;
return
TRITONSERVER_ErrorNew
(
StatusCodeToTritonCode
(
status
.
StatusCode
()),
status
.
Message
().
c_str
());
}
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_OutputBufferAttributes
(
TRITONBACKEND_Output
*
output
,
TRITONSERVER_BufferAttributes
**
buffer_attributes
)
{
InferenceResponse
::
Output
*
to
=
reinterpret_cast
<
InferenceResponse
::
Output
*>
(
output
);
*
buffer_attributes
=
reinterpret_cast
<
TRITONSERVER_BufferAttributes
*>
(
to
->
GetBufferAttributes
());
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_BackendAttributeAddPreferredInstanceGroup
(
TRITONBACKEND_BackendAttribute
*
backend_attributes
,
const
TRITONSERVER_InstanceGroupKind
kind
,
const
uint64_t
count
,
const
uint64_t
*
device_ids
,
const
uint64_t
id_count
)
{
auto
ba
=
reinterpret_cast
<
TritonBackend
::
Attribute
*>
(
backend_attributes
);
ba
->
preferred_groups_
.
emplace_back
();
auto
&
pg
=
ba
->
preferred_groups_
.
back
();
switch
(
kind
)
{
case
TRITONSERVER_INSTANCEGROUPKIND_AUTO
:
pg
.
set_kind
(
inference
::
ModelInstanceGroup
::
KIND_AUTO
);
break
;
case
TRITONSERVER_INSTANCEGROUPKIND_CPU
:
pg
.
set_kind
(
inference
::
ModelInstanceGroup
::
KIND_CPU
);
break
;
case
TRITONSERVER_INSTANCEGROUPKIND_GPU
:
pg
.
set_kind
(
inference
::
ModelInstanceGroup
::
KIND_GPU
);
break
;
case
TRITONSERVER_INSTANCEGROUPKIND_MODEL
:
pg
.
set_kind
(
inference
::
ModelInstanceGroup
::
KIND_MODEL
);
break
;
}
pg
.
set_count
(
count
);
if
(
device_ids
!=
nullptr
)
{
for
(
size_t
i
=
0
;
i
<
id_count
;
++
i
)
{
pg
.
add_gpus
(
device_ids
[
i
]);
}
}
return
nullptr
;
}
}
// extern C
}}
// namespace triton::core
3rdparty/core-r22.12/src/backend_model.h
0 → 100644
View file @
b30f3cdb
// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <memory>
#include <string>
#include "backend_manager.h"
#include "filesystem.h"
#include "infer_request.h"
#include "model.h"
#include "model_config.pb.h"
#include "status.h"
namespace
triton
{
namespace
core
{
class
InferenceServer
;
class
TritonModelInstance
;
//
// Represents a model.
//
// Inheriting from Model to implement backend APIs
//
class
TritonModel
:
public
Model
{
public:
static
Status
Create
(
InferenceServer
*
server
,
const
std
::
string
&
model_path
,
const
triton
::
common
::
BackendCmdlineConfigMap
&
backend_cmdline_config_map
,
const
triton
::
common
::
HostPolicyCmdlineConfigMap
&
host_policy_map
,
const
std
::
string
&
model_name
,
const
int64_t
version
,
inference
::
ModelConfig
model_config
,
const
bool
is_config_provided
,
std
::
unique_ptr
<
TritonModel
>*
model
);
~
TritonModel
();
const
std
::
string
&
LocalizedModelPath
()
const
{
return
localized_model_dir_
->
Path
();
}
InferenceServer
*
Server
()
{
return
server_
;
}
bool
AutoCompleteConfig
()
const
{
return
auto_complete_config_
;
}
Status
UpdateModelConfig
(
const
uint32_t
config_version
,
TRITONSERVER_Message
*
updated_config_message
);
const
std
::
shared_ptr
<
TritonBackend
>&
Backend
()
const
{
return
backend_
;
}
const
std
::
vector
<
std
::
unique_ptr
<
TritonModelInstance
>>&
Instances
()
const
{
return
instances_
;
}
void
*
State
()
{
return
state_
;
}
void
SetState
(
void
*
state
)
{
state_
=
state
;
}
Status
AddInstance
(
std
::
unique_ptr
<
TritonModelInstance
>&&
instance
,
const
bool
passive
);
private:
DISALLOW_COPY_AND_ASSIGN
(
TritonModel
);
TritonModel
(
InferenceServer
*
server
,
const
std
::
shared_ptr
<
LocalizedPath
>&
localized_model_dir
,
const
std
::
shared_ptr
<
TritonBackend
>&
backend
,
const
double
min_compute_capability
,
const
int64_t
version
,
const
inference
::
ModelConfig
&
config
,
const
bool
auto_complete_config
);
// Set the scheduler based on the model configuration. The scheduler
// can only be set once for a backend.
Status
SetConfiguredScheduler
();
// Merges the global backend configs with the specific
// backend configs.
static
Status
ResolveBackendConfigs
(
const
triton
::
common
::
BackendCmdlineConfigMap
&
backend_cmdline_config_map
,
const
std
::
string
&
backend_name
,
triton
::
common
::
BackendCmdlineConfig
&
config
);
// Sets defaults for some backend configurations when none are specified on
// the command line.
static
Status
SetBackendConfigDefaults
(
triton
::
common
::
BackendCmdlineConfig
&
config
);
Status
Initialize
();
Status
WarmUp
();
// The server object that owns this model. The model holds this as a
// raw pointer because the lifetime of the server is guaranteed to
// be longer than the lifetime of a model owned by the server.
InferenceServer
*
server_
;
// The minimum supported compute capability on device.
const
double
min_compute_capability_
;
// Whether the backend should attempt to auto-complete the model config.
const
bool
auto_complete_config_
;
// The localized repo directory holding the model. If localization
// required creation of a temporary local copy then that copy will
// persist as along as this object is retained by this model.
std
::
shared_ptr
<
LocalizedPath
>
localized_model_dir_
;
// Backend used by this model.
std
::
shared_ptr
<
TritonBackend
>
backend_
;
// The model instances for this model.
std
::
vector
<
std
::
unique_ptr
<
TritonModelInstance
>>
instances_
;
std
::
vector
<
std
::
unique_ptr
<
TritonModelInstance
>>
passive_instances_
;
// Opaque state associated with this model.
void
*
state_
;
};
}}
// namespace triton::core
3rdparty/core-r22.12/src/backend_model_instance.cc
0 → 100644
View file @
b30f3cdb
// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "backend_model_instance.h"
#ifndef _WIN32
#include <sys/resource.h>
#include <sys/syscall.h>
#include <unistd.h>
#endif
#include "backend_config.h"
#include "backend_model.h"
#include "cuda_utils.h"
#include "metrics.h"
#include "model_config.pb.h"
#include "numa_utils.h"
#include "server.h"
#include "shared_library.h"
#include "triton/common/logging.h"
#include "triton/common/nvtx.h"
#include "tritonserver_apis.h"
// For unknown reason, windows will not export the TRITONBACKEND_*
// functions declared with dllexport in tritonbackend.h. To get those
// functions exported it is (also?) necessary to mark the definitions
// in this file with dllexport as well.
#if defined(_MSC_VER)
#define TRITONAPI_DECLSPEC __declspec(dllexport)
#elif defined(__GNUC__)
#define TRITONAPI_DECLSPEC __attribute__((__visibility__("default")))
#else
#define TRITONAPI_DECLSPEC
#endif
namespace
triton
{
namespace
core
{
namespace
{
// Utilities for warmup feature
TRITONSERVER_Error
*
WarmupResponseAlloc
(
TRITONSERVER_ResponseAllocator
*
allocator
,
const
char
*
tensor_name
,
size_t
byte_size
,
TRITONSERVER_MemoryType
preferred_memory_type
,
int64_t
preferred_memory_type_id
,
void
*
userp
,
void
**
buffer
,
void
**
buffer_userp
,
TRITONSERVER_MemoryType
*
actual_memory_type
,
int64_t
*
actual_memory_type_id
)
{
*
buffer
=
malloc
(
byte_size
);
if
(
*
buffer
!=
nullptr
)
{
*
actual_memory_type
=
TRITONSERVER_MEMORY_CPU
;
*
actual_memory_type_id
=
0
;
return
nullptr
;
}
return
TRITONSERVER_ErrorNew
(
TRITONSERVER_ERROR_INTERNAL
,
"failed to allocate output buffer for warmup."
);
}
TRITONSERVER_Error
*
WarmupResponseRelease
(
TRITONSERVER_ResponseAllocator
*
allocator
,
void
*
buffer
,
void
*
buffer_userp
,
size_t
byte_size
,
TRITONSERVER_MemoryType
memory_type
,
int64_t
memory_type_id
)
{
free
(
buffer
);
return
nullptr
;
}
ResponseAllocator
warmup_allocator
=
ResponseAllocator
(
WarmupResponseAlloc
,
WarmupResponseRelease
,
nullptr
/* start_fn */
);
void
WarmupResponseComplete
(
TRITONSERVER_InferenceResponse
*
iresponse
,
const
uint32_t
flags
,
void
*
userp
)
{
auto
res_pair
=
reinterpret_cast
<
std
::
pair
<
std
::
promise
<
void
>
,
std
::
vector
<
std
::
string
>*>*>
(
userp
);
if
(
iresponse
!=
nullptr
)
{
auto
err
=
TRITONSERVER_InferenceResponseError
(
iresponse
);
if
(
err
!=
nullptr
)
{
// The error vector is shared by all requests in the batch for now
static
std
::
mutex
res_mtx
;
{
std
::
lock_guard
<
std
::
mutex
>
lk
(
res_mtx
);
res_pair
->
second
->
emplace_back
(
TRITONSERVER_ErrorMessage
(
err
));
}
TRITONSERVER_ErrorDelete
(
err
);
}
// Just delete the response, warmup doesn't check for correctness
LOG_TRITONSERVER_ERROR
(
TRITONSERVER_InferenceResponseDelete
(
iresponse
),
"deleting warmup response"
);
}
// Last response
if
((
flags
&
TRITONSERVER_RESPONSE_COMPLETE_FINAL
)
!=
0
)
{
res_pair
->
first
.
set_value
();
}
}
void
WarmupRequestComplete
(
TRITONSERVER_InferenceRequest
*
request
,
const
uint32_t
flags
,
void
*
userp
)
{
if
((
flags
&
TRITONSERVER_REQUEST_RELEASE_ALL
)
!=
0
)
{
// Don't need to release request here, it is managed in WarmupData
if
(
userp
!=
nullptr
)
{
auto
warmup_promise
=
reinterpret_cast
<
std
::
promise
<
void
>*>
(
userp
);
warmup_promise
->
set_value
();
}
}
}
}
// namespace
TritonModelInstance
::
TritonModelInstance
(
TritonModel
*
model
,
const
std
::
string
&
name
,
const
size_t
index
,
const
TRITONSERVER_InstanceGroupKind
kind
,
const
int32_t
device_id
,
const
std
::
vector
<
std
::
string
>&
profile_names
,
const
bool
passive
,
const
triton
::
common
::
HostPolicyCmdlineConfig
&
host_policy
,
const
TritonServerMessage
&
host_policy_message
,
const
std
::
vector
<
SecondaryDevice
>&
secondary_devices
)
:
model_
(
model
),
name_
(
name
),
index_
(
index
),
kind_
(
kind
),
device_id_
(
device_id
),
host_policy_
(
host_policy
),
host_policy_message_
(
host_policy_message
),
profile_names_
(
profile_names
),
passive_
(
passive
),
secondary_devices_
(
secondary_devices
),
state_
(
nullptr
)
{
#ifdef TRITON_ENABLE_METRICS
if
(
Metrics
::
Enabled
())
{
// Use an ID in the metric only for GPU instances. Otherwise use
// METRIC_REPORTER_ID_CPU to indicate no device should be reported in the
// metric.
const
int
id
=
(
kind_
==
TRITONSERVER_INSTANCEGROUPKIND_GPU
)
?
device_id_
:
METRIC_REPORTER_ID_CPU
;
MetricModelReporter
::
Create
(
model_
->
Name
(),
model_
->
Version
(),
id
,
model_
->
Config
().
metric_tags
(),
&
reporter_
);
}
#endif // TRITON_ENABLE_METRICS
}
TritonModelInstance
::~
TritonModelInstance
()
{
if
(
triton_backend_thread_
.
get
()
!=
nullptr
)
{
triton_backend_thread_
->
StopBackendThread
();
}
// Model finalization is optional...
if
(
model_
->
Backend
()
->
ModelInstanceFiniFn
()
!=
nullptr
)
{
LOG_TRITONSERVER_ERROR
(
model_
->
Backend
()
->
ModelInstanceFiniFn
()(
reinterpret_cast
<
TRITONBACKEND_ModelInstance
*>
(
this
)),
"failed finalizing model instance"
);
}
}
Status
TritonModelInstance
::
CreateInstances
(
TritonModel
*
model
,
const
triton
::
common
::
BackendCmdlineConfigMap
&
backend_cmdline_config_map
,
const
triton
::
common
::
HostPolicyCmdlineConfigMap
&
host_policy_map
,
const
inference
::
ModelConfig
&
model_config
,
const
bool
device_blocking
)
{
static
triton
::
common
::
HostPolicyCmdlineConfig
empty_host_policy
;
// This structure is used to allocate TritonBackendThread to instances on same
// device for device blocking execution policy.
std
::
map
<
uint32_t
,
std
::
shared_ptr
<
TritonBackendThread
>>
device_to_thread_map
;
for
(
const
auto
&
group
:
model_config
.
instance_group
())
{
std
::
vector
<
std
::
string
>
profile_names
;
for
(
const
auto
&
profile_name
:
group
.
profile
())
{
profile_names
.
push_back
(
profile_name
);
}
std
::
vector
<
SecondaryDevice
>
secondary_devices
;
for
(
const
auto
&
secondary_device
:
group
.
secondary_devices
())
{
secondary_devices
.
emplace_back
(
inference
::
ModelInstanceGroup_SecondaryDevice_SecondaryDeviceKind_Name
(
secondary_device
.
kind
()),
secondary_device
.
device_id
());
}
for
(
int32_t
c
=
0
;
c
<
group
.
count
();
++
c
)
{
std
::
string
instance_name
{
group
.
count
()
>
1
?
group
.
name
()
+
"_"
+
std
::
to_string
(
c
)
:
group
.
name
()};
const
bool
passive
=
group
.
passive
();
std
::
vector
<
std
::
tuple
<
std
::
string
,
TRITONSERVER_InstanceGroupKind
,
int32_t
,
const
inference
::
ModelRateLimiter
*>>
instance_setting
;
if
(
group
.
kind
()
==
inference
::
ModelInstanceGroup
::
KIND_CPU
)
{
instance_setting
.
emplace_back
(
group
.
host_policy
().
empty
()
?
"cpu"
:
group
.
host_policy
(),
TRITONSERVER_INSTANCEGROUPKIND_CPU
,
0
/* device_id */
,
&
group
.
rate_limiter
());
}
else
if
(
group
.
kind
()
==
inference
::
ModelInstanceGroup
::
KIND_GPU
)
{
for
(
const
int32_t
device_id
:
group
.
gpus
())
{
instance_setting
.
emplace_back
(
group
.
host_policy
().
empty
()
?
(
"gpu_"
+
std
::
to_string
(
device_id
))
:
group
.
host_policy
(),
TRITONSERVER_INSTANCEGROUPKIND_GPU
,
device_id
,
&
group
.
rate_limiter
());
}
}
else
if
(
group
.
kind
()
==
inference
::
ModelInstanceGroup
::
KIND_MODEL
)
{
instance_setting
.
emplace_back
(
group
.
host_policy
().
empty
()
?
"model"
:
group
.
host_policy
(),
TRITONSERVER_INSTANCEGROUPKIND_MODEL
,
0
/* device_id */
,
&
group
.
rate_limiter
());
}
else
{
return
Status
(
Status
::
Code
::
INVALID_ARG
,
std
::
string
(
"instance_group kind "
)
+
ModelInstanceGroup_Kind_Name
(
group
.
kind
())
+
" not supported"
);
}
for
(
const
auto
is
:
instance_setting
)
{
const
auto
&
kind
=
std
::
get
<
1
>
(
is
);
const
auto
&
id
=
std
::
get
<
2
>
(
is
);
const
std
::
string
&
policy_name
=
std
::
get
<
0
>
(
is
);
const
triton
::
common
::
HostPolicyCmdlineConfig
*
host_policy
;
const
auto
policy_it
=
host_policy_map
.
find
(
policy_name
);
if
(
policy_it
!=
host_policy_map
.
end
())
{
host_policy
=
&
policy_it
->
second
;
}
else
{
host_policy
=
&
empty_host_policy
;
}
RETURN_IF_ERROR
(
SetNumaConfigOnThread
(
*
host_policy
));
auto
err
=
CreateInstance
(
model
,
instance_name
,
c
,
kind
,
id
,
profile_names
,
passive
,
policy_name
,
*
host_policy
,
*
(
std
::
get
<
3
>
(
is
)),
device_blocking
,
&
device_to_thread_map
,
secondary_devices
);
RETURN_IF_ERROR
(
ResetNumaMemoryPolicy
());
RETURN_IF_ERROR
(
err
);
// When deploying on GPU, we want to make sure the GPU memory usage
// is within allowed range, otherwise, stop the creation to ensure
// there is sufficient GPU memory for other use.
// We check the usage after loading the instance to better enforcing
// the limit. If we check before loading, we may create instance
// that occupies the rest of available memory which against the purpose
if
(
kind
==
TRITONSERVER_INSTANCEGROUPKIND_GPU
)
{
size_t
free
,
total
;
double
memory_limit
;
RETURN_IF_ERROR
(
GetDeviceMemoryInfo
(
id
,
&
free
,
&
total
));
RETURN_IF_ERROR
(
BackendConfigurationModelLoadGpuFraction
(
backend_cmdline_config_map
,
id
,
&
memory_limit
));
const
size_t
allow
=
total
*
memory_limit
;
const
size_t
used
=
total
-
free
;
if
(
used
>
allow
)
{
return
Status
(
Status
::
Code
::
UNAVAILABLE
,
std
::
string
(
"can not create model '"
)
+
instance_name
+
"': memory limit set for "
+
TRITONSERVER_InstanceGroupKindString
(
kind
)
+
" "
+
std
::
to_string
(
id
)
+
" has exceeded, model loading is rejected."
);
}
}
}
}
}
return
Status
::
Success
;
}
Status
TritonModelInstance
::
CreateInstance
(
TritonModel
*
model
,
const
std
::
string
&
name
,
const
size_t
index
,
const
TRITONSERVER_InstanceGroupKind
kind
,
const
int32_t
device_id
,
const
std
::
vector
<
std
::
string
>&
profile_names
,
const
bool
passive
,
const
std
::
string
&
host_policy_name
,
const
triton
::
common
::
HostPolicyCmdlineConfig
&
host_policy
,
const
inference
::
ModelRateLimiter
&
rate_limiter_config
,
const
bool
device_blocking
,
std
::
map
<
uint32_t
,
std
::
shared_ptr
<
TritonBackendThread
>>*
device_to_thread_map
,
const
std
::
vector
<
SecondaryDevice
>&
secondary_devices
)
{
// Create the JSON representation of the backend configuration.
triton
::
common
::
TritonJson
::
Value
host_policy_json
(
triton
::
common
::
TritonJson
::
ValueType
::
OBJECT
);
triton
::
common
::
TritonJson
::
Value
policy_setting_json
(
host_policy_json
,
triton
::
common
::
TritonJson
::
ValueType
::
OBJECT
);
for
(
const
auto
&
pr
:
host_policy
)
{
RETURN_IF_ERROR
(
policy_setting_json
.
AddString
(
pr
.
first
.
c_str
(),
pr
.
second
));
}
RETURN_IF_ERROR
(
host_policy_json
.
Add
(
host_policy_name
.
c_str
(),
std
::
move
(
policy_setting_json
)));
TritonServerMessage
host_policy_message
(
host_policy_json
);
std
::
unique_ptr
<
TritonModelInstance
>
local_instance
(
new
TritonModelInstance
(
model
,
name
,
index
,
kind
,
device_id
,
profile_names
,
passive
,
host_policy
,
host_policy_message
,
secondary_devices
));
TRITONBACKEND_ModelInstance
*
triton_instance
=
reinterpret_cast
<
TRITONBACKEND_ModelInstance
*>
(
local_instance
.
get
());
// Instance initialization is optional... We must set set shared
// library path to point to the backend directory in case the
// backend library attempts to load additional shared libaries.
if
(
model
->
Backend
()
->
ModelInstanceInitFn
()
!=
nullptr
)
{
std
::
unique_ptr
<
SharedLibrary
>
slib
;
RETURN_IF_ERROR
(
SharedLibrary
::
Acquire
(
&
slib
));
RETURN_IF_ERROR
(
slib
->
SetLibraryDirectory
(
model
->
Backend
()
->
Directory
()));
TRITONSERVER_Error
*
err
=
model
->
Backend
()
->
ModelInstanceInitFn
()(
triton_instance
);
RETURN_IF_ERROR
(
slib
->
ResetLibraryDirectory
());
RETURN_IF_TRITONSERVER_ERROR
(
err
);
}
if
(
!
passive
)
{
RETURN_IF_ERROR
(
local_instance
->
GenerateWarmupData
());
RETURN_IF_ERROR
(
model
->
Server
()
->
GetRateLimiter
()
->
RegisterModelInstance
(
local_instance
.
get
(),
rate_limiter_config
));
RETURN_IF_ERROR
(
local_instance
->
SetBackendThread
(
kind
,
device_id
,
device_blocking
,
device_to_thread_map
));
}
RETURN_IF_ERROR
(
model
->
AddInstance
(
std
::
move
(
local_instance
),
passive
));
return
Status
::
Success
;
}
Status
TritonModelInstance
::
SetBackendThread
(
const
TRITONSERVER_InstanceGroupKind
kind
,
const
int32_t
device_id
,
const
bool
device_blocking
,
std
::
map
<
uint32_t
,
std
::
shared_ptr
<
TritonBackendThread
>>*
device_to_thread_map
)
{
if
(
device_blocking
&&
(
kind
==
TRITONSERVER_INSTANCEGROUPKIND_GPU
))
{
auto
thread_it
=
device_to_thread_map
->
find
(
device_id
);
if
(
thread_it
!=
device_to_thread_map
->
end
())
{
LOG_VERBOSE
(
1
)
<<
"Using already started backend thread for "
<<
Name
()
<<
" on device "
<<
device_id
;
triton_backend_thread_
=
thread_it
->
second
;
}
}
if
(
triton_backend_thread_
.
get
()
==
nullptr
)
{
std
::
unique_ptr
<
TritonBackendThread
>
local_backend_thread
;
RETURN_IF_ERROR
(
TritonBackendThread
::
CreateBackendThread
(
Name
(),
this
,
0
/* nice */
,
device_id
,
&
local_backend_thread
));
triton_backend_thread_
=
std
::
move
(
local_backend_thread
);
device_to_thread_map
->
insert
({
device_id
,
triton_backend_thread_
});
}
else
{
triton_backend_thread_
->
AddModelInstance
(
this
);
}
RETURN_IF_ERROR
(
triton_backend_thread_
->
InitAndWarmUpModelInstance
(
this
));
return
Status
::
Success
;
}
Status
TritonModelInstance
::
GenerateWarmupData
()
{
warmup_samples_
.
clear
();
for
(
const
auto
&
warmup_setting
:
model_
->
Config
().
model_warmup
())
{
if
(
warmup_setting
.
batch_size
()
==
0
)
{
LOG_VERBOSE
(
1
)
<<
"Skipping batch 0 warmup sample '"
<<
warmup_setting
.
name
()
<<
"'"
;
continue
;
}
LOG_VERBOSE
(
1
)
<<
"Generating warmup sample data for '"
<<
warmup_setting
.
name
()
<<
"'"
;
// Two passes. First pass to get max byte size for synthetic
// data. Second pass to add original inputs and override inputs
// for control inputs.
int64_t
max_zero_byte_size
=
0
;
int64_t
max_random_byte_size
=
0
;
for
(
const
auto
&
input_meta
:
warmup_setting
.
inputs
())
{
auto
element_count
=
triton
::
common
::
GetElementCount
(
input_meta
.
second
.
dims
());
if
(
element_count
==
-
1
)
{
return
Status
(
Status
::
Code
::
INVALID_ARG
,
"warmup setting expects all variable-size dimensions are specified "
"for input '"
+
input_meta
.
first
+
"'"
);
}
int64_t
batch_byte_size
=
element_count
*
triton
::
common
::
GetDataTypeByteSize
(
input_meta
.
second
.
data_type
());
if
(
batch_byte_size
==
0
)
{
batch_byte_size
=
element_count
*
sizeof
(
int32_t
);
}
switch
(
input_meta
.
second
.
input_data_type_case
())
{
case
inference
::
ModelWarmup_Input
::
InputDataTypeCase
::
kZeroData
:
max_zero_byte_size
=
std
::
max
(
batch_byte_size
,
max_zero_byte_size
);
break
;
case
inference
::
ModelWarmup_Input
::
InputDataTypeCase
::
kRandomData
:
{
// Because Triton expects STRING type to be in special format
// (prepend 4 bytes to specify string length), so using zero data
// for simplicity (4 bytes * element count of zeros).
if
(
input_meta
.
second
.
data_type
()
==
inference
::
DataType
::
TYPE_STRING
)
{
max_zero_byte_size
=
std
::
max
(
batch_byte_size
,
max_zero_byte_size
);
}
else
{
max_random_byte_size
=
std
::
max
(
batch_byte_size
,
max_random_byte_size
);
}
break
;
}
default:
break
;
}
}
warmup_samples_
.
emplace_back
(
warmup_setting
.
name
(),
warmup_setting
.
count
());
auto
&
warmup_data
=
warmup_samples_
.
back
();
// Create buffers for synthetic data
TRITONSERVER_MemoryType
type
;
int64_t
type_id
;
warmup_data
.
zero_data_
.
reset
(
new
AllocatedMemory
(
max_zero_byte_size
,
TRITONSERVER_MEMORY_CPU_PINNED
/* memory_type */
,
0
/* memory_type_id */
));
char
*
zero_buffer
=
warmup_data
.
zero_data_
->
MutableBuffer
(
&
type
,
&
type_id
);
memset
(
zero_buffer
,
0
,
max_zero_byte_size
);
warmup_data
.
random_data_
.
reset
(
new
AllocatedMemory
(
max_random_byte_size
,
TRITONSERVER_MEMORY_CPU_PINNED
/* memory_type */
,
0
/* memory_type_id */
));
char
*
random_buffer
=
warmup_data
.
random_data_
->
MutableBuffer
(
&
type
,
&
type_id
);
for
(
int64_t
offset
=
0
;
offset
<
max_random_byte_size
;
offset
++
)
{
random_buffer
[
offset
]
=
rand
();
}
// Prepare the inference request for the specified sample, not using
// in-process C API because the request doesn't go through the same pipeline
// (i.e. no normalization / scheduler) so we need to prepare the request to
// the state just before calling instance execute function.
for
(
size_t
cnt
=
0
;
cnt
<
warmup_setting
.
batch_size
();
cnt
++
)
{
warmup_data
.
requests_
.
emplace_back
(
new
InferenceRequest
(
model_
,
model_
->
Version
()));
auto
&
lrequest
=
warmup_data
.
requests_
.
back
();
// Second pass to prepare original inputs.
std
::
vector
<
std
::
shared_ptr
<
InferenceRequest
::
Input
>>
input_sps
;
for
(
const
auto
&
input_meta
:
warmup_setting
.
inputs
())
{
auto
batch1_element_count
=
triton
::
common
::
GetElementCount
(
input_meta
.
second
.
dims
());
auto
batch_byte_size
=
batch1_element_count
*
triton
::
common
::
GetDataTypeByteSize
(
input_meta
.
second
.
data_type
());
if
(
batch_byte_size
==
0
)
{
batch_byte_size
=
batch1_element_count
*
sizeof
(
int32_t
);
}
const
char
*
allocated_ptr
;
switch
(
input_meta
.
second
.
input_data_type_case
())
{
case
inference
::
ModelWarmup_Input
::
InputDataTypeCase
::
kZeroData
:
allocated_ptr
=
zero_buffer
;
break
;
case
inference
::
ModelWarmup_Input
::
InputDataTypeCase
::
kRandomData
:
{
if
(
input_meta
.
second
.
data_type
()
==
inference
::
DataType
::
TYPE_STRING
)
{
allocated_ptr
=
zero_buffer
;
}
else
{
allocated_ptr
=
random_buffer
;
}
break
;
}
case
inference
::
ModelWarmup_Input
::
InputDataTypeCase
::
kInputDataFile:
{
// For data provided from file, we can set buffer in first pass
warmup_data
.
provided_data_
.
emplace_back
(
new
std
::
string
());
auto
input_data
=
warmup_data
.
provided_data_
.
back
().
get
();
RETURN_IF_ERROR
(
ReadTextFile
(
JoinPath
(
{
model_
->
LocalizedModelPath
(),
kWarmupDataFolder
,
input_meta
.
second
.
input_data_file
()
}
),
input_data
));
if
(
input_meta
.
second
.
data_type
()
==
inference
::
DataType
::
TYPE_STRING
)
{
batch_byte_size
=
input_data
->
size
();
}
else
if
(((
size_t
)
batch_byte_size
)
>
input_data
->
size
())
{
return
Status
(
Status
::
Code
::
INVALID_ARG
,
lrequest
->
LogRequest
()
+
"warmup setting expects "
+
std
::
to_string
(
batch_byte_size
)
+
" bytes, but the data "
"provided from "
+
input_meta
.
second
.
input_data_file
()
+
"only has "
+
std
::
to_string
(
input_data
->
size
())
+
" bytes"
);
}
allocated_ptr
=
input_data
->
data
();
break
;
}
default:
return
Status
(
Status
::
Code
::
INVALID_ARG
,
lrequest
->
LogRequest
()
+
"warmup setting expects input '"
+
input_meta
.
first
+
"' to have input_data_type set"
);
}
const
inference
::
ModelInput
*
input_config
;
bool
is_original_input
=
model_
->
GetInput
(
input_meta
.
first
,
&
input_config
).
IsOk
();
InferenceRequest
::
Input
*
input
=
nullptr
;
std
::
vector
<
int64_t
>
input_meta_shape
;
// Append batch size only if the model supports batching
// and not control inpt.
if
((
model_
->
Config
().
max_batch_size
()
!=
0
)
&&
is_original_input
)
{
input_meta_shape
.
push_back
(
1
);
}
for
(
auto
d
:
input_meta
.
second
.
dims
())
{
input_meta_shape
.
push_back
(
d
);
}
if
(
is_original_input
)
{
RETURN_IF_ERROR
(
lrequest
->
AddOriginalInput
(
input_meta
.
first
,
input_meta
.
second
.
data_type
(),
input_meta_shape
,
&
input
));
}
else
{
input_sps
.
emplace_back
();
RETURN_IF_ERROR
(
lrequest
->
AddOverrideInput
(
input_meta
.
first
,
input_meta
.
second
.
data_type
(),
(
model_
->
Config
().
max_batch_size
()
!=
0
?
1
:
0
),
input_meta_shape
,
&
input_sps
.
back
()));
input
=
input_sps
.
back
().
get
();
}
RETURN_IF_ERROR
(
input
->
AppendData
(
allocated_ptr
,
batch_byte_size
,
TRITONSERVER_MEMORY_CPU
/* memory_type */
,
0
/* memory_type_id */
));
}
RETURN_IF_ERROR
(
lrequest
->
PrepareForInference
());
// Override inputs must be added after PrepareForInference() is called
for
(
const
auto
&
sp
:
input_sps
)
{
RETURN_IF_ERROR
(
lrequest
->
AddOverrideInput
(
sp
));
}
}
}
return
Status
::
Success
;
}
void
TritonModelInstance
::
Schedule
(
std
::
vector
<
std
::
unique_ptr
<
InferenceRequest
>>&&
requests
,
const
std
::
function
<
void
()
>&
OnCompletion
)
{
// Use a thread local vector to avoid needing to malloc each
// time an inference is run.
thread_local
std
::
vector
<
TRITONBACKEND_Request
*>
triton_requests
(
1024
);
triton_requests
.
clear
();
for
(
auto
&
r
:
requests
)
{
// Load the input states for the inference request.
r
->
LoadInputStates
();
triton_requests
.
push_back
(
reinterpret_cast
<
TRITONBACKEND_Request
*>
(
r
.
release
()));
}
Execute
(
triton_requests
);
OnCompletion
();
}
Status
TritonModelInstance
::
Initialize
()
{
RETURN_IF_ERROR
(
SetNumaConfigOnThread
(
HostPolicy
()));
return
Status
::
Success
;
}
Status
TritonModelInstance
::
WarmUp
()
{
// move samples to local variable for scoped cleanup
std
::
vector
<
triton
::
core
::
TritonModelInstance
::
WarmupData
>
lwarmup_samples
;
lwarmup_samples
.
swap
(
warmup_samples_
);
for
(
auto
&
sample
:
lwarmup_samples
)
{
for
(
size_t
iteration
=
1
;
iteration
<=
sample
.
count_
;
++
iteration
)
{
LOG_VERBOSE
(
1
)
<<
"model '"
<<
sample
.
requests_
.
back
()
->
ModelName
()
<<
"' instance "
<<
Name
()
<<
" is running warmup sample '"
<<
sample
.
sample_name_
<<
"' for iteration "
<<
iteration
;
// request/response complete is asynchronous so use promise to wait for
// completion. Also collects error message from the responses in a vector.
std
::
vector
<
std
::
promise
<
void
>>
request_complete
(
sample
.
requests_
.
size
());
std
::
vector
<
std
::
string
>
response_errors
;
std
::
vector
<
std
::
pair
<
std
::
promise
<
void
>
,
std
::
vector
<
std
::
string
>*>>
response_complete
(
sample
.
requests_
.
size
());
std
::
vector
<
TRITONBACKEND_Request
*>
triton_requests
;
for
(
size_t
i
=
0
;
i
<
sample
.
requests_
.
size
();
++
i
)
{
auto
&
request
=
sample
.
requests_
[
i
];
request
->
SetReleaseCallback
(
WarmupRequestComplete
,
&
request_complete
[
i
]);
response_complete
[
i
].
second
=
&
response_errors
;
request
->
SetResponseCallback
(
&
warmup_allocator
,
nullptr
,
WarmupResponseComplete
,
&
response_complete
[
i
]);
// Capture timestamp before run to avoid incorrect accumulation from
// sequential warmup runs
#ifdef TRITON_ENABLE_STATS
request
->
CaptureRequestStartNs
();
#endif // TRITON_ENABLE_STATS
request
->
CaptureQueueStartNs
();
triton_requests
.
push_back
(
reinterpret_cast
<
TRITONBACKEND_Request
*>
(
request
.
get
()));
}
Execute
(
triton_requests
);
// Wait for warmup sample to complete and check error
for
(
size_t
i
=
0
;
i
<
sample
.
requests_
.
size
();
++
i
)
{
request_complete
[
i
].
get_future
().
get
();
response_complete
[
i
].
first
.
get_future
().
get
();
}
if
(
response_errors
.
size
()
!=
0
)
{
std
::
string
err_str
=
"failed to run warmup sample '"
+
sample
.
sample_name_
+
"': "
;
for
(
const
auto
&
error
:
response_errors
)
{
err_str
+=
(
error
+
"; "
);
}
// End warmup as soon as there is failing sample
LOG_VERBOSE
(
1
)
<<
"model '"
<<
sample
.
requests_
.
back
()
->
ModelName
()
<<
"' instance "
<<
Name
()
<<
" failed to run warmup sample '"
<<
sample
.
sample_name_
<<
"'"
;
return
Status
(
Status
::
Code
::
INVALID_ARG
,
err_str
);
}
}
}
return
Status
::
Success
;
}
void
TritonModelInstance
::
Execute
(
std
::
vector
<
TRITONBACKEND_Request
*>&
triton_requests
)
{
TRITONBACKEND_ModelInstance
*
triton_model_instance
=
reinterpret_cast
<
TRITONBACKEND_ModelInstance
*>
(
this
);
TritonBackend
::
TritonModelInstanceExecFn_t
inst_exec_fn
=
model_
->
Backend
()
->
ModelInstanceExecFn
();
// If there is an error then we retain ownership of 'requests'
// and must send error responses.
TRITONSERVER_Error
*
err
=
inst_exec_fn
(
triton_model_instance
,
&
triton_requests
[
0
],
triton_requests
.
size
());
if
(
err
!=
nullptr
)
{
Status
status
=
Status
(
TritonCodeToStatusCode
(
TRITONSERVER_ErrorCode
(
err
)),
TRITONSERVER_ErrorMessage
(
err
));
for
(
TRITONBACKEND_Request
*
tr
:
triton_requests
)
{
std
::
unique_ptr
<
InferenceRequest
>
ur
(
reinterpret_cast
<
InferenceRequest
*>
(
tr
));
InferenceRequest
::
RespondIfError
(
ur
,
status
,
true
/* release_requests */
);
}
TRITONSERVER_ErrorDelete
(
err
);
}
}
Status
TritonModelInstance
::
TritonBackendThread
::
CreateBackendThread
(
const
std
::
string
name
,
TritonModelInstance
*
model_instance
,
const
int
nice
,
const
int32_t
device_id
,
std
::
unique_ptr
<
TritonBackendThread
>*
triton_backend_thread
)
{
TritonBackendThread
*
raw_triton_backend_thread
=
new
TritonBackendThread
(
name
,
model_instance
->
Model
());
std
::
unique_ptr
<
TritonBackendThread
>
runner
(
raw_triton_backend_thread
);
runner
->
AddModelInstance
(
model_instance
);
runner
->
backend_thread_
=
std
::
thread
([
raw_triton_backend_thread
,
nice
,
device_id
]()
{
raw_triton_backend_thread
->
BackendThread
(
nice
,
device_id
);
});
triton_backend_thread
->
reset
(
runner
.
release
());
return
Status
::
Success
;
}
void
TritonModelInstance
::
TritonBackendThread
::
AddModelInstance
(
TritonModelInstance
*
model_instance
)
{
model_instances_
.
push_back
(
model_instance
);
}
Status
TritonModelInstance
::
TritonBackendThread
::
InitAndWarmUpModelInstance
(
TritonModelInstance
*
model_instance
)
{
// Initialize the instance on the backend thread
auto
init_payload
=
model_
->
Server
()
->
GetRateLimiter
()
->
GetPayload
(
Payload
::
Operation
::
INIT
,
model_instance
);
RETURN_IF_ERROR
(
model_
->
Server
()
->
GetRateLimiter
()
->
EnqueuePayload
(
model_
,
init_payload
));
RETURN_IF_ERROR
(
init_payload
->
Wait
());
// Warm-up the instance on the backend thread
auto
warmup_payload
=
model_
->
Server
()
->
GetRateLimiter
()
->
GetPayload
(
Payload
::
Operation
::
WARM_UP
,
model_instance
);
RETURN_IF_ERROR
(
model_
->
Server
()
->
GetRateLimiter
()
->
EnqueuePayload
(
model_
,
warmup_payload
));
RETURN_IF_ERROR
(
warmup_payload
->
Wait
());
return
Status
::
Success
;
}
TritonModelInstance
::
TritonBackendThread
::
TritonBackendThread
(
const
std
::
string
&
name
,
TritonModel
*
model
)
:
name_
(
name
),
model_
(
model
)
{
}
TritonModelInstance
::
TritonBackendThread
::~
TritonBackendThread
()
{
StopBackendThread
();
}
void
TritonModelInstance
::
TritonBackendThread
::
StopBackendThread
()
{
if
(
backend_thread_
.
joinable
())
{
// Signal the backend thread to exit and then wait for it...
auto
exit_payload
=
model_
->
Server
()
->
GetRateLimiter
()
->
GetPayload
(
Payload
::
Operation
::
EXIT
,
model_instances_
.
back
());
model_
->
Server
()
->
GetRateLimiter
()
->
EnqueuePayload
(
model_
,
exit_payload
);
backend_thread_
.
join
();
}
}
void
TritonModelInstance
::
TritonBackendThread
::
BackendThread
(
const
int
nice
,
const
int32_t
device_id
)
{
#ifndef _WIN32
if
(
setpriority
(
PRIO_PROCESS
,
syscall
(
SYS_gettid
),
nice
)
==
0
)
{
LOG_VERBOSE
(
1
)
<<
"Starting backend thread for "
<<
name_
<<
" at nice "
<<
nice
<<
" on device "
<<
device_id
<<
"..."
;
}
else
{
LOG_VERBOSE
(
1
)
<<
"Starting backend thread for "
<<
name_
<<
" at default nice (requested nice "
<<
nice
<<
" failed)"
<<
" on device "
<<
device_id
<<
"..."
;
}
#else
LOG_VERBOSE
(
1
)
<<
"Starting backend thread for "
<<
name_
<<
" at default nice on device "
<<
device_id
<<
"..."
;
#endif
bool
should_exit
=
false
;
while
(
!
should_exit
)
{
std
::
shared_ptr
<
Payload
>
payload
;
model_
->
Server
()
->
GetRateLimiter
()
->
DequeuePayload
(
model_instances_
,
&
payload
);
NVTX_RANGE
(
nvtx_
,
"BackendThread "
+
name_
);
payload
->
Execute
(
&
should_exit
);
model_instances_
.
push_back
(
payload
->
GetInstance
());
// Release the payload to the RateLimiter
model_
->
Server
()
->
GetRateLimiter
()
->
PayloadRelease
(
payload
);
}
LOG_VERBOSE
(
1
)
<<
"Stopping backend thread for "
<<
name_
<<
"..."
;
}
extern
"C"
{
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelInstanceName
(
TRITONBACKEND_ModelInstance
*
instance
,
const
char
**
name
)
{
TritonModelInstance
*
ti
=
reinterpret_cast
<
TritonModelInstance
*>
(
instance
);
*
name
=
ti
->
Name
().
c_str
();
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelInstanceKind
(
TRITONBACKEND_ModelInstance
*
instance
,
TRITONSERVER_InstanceGroupKind
*
kind
)
{
TritonModelInstance
*
ti
=
reinterpret_cast
<
TritonModelInstance
*>
(
instance
);
*
kind
=
ti
->
Kind
();
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelInstanceDeviceId
(
TRITONBACKEND_ModelInstance
*
instance
,
int32_t
*
device_id
)
{
TritonModelInstance
*
ti
=
reinterpret_cast
<
TritonModelInstance
*>
(
instance
);
*
device_id
=
ti
->
DeviceId
();
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelInstanceHostPolicy
(
TRITONBACKEND_ModelInstance
*
instance
,
TRITONSERVER_Message
**
host_policy
)
{
TritonModelInstance
*
ti
=
reinterpret_cast
<
TritonModelInstance
*>
(
instance
);
*
host_policy
=
const_cast
<
TRITONSERVER_Message
*>
(
reinterpret_cast
<
const
TRITONSERVER_Message
*>
(
&
ti
->
HostPolicyMessage
()));
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelInstanceProfileCount
(
TRITONBACKEND_ModelInstance
*
instance
,
uint32_t
*
count
)
{
TritonModelInstance
*
ti
=
reinterpret_cast
<
TritonModelInstance
*>
(
instance
);
*
count
=
ti
->
Profiles
().
size
();
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelInstanceProfileName
(
TRITONBACKEND_ModelInstance
*
instance
,
const
uint32_t
index
,
const
char
**
profile_name
)
{
*
profile_name
=
nullptr
;
TritonModelInstance
*
ti
=
reinterpret_cast
<
TritonModelInstance
*>
(
instance
);
const
auto
&
rprofiles
=
ti
->
Profiles
();
if
(
index
>=
rprofiles
.
size
())
{
return
TRITONSERVER_ErrorNew
(
TRITONSERVER_ERROR_INVALID_ARG
,
(
std
::
string
(
"out of bounds index "
)
+
std
::
to_string
(
index
)
+
": instance is configured with "
+
std
::
to_string
(
rprofiles
.
size
())
+
" profiles"
)
.
c_str
());
}
*
profile_name
=
rprofiles
[
index
].
c_str
();
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelInstanceSecondaryDeviceCount
(
TRITONBACKEND_ModelInstance
*
instance
,
uint32_t
*
count
)
{
TritonModelInstance
*
ti
=
reinterpret_cast
<
TritonModelInstance
*>
(
instance
);
*
count
=
ti
->
SecondaryDevices
().
size
();
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelInstanceSecondaryDeviceProperties
(
TRITONBACKEND_ModelInstance
*
instance
,
uint32_t
index
,
const
char
**
kind
,
int64_t
*
id
)
{
TritonModelInstance
*
ti
=
reinterpret_cast
<
TritonModelInstance
*>
(
instance
);
const
auto
&
rsecondarydevices
=
ti
->
SecondaryDevices
();
if
(
index
>=
rsecondarydevices
.
size
())
{
return
TRITONSERVER_ErrorNew
(
TRITONSERVER_ERROR_INVALID_ARG
,
(
std
::
string
(
"out of bounds index "
)
+
std
::
to_string
(
index
)
+
": instance is configured with "
+
std
::
to_string
(
rsecondarydevices
.
size
())
+
" secondary devices"
)
.
c_str
());
}
*
kind
=
rsecondarydevices
[
index
].
kind_
.
c_str
();
*
id
=
rsecondarydevices
[
index
].
id_
;
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelInstanceIsPassive
(
TRITONBACKEND_ModelInstance
*
instance
,
bool
*
is_passive
)
{
TritonModelInstance
*
ti
=
reinterpret_cast
<
TritonModelInstance
*>
(
instance
);
*
is_passive
=
ti
->
IsPassive
();
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelInstanceModel
(
TRITONBACKEND_ModelInstance
*
instance
,
TRITONBACKEND_Model
**
model
)
{
TritonModelInstance
*
ti
=
reinterpret_cast
<
TritonModelInstance
*>
(
instance
);
*
model
=
reinterpret_cast
<
TRITONBACKEND_Model
*>
(
ti
->
Model
());
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelInstanceState
(
TRITONBACKEND_ModelInstance
*
instance
,
void
**
state
)
{
TritonModelInstance
*
ti
=
reinterpret_cast
<
TritonModelInstance
*>
(
instance
);
*
state
=
ti
->
State
();
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelInstanceSetState
(
TRITONBACKEND_ModelInstance
*
instance
,
void
*
state
)
{
TritonModelInstance
*
ti
=
reinterpret_cast
<
TritonModelInstance
*>
(
instance
);
ti
->
SetState
(
state
);
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelInstanceReportStatistics
(
TRITONBACKEND_ModelInstance
*
instance
,
TRITONBACKEND_Request
*
request
,
const
bool
success
,
const
uint64_t
exec_start_ns
,
const
uint64_t
compute_start_ns
,
const
uint64_t
compute_end_ns
,
const
uint64_t
exec_end_ns
)
{
#ifdef TRITON_ENABLE_STATS
TritonModelInstance
*
ti
=
reinterpret_cast
<
TritonModelInstance
*>
(
instance
);
InferenceRequest
*
tr
=
reinterpret_cast
<
InferenceRequest
*>
(
request
);
tr
->
ReportStatistics
(
ti
->
MetricReporter
(),
success
,
exec_start_ns
,
compute_start_ns
,
compute_end_ns
,
exec_end_ns
);
#endif // TRITON_ENABLE_STATS
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONBACKEND_ModelInstanceReportBatchStatistics
(
TRITONBACKEND_ModelInstance
*
instance
,
const
uint64_t
batch_size
,
const
uint64_t
exec_start_ns
,
const
uint64_t
compute_start_ns
,
const
uint64_t
compute_end_ns
,
const
uint64_t
exec_end_ns
)
{
#ifdef TRITON_ENABLE_STATS
TritonModelInstance
*
ti
=
reinterpret_cast
<
TritonModelInstance
*>
(
instance
);
ti
->
Model
()
->
MutableStatsAggregator
()
->
UpdateInferBatchStats
(
ti
->
MetricReporter
(),
batch_size
,
exec_start_ns
,
compute_start_ns
,
compute_end_ns
,
exec_end_ns
);
#endif // TRITON_ENABLE_STATS
return
nullptr
;
// success
}
}
// extern C
}}
// namespace triton::core
3rdparty/core-r22.12/src/backend_model_instance.h
0 → 100644
View file @
b30f3cdb
// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <functional>
#include <future>
#include <memory>
#include <string>
#include <thread>
#include "constants.h"
#include "memory.h"
#include "metric_model_reporter.h"
#include "model_config.pb.h"
#include "server_message.h"
#include "status.h"
#include "triton/common/sync_queue.h"
namespace
triton
{
namespace
core
{
class
TritonModel
;
class
InferenceRequest
;
//
// Represents a model instance.
//
class
TritonModelInstance
{
public:
static
Status
CreateInstances
(
TritonModel
*
model
,
const
triton
::
common
::
BackendCmdlineConfigMap
&
backend_cmdline_config_map
,
const
triton
::
common
::
HostPolicyCmdlineConfigMap
&
host_policy_map
,
const
inference
::
ModelConfig
&
model_config
,
const
bool
device_blocking
);
~
TritonModelInstance
();
const
std
::
string
&
Name
()
const
{
return
name_
;
}
size_t
Index
()
const
{
return
index_
;
}
TRITONSERVER_InstanceGroupKind
Kind
()
const
{
return
kind_
;
}
int32_t
DeviceId
()
const
{
return
device_id_
;
}
const
triton
::
common
::
HostPolicyCmdlineConfig
&
HostPolicy
()
const
{
return
host_policy_
;
}
const
TritonServerMessage
&
HostPolicyMessage
()
const
{
return
host_policy_message_
;
}
bool
IsPassive
()
const
{
return
passive_
;
}
const
std
::
vector
<
std
::
string
>&
Profiles
()
const
{
return
profile_names_
;
}
struct
SecondaryDevice
{
SecondaryDevice
(
const
std
::
string
kind
,
const
int64_t
id
)
:
kind_
(
kind
),
id_
(
id
)
{
}
const
std
::
string
kind_
;
const
int64_t
id_
;
};
const
std
::
vector
<
SecondaryDevice
>&
SecondaryDevices
()
const
{
return
secondary_devices_
;
}
Status
Initialize
();
Status
WarmUp
();
void
Schedule
(
std
::
vector
<
std
::
unique_ptr
<
InferenceRequest
>>&&
requests
,
const
std
::
function
<
void
()
>&
OnCompletion
);
TritonModel
*
Model
()
const
{
return
model_
;
}
void
*
State
()
{
return
state_
;
}
void
SetState
(
void
*
state
)
{
state_
=
state
;
}
MetricModelReporter
*
MetricReporter
()
const
{
return
reporter_
.
get
();
}
private:
DISALLOW_COPY_AND_ASSIGN
(
TritonModelInstance
);
class
TritonBackendThread
;
TritonModelInstance
(
TritonModel
*
model
,
const
std
::
string
&
name
,
const
size_t
index
,
const
TRITONSERVER_InstanceGroupKind
kind
,
const
int32_t
device_id
,
const
std
::
vector
<
std
::
string
>&
profile_names
,
const
bool
passive
,
const
triton
::
common
::
HostPolicyCmdlineConfig
&
host_policy
,
const
TritonServerMessage
&
host_policy_message
,
const
std
::
vector
<
SecondaryDevice
>&
secondary_devices
);
static
Status
CreateInstance
(
TritonModel
*
model
,
const
std
::
string
&
name
,
const
size_t
index
,
const
TRITONSERVER_InstanceGroupKind
kind
,
const
int32_t
device_id
,
const
std
::
vector
<
std
::
string
>&
profile_names
,
const
bool
passive
,
const
std
::
string
&
host_policy_name
,
const
triton
::
common
::
HostPolicyCmdlineConfig
&
host_policy
,
const
inference
::
ModelRateLimiter
&
rate_limiter_config
,
const
bool
device_blocking
,
std
::
map
<
uint32_t
,
std
::
shared_ptr
<
TritonBackendThread
>>*
device_to_thread_map
,
const
std
::
vector
<
SecondaryDevice
>&
secondary_devices
);
Status
SetBackendThread
(
const
TRITONSERVER_InstanceGroupKind
kind
,
const
int32_t
device_id
,
const
bool
device_blocking
,
std
::
map
<
uint32_t
,
std
::
shared_ptr
<
TritonBackendThread
>>*
device_to_thread_map
);
Status
GenerateWarmupData
();
void
Execute
(
std
::
vector
<
TRITONBACKEND_Request
*>&
triton_requests
);
class
TritonBackendThread
{
public:
static
Status
CreateBackendThread
(
const
std
::
string
name
,
TritonModelInstance
*
model
,
const
int
nice
,
const
int32_t
device_id
,
std
::
unique_ptr
<
TritonBackendThread
>*
triton_backend_thread
);
void
AddModelInstance
(
TritonModelInstance
*
model_instance
);
Status
InitAndWarmUpModelInstance
(
TritonModelInstance
*
model_instance
);
void
StopBackendThread
();
~
TritonBackendThread
();
private:
TritonBackendThread
(
const
std
::
string
&
name
,
TritonModel
*
model
);
void
BackendThread
(
const
int
nice
,
const
int32_t
device_id
);
std
::
string
name_
;
TritonModel
*
model_
;
std
::
deque
<
TritonModelInstance
*>
model_instances_
;
std
::
thread
backend_thread_
;
std
::
atomic
<
bool
>
backend_thread_exit_
;
};
std
::
shared_ptr
<
TritonBackendThread
>
triton_backend_thread_
;
struct
WarmupData
{
WarmupData
(
const
std
::
string
&
sample_name
,
const
size_t
count
)
:
sample_name_
(
sample_name
),
count_
(
std
::
max
(
count
,
size_t
{
1
}))
{
}
std
::
string
sample_name_
;
size_t
count_
;
// Using a batch of requests to satisfy batch size, this provides better
// alignment on the batch expected by the model, especially for sequence
// model.
std
::
vector
<
std
::
unique_ptr
<
InferenceRequest
>>
requests_
;
// Placeholder for input data
std
::
unique_ptr
<
AllocatedMemory
>
zero_data_
;
std
::
unique_ptr
<
AllocatedMemory
>
random_data_
;
std
::
vector
<
std
::
unique_ptr
<
std
::
string
>>
provided_data_
;
};
std
::
vector
<
WarmupData
>
warmup_samples_
;
// The TritonModel object that owns this instance. The instance
// holds this as a raw pointer because the lifetime of the model is
// guaranteed to be longer than the lifetime of an instance owned by the
// model.
TritonModel
*
model_
;
std
::
string
name_
;
size_t
index_
;
// For CPU device_id_ is always 0. For GPU device_id_ indicates the
// GPU device to be used by the instance.
TRITONSERVER_InstanceGroupKind
kind_
;
int32_t
device_id_
;
const
triton
::
common
::
HostPolicyCmdlineConfig
host_policy_
;
TritonServerMessage
host_policy_message_
;
std
::
vector
<
std
::
string
>
profile_names_
;
bool
passive_
;
std
::
vector
<
SecondaryDevice
>
secondary_devices_
;
// Reporter for metrics, or nullptr if no metrics should be reported
std
::
shared_ptr
<
MetricModelReporter
>
reporter_
;
// Opaque state associated with this model instance.
void
*
state_
;
};
}}
// namespace triton::core
3rdparty/core-r22.12/src/buffer_attributes.cc
0 → 100644
View file @
b30f3cdb
// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "buffer_attributes.h"
#include <cstring>
#include "constants.h"
namespace
triton
{
namespace
core
{
void
BufferAttributes
::
SetByteSize
(
const
size_t
&
byte_size
)
{
byte_size_
=
byte_size
;
}
void
BufferAttributes
::
SetMemoryType
(
const
TRITONSERVER_MemoryType
&
memory_type
)
{
memory_type_
=
memory_type
;
}
void
BufferAttributes
::
SetMemoryTypeId
(
const
int64_t
&
memory_type_id
)
{
memory_type_id_
=
memory_type_id
;
}
void
BufferAttributes
::
SetCudaIpcHandle
(
void
*
cuda_ipc_handle
)
{
char
*
lcuda_ipc_handle
=
reinterpret_cast
<
char
*>
(
cuda_ipc_handle
);
cuda_ipc_handle_
.
clear
();
std
::
copy
(
lcuda_ipc_handle
,
lcuda_ipc_handle
+
CUDA_IPC_STRUCT_SIZE
,
std
::
back_inserter
(
cuda_ipc_handle_
));
}
void
*
BufferAttributes
::
CudaIpcHandle
()
{
if
(
cuda_ipc_handle_
.
empty
())
{
return
nullptr
;
}
else
{
return
reinterpret_cast
<
void
*>
(
cuda_ipc_handle_
.
data
());
}
}
size_t
BufferAttributes
::
ByteSize
()
const
{
return
byte_size_
;
}
TRITONSERVER_MemoryType
BufferAttributes
::
MemoryType
()
const
{
return
memory_type_
;
}
int64_t
BufferAttributes
::
MemoryTypeId
()
const
{
return
memory_type_id_
;
}
BufferAttributes
::
BufferAttributes
(
size_t
byte_size
,
TRITONSERVER_MemoryType
memory_type
,
int64_t
memory_type_id
,
char
*
cuda_ipc_handle
)
:
byte_size_
(
byte_size
),
memory_type_
(
memory_type
),
memory_type_id_
(
memory_type_id
)
{
// cuda ipc handle size
cuda_ipc_handle_
.
reserve
(
CUDA_IPC_STRUCT_SIZE
);
if
(
cuda_ipc_handle
!=
nullptr
)
{
std
::
copy
(
cuda_ipc_handle
,
cuda_ipc_handle
+
CUDA_IPC_STRUCT_SIZE
,
std
::
back_inserter
(
cuda_ipc_handle_
));
}
}
}}
// namespace triton::core
3rdparty/core-r22.12/src/buffer_attributes.h
0 → 100644
View file @
b30f3cdb
// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <iterator>
#include <vector>
#include "tritonserver_apis.h"
#pragma once
namespace
triton
{
namespace
core
{
//
// A class to hold information about the buffer allocation.
//
class
BufferAttributes
{
public:
BufferAttributes
(
size_t
byte_size
,
TRITONSERVER_MemoryType
memory_type
,
int64_t
memory_type_id
,
char
cuda_ipc_handle
[
64
]);
BufferAttributes
()
{
memory_type_
=
TRITONSERVER_MEMORY_CPU
;
memory_type_id_
=
0
;
cuda_ipc_handle_
.
reserve
(
64
);
}
// Set the buffer byte size
void
SetByteSize
(
const
size_t
&
byte_size
);
// Set the buffer memory_type
void
SetMemoryType
(
const
TRITONSERVER_MemoryType
&
memory_type
);
// Set the buffer memory type id
void
SetMemoryTypeId
(
const
int64_t
&
memory_type_id
);
// Set the cuda ipc handle
void
SetCudaIpcHandle
(
void
*
cuda_ipc_handle
);
// Get the cuda ipc handle
void
*
CudaIpcHandle
();
// Get the byte size
size_t
ByteSize
()
const
;
// Get the memory type
TRITONSERVER_MemoryType
MemoryType
()
const
;
// Get the memory type id
int64_t
MemoryTypeId
()
const
;
private:
size_t
byte_size_
;
TRITONSERVER_MemoryType
memory_type_
;
int64_t
memory_type_id_
;
std
::
vector
<
char
>
cuda_ipc_handle_
;
};
}}
// namespace triton::core
3rdparty/core-r22.12/src/constants.h
0 → 100644
View file @
b30f3cdb
// Copyright 2018-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <stdint.h>
namespace
triton
{
namespace
core
{
constexpr
char
kInferHeaderContentLengthHTTPHeader
[]
=
"Inference-Header-Content-Length"
;
constexpr
char
kAcceptEncodingHTTPHeader
[]
=
"Accept-Encoding"
;
constexpr
char
kContentEncodingHTTPHeader
[]
=
"Content-Encoding"
;
constexpr
char
kContentTypeHeader
[]
=
"Content-Type"
;
constexpr
char
kContentLengthHeader
[]
=
"Content-Length"
;
constexpr
char
kTensorFlowGraphDefPlatform
[]
=
"tensorflow_graphdef"
;
constexpr
char
kTensorFlowSavedModelPlatform
[]
=
"tensorflow_savedmodel"
;
constexpr
char
kTensorFlowGraphDefFilename
[]
=
"model.graphdef"
;
constexpr
char
kTensorFlowSavedModelFilename
[]
=
"model.savedmodel"
;
constexpr
char
kTensorFlowBackend
[]
=
"tensorflow"
;
constexpr
char
kTensorRTPlanPlatform
[]
=
"tensorrt_plan"
;
constexpr
char
kTensorRTPlanFilename
[]
=
"model.plan"
;
constexpr
char
kTensorRTBackend
[]
=
"tensorrt"
;
constexpr
char
kOnnxRuntimeOnnxPlatform
[]
=
"onnxruntime_onnx"
;
constexpr
char
kOnnxRuntimeOnnxFilename
[]
=
"model.onnx"
;
constexpr
char
kOnnxRuntimeBackend
[]
=
"onnxruntime"
;
constexpr
char
kOpenVINORuntimeOpenVINOFilename
[]
=
"model.xml"
;
constexpr
char
kOpenVINORuntimeBackend
[]
=
"openvino"
;
constexpr
char
kPyTorchLibTorchPlatform
[]
=
"pytorch_libtorch"
;
constexpr
char
kPyTorchLibTorchFilename
[]
=
"model.pt"
;
constexpr
char
kPyTorchBackend
[]
=
"pytorch"
;
constexpr
char
kPythonFilename
[]
=
"model.py"
;
constexpr
char
kPythonBackend
[]
=
"python"
;
#ifdef TRITON_ENABLE_ENSEMBLE
constexpr
char
kEnsemblePlatform
[]
=
"ensemble"
;
#endif // TRITON_ENABLE_ENSEMBLE
constexpr
char
kTensorRTExecutionAccelerator
[]
=
"tensorrt"
;
constexpr
char
kOpenVINOExecutionAccelerator
[]
=
"openvino"
;
constexpr
char
kGPUIOExecutionAccelerator
[]
=
"gpu_io"
;
constexpr
char
kAutoMixedPrecisionExecutionAccelerator
[]
=
"auto_mixed_precision"
;
constexpr
char
kModelConfigPbTxt
[]
=
"config.pbtxt"
;
constexpr
char
kMetricsLabelModelName
[]
=
"model"
;
constexpr
char
kMetricsLabelModelVersion
[]
=
"version"
;
constexpr
char
kMetricsLabelGpuUuid
[]
=
"gpu_uuid"
;
constexpr
char
kWarmupDataFolder
[]
=
"warmup"
;
constexpr
char
kInitialStateFolder
[]
=
"initial_state"
;
constexpr
uint64_t
NANOS_PER_SECOND
=
1000000000
;
constexpr
uint64_t
NANOS_PER_MILLIS
=
1000000
;
constexpr
int
MAX_GRPC_MESSAGE_SIZE
=
INT32_MAX
;
constexpr
uint64_t
SEQUENCE_IDLE_DEFAULT_MICROSECONDS
=
1000
*
1000
;
constexpr
size_t
STRING_CORRELATION_ID_MAX_LENGTH_BYTES
=
128
;
constexpr
size_t
CUDA_IPC_STRUCT_SIZE
=
64
;
#ifdef TRITON_ENABLE_METRICS
// MetricModelReporter expects a device ID for GPUs, but we reuse this device
// ID for other metrics as well such as for CPU and Response Cache metrics
constexpr
int
METRIC_REPORTER_ID_CPU
=
-
1
;
constexpr
int
METRIC_REPORTER_ID_RESPONSE_CACHE
=
-
2
;
#endif
#define TIMESPEC_TO_NANOS(TS) \
((TS).tv_sec * triton::core::NANOS_PER_SECOND + (TS).tv_nsec)
#define TIMESPEC_TO_MILLIS(TS) \
(TIMESPEC_TO_NANOS(TS) / triton::core::NANOS_PER_MILLIS)
#define DISALLOW_MOVE(TypeName) TypeName(Context&& o) = delete;
#define DISALLOW_COPY(TypeName) TypeName(const TypeName&) = delete;
#define DISALLOW_ASSIGN(TypeName) void operator=(const TypeName&) = delete;
#define DISALLOW_COPY_AND_ASSIGN(TypeName) \
DISALLOW_COPY(TypeName) \
DISALLOW_ASSIGN(TypeName)
}}
// namespace triton::core
3rdparty/core-r22.12/src/cuda_memory_manager.cc
0 → 100644
View file @
b30f3cdb
// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
#include "cuda_memory_manager.h"
#include <cnmem.h>
#include <string.h>
#include <set>
#include "cuda_utils.h"
#include "triton/common/logging.h"
namespace
{
#define RETURN_IF_CNMEM_ERROR(S, MSG) \
do { \
auto status__ = (S); \
if (status__ != CNMEM_STATUS_SUCCESS) { \
return Status( \
Status::Code::INTERNAL, \
(MSG) + ": " + cnmemGetErrorString(status__)); \
} \
} while (false)
std
::
string
PointerToString
(
void
*
ptr
)
{
std
::
stringstream
ss
;
ss
<<
ptr
;
return
ss
.
str
();
}
}
// namespace
namespace
triton
{
namespace
core
{
std
::
unique_ptr
<
CudaMemoryManager
>
CudaMemoryManager
::
instance_
;
std
::
mutex
CudaMemoryManager
::
instance_mu_
;
CudaMemoryManager
::~
CudaMemoryManager
()
{
if
(
has_allocation_
)
{
auto
status
=
cnmemFinalize
();
if
(
status
!=
CNMEM_STATUS_SUCCESS
)
{
LOG_ERROR
<<
"Failed to finalize CUDA memory manager: ["
<<
status
<<
"] "
<<
cnmemGetErrorString
(
status
);
}
}
}
void
CudaMemoryManager
::
Reset
()
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
instance_mu_
);
instance_
.
reset
();
}
Status
CudaMemoryManager
::
Create
(
const
CudaMemoryManager
::
Options
&
options
)
{
// Ensure thread-safe creation of CUDA memory pool
std
::
lock_guard
<
std
::
mutex
>
lock
(
instance_mu_
);
if
(
instance_
!=
nullptr
)
{
LOG_WARNING
<<
"New CUDA memory pools could not be created since they "
"already exists"
;
return
Status
::
Success
;
}
std
::
set
<
int
>
supported_gpus
;
auto
status
=
GetSupportedGPUs
(
&
supported_gpus
,
options
.
min_supported_compute_capability_
);
if
(
status
.
IsOk
())
{
std
::
vector
<
cnmemDevice_t
>
devices
;
for
(
auto
gpu
:
supported_gpus
)
{
const
auto
it
=
options
.
memory_pool_byte_size_
.
find
(
gpu
);
if
((
it
!=
options
.
memory_pool_byte_size_
.
end
())
&&
(
it
->
second
!=
0
))
{
devices
.
emplace_back
();
auto
&
device
=
devices
.
back
();
memset
(
&
device
,
0
,
sizeof
(
device
));
device
.
device
=
gpu
;
device
.
size
=
it
->
second
;
LOG_INFO
<<
"CUDA memory pool is created on device "
<<
device
.
device
<<
" with size "
<<
device
.
size
;
}
}
if
(
!
devices
.
empty
())
{
RETURN_IF_CNMEM_ERROR
(
cnmemInit
(
devices
.
size
(),
devices
.
data
(),
CNMEM_FLAGS_CANNOT_GROW
),
std
::
string
(
"Failed to finalize CUDA memory manager"
));
}
else
{
LOG_INFO
<<
"CUDA memory pool disabled"
;
}
// Use to finalize CNMeM properly when out of scope
instance_
.
reset
(
new
CudaMemoryManager
(
!
devices
.
empty
()));
}
else
{
return
Status
(
status
.
ErrorCode
(),
"Failed to initialize CUDA memory manager: "
+
status
.
Message
());
}
return
Status
::
Success
;
}
Status
CudaMemoryManager
::
Alloc
(
void
**
ptr
,
uint64_t
size
,
int64_t
device_id
)
{
if
(
instance_
==
nullptr
)
{
return
Status
(
Status
::
Code
::
UNAVAILABLE
,
"CudaMemoryManager has not been created"
);
}
else
if
(
!
instance_
->
has_allocation_
)
{
return
Status
(
Status
::
Code
::
UNAVAILABLE
,
"CudaMemoryManager has no preallocated CUDA memory"
);
}
int
current_device
;
RETURN_IF_CUDA_ERR
(
cudaGetDevice
(
&
current_device
),
std
::
string
(
"Failed to get device"
));
bool
overridden
=
(
current_device
!=
device_id
);
if
(
overridden
)
{
RETURN_IF_CUDA_ERR
(
cudaSetDevice
(
device_id
),
std
::
string
(
"Failed to set device"
));
}
// Defer returning error to make sure the device is recovered
auto
err
=
cnmemMalloc
(
ptr
,
size
,
nullptr
);
if
(
overridden
)
{
cudaSetDevice
(
current_device
);
}
RETURN_IF_CNMEM_ERROR
(
err
,
std
::
string
(
"Failed to allocate CUDA memory with byte size "
)
+
std
::
to_string
(
size
)
+
" on GPU "
+
std
::
to_string
(
device_id
));
return
Status
::
Success
;
}
Status
CudaMemoryManager
::
Free
(
void
*
ptr
,
int64_t
device_id
)
{
if
(
instance_
==
nullptr
)
{
return
Status
(
Status
::
Code
::
UNAVAILABLE
,
"CudaMemoryManager has not been created"
);
}
else
if
(
!
instance_
->
has_allocation_
)
{
return
Status
(
Status
::
Code
::
UNAVAILABLE
,
"CudaMemoryManager has no preallocated CUDA memory"
);
}
int
current_device
;
RETURN_IF_CUDA_ERR
(
cudaGetDevice
(
&
current_device
),
std
::
string
(
"Failed to get device"
));
bool
overridden
=
(
current_device
!=
device_id
);
if
(
overridden
)
{
RETURN_IF_CUDA_ERR
(
cudaSetDevice
(
device_id
),
std
::
string
(
"Failed to set device"
));
}
// Defer returning error to make sure the device is recovered
auto
err
=
cnmemFree
(
ptr
,
nullptr
);
if
(
overridden
)
{
cudaSetDevice
(
current_device
);
}
RETURN_IF_CNMEM_ERROR
(
err
,
std
::
string
(
"Failed to deallocate CUDA memory at address "
)
+
PointerToString
(
ptr
)
+
" on GPU "
+
std
::
to_string
(
device_id
));
return
Status
::
Success
;
}
}}
// namespace triton::core
3rdparty/core-r22.12/src/cuda_memory_manager.h
0 → 100644
View file @
b30f3cdb
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
#pragma once
#include <map>
#include <memory>
#include <mutex>
#include "status.h"
namespace
triton
{
namespace
core
{
// This is a singleton class responsible for maintaining CUDA memory pool
// used by the inference server. CUDA memory allocations and deallocations
// must be requested via functions provided by this class.
class
CudaMemoryManager
{
public:
// Options to configure CUDA memory manager.
struct
Options
{
Options
(
double
cc
=
6.0
,
const
std
::
map
<
int
,
uint64_t
>&
s
=
{})
:
min_supported_compute_capability_
(
cc
),
memory_pool_byte_size_
(
s
)
{
}
// The minimum compute capability of the supported devices.
double
min_supported_compute_capability_
;
// The size of CUDA memory reserved for the specified devices.
// The memory size will be rounded up to align with
// the default granularity (512 bytes).
// No memory will be reserved for devices that is not listed.
std
::
map
<
int
,
uint64_t
>
memory_pool_byte_size_
;
};
~
CudaMemoryManager
();
// Create the memory manager based on 'options' specified.
// Return Status object indicating success or failure.
static
Status
Create
(
const
Options
&
options
);
// Allocate CUDA memory on GPU 'device_id' with
// the requested 'size' and return the pointer in 'ptr'.
// Return Status object indicating success or failure.
static
Status
Alloc
(
void
**
ptr
,
uint64_t
size
,
int64_t
device_id
);
// Free the memory allocated by the memory manager on 'device_id'.
// Return Status object indicating success or failure.
static
Status
Free
(
void
*
ptr
,
int64_t
device_id
);
protected:
// Provide explicit control on the lifecycle of the CUDA memory manager,
// for testing only.
static
void
Reset
();
private:
CudaMemoryManager
(
bool
has_allocation
)
:
has_allocation_
(
has_allocation
)
{}
bool
has_allocation_
;
static
std
::
unique_ptr
<
CudaMemoryManager
>
instance_
;
static
std
::
mutex
instance_mu_
;
};
}}
// namespace triton::core
3rdparty/core-r22.12/src/cuda_utils.cc
0 → 100644
View file @
b30f3cdb
// Copyright 2019-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "cuda_utils.h"
#include "model_config_utils.h"
#include "triton/common/nvtx.h"
namespace
triton
{
namespace
core
{
#ifdef TRITON_ENABLE_GPU
void
CUDART_CB
MemcpyHost
(
void
*
args
)
{
auto
*
copy_params
=
reinterpret_cast
<
CopyParams
*>
(
args
);
memcpy
(
copy_params
->
dst_
,
copy_params
->
src_
,
copy_params
->
byte_size_
);
delete
copy_params
;
}
#endif // TRITON_ENABLE_GPU
Status
GetDeviceMemoryInfo
(
const
int
device_id
,
size_t
*
free
,
size_t
*
total
)
{
*
free
=
0
;
*
total
=
0
;
#ifdef TRITON_ENABLE_GPU
// Make sure that correct device is set before creating stream and
// then restore the device to what was set by the caller.
int
current_device
;
auto
cuerr
=
cudaGetDevice
(
&
current_device
);
bool
overridden
=
false
;
if
(
cuerr
==
cudaSuccess
)
{
overridden
=
(
current_device
!=
device_id
);
if
(
overridden
)
{
cuerr
=
cudaSetDevice
(
device_id
);
}
}
if
(
cuerr
==
cudaSuccess
)
{
cuerr
=
cudaMemGetInfo
(
free
,
total
);
}
if
(
overridden
)
{
cudaSetDevice
(
current_device
);
}
if
(
cuerr
!=
cudaSuccess
)
{
return
Status
(
Status
::
Code
::
INTERNAL
,
(
std
::
string
(
"unable to get memory info for device "
)
+
std
::
to_string
(
device_id
)
+
": "
+
cudaGetErrorString
(
cuerr
)));
}
#endif // TRITON_ENABLE_GPU
return
Status
::
Success
;
}
Status
EnablePeerAccess
(
const
double
min_compute_capability
)
{
#ifdef TRITON_ENABLE_GPU
// If we can't enable peer access for one device pair, the best we can
// do is skipping it...
std
::
set
<
int
>
supported_gpus
;
bool
all_enabled
=
false
;
if
(
GetSupportedGPUs
(
&
supported_gpus
,
min_compute_capability
).
IsOk
())
{
all_enabled
=
true
;
int
can_access_peer
=
false
;
for
(
const
auto
&
host
:
supported_gpus
)
{
auto
cuerr
=
cudaSetDevice
(
host
);
if
(
cuerr
==
cudaSuccess
)
{
for
(
const
auto
&
peer
:
supported_gpus
)
{
if
(
host
==
peer
)
{
continue
;
}
cuerr
=
cudaDeviceCanAccessPeer
(
&
can_access_peer
,
host
,
peer
);
if
((
cuerr
==
cudaSuccess
)
&&
(
can_access_peer
==
1
))
{
cuerr
=
cudaDeviceEnablePeerAccess
(
peer
,
0
);
}
all_enabled
&=
((
cuerr
==
cudaSuccess
)
&&
(
can_access_peer
==
1
));
}
}
}
}
if
(
!
all_enabled
)
{
return
Status
(
Status
::
Code
::
UNSUPPORTED
,
"failed to enable peer access for some device pairs"
);
}
#endif // TRITON_ENABLE_GPU
return
Status
::
Success
;
}
Status
CopyBuffer
(
const
std
::
string
&
msg
,
const
TRITONSERVER_MemoryType
src_memory_type
,
const
int64_t
src_memory_type_id
,
const
TRITONSERVER_MemoryType
dst_memory_type
,
const
int64_t
dst_memory_type_id
,
const
size_t
byte_size
,
const
void
*
src
,
void
*
dst
,
cudaStream_t
cuda_stream
,
bool
*
cuda_used
,
bool
copy_on_stream
)
{
NVTX_RANGE
(
nvtx_
,
"CopyBuffer"
);
*
cuda_used
=
false
;
// For CUDA memcpy, all host to host copy will be blocked in respect to the
// host, so use memcpy() directly. In this case, need to be careful on whether
// the src buffer is valid.
if
((
src_memory_type
!=
TRITONSERVER_MEMORY_GPU
)
&&
(
dst_memory_type
!=
TRITONSERVER_MEMORY_GPU
))
{
#ifdef TRITON_ENABLE_GPU
if
(
copy_on_stream
)
{
auto
params
=
new
CopyParams
(
dst
,
src
,
byte_size
);
cudaLaunchHostFunc
(
cuda_stream
,
MemcpyHost
,
reinterpret_cast
<
void
*>
(
params
));
*
cuda_used
=
true
;
}
else
{
memcpy
(
dst
,
src
,
byte_size
);
}
#else
memcpy
(
dst
,
src
,
byte_size
);
#endif // TRITON_ENABLE_GPU
}
else
{
#ifdef TRITON_ENABLE_GPU
RETURN_IF_CUDA_ERR
(
cudaMemcpyAsync
(
dst
,
src
,
byte_size
,
cudaMemcpyDefault
,
cuda_stream
),
msg
+
": failed to perform CUDA copy"
);
*
cuda_used
=
true
;
#else
return
Status
(
Status
::
Code
::
INTERNAL
,
msg
+
": try to use CUDA copy while GPU is not supported"
);
#endif // TRITON_ENABLE_GPU
}
return
Status
::
Success
;
}
void
CopyBufferHandler
(
const
std
::
string
&
msg
,
const
TRITONSERVER_MemoryType
src_memory_type
,
const
int64_t
src_memory_type_id
,
const
TRITONSERVER_MemoryType
dst_memory_type
,
const
int64_t
dst_memory_type_id
,
const
size_t
byte_size
,
const
void
*
src
,
void
*
dst
,
cudaStream_t
cuda_stream
,
void
*
response_ptr
,
triton
::
common
::
SyncQueue
<
std
::
tuple
<
Status
,
bool
,
void
*>>*
completion_queue
)
{
bool
cuda_used
=
false
;
Status
status
=
CopyBuffer
(
msg
,
src_memory_type
,
src_memory_type_id
,
dst_memory_type
,
dst_memory_type_id
,
byte_size
,
src
,
dst
,
cuda_stream
,
&
cuda_used
);
completion_queue
->
Put
(
std
::
make_tuple
(
status
,
cuda_used
,
response_ptr
));
}
#ifdef TRITON_ENABLE_GPU
Status
CheckGPUCompatibility
(
const
int
gpu_id
,
const
double
min_compute_capability
)
{
// Query the compute capability from the device
cudaDeviceProp
cuprops
;
cudaError_t
cuerr
=
cudaGetDeviceProperties
(
&
cuprops
,
gpu_id
);
if
(
cuerr
!=
cudaSuccess
)
{
return
Status
(
Status
::
Code
::
INTERNAL
,
"unable to get CUDA device properties for GPU ID"
+
std
::
to_string
(
gpu_id
)
+
": "
+
cudaGetErrorString
(
cuerr
));
}
double
compute_compability
=
cuprops
.
major
+
(
cuprops
.
minor
/
10.0
);
if
((
compute_compability
>
min_compute_capability
)
||
(
abs
(
compute_compability
-
min_compute_capability
)
<
0.01
))
{
return
Status
::
Success
;
}
else
{
return
Status
(
Status
::
Code
::
UNSUPPORTED
,
"gpu "
+
std
::
to_string
(
gpu_id
)
+
" has compute capability '"
+
std
::
to_string
(
cuprops
.
major
)
+
"."
+
std
::
to_string
(
cuprops
.
minor
)
+
"' which is less than the minimum supported of '"
+
std
::
to_string
(
min_compute_capability
)
+
"'"
);
}
}
Status
GetSupportedGPUs
(
std
::
set
<
int
>*
supported_gpus
,
const
double
min_compute_capability
)
{
// Make sure set is empty before starting
supported_gpus
->
clear
();
int
device_cnt
;
cudaError_t
cuerr
=
cudaGetDeviceCount
(
&
device_cnt
);
if
((
cuerr
==
cudaErrorNoDevice
)
||
(
cuerr
==
cudaErrorInsufficientDriver
))
{
device_cnt
=
0
;
}
else
if
(
cuerr
!=
cudaSuccess
)
{
return
Status
(
Status
::
Code
::
INTERNAL
,
"unable to get number of CUDA devices: "
+
std
::
string
(
cudaGetErrorString
(
cuerr
)));
}
// populates supported_gpus
for
(
int
gpu_id
=
0
;
gpu_id
<
device_cnt
;
gpu_id
++
)
{
Status
status
=
CheckGPUCompatibility
(
gpu_id
,
min_compute_capability
);
if
(
status
.
IsOk
())
{
supported_gpus
->
insert
(
gpu_id
);
}
}
return
Status
::
Success
;
}
Status
SupportsIntegratedZeroCopy
(
const
int
gpu_id
,
bool
*
zero_copy_support
)
{
// Query the device to check if integrated
cudaDeviceProp
cuprops
;
cudaError_t
cuerr
=
cudaGetDeviceProperties
(
&
cuprops
,
gpu_id
);
if
(
cuerr
!=
cudaSuccess
)
{
return
Status
(
Status
::
Code
::
INTERNAL
,
"unable to get CUDA device properties for GPU ID"
+
std
::
to_string
(
gpu_id
)
+
": "
+
cudaGetErrorString
(
cuerr
));
}
// Zero-copy supported only on integrated GPU when it can map host memory
if
(
cuprops
.
integrated
&&
cuprops
.
canMapHostMemory
)
{
*
zero_copy_support
=
true
;
}
else
{
*
zero_copy_support
=
false
;
}
return
Status
::
Success
;
}
#endif
}}
// namespace triton::core
3rdparty/core-r22.12/src/cuda_utils.h
0 → 100644
View file @
b30f3cdb
// Copyright 2019-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <set>
#include "status.h"
#include "triton/common/sync_queue.h"
#ifdef TRITON_ENABLE_GPU
#include <cuda_runtime_api.h>
#endif // TRITON_ENABLE_GPU
namespace
triton
{
namespace
core
{
#ifdef TRITON_ENABLE_GPU
#define RETURN_IF_CUDA_ERR(X, MSG) \
do { \
cudaError_t err__ = (X); \
if (err__ != cudaSuccess) { \
return Status( \
Status::Code::INTERNAL, (MSG) + ": " + cudaGetErrorString(err__)); \
} \
} while (false)
#endif // TRITON_ENABLE_GPU
#ifndef TRITON_ENABLE_GPU
using
cudaStream_t
=
void
*
;
#endif // !TRITON_ENABLE_GPU
/// Get the memory info for the specified device.
/// \param device_id The device ID.
/// \param free Return free memory in bytes.
/// \param total Return total memory in bytes.
/// \return The error status. A non-OK status means failure to get memory info.
Status
GetDeviceMemoryInfo
(
const
int
device_id
,
size_t
*
free
,
size_t
*
total
);
/// Enable peer access for all GPU device pairs
/// \param min_compute_capability The minimum support CUDA compute
/// capability.
/// \return The error status. A non-OK status means not all pairs are enabled
Status
EnablePeerAccess
(
const
double
min_compute_capability
);
/// Copy buffer from 'src' to 'dst' for given 'byte_size'. The buffer location
/// is identified by the memory type and id, and the corresponding copy will be
/// initiated.
/// \param msg The message to be prepended in error message.
/// \param src_memory_type The memory type CPU/GPU of the source.
/// \param src_memory_type_id The device id of the source.
/// \param dst_memory_type The memory type CPU/GPU of the destination.
/// \param dst_memory_type_id The device id of the destination.
/// \param byte_size The size in bytes to me copied from source to destination.
/// \param src The buffer start address of the source.
/// \param dst The buffer start address of the destination.
/// \param cuda_stream The stream to be associated with, and 0 can be
/// passed for default stream.
/// \param cuda_used returns whether a CUDA memory copy is initiated. If true,
/// the caller should synchronize on the given 'cuda_stream' to ensure data copy
/// is completed.
/// \param copy_on_stream whether the memory copies should be performed in cuda
/// host functions on the 'cuda_stream'.
/// \return The error status. A non-ok status indicates failure to copy the
/// buffer.
Status
CopyBuffer
(
const
std
::
string
&
msg
,
const
TRITONSERVER_MemoryType
src_memory_type
,
const
int64_t
src_memory_type_id
,
const
TRITONSERVER_MemoryType
dst_memory_type
,
const
int64_t
dst_memory_type_id
,
const
size_t
byte_size
,
const
void
*
src
,
void
*
dst
,
cudaStream_t
cuda_stream
,
bool
*
cuda_used
,
bool
copy_on_stream
=
false
);
#ifdef TRITON_ENABLE_GPU
/// Validates the compute capability of the GPU indexed
/// \param gpu_id The index of the target GPU.
/// \param min_compute_capability The minimum support CUDA compute
/// capability.
/// \return The error status. A non-OK status means the target GPU is
/// not supported.
Status
CheckGPUCompatibility
(
const
int
gpu_id
,
const
double
min_compute_capability
);
/// Obtains a set of gpu ids that is supported by triton.
/// \param supported_gpus Returns the set of integers which is
/// populated by ids of supported GPUS
/// \param min_compute_capability The minimum support CUDA compute
/// capability.
/// \return The error status. A non-ok status means there were
/// errors encountered while querying GPU devices.
Status
GetSupportedGPUs
(
std
::
set
<
int
>*
supported_gpus
,
const
double
min_compute_capability
);
/// Checks if the GPU specified is an integrated GPU and supports Zero-copy.
/// \param gpu_id The index of the target GPU.
/// \param zero_copy_support If true, Zero-copy is supported by this GPU.
/// \return The error status. A non-OK status means the target GPU is
/// not supported.
Status
SupportsIntegratedZeroCopy
(
const
int
gpu_id
,
bool
*
zero_copy_support
);
#endif
// Helper around CopyBuffer that updates the completion queue with the returned
// status and cuda_used flag.
void
CopyBufferHandler
(
const
std
::
string
&
msg
,
const
TRITONSERVER_MemoryType
src_memory_type
,
const
int64_t
src_memory_type_id
,
const
TRITONSERVER_MemoryType
dst_memory_type
,
const
int64_t
dst_memory_type_id
,
const
size_t
byte_size
,
const
void
*
src
,
void
*
dst
,
cudaStream_t
cuda_stream
,
void
*
response_ptr
,
triton
::
common
::
SyncQueue
<
std
::
tuple
<
Status
,
bool
,
void
*>>*
completion_queue
);
struct
CopyParams
{
CopyParams
(
void
*
dst
,
const
void
*
src
,
const
size_t
byte_size
)
:
dst_
(
dst
),
src_
(
src
),
byte_size_
(
byte_size
)
{
}
void
*
dst_
;
const
void
*
src_
;
const
size_t
byte_size_
;
};
}}
// namespace triton::core
3rdparty/core-r22.12/src/dynamic_batch_scheduler.cc
0 → 100644
View file @
b30f3cdb
// Copyright 2018-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "dynamic_batch_scheduler.h"
#ifndef _WIN32
#include <sys/resource.h>
#include <sys/syscall.h>
#include <unistd.h>
#endif
#include "constants.h"
#include "server.h"
#include "triton/common/logging.h"
#include "triton/common/model_config.h"
#include "triton/common/nvtx.h"
namespace
triton
{
namespace
core
{
bool
IsStaleState
(
Payload
::
State
payload_state
)
{
return
(
(
payload_state
==
Payload
::
State
::
EXECUTING
)
||
(
payload_state
==
Payload
::
State
::
RELEASED
));
}
DynamicBatchScheduler
::
DynamicBatchScheduler
(
TritonModel
*
model
,
TritonModelInstance
*
model_instance
,
const
bool
dynamic_batching_enabled
,
const
int32_t
max_batch_size
,
const
std
::
unordered_map
<
std
::
string
,
bool
>&
enforce_equal_shape_tensors
,
const
bool
preserve_ordering
,
const
bool
response_cache_enable
,
const
std
::
set
<
int32_t
>&
preferred_batch_sizes
,
const
uint64_t
max_queue_delay_microseconds
,
const
inference
::
ModelQueuePolicy
&
default_queue_policy
,
const
uint32_t
priority_levels
,
const
ModelQueuePolicyMap
&
queue_policy_map
)
:
model_
(
model
),
model_instance_
(
model_instance
),
model_name_
(
model
->
Name
()),
dynamic_batching_enabled_
(
dynamic_batching_enabled
),
queue_
(
default_queue_policy
,
priority_levels
,
queue_policy_map
),
stop_
(
false
),
max_batch_size_
((
size_t
)
std
::
max
(
1
,
max_batch_size
)),
preferred_batch_sizes_
(
preferred_batch_sizes
),
pending_batch_delay_ns_
(
max_queue_delay_microseconds
*
1000
),
pending_batch_size_
(
0
),
queued_batch_size_
(
0
),
next_preferred_batch_size_
(
0
),
enforce_equal_shape_tensors_
(
enforce_equal_shape_tensors
),
has_optional_input_
(
false
),
preserve_ordering_
(
preserve_ordering
)
{
rate_limiter_
=
model_
->
Server
()
->
GetRateLimiter
();
// Both the server and model config should specify
// caching enabled for model to utilize response cache.
response_cache_enabled_
=
(
model_
->
Server
()
->
ResponseCacheEnabled
()
&&
response_cache_enable
);
#ifdef TRITON_ENABLE_METRICS
// Initialize metric reporter for cache statistics if cache enabled
if
(
response_cache_enabled_
)
{
MetricModelReporter
::
Create
(
model_name_
,
model_
->
Version
(),
METRIC_REPORTER_ID_RESPONSE_CACHE
,
model_
->
Config
().
metric_tags
(),
&
reporter_
);
}
#endif // TRITON_ENABLE_METRICS
max_preferred_batch_size_
=
0
;
for
(
const
auto
size
:
preferred_batch_sizes_
)
{
max_preferred_batch_size_
=
std
::
max
(
max_preferred_batch_size_
,
(
size_t
)
size
);
}
for
(
const
auto
&
input
:
model_
->
Config
().
input
())
{
if
(
input
.
optional
())
{
has_optional_input_
=
true
;
break
;
}
}
}
Status
DynamicBatchScheduler
::
Create
(
TritonModel
*
model
,
TritonModelInstance
*
model_instance
,
const
int
nice
,
const
bool
dynamic_batching_enabled
,
const
int32_t
max_batch_size
,
const
std
::
unordered_map
<
std
::
string
,
bool
>&
enforce_equal_shape_tensors
,
const
bool
preserve_ordering
,
const
bool
response_cache_enable
,
const
std
::
set
<
int32_t
>&
preferred_batch_sizes
,
const
uint64_t
max_queue_delay_microseconds
,
std
::
unique_ptr
<
Scheduler
>*
scheduler
)
{
inference
::
ModelDynamicBatching
batcher_config
;
batcher_config
.
set_preserve_ordering
(
preserve_ordering
);
for
(
const
auto
&
bs
:
preferred_batch_sizes
)
{
batcher_config
.
add_preferred_batch_size
(
bs
);
}
batcher_config
.
set_max_queue_delay_microseconds
(
max_queue_delay_microseconds
);
return
Create
(
model
,
model_instance
,
nice
,
dynamic_batching_enabled
,
max_batch_size
,
enforce_equal_shape_tensors
,
batcher_config
,
response_cache_enable
,
scheduler
);
}
Status
DynamicBatchScheduler
::
Create
(
TritonModel
*
model
,
TritonModelInstance
*
model_instance
,
const
int
nice
,
const
bool
dynamic_batching_enabled
,
const
int32_t
max_batch_size
,
const
std
::
unordered_map
<
std
::
string
,
bool
>&
enforce_equal_shape_tensors
,
const
inference
::
ModelDynamicBatching
&
batcher_config
,
const
bool
response_cache_enable
,
std
::
unique_ptr
<
Scheduler
>*
scheduler
)
{
std
::
set
<
int32_t
>
preferred_batch_sizes
;
for
(
const
auto
size
:
batcher_config
.
preferred_batch_size
())
{
preferred_batch_sizes
.
insert
(
size
);
}
DynamicBatchScheduler
*
dyna_sched
=
new
DynamicBatchScheduler
(
model
,
model_instance
,
dynamic_batching_enabled
,
max_batch_size
,
enforce_equal_shape_tensors
,
batcher_config
.
preserve_ordering
(),
response_cache_enable
,
preferred_batch_sizes
,
batcher_config
.
max_queue_delay_microseconds
(),
batcher_config
.
default_queue_policy
(),
batcher_config
.
priority_levels
(),
batcher_config
.
priority_queue_policy
());
std
::
unique_ptr
<
DynamicBatchScheduler
>
sched
(
dyna_sched
);
sched
->
scheduler_thread_exit_
.
store
(
false
);
if
(
dynamic_batching_enabled
)
{
sched
->
NewPayload
();
sched
->
scheduler_thread_
=
std
::
thread
([
dyna_sched
,
nice
]()
{
dyna_sched
->
BatcherThread
(
nice
);
});
}
scheduler
->
reset
(
sched
.
release
());
return
Status
::
Success
;
}
DynamicBatchScheduler
::~
DynamicBatchScheduler
()
{
// Signal the scheduler thread to exit and then wait for it..
scheduler_thread_exit_
.
store
(
true
);
cv_
.
notify_one
();
if
(
scheduler_thread_
.
joinable
())
{
scheduler_thread_
.
join
();
}
}
Status
DynamicBatchScheduler
::
Enqueue
(
std
::
unique_ptr
<
InferenceRequest
>&
request
)
{
if
(
stop_
)
{
return
Status
(
Status
::
Code
::
UNAVAILABLE
,
request
->
LogRequest
()
+
"Server is stopping, scheduler for model has stopped accepting new "
"inference requests"
);
}
// If queue start timestamp hasn't been set, queue timer starts at
// the beginning of the queueing and scheduling process. Otherwise,
// dynamic batcher is used as component of another batcher and should not
// overwrite the queue start timestamp.
if
(
request
->
QueueStartNs
()
==
0
)
{
request
->
CaptureQueueStartNs
();
INFER_TRACE_ACTIVITY
(
request
->
Trace
(),
TRITONSERVER_TRACE_QUEUE_START
,
request
->
QueueStartNs
());
#ifdef TRITON_ENABLE_TRACING
request
->
TraceInputTensors
(
TRITONSERVER_TRACE_TENSOR_QUEUE_INPUT
,
"DynamicBatchScheduler Enqueue"
);
#endif // TRITON_ENABLE_TRACING
}
// Record time at the beginning of the batcher queueing. In the case of
// oldest sequence batcher, this will overwrite the value that was previously
// set by sequence batcher, which is okay as by this point, the previous
// batcher won't be needing this value and it can be safely reused by
// the dynamic batcher.
request
->
CaptureBatcherStartNs
();
std
::
unique_ptr
<
InferenceResponse
>
cached_response
;
if
(
response_cache_enabled_
)
{
CacheLookUp
(
request
,
cached_response
);
}
if
(
cached_response
!=
nullptr
)
{
// If there was a cache hit then try sending the cached response
// and release the request.
if
(
preserve_ordering_
)
{
// In order to preserve the order, the response send must be
// delegated.
DelegateResponse
(
request
);
}
// Send cached response and release request
InferenceResponse
::
Send
(
std
::
move
(
cached_response
),
TRITONSERVER_RESPONSE_COMPLETE_FINAL
);
InferenceRequest
::
Release
(
std
::
move
(
request
),
TRITONSERVER_REQUEST_RELEASE_ALL
);
return
Status
::
Success
;
}
if
(
!
dynamic_batching_enabled_
)
{
if
(
preserve_ordering_
||
response_cache_enabled_
)
{
DelegateResponse
(
request
);
}
// If not using dynamic batching, directly enqueue the
// request to model for execution
auto
payload
=
model_
->
Server
()
->
GetRateLimiter
()
->
GetPayload
(
Payload
::
Operation
::
INFER_RUN
,
nullptr
/* TritonModelInstance*/
);
payload
->
AddRequest
(
std
::
move
(
request
));
RETURN_IF_ERROR
(
model_
->
Server
()
->
GetRateLimiter
()
->
EnqueuePayload
(
model_
,
payload
));
}
else
{
bool
wake_batcher
=
true
;
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mu_
);
queued_batch_size_
+=
std
::
max
(
1U
,
request
->
BatchSize
());
// Assuming no error is returned, this call takes ownership of
// 'request' and so we can't use it after this point.
RETURN_IF_ERROR
(
queue_
.
Enqueue
(
request
->
Priority
(),
request
));
// If there are any idle runners and the queued batch size is greater or
// equal to next preferred batch size, then wake batcher up to service
// this request. We do the actual wake outside of the lock to avoid
// having the woken thread immediately block on the lock
wake_batcher
=
model_
->
Server
()
->
GetRateLimiter
()
->
PayloadSlotAvailable
(
model_
);
// We may wake up runner less often if we don't enforce equal shape
// within a batch, otherwise must always wake up runner to check it
if
(
enforce_equal_shape_tensors_
.
empty
())
{
std
::
lock_guard
<
std
::
mutex
>
exec_lock
(
*
(
curr_payload_
->
GetExecMutex
()));
auto
payload_state
=
curr_payload_
->
GetState
();
wake_batcher
&=
(
payload_saturated_
||
IsStaleState
(
payload_state
)
||
(
queued_batch_size_
>=
next_preferred_batch_size_
));
}
}
if
(
wake_batcher
)
{
cv_
.
notify_one
();
}
}
return
Status
::
Success
;
}
void
DynamicBatchScheduler
::
NewPayload
()
{
curr_payload_
=
model_
->
Server
()
->
GetRateLimiter
()
->
GetPayload
(
Payload
::
Operation
::
INFER_RUN
,
model_instance_
);
payload_saturated_
=
false
;
}
void
DynamicBatchScheduler
::
BatcherThread
(
const
int
nice
)
{
#ifndef _WIN32
if
(
setpriority
(
PRIO_PROCESS
,
syscall
(
SYS_gettid
),
nice
)
==
0
)
{
LOG_VERBOSE
(
1
)
<<
"Starting dynamic-batcher thread for "
<<
model_name_
<<
" at nice "
<<
nice
<<
"..."
;
}
else
{
LOG_VERBOSE
(
1
)
<<
"Starting dynamic-batcher thread for "
<<
model_name_
<<
" at default nice (requested nice "
<<
nice
<<
" failed)..."
;
}
#else
LOG_VERBOSE
(
1
)
<<
"Starting dynamic-batcher thread for "
<<
model_name_
<<
" at default nice..."
;
#endif
// For debugging/testing, delay start of threads until the queue
// contains the specified number of entries.
size_t
delay_cnt
=
0
;
{
const
char
*
dstr
=
getenv
(
"TRITONSERVER_DELAY_SCHEDULER"
);
if
(
dstr
!=
nullptr
)
{
delay_cnt
=
atoi
(
dstr
);
LOG_VERBOSE
(
1
)
<<
"Delaying batcher thread for "
<<
model_name_
<<
" until "
<<
delay_cnt
<<
" queued requests..."
;
}
}
auto
wait_for_slots
=
[
this
]()
{
return
model_
->
Server
()
->
GetRateLimiter
()
->
PayloadSlotAvailable
(
model_
);
};
const
uint64_t
default_wait_microseconds
=
500
*
1000
;
while
(
!
scheduler_thread_exit_
.
load
())
{
NVTX_RANGE
(
nvtx_
,
"DynamicBatcher "
+
model_name_
);
std
::
shared_ptr
<
std
::
vector
<
std
::
deque
<
std
::
unique_ptr
<
InferenceRequest
>>>>
rejected_requests
;
uint64_t
wait_microseconds
=
0
;
// Hold the lock for as short a time as possible.
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mu_
);
{
std
::
lock_guard
<
std
::
mutex
>
exec_lock
(
*
(
curr_payload_
->
GetExecMutex
()));
auto
payload_state
=
curr_payload_
->
GetState
();
if
(
payload_saturated_
||
IsStaleState
(
payload_state
))
{
NewPayload
();
next_preferred_batch_size_
=
0
;
}
}
if
(
delay_cnt
>
0
)
{
// Debugging/testing... wait until queue contains 'delay_cnt'
// items...
wait_microseconds
=
10
*
1000
;
if
(
queue_
.
Size
()
>=
delay_cnt
)
{
delay_cnt
=
0
;
}
LOG_VERBOSE
(
1
)
<<
"Delaying batcher thread "
<<
model_name_
<<
" until "
<<
delay_cnt
<<
" queued requests, current total = "
<<
queue_
.
Size
();
}
else
if
(
queue_
.
Empty
())
{
wait_microseconds
=
default_wait_microseconds
;
}
else
{
if
(
payload_saturated_
)
{
continue
;
}
cv_
.
wait
(
lock
,
wait_for_slots
);
{
std
::
lock_guard
<
std
::
mutex
>
exec_lock
(
*
(
curr_payload_
->
GetExecMutex
()));
auto
payload_state
=
curr_payload_
->
GetState
();
if
(
IsStaleState
(
payload_state
))
{
continue
;
}
// Use dynamic batching to get request(s) to execute.
wait_microseconds
=
GetDynamicBatch
();
// Get requests that are rejected from searching dynamic batch.
queue_
.
ReleaseRejectedRequests
(
&
rejected_requests
);
// Extract batch only if there is pending batch
auto
pending_batch_queue_cnt
=
queue_
.
PendingBatchCount
();
if
((
wait_microseconds
==
0
)
&&
(
pending_batch_queue_cnt
!=
0
))
{
curr_payload_
->
ReserveRequests
(
pending_batch_queue_cnt
);
for
(
size_t
idx
=
0
;
idx
<
pending_batch_queue_cnt
;
++
idx
)
{
std
::
unique_ptr
<
InferenceRequest
>
request
;
auto
status
=
queue_
.
Dequeue
(
&
request
);
if
(
status
.
IsOk
())
{
if
(
preserve_ordering_
||
response_cache_enabled_
)
{
DelegateResponse
(
request
);
}
curr_payload_
->
AddRequest
(
std
::
move
(
request
));
}
else
{
// The queue is empty which conflicts with pending batch
// count. Send the current batch if any and reset related
// variables.
LOG_ERROR
<<
request
->
LogRequest
()
<<
"Failed to retrieve request from scheduler queue: "
<<
status
.
Message
();
queue_
.
ResetCursor
();
queued_batch_size_
=
0
;
pending_batch_size_
=
0
;
break
;
}
}
if
(
curr_payload_
->
GetState
()
==
Payload
::
State
::
UNINITIALIZED
)
{
curr_payload_
->
SetState
(
Payload
::
State
::
READY
);
}
queued_batch_size_
-=
pending_batch_size_
;
pending_batch_size_
=
0
;
}
}
}
// If no requests are to be handled, wait for notification or
// for the specified timeout before checking the queue again.
if
(
wait_microseconds
>
0
)
{
std
::
chrono
::
microseconds
wait_timeout
(
wait_microseconds
);
cv_
.
wait_for
(
lock
,
wait_timeout
);
}
}
if
(
curr_payload_
->
GetState
()
==
Payload
::
State
::
READY
)
{
auto
callback
=
[
this
]()
{
cv_
.
notify_one
();
};
curr_payload_
->
SetCallback
(
callback
);
model_
->
Server
()
->
GetRateLimiter
()
->
EnqueuePayload
(
model_
,
curr_payload_
);
}
// Finish rejected requests if any
if
(
rejected_requests
!=
nullptr
)
{
static
Status
rejected_status
=
Status
(
Status
::
Code
::
UNAVAILABLE
,
"Request timeout expired"
);
for
(
auto
&
rejected_queue
:
*
rejected_requests
)
{
for
(
auto
&
rejected_request
:
rejected_queue
)
{
InferenceRequest
::
RespondIfError
(
rejected_request
,
rejected_status
,
true
);
}
}
}
}
// end runner loop
LOG_VERBOSE
(
1
)
<<
"Stopping dynamic-batcher thread for "
<<
model_name_
<<
"..."
;
}
uint64_t
DynamicBatchScheduler
::
GetDynamicBatch
()
{
// 'mu_' mutex must be held when this function is called. queue_
// must not be empty.
// Examine the new requests. If adding these new requests to the
// pending batch allows a preferred batch size then execute it
// immediately. Stop examining requests if the maximum preferred
// batch size would be exceeded or if the shape of the next request
// does not match the shape of the pending batch.
bool
send_now
=
false
;
if
(
!
queue_
.
IsCursorValid
())
{
queue_
.
ResetCursor
();
pending_batch_size_
=
0
;
}
size_t
best_preferred_batch_size
=
0
;
queued_batch_size_
-=
queue_
.
ApplyPolicyAtCursor
();
// When there is optional input or input shape must be enforced,
// the inputs in the requests must be examined for forming a batch
const
bool
check_input
=
!
enforce_equal_shape_tensors_
.
empty
()
||
has_optional_input_
;
auto
payload_batch_size
=
curr_payload_
->
BatchSize
();
while
(
!
queue_
.
CursorEnd
())
{
const
auto
batch_size
=
std
::
max
(
1U
,
queue_
.
RequestAtCursor
()
->
BatchSize
());
// If there is no pending batch, then this request is starting a
// new batch.
if
((
payload_batch_size
+
queue_
.
PendingBatchCount
())
==
0
)
{
// Get the shape of the new batch that is being started...
if
(
check_input
)
{
if
(
!
curr_payload_
->
MutableRequiredEqualInputs
()
->
Initialize
(
queue_
.
RequestAtCursor
(),
enforce_equal_shape_tensors_
,
has_optional_input_
)
.
IsOk
())
{
send_now
=
true
;
break
;
}
}
}
else
{
// There is a pending batch and adding this request would make
// the batch size larger than all of the preferred batch sizes,
// so mark the cursor at this point. Not sending the pending batch so
// that we can examine the queue delay of requests that fits in a batch.
if
(((
payload_batch_size
+
pending_batch_size_
+
batch_size
)
>
max_preferred_batch_size_
)
&&
(
best_preferred_batch_size
==
0
))
{
best_preferred_batch_size
=
pending_batch_size_
;
queue_
.
MarkCursor
();
payload_saturated_
=
true
;
}
if
((
payload_batch_size
+
pending_batch_size_
+
batch_size
)
>
max_batch_size_
)
{
send_now
=
true
;
break
;
}
// There is a pending batch and it has a different shape then
// this request, so send the pending batch as it is.
if
(
check_input
&&
!
curr_payload_
->
MutableRequiredEqualInputs
()
->
HasEqualInputs
(
queue_
.
RequestAtCursor
()))
{
curr_payload_
->
MarkSaturated
();
send_now
=
true
;
break
;
}
}
pending_batch_size_
+=
batch_size
;
queue_
.
AdvanceCursor
();
queued_batch_size_
-=
queue_
.
ApplyPolicyAtCursor
();
if
(
preferred_batch_sizes_
.
find
(
pending_batch_size_
+
payload_batch_size
)
!=
preferred_batch_sizes_
.
end
())
{
best_preferred_batch_size
=
pending_batch_size_
;
queue_
.
MarkCursor
();
}
}
// Obatin the age of the oldest pending request to compare with the maximum
// batch queuing delay
uint64_t
now_ns
=
std
::
chrono
::
duration_cast
<
std
::
chrono
::
nanoseconds
>
(
std
::
chrono
::
steady_clock
::
now
().
time_since_epoch
())
.
count
();
uint64_t
delay_ns
=
now_ns
-
queue_
.
OldestEnqueueTime
();
bool
delay_is_exceeded
=
(
pending_batch_delay_ns_
!=
0
)
&&
(
delay_ns
>=
pending_batch_delay_ns_
);
// If we found a preferred batch size and the queue delay hasn't been
// exceeded, then execute that.
if
((
best_preferred_batch_size
!=
0
)
&&
!
delay_is_exceeded
)
{
if
(
pending_batch_delay_ns_
==
0
)
{
payload_saturated_
=
true
;
}
pending_batch_size_
=
best_preferred_batch_size
;
queue_
.
SetCursorToMark
();
return
0
;
}
// No request in pending batch happens when all queued requests have expired
// timeout and the policies are REJECT
if
(
queue_
.
PendingBatchCount
()
==
0
)
{
return
0
;
}
// If the delay has been exceeded, or if the current batch can't grow
// any larger then just immediately execute whatever is pending.
if
(
send_now
||
((
payload_batch_size
+
pending_batch_size_
)
>=
max_preferred_batch_size_
))
{
payload_saturated_
=
true
;
return
0
;
}
if
(
delay_is_exceeded
||
(
pending_batch_delay_ns_
==
0
))
{
return
0
;
}
// Set the next preferred batch size given the pending batch size
auto
next_preferred_batch_size_it
=
preferred_batch_sizes_
.
upper_bound
(
pending_batch_size_
+
payload_batch_size
);
if
(
next_preferred_batch_size_it
!=
preferred_batch_sizes_
.
end
())
{
next_preferred_batch_size_
=
*
next_preferred_batch_size_it
;
}
else
{
next_preferred_batch_size_
=
preferred_batch_sizes_
.
empty
()
?
0
:
*
preferred_batch_sizes_
.
begin
();
}
if
(
next_preferred_batch_size_
!=
0
)
{
next_preferred_batch_size_
-=
payload_batch_size
;
}
// By this point, we have not seen the pending batch that should be executed
// immediately. However, if we have scheduled a payload that can be grown and
// not yet in preferred batch size, we should move the pending batch over to
// ensure the model instance will pick up largest available batch even if it
// is not the preferred batch.
if
(
!
payload_saturated_
&&
(
payload_batch_size
!=
0
)
&&
(
preferred_batch_sizes_
.
find
(
payload_batch_size
)
==
preferred_batch_sizes_
.
end
()))
{
return
0
;
}
uint64_t
wait_ns
=
pending_batch_delay_ns_
-
delay_ns
;
// Note that taking request timeout into consideration allows us to reset
// pending batch as soon as it is invalidated. But the cost is that in edge
// case where the timeout will be expired one by one, the thread will be
// waken frequently.
if
(
queue_
.
ClosestTimeout
()
!=
0
)
{
if
(
now_ns
<=
queue_
.
ClosestTimeout
())
{
wait_ns
=
std
::
min
(
queue_
.
ClosestTimeout
()
-
now_ns
,
wait_ns
);
}
else
{
// A request in pending batch is timed-out, wait for 1 us to force the
// thread to reset the pending batch right the way.
wait_ns
=
1000
;
}
}
// Return non-zero wait microseconds to cause this thread to wait
// until the queue delay or the closest timeout has expired.
// Another thread may be awaken due to incoming request to handle the
// pending batch before this thread wakes and that is ok. But if no other
// request comes in then this thread will wake and revisit the pending batch
// (and at that time will then see the delay has been exceeded and will send
// the batch).
return
wait_ns
/
1000
;
}
void
DynamicBatchScheduler
::
DelegateResponse
(
std
::
unique_ptr
<
InferenceRequest
>&
request
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
completion_queue_mtx_
);
completion_queue_
.
emplace_back
();
auto
queue_slot
=
&
completion_queue_
.
back
();
// Pass raw ptr to lambda for tracking stats from cache and updating
// metric reporter on cache miss stats after insertion
InferenceRequest
*
raw_request_ptr
=
request
.
get
();
request
->
SetResponseDelegator
(
[
this
,
queue_slot
,
raw_request_ptr
](
std
::
unique_ptr
<
InferenceResponse
>&&
response
,
const
uint32_t
flags
)
{
if
(
response_cache_enabled_
&&
raw_request_ptr
->
CacheKeyIsSet
())
{
// Cache insertion happens here because we need the backend to have
// computed the inference response first in the case of cache miss
auto
cache
=
model_
->
Server
()
->
GetResponseCache
();
auto
status
=
cache
->
Insert
(
*
response
,
raw_request_ptr
);
bool
cache_miss
=
(
status
.
StatusCode
()
!=
Status
::
Code
::
ALREADY_EXISTS
);
if
(
cache_miss
)
{
#ifdef TRITON_ENABLE_STATS
// Update cache miss statistics even on failure to insert
// as we still spend time on lookup and attempting to insert
raw_request_ptr
->
ReportStatisticsCacheMiss
(
reporter_
.
get
());
#endif // TRITON_ENABLE_STATS
if
(
!
status
.
IsOk
())
{
LOG_ERROR
<<
raw_request_ptr
->
LogRequest
()
<<
"Failed to insert request_hash ["
<<
raw_request_ptr
->
CacheKey
()
<<
"] into response cache: "
<<
status
.
Message
();
}
}
// Otherwise do nothing; we update cache hit statistics on Lookup
}
if
(
preserve_ordering_
)
{
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
completion_queue_mtx_
);
queue_slot
->
emplace_back
(
std
::
move
(
response
),
flags
);
}
FinalizeResponses
();
}
else
{
InferenceResponse
::
Send
(
std
::
move
(
response
),
flags
);
}
});
}
void
DynamicBatchScheduler
::
CacheLookUp
(
std
::
unique_ptr
<
InferenceRequest
>&
request
,
std
::
unique_ptr
<
InferenceResponse
>&
cached_response
)
{
auto
cache
=
model_
->
Server
()
->
GetResponseCache
();
// Lookup request in cache
std
::
unique_ptr
<
InferenceResponse
>
local_response
;
request
->
ResponseFactory
()
->
CreateResponse
(
&
local_response
);
auto
status
=
cache
->
Lookup
(
local_response
.
get
(),
request
.
get
());
if
(
status
.
IsOk
()
&&
(
local_response
!=
nullptr
))
{
cached_response
=
std
::
move
(
local_response
);
#ifdef TRITON_ENABLE_STATS
// Update model metrics/stats on cache hits
// Backends will update metrics as normal on cache misses
request
->
ReportStatisticsCacheHit
(
reporter_
.
get
());
#endif // TRITON_ENABLE_STATS
}
}
void
DynamicBatchScheduler
::
FinalizeResponses
()
{
// Need exclusive access of the function to ensure responses are sent
// in order
std
::
lock_guard
<
std
::
mutex
>
lock
(
finalize_mtx_
);
// Finalize the completed payloads in-order as far as possible
std
::
deque
<
std
::
pair
<
std
::
unique_ptr
<
InferenceResponse
>
,
const
uint32_t
>>
responses
;
{
std
::
lock_guard
<
std
::
mutex
>
queue_lock
(
completion_queue_mtx_
);
while
(
!
completion_queue_
.
empty
()
&&
!
completion_queue_
.
front
().
empty
())
{
bool
response_complete
=
false
;
for
(
auto
&
response_pair
:
completion_queue_
.
front
())
{
// Assuming FINAL flag is set only in the last response of the request
response_complete
=
((
response_pair
.
second
&
TRITONSERVER_RESPONSE_COMPLETE_FINAL
)
!=
0
);
responses
.
emplace_back
(
std
::
move
(
response_pair
));
}
if
(
response_complete
)
{
completion_queue_
.
pop_front
();
}
else
{
completion_queue_
.
front
().
clear
();
}
}
}
for
(
auto
&
response
:
responses
)
{
InferenceResponse
::
Send
(
std
::
move
(
response
.
first
),
response
.
second
);
}
}
}}
// namespace triton::core
3rdparty/core-r22.12/src/dynamic_batch_scheduler.h
0 → 100644
View file @
b30f3cdb
// Copyright 2018-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <atomic>
#include <condition_variable>
#include <deque>
#include <future>
#include <map>
#include <mutex>
#include <queue>
#include <set>
#include <thread>
#include "backend_model.h"
#include "backend_model_instance.h"
#include "model_config.pb.h"
#include "rate_limiter.h"
#include "scheduler.h"
#include "scheduler_utils.h"
#include "status.h"
#include "triton/common/model_config.h"
namespace
triton
{
namespace
core
{
// Scheduler that implements dynamic batching.
class
DynamicBatchScheduler
:
public
Scheduler
{
public:
// Create a scheduler to support a given number of runners and a run
// function to call when a request is scheduled.
static
Status
Create
(
TritonModel
*
model
,
TritonModelInstance
*
model_instance
,
const
int
nice
,
const
bool
dynamic_batching_enabled
,
const
int32_t
max_batch_size
,
const
std
::
unordered_map
<
std
::
string
,
bool
>&
enforce_equal_shape_tensors
,
const
bool
preserve_ordering
,
const
bool
response_cache_enable
,
const
std
::
set
<
int32_t
>&
preferred_batch_sizes
,
const
uint64_t
max_queue_delay_microseconds
,
std
::
unique_ptr
<
Scheduler
>*
scheduler
);
// Create a scheduler to support a given number of runners and a run
// function to call when a request is scheduled. And the scheduler also
// supports different queue policies for different priority levels.
static
Status
Create
(
TritonModel
*
model
,
TritonModelInstance
*
model_instance
,
const
int
nice
,
const
bool
dynamic_batching_enabled
,
const
int32_t
max_batch_size
,
const
std
::
unordered_map
<
std
::
string
,
bool
>&
enforce_equal_shape_tensors
,
const
inference
::
ModelDynamicBatching
&
batcher_config
,
const
bool
response_cache_enable
,
std
::
unique_ptr
<
Scheduler
>*
scheduler
);
~
DynamicBatchScheduler
();
// \see Scheduler::Enqueue()
Status
Enqueue
(
std
::
unique_ptr
<
InferenceRequest
>&
request
)
override
;
// \see Scheduler::InflightInferenceCount()
size_t
InflightInferenceCount
()
override
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mu_
);
if
(
curr_payload_
!=
nullptr
)
{
return
queue_
.
Size
()
+
curr_payload_
->
RequestCount
();
}
return
queue_
.
Size
();
}
// \see Scheduler::Stop()
void
Stop
()
override
{
stop_
=
true
;
}
MetricModelReporter
*
MetricReporter
()
const
{
return
reporter_
.
get
();
}
private:
DynamicBatchScheduler
(
TritonModel
*
model
,
TritonModelInstance
*
model_instance
,
const
bool
dynamic_batching_enabled
,
const
int32_t
max_batch_size
,
const
std
::
unordered_map
<
std
::
string
,
bool
>&
enforce_equal_shape_tensors
,
const
bool
preserve_ordering
,
const
bool
response_cache_enable
,
const
std
::
set
<
int32_t
>&
preferred_batch_sizes
,
const
uint64_t
max_queue_delay_microseconds
,
const
inference
::
ModelQueuePolicy
&
default_queue_policy
,
const
uint32_t
priority_levels
,
const
ModelQueuePolicyMap
&
queue_policy_map
);
void
BatcherThread
(
const
int
nice
);
void
NewPayload
();
uint64_t
GetDynamicBatch
();
void
DelegateResponse
(
std
::
unique_ptr
<
InferenceRequest
>&
request
);
void
CacheLookUp
(
std
::
unique_ptr
<
InferenceRequest
>&
request
,
std
::
unique_ptr
<
InferenceResponse
>&
cached_response
);
void
FinalizeResponses
();
TritonModel
*
model_
;
TritonModelInstance
*
model_instance_
;
// Name of the model.
std
::
string
model_name_
;
// True if dynamic batching is enabled.
const
bool
dynamic_batching_enabled_
;
// Map from priority level to queue holding inference requests for the model
// represented by this scheduler. If priority queues are not supported by the
// scheduler, then priority zero entry is used as the single queue.
PriorityQueue
queue_
;
bool
stop_
;
std
::
thread
scheduler_thread_
;
std
::
atomic
<
bool
>
scheduler_thread_exit_
;
// Mutex and condvar for signaling scheduler thread
std
::
mutex
mu_
;
std
::
condition_variable
cv_
;
std
::
shared_ptr
<
RateLimiter
>
rate_limiter_
;
std
::
shared_ptr
<
Payload
>
curr_payload_
;
bool
payload_saturated_
;
size_t
max_batch_size_
;
size_t
max_preferred_batch_size_
;
std
::
set
<
int32_t
>
preferred_batch_sizes_
;
uint64_t
pending_batch_delay_ns_
;
size_t
pending_batch_size_
;
size_t
queued_batch_size_
;
size_t
next_preferred_batch_size_
;
// The input tensors that require shape checking before being
// allowed in a batch. As a map from the tensor name to a bool. If
// tensor is in map then its shape must match shape of same tensor
// in requests already in the batch. If value is "true" then
// additional tensor is treated as a shape tensor and the values
// contained in the shape tensor must match same tensor already in
// the batch.
const
std
::
unordered_map
<
std
::
string
,
bool
>
enforce_equal_shape_tensors_
;
// Store information on whether the model contains optional inputs.
bool
has_optional_input_
;
// If true the ordering of responses matches the order of requests
// even when there are multiple scheduler threads.
const
bool
preserve_ordering_
;
// If true, the scheduler will try to retrieve responses from cache.
bool
response_cache_enabled_
;
// Per completion-id queues to store the ready responses
std
::
deque
<
std
::
vector
<
std
::
pair
<
std
::
unique_ptr
<
InferenceResponse
>
,
uint32_t
>>>
completion_queue_
;
// Lock to protect the completion_queues_
std
::
mutex
completion_queue_mtx_
;
// Preserves the order in which responses are finalized
std
::
mutex
finalize_mtx_
;
// Reporter for metrics, or nullptr if no metrics should be reported
std
::
shared_ptr
<
MetricModelReporter
>
reporter_
;
};
}}
// namespace triton::core
Prev
1
2
3
4
5
6
7
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment