"src/diffusers/pipelines/pipeline_glide.py" did not exist on "dc6324d44bc189a0bf63018145617a736e7a38ff"
Commit b30f3cdb authored by xiabo's avatar xiabo
Browse files

添加下载的代码

parent e38ee081
// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
/// \file
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#ifdef __cplusplus
extern "C" {
#endif
#ifdef _COMPILING_TRITONSERVER
#if defined(_MSC_VER)
#define TRITONSERVER_DECLSPEC __declspec(dllexport)
#elif defined(__GNUC__)
#define TRITONSERVER_DECLSPEC __attribute__((__visibility__("default")))
#else
#define TRITONSERVER_DECLSPEC
#endif
#else
#if defined(_MSC_VER)
#define TRITONSERVER_DECLSPEC __declspec(dllimport)
#else
#define TRITONSERVER_DECLSPEC
#endif
#endif
struct TRITONSERVER_BufferAttributes;
struct TRITONSERVER_Error;
struct TRITONSERVER_InferenceRequest;
struct TRITONSERVER_InferenceResponse;
struct TRITONSERVER_InferenceTrace;
struct TRITONSERVER_Message;
struct TRITONSERVER_Metrics;
struct TRITONSERVER_Parameter;
struct TRITONSERVER_ResponseAllocator;
struct TRITONSERVER_Server;
struct TRITONSERVER_ServerOptions;
struct TRITONSERVER_Metric;
struct TRITONSERVER_MetricFamily;
///
/// TRITONSERVER API Version
///
/// The TRITONSERVER API is versioned with major and minor version
/// numbers. Any change to the API that does not impact backwards
/// compatibility (for example, adding a non-required function)
/// increases the minor version number. Any change that breaks
/// backwards compatibility (for example, deleting or changing the
/// behavior of a function) increases the major version number. A
/// client should check that the API version used to compile the
/// client is compatible with the API version of the Triton shared
/// library that it is linking against. This is typically done by code
/// similar to the following which makes sure that the major versions
/// are equal and that the minor version of the Triton shared library
/// is >= the minor version used to build the client.
///
/// uint32_t api_version_major, api_version_minor;
/// TRITONSERVER_ApiVersion(&api_version_major, &api_version_minor);
/// if ((api_version_major != TRITONSERVER_API_VERSION_MAJOR) ||
/// (api_version_minor < TRITONSERVER_API_VERSION_MINOR)) {
/// return TRITONSERVER_ErrorNew(
/// TRITONSERVER_ERROR_UNSUPPORTED,
/// "triton server API version does not support this client");
/// }
///
#define TRITONSERVER_API_VERSION_MAJOR 1
#define TRITONSERVER_API_VERSION_MINOR 17
/// Get the TRITONBACKEND API version supported by the Triton shared
/// library. This value can be compared against the
/// TRITONSERVER_API_VERSION_MAJOR and TRITONSERVER_API_VERSION_MINOR
/// used to build the client to ensure that Triton shared library is
/// compatible with the client.
///
/// \param major Returns the TRITONSERVER API major version supported
/// by Triton.
/// \param minor Returns the TRITONSERVER API minor version supported
/// by Triton.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ApiVersion(
uint32_t* major, uint32_t* minor);
/// TRITONSERVER_DataType
///
/// Tensor data types recognized by TRITONSERVER.
///
typedef enum TRITONSERVER_datatype_enum {
TRITONSERVER_TYPE_INVALID,
TRITONSERVER_TYPE_BOOL,
TRITONSERVER_TYPE_UINT8,
TRITONSERVER_TYPE_UINT16,
TRITONSERVER_TYPE_UINT32,
TRITONSERVER_TYPE_UINT64,
TRITONSERVER_TYPE_INT8,
TRITONSERVER_TYPE_INT16,
TRITONSERVER_TYPE_INT32,
TRITONSERVER_TYPE_INT64,
TRITONSERVER_TYPE_FP16,
TRITONSERVER_TYPE_FP32,
TRITONSERVER_TYPE_FP64,
TRITONSERVER_TYPE_BYTES,
TRITONSERVER_TYPE_BF16
} TRITONSERVER_DataType;
/// Get the string representation of a data type. The returned string
/// is not owned by the caller and so should not be modified or freed.
///
/// \param datatype The data type.
/// \return The string representation of the data type.
TRITONSERVER_DECLSPEC const char* TRITONSERVER_DataTypeString(
TRITONSERVER_DataType datatype);
/// Get the Triton datatype corresponding to a string representation
/// of a datatype.
///
/// \param dtype The datatype string representation.
/// \return The Triton data type or TRITONSERVER_TYPE_INVALID if the
/// string does not represent a data type.
TRITONSERVER_DECLSPEC TRITONSERVER_DataType
TRITONSERVER_StringToDataType(const char* dtype);
/// Get the size of a Triton datatype in bytes. Zero is returned for
/// TRITONSERVER_TYPE_BYTES because it have variable size. Zero is
/// returned for TRITONSERVER_TYPE_INVALID.
///
/// \param dtype The datatype.
/// \return The size of the datatype.
TRITONSERVER_DECLSPEC uint32_t
TRITONSERVER_DataTypeByteSize(TRITONSERVER_DataType datatype);
/// TRITONSERVER_MemoryType
///
/// Types of memory recognized by TRITONSERVER.
///
typedef enum TRITONSERVER_memorytype_enum {
TRITONSERVER_MEMORY_CPU,
TRITONSERVER_MEMORY_CPU_PINNED,
TRITONSERVER_MEMORY_GPU
} TRITONSERVER_MemoryType;
/// Get the string representation of a memory type. The returned
/// string is not owned by the caller and so should not be modified or
/// freed.
///
/// \param memtype The memory type.
/// \return The string representation of the memory type.
TRITONSERVER_DECLSPEC const char* TRITONSERVER_MemoryTypeString(
TRITONSERVER_MemoryType memtype);
/// TRITONSERVER_ParameterType
///
/// Types of parameters recognized by TRITONSERVER.
///
typedef enum TRITONSERVER_parametertype_enum {
TRITONSERVER_PARAMETER_STRING,
TRITONSERVER_PARAMETER_INT,
TRITONSERVER_PARAMETER_BOOL,
TRITONSERVER_PARAMETER_BYTES
} TRITONSERVER_ParameterType;
/// Get the string representation of a parameter type. The returned
/// string is not owned by the caller and so should not be modified or
/// freed.
///
/// \param paramtype The parameter type.
/// \return The string representation of the parameter type.
TRITONSERVER_DECLSPEC const char* TRITONSERVER_ParameterTypeString(
TRITONSERVER_ParameterType paramtype);
/// Create a new parameter object. The caller takes ownership of the
/// TRITONSERVER_Parameter object and must call TRITONSERVER_ParameterDelete to
/// release the object. The object will maintain its own copy of the 'value'
///
/// \param name The parameter name.
/// \param type The parameter type.
/// \param value The pointer to the value.
/// \return A new TRITONSERVER_Parameter object. 'nullptr' will be returned if
/// 'type' is 'TRITONSERVER_PARAMETER_BYTES'. The caller should use
/// TRITONSERVER_ParameterBytesNew to create parameter with bytes type.
TRITONSERVER_DECLSPEC TRITONSERVER_Parameter* TRITONSERVER_ParameterNew(
const char* name, const TRITONSERVER_ParameterType type, const void* value);
/// Create a new parameter object with type TRITONSERVER_PARAMETER_BYTES.
/// The caller takes ownership of the TRITONSERVER_Parameter object and must
/// call TRITONSERVER_ParameterDelete to release the object. The object only
/// maintains a shallow copy of the 'byte_ptr' so the data content must be
/// valid until the parameter object is deleted.
///
/// \param name The parameter name.
/// \param byte_ptr The pointer to the data content.
/// \param size The size of the data content.
/// \return A new TRITONSERVER_Error object.
TRITONSERVER_DECLSPEC TRITONSERVER_Parameter* TRITONSERVER_ParameterBytesNew(
const char* name, const void* byte_ptr, const uint64_t size);
/// Delete an parameter object.
///
/// \param parameter The parameter object.
TRITONSERVER_DECLSPEC void TRITONSERVER_ParameterDelete(
TRITONSERVER_Parameter* parameter);
/// TRITONSERVER_InstanceGroupKind
///
/// Kinds of instance groups recognized by TRITONSERVER.
///
typedef enum TRITONSERVER_instancegroupkind_enum {
TRITONSERVER_INSTANCEGROUPKIND_AUTO,
TRITONSERVER_INSTANCEGROUPKIND_CPU,
TRITONSERVER_INSTANCEGROUPKIND_GPU,
TRITONSERVER_INSTANCEGROUPKIND_MODEL
} TRITONSERVER_InstanceGroupKind;
/// Get the string representation of an instance-group kind. The
/// returned string is not owned by the caller and so should not be
/// modified or freed.
///
/// \param kind The instance-group kind.
/// \return The string representation of the kind.
TRITONSERVER_DECLSPEC const char* TRITONSERVER_InstanceGroupKindString(
TRITONSERVER_InstanceGroupKind kind);
/// TRITONSERVER_Logging
///
/// Types/levels of logging.
///
typedef enum TRITONSERVER_loglevel_enum {
TRITONSERVER_LOG_INFO,
TRITONSERVER_LOG_WARN,
TRITONSERVER_LOG_ERROR,
TRITONSERVER_LOG_VERBOSE
} TRITONSERVER_LogLevel;
///
/// Format of logging.
///
/// TRITONSERVER_LOG_DEFAULT: the log severity (L) and timestamp will be
/// logged as "LMMDD hh:mm:ss.ssssss".
///
/// TRITONSERVER_LOG_ISO8601: the log format will be "YYYY-MM-DDThh:mm:ssZ L".
///
typedef enum TRITONSERVER_logformat_enum {
TRITONSERVER_LOG_DEFAULT,
TRITONSERVER_LOG_ISO8601
} TRITONSERVER_LogFormat;
/// Is a log level enabled?
///
/// \param level The log level.
/// \return True if the log level is enabled, false if not enabled.
TRITONSERVER_DECLSPEC bool TRITONSERVER_LogIsEnabled(
TRITONSERVER_LogLevel level);
/// Log a message at a given log level if that level is enabled.
///
/// \param level The log level.
/// \param filename The file name of the location of the log message.
/// \param line The line number of the log message.
/// \param msg The log message.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_LogMessage(
TRITONSERVER_LogLevel level, const char* filename, const int line,
const char* msg);
/// TRITONSERVER_Error
///
/// Errors are reported by a TRITONSERVER_Error object. A NULL
/// TRITONSERVER_Error indicates no error, a non-NULL TRITONSERVER_Error
/// indicates error and the code and message for the error can be
/// retrieved from the object.
///
/// The caller takes ownership of a TRITONSERVER_Error object returned by
/// the API and must call TRITONSERVER_ErrorDelete to release the object.
///
/// The TRITONSERVER_Error error codes
typedef enum TRITONSERVER_errorcode_enum {
TRITONSERVER_ERROR_UNKNOWN,
TRITONSERVER_ERROR_INTERNAL,
TRITONSERVER_ERROR_NOT_FOUND,
TRITONSERVER_ERROR_INVALID_ARG,
TRITONSERVER_ERROR_UNAVAILABLE,
TRITONSERVER_ERROR_UNSUPPORTED,
TRITONSERVER_ERROR_ALREADY_EXISTS
} TRITONSERVER_Error_Code;
/// Create a new error object. The caller takes ownership of the
/// TRITONSERVER_Error object and must call TRITONSERVER_ErrorDelete to
/// release the object.
///
/// \param code The error code.
/// \param msg The error message.
/// \return A new TRITONSERVER_Error object.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ErrorNew(
TRITONSERVER_Error_Code code, const char* msg);
/// Delete an error object.
///
/// \param error The error object.
TRITONSERVER_DECLSPEC void TRITONSERVER_ErrorDelete(TRITONSERVER_Error* error);
/// Get the error code.
///
/// \param error The error object.
/// \return The error code.
TRITONSERVER_DECLSPEC TRITONSERVER_Error_Code
TRITONSERVER_ErrorCode(TRITONSERVER_Error* error);
/// Get the string representation of an error code. The returned
/// string is not owned by the caller and so should not be modified or
/// freed. The lifetime of the returned string extends only as long as
/// 'error' and must not be accessed once 'error' is deleted.
///
/// \param error The error object.
/// \return The string representation of the error code.
TRITONSERVER_DECLSPEC const char* TRITONSERVER_ErrorCodeString(
TRITONSERVER_Error* error);
/// Get the error message. The returned string is not owned by the
/// caller and so should not be modified or freed. The lifetime of the
/// returned string extends only as long as 'error' and must not be
/// accessed once 'error' is deleted.
///
/// \param error The error object.
/// \return The error message.
TRITONSERVER_DECLSPEC const char* TRITONSERVER_ErrorMessage(
TRITONSERVER_Error* error);
/// TRITONSERVER_ResponseAllocator
///
/// Object representing a memory allocator for output tensors in an
/// inference response.
///
/// Type for allocation function that allocates a buffer to hold an
/// output tensor.
///
/// \param allocator The allocator that is provided in the call to
/// TRITONSERVER_InferenceRequestSetResponseCallback.
/// \param tensor_name The name of the output tensor to allocate for.
/// \param byte_size The size of the buffer to allocate.
/// \param memory_type The type of memory that the caller prefers for
/// the buffer allocation.
/// \param memory_type_id The ID of the memory that the caller prefers
/// for the buffer allocation.
/// \param userp The user data pointer that is provided as
/// 'response_allocator_userp' in the call to
/// TRITONSERVER_InferenceRequestSetResponseCallback.
/// \param buffer Returns a pointer to the allocated memory.
/// \param buffer_userp Returns a user-specified value to associate
/// with the buffer, or nullptr if no user-specified value should be
/// associated with the buffer. This value will be provided in the
/// call to TRITONSERVER_ResponseAllocatorReleaseFn_t when the buffer
/// is released and will also be returned by
/// TRITONSERVER_InferenceResponseOutput.
/// \param actual_memory_type Returns the type of memory where the
/// allocation resides. May be different than the type of memory
/// requested by 'memory_type'.
/// \param actual_memory_type_id Returns the ID of the memory where
/// the allocation resides. May be different than the ID of the memory
/// requested by 'memory_type_id'.
/// \return a TRITONSERVER_Error object if a failure occurs while
/// attempting an allocation. If an error is returned all other return
/// values will be ignored.
typedef TRITONSERVER_Error* (*TRITONSERVER_ResponseAllocatorAllocFn_t)(
TRITONSERVER_ResponseAllocator* allocator, const char* tensor_name,
size_t byte_size, TRITONSERVER_MemoryType memory_type,
int64_t memory_type_id, void* userp, void** buffer, void** buffer_userp,
TRITONSERVER_MemoryType* actual_memory_type,
int64_t* actual_memory_type_id);
/// Type for allocation function that allocates a buffer to hold an
/// output tensor with buffer attributes. The callback function must fill in the
/// appropriate buffer attributes information related to this buffer. If set,
/// this function is always called after TRITONSERVER_ResponseAllocatorAllocFn_t
/// function.
///
/// \param allocator The allocator that is provided in the call to
/// TRITONSERVER_InferenceRequestSetResponseCallback.
/// \param tensor_name The name of the output tensor to allocate for.
/// \param buffer_attributes The buffer attributes associated with the buffer.
/// \param userp The user data pointer that is provided as
/// 'response_allocator_userp' in the call to
/// TRITONSERVER_InferenceRequestSetResponseCallback.
/// \param buffer_userp Returns a user-specified value to associate
/// with the buffer, or nullptr if no user-specified value should be
/// associated with the buffer. This value will be provided in the
/// call to TRITONSERVER_ResponseAllocatorReleaseFn_t when the buffer
/// is released and will also be returned by
/// TRITONSERVER_InferenceResponseOutput.
/// \return a TRITONSERVER_Error object if a failure occurs while
/// attempting an allocation. If an error is returned all other return
/// values will be ignored.
typedef TRITONSERVER_Error* (
*TRITONSERVER_ResponseAllocatorBufferAttributesFn_t)(
TRITONSERVER_ResponseAllocator* allocator, const char* tensor_name,
TRITONSERVER_BufferAttributes* buffer_attributes, void* userp,
void* buffer_userp);
/// Type for function that is called to query the allocator's preferred memory
/// type and memory type ID. As much as possible, the allocator should attempt
/// to return the same memory_type and memory_type_id values that will be
/// returned by the subsequent call to TRITONSERVER_ResponseAllocatorAllocFn_t.
/// But the allocator is not required to do so.
///
/// \param allocator The allocator that is provided in the call to
/// TRITONSERVER_InferenceRequestSetResponseCallback.
/// \param userp The user data pointer that is provided as
/// 'response_allocator_userp' in the call to
/// TRITONSERVER_InferenceRequestSetResponseCallback.
/// \param tensor_name The name of the output tensor. This is optional
/// and it should be set to nullptr to indicate that the tensor name has
/// not determined.
/// \param byte_size The expected size of the buffer. This is optional
/// and it should be set to nullptr to indicate that the byte size has
/// not determined.
/// \param memory_type Acts as both input and output. On input gives
/// the memory type preferred by the caller. Returns memory type preferred
/// by the allocator, taken account of the caller preferred type.
/// \param memory_type_id Acts as both input and output. On input gives
/// the memory type ID preferred by the caller. Returns memory type ID preferred
/// by the allocator, taken account of the caller preferred type ID.
/// \return a TRITONSERVER_Error object if a failure occurs.
typedef TRITONSERVER_Error* (*TRITONSERVER_ResponseAllocatorQueryFn_t)(
TRITONSERVER_ResponseAllocator* allocator, void* userp,
const char* tensor_name, size_t* byte_size,
TRITONSERVER_MemoryType* memory_type, int64_t* memory_type_id);
/// Type for function that is called when the server no longer holds
/// any reference to a buffer allocated by
/// TRITONSERVER_ResponseAllocatorAllocFn_t. In practice this function
/// is typically called when the response object associated with the
/// buffer is deleted by TRITONSERVER_InferenceResponseDelete.
///
/// \param allocator The allocator that is provided in the call to
/// TRITONSERVER_InferenceRequestSetResponseCallback.
/// \param buffer Pointer to the buffer to be freed.
/// \param buffer_userp The user-specified value associated
/// with the buffer in TRITONSERVER_ResponseAllocatorAllocFn_t.
/// \param byte_size The size of the buffer.
/// \param memory_type The type of memory holding the buffer.
/// \param memory_type_id The ID of the memory holding the buffer.
/// \return a TRITONSERVER_Error object if a failure occurs while
/// attempting the release. If an error is returned Triton will not
/// attempt to release the buffer again.
typedef TRITONSERVER_Error* (*TRITONSERVER_ResponseAllocatorReleaseFn_t)(
TRITONSERVER_ResponseAllocator* allocator, void* buffer, void* buffer_userp,
size_t byte_size, TRITONSERVER_MemoryType memory_type,
int64_t memory_type_id);
/// Type for function that is called to indicate that subsequent
/// allocation requests will refer to a new response.
///
/// \param allocator The allocator that is provided in the call to
/// TRITONSERVER_InferenceRequestSetResponseCallback.
/// \param userp The user data pointer that is provided as
/// 'response_allocator_userp' in the call to
/// TRITONSERVER_InferenceRequestSetResponseCallback.
/// \return a TRITONSERVER_Error object if a failure occurs.
typedef TRITONSERVER_Error* (*TRITONSERVER_ResponseAllocatorStartFn_t)(
TRITONSERVER_ResponseAllocator* allocator, void* userp);
/// Create a new response allocator object.
///
/// The response allocator object is used by Triton to allocate
/// buffers to hold the output tensors in inference responses. Most
/// models generate a single response for each inference request
/// (TRITONSERVER_TXN_ONE_TO_ONE). For these models the order of
/// callbacks will be:
///
/// TRITONSERVER_ServerInferAsync called
/// - start_fn : optional (and typically not required)
/// - alloc_fn : called once for each output tensor in response
/// TRITONSERVER_InferenceResponseDelete called
/// - release_fn: called once for each output tensor in response
///
/// For models that generate multiple responses for each inference
/// request (TRITONSERVER_TXN_DECOUPLED), the start_fn callback can be
/// used to determine sets of alloc_fn callbacks that belong to the
/// same response:
///
/// TRITONSERVER_ServerInferAsync called
/// - start_fn
/// - alloc_fn : called once for each output tensor in response
/// - start_fn
/// - alloc_fn : called once for each output tensor in response
/// ...
/// For each response, TRITONSERVER_InferenceResponseDelete called
/// - release_fn: called once for each output tensor in the response
///
/// In all cases the start_fn, alloc_fn and release_fn callback
/// functions must be thread-safe. Typically making these functions
/// thread-safe does not require explicit locking. The recommended way
/// to implement these functions is to have each inference request
/// provide a 'response_allocator_userp' object that is unique to that
/// request with TRITONSERVER_InferenceRequestSetResponseCallback. The
/// callback functions then operate only on this unique state. Locking
/// is required only when the callback function needs to access state
/// that is shared across inference requests (for example, a common
/// allocation pool).
///
/// \param allocator Returns the new response allocator object.
/// \param alloc_fn The function to call to allocate buffers for result
/// tensors.
/// \param release_fn The function to call when the server no longer
/// holds a reference to an allocated buffer.
/// \param start_fn The function to call to indicate that the
/// subsequent 'alloc_fn' calls are for a new response. This callback
/// is optional (use nullptr to indicate that it should not be
/// invoked).
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ResponseAllocatorNew(
TRITONSERVER_ResponseAllocator** allocator,
TRITONSERVER_ResponseAllocatorAllocFn_t alloc_fn,
TRITONSERVER_ResponseAllocatorReleaseFn_t release_fn,
TRITONSERVER_ResponseAllocatorStartFn_t start_fn);
/// Set the buffer attributes function for a response allocator object.
/// The function will be called after alloc_fn to set the buffer attributes
/// associated with the output buffer.
///
/// The thread-safy requirement for buffer_attributes_fn is the same as other
/// allocator callbacks.
///
/// \param allocator The response allocator object.
/// \param buffer_attributes_fn The function to call to get the buffer
/// attributes information for an allocated buffer.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ResponseAllocatorSetBufferAttributesFunction(
TRITONSERVER_ResponseAllocator* allocator,
TRITONSERVER_ResponseAllocatorBufferAttributesFn_t buffer_attributes_fn);
/// Set the query function to a response allocator object. Usually the
/// function will be called before alloc_fn to understand what is the
/// allocator's preferred memory type and memory type ID at the current
/// situation to make different execution decision.
///
/// The thread-safy requirement for query_fn is the same as other allocator
/// callbacks.
///
/// \param allocator The response allocator object.
/// \param query_fn The function to call to query allocator's preferred memory
/// type and memory type ID.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ResponseAllocatorSetQueryFunction(
TRITONSERVER_ResponseAllocator* allocator,
TRITONSERVER_ResponseAllocatorQueryFn_t query_fn);
/// Delete a response allocator.
///
/// \param allocator The response allocator object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ResponseAllocatorDelete(
TRITONSERVER_ResponseAllocator* allocator);
/// TRITONSERVER_Message
///
/// Object representing a Triton Server message.
///
/// Create a new message object from serialized JSON string.
///
/// \param message The message object.
/// \param base The base of the serialized JSON.
/// \param byte_size The size, in bytes, of the serialized message.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_MessageNewFromSerializedJson(
TRITONSERVER_Message** message, const char* base, size_t byte_size);
/// Delete a message object.
///
/// \param message The message object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_MessageDelete(
TRITONSERVER_Message* message);
/// Get the base and size of the buffer containing the serialized
/// message in JSON format. The buffer is owned by the
/// TRITONSERVER_Message object and should not be modified or freed by
/// the caller. The lifetime of the buffer extends only as long as
/// 'message' and must not be accessed once 'message' is deleted.
///
/// \param message The message object.
/// \param base Returns the base of the serialized message.
/// \param byte_size Returns the size, in bytes, of the serialized
/// message.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_MessageSerializeToJson(
TRITONSERVER_Message* message, const char** base, size_t* byte_size);
/// TRITONSERVER_Metrics
///
/// Object representing metrics.
///
/// Metric format types
typedef enum tritonserver_metricformat_enum {
TRITONSERVER_METRIC_PROMETHEUS
} TRITONSERVER_MetricFormat;
/// Delete a metrics object.
///
/// \param metrics The metrics object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_MetricsDelete(
TRITONSERVER_Metrics* metrics);
/// Get a buffer containing the metrics in the specified format. For
/// each format the buffer contains the following:
///
/// TRITONSERVER_METRIC_PROMETHEUS: 'base' points to a single multiline
/// string (char*) that gives a text representation of the metrics in
/// prometheus format. 'byte_size' returns the length of the string
/// in bytes.
///
/// The buffer is owned by the 'metrics' object and should not be
/// modified or freed by the caller. The lifetime of the buffer
/// extends only as long as 'metrics' and must not be accessed once
/// 'metrics' is deleted.
///
/// \param metrics The metrics object.
/// \param format The format to use for the returned metrics.
/// \param base Returns a pointer to the base of the formatted
/// metrics, as described above.
/// \param byte_size Returns the size, in bytes, of the formatted
/// metrics.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_MetricsFormatted(
TRITONSERVER_Metrics* metrics, TRITONSERVER_MetricFormat format,
const char** base, size_t* byte_size);
/// TRITONSERVER_InferenceTrace
///
/// Object that represents tracing for an inference request.
///
/// Trace levels. The trace level controls the type of trace
/// activities that are reported for an inference request.
///
/// Trace level values are power-of-2 and can be combined to trace
/// multiple types of activities. For example, use
/// (TRITONSERVER_TRACE_LEVEL_TIMESTAMPS |
/// TRITONSERVER_TRACE_LEVEL_TENSORS) to trace both timestamps and
/// tensors for an inference request.
///
/// TRITONSERVER_TRACE_LEVEL_MIN and TRITONSERVER_TRACE_LEVEL_MAX are
/// deprecated and should not be used.
typedef enum tritonserver_tracelevel_enum {
/// Tracing disabled. No trace activities are reported.
TRITONSERVER_TRACE_LEVEL_DISABLED = 0,
/// Deprecated. Use TRITONSERVER_TRACE_LEVEL_TIMESTAMPS.
TRITONSERVER_TRACE_LEVEL_MIN = 1,
/// Deprecated. Use TRITONSERVER_TRACE_LEVEL_TIMESTAMPS.
TRITONSERVER_TRACE_LEVEL_MAX = 2,
/// Record timestamps for the inference request.
TRITONSERVER_TRACE_LEVEL_TIMESTAMPS = 0x4,
/// Record input and output tensor values for the inference request.
TRITONSERVER_TRACE_LEVEL_TENSORS = 0x8
} TRITONSERVER_InferenceTraceLevel;
/// Get the string representation of a trace level. The returned
/// string is not owned by the caller and so should not be modified or
/// freed.
///
/// \param level The trace level.
/// \return The string representation of the trace level.
TRITONSERVER_DECLSPEC const char* TRITONSERVER_InferenceTraceLevelString(
TRITONSERVER_InferenceTraceLevel level);
/// Trace activities
typedef enum tritonserver_traceactivity_enum {
TRITONSERVER_TRACE_REQUEST_START = 0,
TRITONSERVER_TRACE_QUEUE_START = 1,
TRITONSERVER_TRACE_COMPUTE_START = 2,
TRITONSERVER_TRACE_COMPUTE_INPUT_END = 3,
TRITONSERVER_TRACE_COMPUTE_OUTPUT_START = 4,
TRITONSERVER_TRACE_COMPUTE_END = 5,
TRITONSERVER_TRACE_REQUEST_END = 6,
TRITONSERVER_TRACE_TENSOR_QUEUE_INPUT = 7,
TRITONSERVER_TRACE_TENSOR_BACKEND_INPUT = 8,
TRITONSERVER_TRACE_TENSOR_BACKEND_OUTPUT = 9
} TRITONSERVER_InferenceTraceActivity;
/// Get the string representation of a trace activity. The returned
/// string is not owned by the caller and so should not be modified or
/// freed.
///
/// \param activity The trace activity.
/// \return The string representation of the trace activity.
TRITONSERVER_DECLSPEC const char* TRITONSERVER_InferenceTraceActivityString(
TRITONSERVER_InferenceTraceActivity activity);
/// Type for trace timeline activity callback function. This callback function
/// is used to report activity occurring for a trace. This function
/// does not take ownership of 'trace' and so any information needed
/// from that object must be copied before returning. The 'userp' data
/// is the same as what is supplied in the call to
/// TRITONSERVER_InferenceTraceNew.
typedef void (*TRITONSERVER_InferenceTraceActivityFn_t)(
TRITONSERVER_InferenceTrace* trace,
TRITONSERVER_InferenceTraceActivity activity, uint64_t timestamp_ns,
void* userp);
/// Type for trace tensor activity callback function. This callback function
/// is used to report tensor activity occurring for a trace. This function
/// does not take ownership of 'trace' and so any information needed
/// from that object must be copied before returning. The 'userp' data
/// is the same as what is supplied in the call to
/// TRITONSERVER_InferenceTraceTensorNew.
typedef void (*TRITONSERVER_InferenceTraceTensorActivityFn_t)(
TRITONSERVER_InferenceTrace* trace,
TRITONSERVER_InferenceTraceActivity activity, const char* name,
TRITONSERVER_DataType datatype, const void* base, size_t byte_size,
const int64_t* shape, uint64_t dim_count,
TRITONSERVER_MemoryType memory_type, int64_t memory_type_id, void* userp);
/// Type for trace release callback function. This callback function
/// is called when all activity for the trace has completed. The
/// callback function takes ownership of the
/// TRITONSERVER_InferenceTrace object. The 'userp' data is the same
/// as what is supplied in the call to TRITONSERVER_InferenceTraceNew.
typedef void (*TRITONSERVER_InferenceTraceReleaseFn_t)(
TRITONSERVER_InferenceTrace* trace, void* userp);
/// Create a new inference trace object. The caller takes ownership of
/// the TRITONSERVER_InferenceTrace object and must call
/// TRITONSERVER_InferenceTraceDelete to release the object.
///
/// The activity callback function will be called to report activity
/// for 'trace' as well as for any child traces that are spawned by
/// 'trace', and so the activity callback must check the trace object
/// to determine specifically what activity is being reported.
///
/// The release callback is called for both 'trace' and for any child
/// traces spawned by 'trace'.
///
/// \param trace Returns the new inference trace object.
/// \param level The tracing level.
/// \param parent_id The parent trace id for this trace. A value of 0
/// indicates that there is not parent trace.
/// \param activity_fn The callback function where activity for the
/// trace is reported.
/// \param release_fn The callback function called when all activity
/// is complete for the trace.
/// \param trace_userp User-provided pointer that is delivered to
/// the activity and release callback functions.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceTraceNew(
TRITONSERVER_InferenceTrace** trace, TRITONSERVER_InferenceTraceLevel level,
uint64_t parent_id, TRITONSERVER_InferenceTraceActivityFn_t activity_fn,
TRITONSERVER_InferenceTraceReleaseFn_t release_fn, void* trace_userp);
/// Create a new inference trace object. The caller takes ownership of
/// the TRITONSERVER_InferenceTrace object and must call
/// TRITONSERVER_InferenceTraceDelete to release the object.
///
/// The timeline and tensor activity callback function will be called to report
/// activity for 'trace' as well as for any child traces that are spawned by
/// 'trace', and so the activity callback must check the trace object
/// to determine specifically what activity is being reported.
///
/// The release callback is called for both 'trace' and for any child
/// traces spawned by 'trace'.
///
/// \param trace Returns the new inference trace object.
/// \param level The tracing level.
/// \param parent_id The parent trace id for this trace. A value of 0
/// indicates that there is not parent trace.
/// \param activity_fn The callback function where timeline activity for the
/// trace is reported.
/// \param tensor_activity_fn The callback function where tensor activity for
/// the trace is reported.
/// \param release_fn The callback function called when all activity
/// is complete for the trace.
/// \param trace_userp User-provided pointer that is delivered to
/// the activity and release callback functions.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceTraceTensorNew(
TRITONSERVER_InferenceTrace** trace, TRITONSERVER_InferenceTraceLevel level,
uint64_t parent_id, TRITONSERVER_InferenceTraceActivityFn_t activity_fn,
TRITONSERVER_InferenceTraceTensorActivityFn_t tensor_activity_fn,
TRITONSERVER_InferenceTraceReleaseFn_t release_fn, void* trace_userp);
/// Delete a trace object.
///
/// \param trace The trace object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceTraceDelete(
TRITONSERVER_InferenceTrace* trace);
/// Get the id associated with a trace. Every trace is assigned an id
/// that is unique across all traces created for a Triton server.
///
/// \param trace The trace.
/// \param id Returns the id associated with the trace.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceTraceId(
TRITONSERVER_InferenceTrace* trace, uint64_t* id);
/// Get the parent id associated with a trace. The parent id indicates
/// a parent-child relationship between two traces. A parent id value
/// of 0 indicates that there is no parent trace.
///
/// \param trace The trace.
/// \param id Returns the parent id associated with the trace.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceTraceParentId(
TRITONSERVER_InferenceTrace* trace, uint64_t* parent_id);
/// Get the name of the model associated with a trace. The caller does
/// not own the returned string and must not modify or delete it. The
/// lifetime of the returned string extends only as long as 'trace'.
///
/// \param trace The trace.
/// \param model_name Returns the name of the model associated with
/// the trace.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceTraceModelName(
TRITONSERVER_InferenceTrace* trace, const char** model_name);
/// Get the version of the model associated with a trace.
///
/// \param trace The trace.
/// \param model_version Returns the version of the model associated
/// with the trace.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_InferenceTraceModelVersion(
TRITONSERVER_InferenceTrace* trace, int64_t* model_version);
/// TRITONSERVER_InferenceRequest
///
/// Object representing an inference request. The inference request
/// provides the meta-data and input tensor values needed for an
/// inference and returns the inference result meta-data and output
/// tensors. An inference request object can be modified and reused
/// multiple times.
///
/// Inference request flags. The enum values must be power-of-2 values.
typedef enum tritonserver_requestflag_enum {
TRITONSERVER_REQUEST_FLAG_SEQUENCE_START = 1,
TRITONSERVER_REQUEST_FLAG_SEQUENCE_END = 2
} TRITONSERVER_RequestFlag;
/// Inference request release flags. The enum values must be
/// power-of-2 values.
typedef enum tritonserver_requestreleaseflag_enum {
TRITONSERVER_REQUEST_RELEASE_ALL = 1
} TRITONSERVER_RequestReleaseFlag;
/// Inference response complete flags. The enum values must be
/// power-of-2 values.
typedef enum tritonserver_responsecompleteflag_enum {
TRITONSERVER_RESPONSE_COMPLETE_FINAL = 1
} TRITONSERVER_ResponseCompleteFlag;
/// Type for inference request release callback function. The callback
/// indicates what type of release is being performed on the request
/// and for some of these the callback function takes ownership of the
/// TRITONSERVER_InferenceRequest object. The 'userp' data is the data
/// provided as 'request_release_userp' in the call to
/// TRITONSERVER_InferenceRequestSetReleaseCallback.
///
/// One or more flags will be specified when the callback is invoked,
/// and the callback must take the following actions:
///
/// - TRITONSERVER_REQUEST_RELEASE_ALL: The entire inference request
/// is being released and ownership is passed to the callback
/// function. Triton will not longer access the 'request' object
/// itself nor any input tensor data associated with the
/// request. The callback should free or otherwise manage the
/// 'request' object and all associated tensor data.
///
/// Note that currently TRITONSERVER_REQUEST_RELEASE_ALL should always
/// be set when the callback is invoked but in the future that may
/// change, so the callback should explicitly check for the flag
/// before taking ownership of the request object.
///
typedef void (*TRITONSERVER_InferenceRequestReleaseFn_t)(
TRITONSERVER_InferenceRequest* request, const uint32_t flags, void* userp);
/// Type for callback function indicating that an inference response
/// has completed. The callback function takes ownership of the
/// TRITONSERVER_InferenceResponse object. The 'userp' data is the
/// data provided as 'response_userp' in the call to
/// TRITONSERVER_InferenceRequestSetResponseCallback.
///
/// One or more flags may be specified when the callback is invoked:
///
/// - TRITONSERVER_RESPONSE_COMPLETE_FINAL: Indicates that no more
/// responses will be generated for a given request (more
/// specifically, that no more responses will be generated for the
/// inference request that set this callback and 'userp'). When
/// this flag is set 'response' may be a response object or may be
/// nullptr. If 'response' is not nullptr, then 'response' is the
/// last response that Triton will produce for the request. If
/// 'response' is nullptr then Triton is indicating that no more
/// responses will be produced for the request.
typedef void (*TRITONSERVER_InferenceResponseCompleteFn_t)(
TRITONSERVER_InferenceResponse* response, const uint32_t flags,
void* userp);
/// Create a new inference request object.
///
/// \param inference_request Returns the new request object.
/// \param server the inference server object.
/// \param model_name The name of the model to use for the request.
/// \param model_version The version of the model to use for the
/// request. If -1 then the server will choose a version based on the
/// model's policy.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceRequestNew(
TRITONSERVER_InferenceRequest** inference_request,
TRITONSERVER_Server* server, const char* model_name,
const int64_t model_version);
/// Delete an inference request object.
///
/// \param inference_request The request object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceRequestDelete(
TRITONSERVER_InferenceRequest* inference_request);
/// Get the ID for a request. The returned ID is owned by
/// 'inference_request' and must not be modified or freed by the
/// caller.
///
/// \param inference_request The request object.
/// \param id Returns the ID.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceRequestId(
TRITONSERVER_InferenceRequest* inference_request, const char** id);
/// Set the ID for a request.
///
/// \param inference_request The request object.
/// \param id The ID.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceRequestSetId(
TRITONSERVER_InferenceRequest* inference_request, const char* id);
/// Get the flag(s) associated with a request. On return 'flags' holds
/// a bitwise-or of all flag values, see TRITONSERVER_RequestFlag for
/// available flags.
///
/// \param inference_request The request object.
/// \param flags Returns the flags.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceRequestFlags(
TRITONSERVER_InferenceRequest* inference_request, uint32_t* flags);
/// Set the flag(s) associated with a request. 'flags' should hold a
/// bitwise-or of all flag values, see TRITONSERVER_RequestFlag for
/// available flags.
///
/// \param inference_request The request object.
/// \param flags The flags.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceRequestSetFlags(
TRITONSERVER_InferenceRequest* inference_request, uint32_t flags);
/// Get the correlation ID of the inference request as an unsigned integer.
/// Default is 0, which indicates that the request has no correlation ID.
/// If the correlation id associated with the inference request is a string,
/// this function will return a failure. The correlation ID is used
/// to indicate two or more inference request are related to each other.
/// How this relationship is handled by the inference server is determined by
/// the model's scheduling policy.
///
/// \param inference_request The request object.
/// \param correlation_id Returns the correlation ID.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_InferenceRequestCorrelationId(
TRITONSERVER_InferenceRequest* inference_request, uint64_t* correlation_id);
/// Get the correlation ID of the inference request as a string.
/// Default is empty "", which indicates that the request has no correlation ID.
/// If the correlation id associated with the inference request is an unsigned
/// integer, then this function will return a failure. The correlation ID
/// is used to indicate two or more inference request are related to each other.
/// How this relationship is handled by the inference server is determined by
/// the model's scheduling policy.
///
/// \param inference_request The request object.
/// \param correlation_id Returns the correlation ID.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_InferenceRequestCorrelationIdString(
TRITONSERVER_InferenceRequest* inference_request,
const char** correlation_id);
/// Set the correlation ID of the inference request to be an unsigned integer.
/// Default is 0, which indicates that the request has no correlation ID.
/// The correlation ID is used to indicate two or more inference request
/// are related to each other. How this relationship is handled by the
/// inference server is determined by the model's scheduling policy.
///
/// \param inference_request The request object.
/// \param correlation_id The correlation ID.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_InferenceRequestSetCorrelationId(
TRITONSERVER_InferenceRequest* inference_request, uint64_t correlation_id);
/// Set the correlation ID of the inference request to be a string.
/// The correlation ID is used to indicate two or more inference
/// request are related to each other. How this relationship is
/// handled by the inference server is determined by the model's
/// scheduling policy.
///
/// \param inference_request The request object.
/// \param correlation_id The correlation ID.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_InferenceRequestSetCorrelationIdString(
TRITONSERVER_InferenceRequest* inference_request,
const char* correlation_id);
/// Get the priority for a request. The default is 0 indicating that
/// the request does not specify a priority and so will use the
/// model's default priority.
///
/// \param inference_request The request object.
/// \param priority Returns the priority level.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceRequestPriority(
TRITONSERVER_InferenceRequest* inference_request, uint32_t* priority);
/// Set the priority for a request. The default is 0 indicating that
/// the request does not specify a priority and so will use the
/// model's default priority.
///
/// \param inference_request The request object.
/// \param priority The priority level.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_InferenceRequestSetPriority(
TRITONSERVER_InferenceRequest* inference_request, uint32_t priority);
/// Get the timeout for a request, in microseconds. The default is 0
/// which indicates that the request has no timeout.
///
/// \param inference_request The request object.
/// \param timeout_us Returns the timeout, in microseconds.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_InferenceRequestTimeoutMicroseconds(
TRITONSERVER_InferenceRequest* inference_request, uint64_t* timeout_us);
/// Set the timeout for a request, in microseconds. The default is 0
/// which indicates that the request has no timeout.
///
/// \param inference_request The request object.
/// \param timeout_us The timeout, in microseconds.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_InferenceRequestSetTimeoutMicroseconds(
TRITONSERVER_InferenceRequest* inference_request, uint64_t timeout_us);
/// Add an input to a request.
///
/// \param inference_request The request object.
/// \param name The name of the input.
/// \param datatype The type of the input. Valid type names are BOOL,
/// UINT8, UINT16, UINT32, UINT64, INT8, INT16, INT32, INT64, FP16,
/// FP32, FP64, and BYTES.
/// \param shape The shape of the input.
/// \param dim_count The number of dimensions of 'shape'.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceRequestAddInput(
TRITONSERVER_InferenceRequest* inference_request, const char* name,
const TRITONSERVER_DataType datatype, const int64_t* shape,
uint64_t dim_count);
/// Add a raw input to a request. The name recognized by the model, data type
/// and shape of the input will be deduced from model configuration.
/// This function must be called at most once on request with no other input to
/// ensure the deduction is accurate.
///
/// \param inference_request The request object.
/// \param name The name of the input. This name is only used as a reference
/// of the raw input in other Tritonserver APIs. It doesn't assoicate with the
/// name used in the model.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_InferenceRequestAddRawInput(
TRITONSERVER_InferenceRequest* inference_request, const char* name);
/// Remove an input from a request.
///
/// \param inference_request The request object.
/// \param name The name of the input.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_InferenceRequestRemoveInput(
TRITONSERVER_InferenceRequest* inference_request, const char* name);
/// Remove all inputs from a request.
///
/// \param inference_request The request object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_InferenceRequestRemoveAllInputs(
TRITONSERVER_InferenceRequest* inference_request);
/// Assign a buffer of data to an input. The buffer will be appended
/// to any existing buffers for that input. The 'inference_request'
/// object takes ownership of the buffer and so the caller should not
/// modify or free the buffer until that ownership is released by
/// 'inference_request' being deleted or by the input being removed
/// from 'inference_request'.
///
/// \param inference_request The request object.
/// \param name The name of the input.
/// \param base The base address of the input data.
/// \param byte_size The size, in bytes, of the input data.
/// \param memory_type The memory type of the input data.
/// \param memory_type_id The memory type id of the input data.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_InferenceRequestAppendInputData(
TRITONSERVER_InferenceRequest* inference_request, const char* name,
const void* base, size_t byte_size, TRITONSERVER_MemoryType memory_type,
int64_t memory_type_id);
/// Assign a buffer of data to an input for execution on all model instances
/// with the specified host policy. The buffer will be appended to any existing
/// buffers for that input on all devices with this host policy. The
/// 'inference_request' object takes ownership of the buffer and so the caller
/// should not modify or free the buffer until that ownership is released by
/// 'inference_request' being deleted or by the input being removed from
/// 'inference_request'. If the execution is scheduled on a device that does not
/// have a input buffer specified using this function, then the input buffer
/// specified with TRITONSERVER_InferenceRequestAppendInputData will be used so
/// a non-host policy specific version of data must be added using that API.
/// \param inference_request The request object.
/// \param name The name of the input.
/// \param base The base address of the input data.
/// \param byte_size The size, in bytes, of the input data.
/// \param memory_type The memory type of the input data.
/// \param memory_type_id The memory type id of the input data.
/// \param host_policy_name All model instances executing with this host_policy
/// will use this input buffer for execution.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_InferenceRequestAppendInputDataWithHostPolicy(
TRITONSERVER_InferenceRequest* inference_request, const char* name,
const void* base, size_t byte_size, TRITONSERVER_MemoryType memory_type,
int64_t memory_type_id, const char* host_policy_name);
/// Assign a buffer of data to an input. The buffer will be appended
/// to any existing buffers for that input. The 'inference_request'
/// object takes ownership of the buffer and so the caller should not
/// modify or free the buffer until that ownership is released by
/// 'inference_request' being deleted or by the input being removed
/// from 'inference_request'.
///
/// \param inference_request The request object.
/// \param name The name of the input.
/// \param base The base address of the input data.
/// \param buffer_attributes The buffer attrubutes of the input.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_InferenceRequestAppendInputDataWithBufferAttributes(
TRITONSERVER_InferenceRequest* inference_request, const char* name,
const void* base, TRITONSERVER_BufferAttributes* buffer_attributes);
/// Clear all input data from an input, releasing ownership of the
/// buffer(s) that were appended to the input with
/// TRITONSERVER_InferenceRequestAppendInputData or
/// TRITONSERVER_InferenceRequestAppendInputDataWithHostPolicy
/// \param inference_request The request object.
/// \param name The name of the input.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_InferenceRequestRemoveAllInputData(
TRITONSERVER_InferenceRequest* inference_request, const char* name);
/// Add an output request to an inference request.
///
/// \param inference_request The request object.
/// \param name The name of the output.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_InferenceRequestAddRequestedOutput(
TRITONSERVER_InferenceRequest* inference_request, const char* name);
/// Remove an output request from an inference request.
///
/// \param inference_request The request object.
/// \param name The name of the output.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_InferenceRequestRemoveRequestedOutput(
TRITONSERVER_InferenceRequest* inference_request, const char* name);
/// Remove all output requests from an inference request.
///
/// \param inference_request The request object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_InferenceRequestRemoveAllRequestedOutputs(
TRITONSERVER_InferenceRequest* inference_request);
/// Set the release callback for an inference request. The release
/// callback is called by Triton to return ownership of the request
/// object.
///
/// \param inference_request The request object.
/// \param request_release_fn The function called to return ownership
/// of the 'inference_request' object.
/// \param request_release_userp User-provided pointer that is
/// delivered to the 'request_release_fn' callback.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_InferenceRequestSetReleaseCallback(
TRITONSERVER_InferenceRequest* inference_request,
TRITONSERVER_InferenceRequestReleaseFn_t request_release_fn,
void* request_release_userp);
/// Set the allocator and response callback for an inference
/// request. The allocator is used to allocate buffers for any output
/// tensors included in responses that are produced for this
/// request. The response callback is called to return response
/// objects representing responses produced for this request.
///
/// \param inference_request The request object.
/// \param response_allocator The TRITONSERVER_ResponseAllocator to use
/// to allocate buffers to hold inference results.
/// \param response_allocator_userp User-provided pointer that is
/// delivered to the response allocator's start and allocation functions.
/// \param response_fn The function called to deliver an inference
/// response for this request.
/// \param response_userp User-provided pointer that is delivered to
/// the 'response_fn' callback.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_InferenceRequestSetResponseCallback(
TRITONSERVER_InferenceRequest* inference_request,
TRITONSERVER_ResponseAllocator* response_allocator,
void* response_allocator_userp,
TRITONSERVER_InferenceResponseCompleteFn_t response_fn,
void* response_userp);
/// TRITONSERVER_InferenceResponse
///
/// Object representing an inference response. The inference response
/// provides the meta-data and output tensor values calculated by the
/// inference.
///
/// Delete an inference response object.
///
/// \param inference_response The response object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceResponseDelete(
TRITONSERVER_InferenceResponse* inference_response);
/// Return the error status of an inference response. Return a
/// TRITONSERVER_Error object on failure, return nullptr on success.
/// The returned error object is owned by 'inference_response' and so
/// should not be deleted by the caller.
///
/// \param inference_response The response object.
/// \return a TRITONSERVER_Error indicating the success or failure
/// status of the response.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceResponseError(
TRITONSERVER_InferenceResponse* inference_response);
/// Get model used to produce a response. The caller does not own the
/// returned model name value and must not modify or delete it. The
/// lifetime of all returned values extends until 'inference_response'
/// is deleted.
///
/// \param inference_response The response object.
/// \param model_name Returns the name of the model.
/// \param model_version Returns the version of the model.
/// this response.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceResponseModel(
TRITONSERVER_InferenceResponse* inference_response, const char** model_name,
int64_t* model_version);
/// Get the ID of the request corresponding to a response. The caller
/// does not own the returned ID and must not modify or delete it. The
/// lifetime of all returned values extends until 'inference_response'
/// is deleted.
///
/// \param inference_response The response object.
/// \param request_id Returns the ID of the request corresponding to
/// this response.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceResponseId(
TRITONSERVER_InferenceResponse* inference_response,
const char** request_id);
/// Get the number of parameters available in the response.
///
/// \param inference_response The response object.
/// \param count Returns the number of parameters.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_InferenceResponseParameterCount(
TRITONSERVER_InferenceResponse* inference_response, uint32_t* count);
/// Get all information about a parameter. The caller does not own any
/// of the returned values and must not modify or delete them. The
/// lifetime of all returned values extends until 'inference_response'
/// is deleted.
///
/// The 'vvalue' returns a void* pointer that must be cast
/// appropriately based on 'type'. For example:
///
/// void* vvalue;
/// TRITONSERVER_ParameterType type;
/// TRITONSERVER_InferenceResponseParameter(
/// response, index, &name, &type, &vvalue);
/// switch (type) {
/// case TRITONSERVER_PARAMETER_BOOL:
/// bool value = *(reinterpret_cast<bool*>(vvalue));
/// ...
/// case TRITONSERVER_PARAMETER_INT:
/// int64_t value = *(reinterpret_cast<int64_t*>(vvalue));
/// ...
/// case TRITONSERVER_PARAMETER_STRING:
/// const char* value = reinterpret_cast<const char*>(vvalue);
/// ...
///
/// \param inference_response The response object.
/// \param index The index of the parameter, must be 0 <= index <
/// count, where 'count' is the value returned by
/// TRITONSERVER_InferenceResponseParameterCount.
/// \param name Returns the name of the parameter.
/// \param type Returns the type of the parameter.
/// \param vvalue Returns a pointer to the parameter value.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_InferenceResponseParameter(
TRITONSERVER_InferenceResponse* inference_response, const uint32_t index,
const char** name, TRITONSERVER_ParameterType* type, const void** vvalue);
/// Get the number of outputs available in the response.
///
/// \param inference_response The response object.
/// \param count Returns the number of output tensors.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_InferenceResponseOutputCount(
TRITONSERVER_InferenceResponse* inference_response, uint32_t* count);
/// Get all information about an output tensor. The tensor data is
/// returned as the base pointer to the data and the size, in bytes,
/// of the data. The caller does not own any of the returned values
/// and must not modify or delete them. The lifetime of all returned
/// values extends until 'inference_response' is deleted.
///
/// \param inference_response The response object.
/// \param index The index of the output tensor, must be 0 <= index <
/// count, where 'count' is the value returned by
/// TRITONSERVER_InferenceResponseOutputCount.
/// \param name Returns the name of the output.
/// \param datatype Returns the type of the output.
/// \param shape Returns the shape of the output.
/// \param dim_count Returns the number of dimensions of the returned
/// shape.
/// \param base Returns the tensor data for the output.
/// \param byte_size Returns the size, in bytes, of the data.
/// \param memory_type Returns the memory type of the data.
/// \param memory_type_id Returns the memory type id of the data.
/// \param userp The user-specified value associated with the buffer
/// in TRITONSERVER_ResponseAllocatorAllocFn_t.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceResponseOutput(
TRITONSERVER_InferenceResponse* inference_response, const uint32_t index,
const char** name, TRITONSERVER_DataType* datatype, const int64_t** shape,
uint64_t* dim_count, const void** base, size_t* byte_size,
TRITONSERVER_MemoryType* memory_type, int64_t* memory_type_id,
void** userp);
/// Get a classification label associated with an output for a given
/// index. The caller does not own the returned label and must not
/// modify or delete it. The lifetime of all returned label extends
/// until 'inference_response' is deleted.
///
/// \param inference_response The response object.
/// \param index The index of the output tensor, must be 0 <= index <
/// count, where 'count' is the value returned by
/// TRITONSERVER_InferenceResponseOutputCount.
/// \param class_index The index of the class.
/// \param name Returns the label corresponding to 'class_index' or
/// nullptr if no label.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_InferenceResponseOutputClassificationLabel(
TRITONSERVER_InferenceResponse* inference_response, const uint32_t index,
const size_t class_index, const char** label);
/// TRITONSERVER_BufferAttributes
///
/// API to create, modify, or retrieve attributes associated with a buffer.
///
/// Create a new buffer attributes object. The caller takes ownership of
/// the TRITONSERVER_BufferAttributes object and must call
/// TRITONSERVER_BufferAttributesDelete to release the object.
///
/// \param buffer_attributes Returns the new buffer attributes object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_BufferAttributesNew(
TRITONSERVER_BufferAttributes** buffer_attributes);
/// Delete a buffer attributes object.
///
/// \param buffer_attributes The buffer_attributes object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_BufferAttributesDelete(
TRITONSERVER_BufferAttributes* buffer_attributes);
/// Set the memory type id field of the buffer attributes.
///
/// \param buffer_attributes The buffer attributes object.
/// \param memory_type_id Memory type id to assign to the buffer attributes
/// object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_BufferAttributesSetMemoryTypeId(
TRITONSERVER_BufferAttributes* buffer_attributes, int64_t memory_type_id);
/// Set the memory type field of the buffer attributes.
///
/// \param buffer_attributes The buffer attributes object.
/// \param memory_type Memory type to assign to the buffer attributes object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_BufferAttributesSetMemoryType(
TRITONSERVER_BufferAttributes* buffer_attributes,
TRITONSERVER_MemoryType memory_type);
/// Set the CudaIpcHandle field of the buffer attributes.
///
/// \param buffer_attributes The buffer attributes object.
/// \param cuda_ipc_handle The CudaIpcHandle to assign to the buffer attributes
/// object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_BufferAttributesSetCudaIpcHandle(
TRITONSERVER_BufferAttributes* buffer_attributes, void* cuda_ipc_handle);
/// Set the byte size field of the buffer attributes.
///
/// \param buffer_attributes The buffer attributes object.
/// \param byte_size Byte size to assign to the buffer attributes object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_BufferAttributesSetByteSize(
TRITONSERVER_BufferAttributes* buffer_attributes, size_t byte_size);
/// Get the memory type id field of the buffer attributes.
///
/// \param buffer_attributes The buffer attributes object.
/// \param memory_type_id Returns the memory type id associated with the buffer
/// attributes object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_BufferAttributesMemoryTypeId(
TRITONSERVER_BufferAttributes* buffer_attributes, int64_t* memory_type_id);
/// Get the memory type field of the buffer attributes.
///
/// \param buffer_attributes The buffer attributes object.
/// \param memory_type Returns the memory type associated with the buffer
/// attributes object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_BufferAttributesMemoryType(
TRITONSERVER_BufferAttributes* buffer_attributes,
TRITONSERVER_MemoryType* memory_type);
/// Get the CudaIpcHandle field of the buffer attributes object.
///
/// \param buffer_attributes The buffer attributes object.
/// \param cuda_ipc_handle Returns the memory type associated with the buffer
/// attributes object. If the cudaIpcHandle does not exist for the buffer,
/// nullptr will be returned.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_BufferAttributesCudaIpcHandle(
TRITONSERVER_BufferAttributes* buffer_attributes, void** cuda_ipc_handle);
/// Get the byte size field of the buffer attributes.
///
/// \param buffer_attributes The buffer attributes object.
/// \param byte_size Returns the byte size associated with the buffer attributes
/// object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_BufferAttributesByteSize(
TRITONSERVER_BufferAttributes* buffer_attributes, size_t* byte_size);
/// TRITONSERVER_ServerOptions
///
/// Options to use when creating an inference server.
///
/// Model control modes
typedef enum tritonserver_modelcontrolmode_enum {
TRITONSERVER_MODEL_CONTROL_NONE,
TRITONSERVER_MODEL_CONTROL_POLL,
TRITONSERVER_MODEL_CONTROL_EXPLICIT
} TRITONSERVER_ModelControlMode;
/// Rate limit modes
typedef enum tritonserver_ratelimitmode_enum {
TRITONSERVER_RATE_LIMIT_OFF,
TRITONSERVER_RATE_LIMIT_EXEC_COUNT
} TRITONSERVER_RateLimitMode;
/// Create a new server options object. The caller takes ownership of
/// the TRITONSERVER_ServerOptions object and must call
/// TRITONSERVER_ServerOptionsDelete to release the object.
///
/// \param options Returns the new server options object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerOptionsNew(
TRITONSERVER_ServerOptions** options);
/// Delete a server options object.
///
/// \param options The server options object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerOptionsDelete(
TRITONSERVER_ServerOptions* options);
/// Set the textual ID for the server in a server options. The ID is a
/// name that identifies the server.
///
/// \param options The server options object.
/// \param server_id The server identifier.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerOptionsSetServerId(
TRITONSERVER_ServerOptions* options, const char* server_id);
/// Set the model repository path in a server options. The path must be
/// the full absolute path to the model repository. This function can be called
/// multiple times with different paths to set multiple model repositories.
/// Note that if a model is not unique across all model repositories
/// at any time, the model will not be available.
///
/// \param options The server options object.
/// \param model_repository_path The full path to the model repository.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerOptionsSetModelRepositoryPath(
TRITONSERVER_ServerOptions* options, const char* model_repository_path);
/// Set the model control mode in a server options. For each mode the models
/// will be managed as the following:
///
/// TRITONSERVER_MODEL_CONTROL_NONE: the models in model repository will be
/// loaded on startup. After startup any changes to the model repository will
/// be ignored. Calling TRITONSERVER_ServerPollModelRepository will result in
/// an error.
///
/// TRITONSERVER_MODEL_CONTROL_POLL: the models in model repository will be
/// loaded on startup. The model repository can be polled periodically using
/// TRITONSERVER_ServerPollModelRepository and the server will load, unload,
/// and updated models according to changes in the model repository.
///
/// TRITONSERVER_MODEL_CONTROL_EXPLICIT: the models in model repository will
/// not be loaded on startup. The corresponding model control APIs must be
/// called to load / unload a model in the model repository.
///
/// \param options The server options object.
/// \param mode The mode to use for the model control.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerOptionsSetModelControlMode(
TRITONSERVER_ServerOptions* options, TRITONSERVER_ModelControlMode mode);
/// Set the model to be loaded at startup in a server options. The model must be
/// present in one, and only one, of the specified model repositories.
/// This function can be called multiple times with different model name
/// to set multiple startup models.
/// Note that it only takes affect on TRITONSERVER_MODEL_CONTROL_EXPLICIT mode.
///
/// \param options The server options object.
/// \param mode_name The name of the model to load on startup.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerOptionsSetStartupModel(
TRITONSERVER_ServerOptions* options, const char* model_name);
/// Enable or disable strict model configuration handling in a server
/// options.
///
/// \param options The server options object.
/// \param strict True to enable strict model configuration handling,
/// false to disable.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerOptionsSetStrictModelConfig(
TRITONSERVER_ServerOptions* options, bool strict);
/// Set the rate limit mode in a server options.
///
/// TRITONSERVER_RATE_LIMIT_EXEC_COUNT: The rate limiting prioritizes the
/// inference execution using the number of times each instance has got a
/// chance to run. The execution gets to run only when its resource
/// constraints are satisfied.
///
/// TRITONSERVER_RATE_LIMIT_OFF: The rate limiting is turned off and the
/// inference gets executed whenever an instance is available.
///
/// \param options The server options object.
/// \param mode The mode to use for the rate limiting. By default, execution
/// count is used to determine the priorities.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerOptionsSetRateLimiterMode(
TRITONSERVER_ServerOptions* options, TRITONSERVER_RateLimitMode mode);
/// Add resource count for rate limiting.
///
/// \param options The server options object.
/// \param name The name of the resource.
/// \param count The count of the resource.
/// \param device The device identifier for the resource. A value of -1
/// indicates that the specified number of resources are available on every
/// device. The device value is ignored for a global resource. The server
/// will use the rate limiter configuration specified for instance groups
/// in model config to determine whether resource is global. In case of
/// conflicting resource type in different model configurations, server
/// will raise an appropriate error while loading model.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerOptionsAddRateLimiterResource(
TRITONSERVER_ServerOptions* options, const char* resource_name,
const size_t resource_count, const int device);
/// Set the total pinned memory byte size that the server can allocate
/// in a server options. The pinned memory pool will be shared across
/// Triton itself and the backends that use
/// TRITONBACKEND_MemoryManager to allocate memory.
///
/// \param options The server options object.
/// \param size The pinned memory pool byte size.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerOptionsSetPinnedMemoryPoolByteSize(
TRITONSERVER_ServerOptions* options, uint64_t size);
/// Set the total CUDA memory byte size that the server can allocate
/// on given GPU device in a server options. The pinned memory pool
/// will be shared across Triton itself and the backends that use
/// TRITONBACKEND_MemoryManager to allocate memory.
///
/// \param options The server options object.
/// \param gpu_device The GPU device to allocate the memory pool.
/// \param size The CUDA memory pool byte size.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerOptionsSetCudaMemoryPoolByteSize(
TRITONSERVER_ServerOptions* options, int gpu_device, uint64_t size);
/// Set the total response cache byte size that the server can allocate in CPU
/// memory. The response cache will be shared across all inference requests and
/// across all models.
///
/// \param options The server options object.
/// \param size The total response cache byte size.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerOptionsSetResponseCacheByteSize(
TRITONSERVER_ServerOptions* options, uint64_t size);
/// Set the minimum support CUDA compute capability in a server
/// options.
///
/// \param options The server options object.
/// \param cc The minimum CUDA compute capability.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerOptionsSetMinSupportedComputeCapability(
TRITONSERVER_ServerOptions* options, double cc);
/// Enable or disable exit-on-error in a server options.
///
/// \param options The server options object.
/// \param exit True to enable exiting on intialization error, false
/// to continue.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerOptionsSetExitOnError(
TRITONSERVER_ServerOptions* options, bool exit);
/// Enable or disable strict readiness handling in a server options.
///
/// \param options The server options object.
/// \param strict True to enable strict readiness handling, false to
/// disable.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerOptionsSetStrictReadiness(
TRITONSERVER_ServerOptions* options, bool strict);
/// Set the exit timeout, in seconds, for the server in a server
/// options.
///
/// \param options The server options object.
/// \param timeout The exit timeout, in seconds.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerOptionsSetExitTimeout(
TRITONSERVER_ServerOptions* options, unsigned int timeout);
/// Set the number of threads used in buffer manager in a server options.
///
/// \param options The server options object.
/// \param thread_count The number of threads.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerOptionsSetBufferManagerThreadCount(
TRITONSERVER_ServerOptions* options, unsigned int thread_count);
/// Set the number of threads to concurrently load models in a server options.
///
/// \param options The server options object.
/// \param thread_count The number of threads.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerOptionsSetModelLoadThreadCount(
TRITONSERVER_ServerOptions* options, unsigned int thread_count);
/// Provide a log output file.
///
/// \param options The server options object.
/// \param file a string defining the file where the log outputs will be saved.
/// An empty string for the file name will cause triton to direct logging
/// facilities to the console
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerOptionsSetLogFile(
TRITONSERVER_ServerOptions* options, const char* file);
/// Enable or disable info level logging.
///
/// \param options The server options object.
/// \param log True to enable info logging, false to disable.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerOptionsSetLogInfo(
TRITONSERVER_ServerOptions* options, bool log);
/// Enable or disable warning level logging.
///
/// \param options The server options object.
/// \param log True to enable warning logging, false to disable.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerOptionsSetLogWarn(
TRITONSERVER_ServerOptions* options, bool log);
/// Enable or disable error level logging.
///
/// \param options The server options object.
/// \param log True to enable error logging, false to disable.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerOptionsSetLogError(
TRITONSERVER_ServerOptions* options, bool log);
/// Set the logging format.
///
/// \param options The server options object.
/// \param format The logging format.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerOptionsSetLogFormat(
TRITONSERVER_ServerOptions* options, const TRITONSERVER_LogFormat format);
/// Set verbose logging level. Level zero disables verbose logging.
///
/// \param options The server options object.
/// \param level The verbose logging level.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerOptionsSetLogVerbose(
TRITONSERVER_ServerOptions* options, int level);
/// Enable or disable metrics collection in a server options.
///
/// \param options The server options object.
/// \param metrics True to enable metrics, false to disable.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerOptionsSetMetrics(
TRITONSERVER_ServerOptions* options, bool metrics);
/// Enable or disable GPU metrics collection in a server options. GPU
/// metrics are collected if both this option and
/// TRITONSERVER_ServerOptionsSetMetrics are true.
///
/// \param options The server options object.
/// \param gpu_metrics True to enable GPU metrics, false to disable.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerOptionsSetGpuMetrics(
TRITONSERVER_ServerOptions* options, bool gpu_metrics);
/// Enable or disable CPU metrics collection in a server options. CPU
/// metrics are collected if both this option and
/// TRITONSERVER_ServerOptionsSetMetrics are true.
///
/// \param options The server options object.
/// \param cpu_metrics True to enable CPU metrics, false to disable.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerOptionsSetCpuMetrics(
TRITONSERVER_ServerOptions* options, bool cpu_metrics);
/// Set the interval for metrics collection in a server options.
/// This is 2000 milliseconds by default.
///
/// \param options The server options object.
/// \param metrics_interval_ms The time interval in ms between
/// successive metrics updates.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerOptionsSetMetricsInterval(
TRITONSERVER_ServerOptions* options, uint64_t metrics_interval_ms);
/// Set the directory containing backend shared libraries. This
/// directory is searched last after the version and model directory
/// in the model repository when looking for the backend shared
/// library for a model. If the backend is named 'be' the directory
/// searched is 'backend_dir'/be/libtriton_be.so.
///
/// \param options The server options object.
/// \param backend_dir The full path of the backend directory.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerOptionsSetBackendDirectory(
TRITONSERVER_ServerOptions* options, const char* backend_dir);
/// Set the directory containing repository agent shared libraries. This
/// directory is searched when looking for the repository agent shared
/// library for a model. If the backend is named 'ra' the directory
/// searched is 'repoagent_dir'/ra/libtritonrepoagent_ra.so.
///
/// \param options The server options object.
/// \param repoagent_dir The full path of the repository agent directory.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerOptionsSetRepoAgentDirectory(
TRITONSERVER_ServerOptions* options, const char* repoagent_dir);
/// Specify the limit on memory usage as a fraction on the device identified by
/// 'kind' and 'device_id'. If model loading on the device is requested and the
/// current memory usage exceeds the limit, the load will be rejected. If not
/// specified, the limit will not be set.
///
/// Currently support TRITONSERVER_INSTANCEGROUPKIND_GPU
///
/// \param options The server options object.
/// \param kind The kind of the device.
/// \param device_id The id of the device.
/// \param fraction The limit on memory usage as a fraction
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerOptionsSetModelLoadDeviceLimit(
TRITONSERVER_ServerOptions* options,
const TRITONSERVER_InstanceGroupKind kind, const int device_id,
const double fraction);
/// Set a configuration setting for a named backend in a server
/// options.
///
/// \param options The server options object.
/// \param backend_name The name of the backend.
/// \param setting The name of the setting.
/// \param value The setting value.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerOptionsSetBackendConfig(
TRITONSERVER_ServerOptions* options, const char* backend_name,
const char* setting, const char* value);
/// Set a host policy setting for a given policy name in a server options.
///
/// \param options The server options object.
/// \param policy_name The name of the policy.
/// \param setting The name of the setting.
/// \param value The setting value.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerOptionsSetHostPolicy(
TRITONSERVER_ServerOptions* options, const char* policy_name,
const char* setting, const char* value);
/// TRITONSERVER_Server
///
/// An inference server.
///
/// Model batch flags. The enum values must be power-of-2 values.
typedef enum tritonserver_batchflag_enum {
TRITONSERVER_BATCH_UNKNOWN = 1,
TRITONSERVER_BATCH_FIRST_DIM = 2
} TRITONSERVER_ModelBatchFlag;
/// Model index flags. The enum values must be power-of-2 values.
typedef enum tritonserver_modelindexflag_enum {
TRITONSERVER_INDEX_FLAG_READY = 1
} TRITONSERVER_ModelIndexFlag;
/// Model transaction policy flags. The enum values must be
/// power-of-2 values.
typedef enum tritonserver_txn_property_flag_enum {
TRITONSERVER_TXN_ONE_TO_ONE = 1,
TRITONSERVER_TXN_DECOUPLED = 2
} TRITONSERVER_ModelTxnPropertyFlag;
/// Create a new server object. The caller takes ownership of the
/// TRITONSERVER_Server object and must call TRITONSERVER_ServerDelete
/// to release the object.
///
/// \param server Returns the new inference server object.
/// \param options The inference server options object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerNew(
TRITONSERVER_Server** server, TRITONSERVER_ServerOptions* options);
/// Delete a server object. If server is not already stopped it is
/// stopped before being deleted.
///
/// \param server The inference server object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerDelete(
TRITONSERVER_Server* server);
/// Stop a server object. A server can't be restarted once it is
/// stopped.
///
/// \param server The inference server object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerStop(
TRITONSERVER_Server* server);
/// Register a new model repository. Not available in polling mode.
///
/// \param server The inference server object.
/// \param repository_path The full path to the model repository.
/// \param name_mapping List of name_mapping parameters. Each mapping has
/// the model directory name as its key, overriden model name as its value.
/// \param model_count Number of mappings provided.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerRegisterModelRepository(
TRITONSERVER_Server* server, const char* repository_path,
const TRITONSERVER_Parameter** name_mapping, const uint32_t mapping_count);
/// Unregister a model repository. Not available in polling mode.
///
/// \param server The inference server object.
/// \param repository_path The full path to the model repository.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerUnregisterModelRepository(
TRITONSERVER_Server* server, const char* repository_path);
/// Check the model repository for changes and update server state
/// based on those changes.
///
/// \param server The inference server object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerPollModelRepository(TRITONSERVER_Server* server);
/// Is the server live?
///
/// \param server The inference server object.
/// \param live Returns true if server is live, false otherwise.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerIsLive(
TRITONSERVER_Server* server, bool* live);
/// Is the server ready?
///
/// \param server The inference server object.
/// \param ready Returns true if server is ready, false otherwise.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerIsReady(
TRITONSERVER_Server* server, bool* ready);
/// Is the model ready?
///
/// \param server The inference server object.
/// \param model_name The name of the model to get readiness for.
/// \param model_version The version of the model to get readiness
/// for. If -1 then the server will choose a version based on the
/// model's policy.
/// \param ready Returns true if server is ready, false otherwise.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerModelIsReady(
TRITONSERVER_Server* server, const char* model_name,
const int64_t model_version, bool* ready);
/// Get the batch properties of the model. The properties are
/// communicated by a flags value and an (optional) object returned by
/// 'voidp'.
///
/// - TRITONSERVER_BATCH_UNKNOWN: Triton cannot determine the
/// batching properties of the model. This means that the model
/// does not support batching in any way that is useable by
/// Triton. The returned 'voidp' value is nullptr.
///
/// - TRITONSERVER_BATCH_FIRST_DIM: The model supports batching
/// along the first dimension of every input and output
/// tensor. Triton schedulers that perform batching can
/// automatically batch inference requests along this dimension.
/// The returned 'voidp' value is nullptr.
///
/// \param server The inference server object.
/// \param model_name The name of the model.
/// \param model_version The version of the model. If -1 then the
/// server will choose a version based on the model's policy.
/// \param flags Returns flags indicating the batch properties of the
/// model.
/// \param voidp If non-nullptr, returns a point specific to the
/// 'flags' value.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerModelBatchProperties(
TRITONSERVER_Server* server, const char* model_name,
const int64_t model_version, uint32_t* flags, void** voidp);
/// Get the transaction policy of the model. The policy is
/// communicated by a flags value.
///
/// - TRITONSERVER_TXN_ONE_TO_ONE: The model generates exactly
/// one response per request.
///
/// - TRITONSERVER_TXN_DECOUPLED: The model may generate zero
/// to many responses per request.
///
/// \param server The inference server object.
/// \param model_name The name of the model.
/// \param model_version The version of the model. If -1 then the
/// server will choose a version based on the model's policy.
/// \param txn_flags Returns flags indicating the transaction policy of the
/// model.
/// \param voidp If non-nullptr, returns a point specific to the 'flags' value.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerModelTransactionProperties(
TRITONSERVER_Server* server, const char* model_name,
const int64_t model_version, uint32_t* txn_flags, void** voidp);
/// Get the metadata of the server as a TRITONSERVER_Message object.
/// The caller takes ownership of the message object and must call
/// TRITONSERVER_MessageDelete to release the object.
///
/// \param server The inference server object.
/// \param server_metadata Returns the server metadata message.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerMetadata(
TRITONSERVER_Server* server, TRITONSERVER_Message** server_metadata);
/// Get the metadata of a model as a TRITONSERVER_Message
/// object. The caller takes ownership of the message object and must
/// call TRITONSERVER_MessageDelete to release the object.
///
/// \param server The inference server object.
/// \param model_name The name of the model.
/// \param model_version The version of the model.
/// If -1 then the server will choose a version based on the model's
/// policy.
/// \param model_metadata Returns the model metadata message.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerModelMetadata(
TRITONSERVER_Server* server, const char* model_name,
const int64_t model_version, TRITONSERVER_Message** model_metadata);
/// Get the statistics of a model as a TRITONSERVER_Message
/// object. The caller takes ownership of the object and must call
/// TRITONSERVER_MessageDelete to release the object.
///
/// \param server The inference server object.
/// \param model_name The name of the model.
/// If empty, then statistics for all available models will be returned,
/// and the server will choose a version based on those models' policies.
/// \param model_version The version of the model. If -1 then the
/// server will choose a version based on the model's policy.
/// \param model_stats Returns the model statistics message.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerModelStatistics(
TRITONSERVER_Server* server, const char* model_name,
const int64_t model_version, TRITONSERVER_Message** model_stats);
/// Get the configuration of a model as a TRITONSERVER_Message object.
/// The caller takes ownership of the message object and must call
/// TRITONSERVER_MessageDelete to release the object.
///
/// \param server The inference server object.
/// \param model_name The name of the model.
/// \param model_version The version of the model. If -1 then the
/// server will choose a version based on the model's policy.
/// \param config_version The model configuration will be returned in
/// a format matching this version. If the configuration cannot be
/// represented in the requested version's format then an error will
/// be returned. Currently only version 1 is supported.
/// \param model_config Returns the model config message.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerModelConfig(
TRITONSERVER_Server* server, const char* model_name,
const int64_t model_version, const uint32_t config_version,
TRITONSERVER_Message** model_config);
/// Get the index of all unique models in the model repositories as a
/// TRITONSERVER_Message object. The caller takes ownership of the
/// message object and must call TRITONSERVER_MessageDelete to release
/// the object.
///
/// If TRITONSERVER_INDEX_FLAG_READY is set in 'flags' only the models
/// that are loaded into the server and ready for inferencing are
/// returned.
///
/// \param server The inference server object.
/// \param flags TRITONSERVER_ModelIndexFlag flags that control how to
/// collect the index.
/// \param model_index Return the model index message that holds the
/// index of all models contained in the server's model repository(s).
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerModelIndex(
TRITONSERVER_Server* server, uint32_t flags,
TRITONSERVER_Message** model_index);
/// Load the requested model or reload the model if it is already
/// loaded. The function does not return until the model is loaded or
/// fails to load. Returned error indicates if model loaded
/// successfully or not.
///
/// \param server The inference server object.
/// \param model_name The name of the model.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerLoadModel(
TRITONSERVER_Server* server, const char* model_name);
/// Load the requested model or reload the model if it is already
/// loaded, with load parameters provided. The function does not return until
/// the model is loaded or fails to load. Returned error indicates if model
/// loaded successfully or not.
/// Currently the below parameter names are recognized:
/// - "config" : string parameter that contains a JSON representation of the
/// model configuration. This config will be used for loading the model instead
/// of the one in the model directory.
///
/// \param server The inference server object.
/// \param model_name The name of the model.
/// \param parameters The array of load parameters.
/// \param parameter_count The number of parameters.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerLoadModelWithParameters(
TRITONSERVER_Server* server, const char* model_name,
const TRITONSERVER_Parameter** parameters, const uint64_t parameter_count);
/// Unload the requested model. Unloading a model that is not loaded
/// on server has no affect and success code will be returned.
/// The function does not wait for the requested model to be fully unload
/// and success code will be returned.
/// Returned error indicates if model unloaded successfully or not.
///
/// \param server The inference server object.
/// \param model_name The name of the model.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerUnloadModel(
TRITONSERVER_Server* server, const char* model_name);
/// Unload the requested model, and also unload any dependent model that
/// was loaded along with the requested model (for example, the models composing
/// an ensemble). Unloading a model that is not loaded
/// on server has no affect and success code will be returned.
/// The function does not wait for the requested model and all dependent
/// models to be fully unload and success code will be returned.
/// Returned error indicates if model unloaded successfully or not.
///
/// \param server The inference server object.
/// \param model_name The name of the model.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerUnloadModelAndDependents(
TRITONSERVER_Server* server, const char* model_name);
/// Get the current metrics for the server. The caller takes ownership
/// of the metrics object and must call TRITONSERVER_MetricsDelete to
/// release the object.
///
/// \param server The inference server object.
/// \param metrics Returns the metrics.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerMetrics(
TRITONSERVER_Server* server, TRITONSERVER_Metrics** metrics);
/// Perform inference using the meta-data and inputs supplied by the
/// 'inference_request'. If the function returns success, then the
/// caller releases ownership of 'inference_request' and must not
/// access it in any way after this call, until ownership is returned
/// via the 'request_release_fn' callback registered in the request
/// object with TRITONSERVER_InferenceRequestSetReleaseCallback.
///
/// The function unconditionally takes ownership of 'trace' and so the
/// caller must not access it in any way after this call (except in
/// the trace activity callbacks) until ownership is returned via the
/// trace's release_fn callback.
///
/// Responses produced for this request are returned using the
/// allocator and callback registered with the request by
/// TRITONSERVER_InferenceRequestSetResponseCallback.
///
/// \param server The inference server object.
/// \param inference_request The request object.
/// \param trace The trace object for this request, or nullptr if no
/// tracing.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerInferAsync(
TRITONSERVER_Server* server,
TRITONSERVER_InferenceRequest* inference_request,
TRITONSERVER_InferenceTrace* trace);
/// TRITONSERVER_MetricKind
///
/// Types of metrics recognized by TRITONSERVER.
///
typedef enum TRITONSERVER_metrickind_enum {
TRITONSERVER_METRIC_KIND_COUNTER,
TRITONSERVER_METRIC_KIND_GAUGE
} TRITONSERVER_MetricKind;
/// Create a new metric family object. The caller takes ownership of the
/// TRITONSERVER_MetricFamily object and must call
/// TRITONSERVER_MetricFamilyDelete to release the object.
///
/// \param family Returns the new metric family object.
/// \param kind The type of metric family to create.
/// \param name The name of the metric family seen when calling the metrics
/// endpoint.
/// \param description The description of the metric family seen when
/// calling the metrics endpoint.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_MetricFamilyNew(
TRITONSERVER_MetricFamily** family, const TRITONSERVER_MetricKind kind,
const char* name, const char* description);
/// Delete a metric family object.
/// A TRITONSERVER_MetricFamily* object should be deleted AFTER its
/// corresponding TRITONSERVER_Metric* objects have been deleted.
/// Attempting to delete a family before its metrics will return an error.
///
/// \param family The metric family object to delete.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_MetricFamilyDelete(
TRITONSERVER_MetricFamily* family);
/// Create a new metric object. The caller takes ownership of the
/// TRITONSERVER_Metric object and must call
/// TRITONSERVER_MetricDelete to release the object. The caller is also
/// responsible for ownership of the labels passed in. Each label can be deleted
/// immediately after creating the metric with TRITONSERVER_ParameterDelete
/// if not re-using the labels.
///
/// \param metric Returns the new metric object.
/// \param family The metric family to add this new metric to.
/// \param labels The array of labels to associate with this new metric.
/// \param label_count The number of labels.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_MetricNew(
TRITONSERVER_Metric** metric, TRITONSERVER_MetricFamily* family,
const TRITONSERVER_Parameter** labels, const uint64_t label_count);
/// Delete a metric object.
/// All TRITONSERVER_Metric* objects should be deleted BEFORE their
/// corresponding TRITONSERVER_MetricFamily* objects have been deleted.
/// If a family is deleted before its metrics, an error will be returned.
///
/// \param metric The metric object to delete.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_MetricDelete(
TRITONSERVER_Metric* metric);
/// Get the current value of a metric object.
/// Supports metrics of kind TRITONSERVER_METRIC_KIND_COUNTER
/// and TRITONSERVER_METRIC_KIND_GAUGE, and returns
/// TRITONSERVER_ERROR_UNSUPPORTED for unsupported TRITONSERVER_MetricKind.
///
/// \param metric The metric object to query.
/// \param value Returns the current value of the metric object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_MetricValue(
TRITONSERVER_Metric* metric, double* value);
/// Increment the current value of metric by value.
/// Supports metrics of kind TRITONSERVER_METRIC_KIND_GAUGE for any value,
/// and TRITONSERVER_METRIC_KIND_COUNTER for non-negative values. Returns
/// TRITONSERVER_ERROR_UNSUPPORTED for unsupported TRITONSERVER_MetricKind
/// and TRITONSERVER_ERROR_INVALID_ARG for negative values on a
/// TRITONSERVER_METRIC_KIND_COUNTER metric.
///
/// \param metric The metric object to update.
/// \param value The amount to increment the metric's value by.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_MetricIncrement(
TRITONSERVER_Metric* metric, double value);
/// Set the current value of metric to value.
/// Supports metrics of kind TRITONSERVER_METRIC_KIND_GAUGE and returns
/// TRITONSERVER_ERROR_UNSUPPORTED for unsupported TRITONSERVER_MetricKind.
///
/// \param metric The metric object to update.
/// \param value The amount to set metric's value to.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_MetricSet(
TRITONSERVER_Metric* metric, double value);
/// Get the TRITONSERVER_MetricKind of metric and its corresponding family.
///
/// \param metric The metric object to query.
/// \param kind Returns the TRITONSERVER_MetricKind of metric.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_GetMetricKind(
TRITONSERVER_Metric* metric, TRITONSERVER_MetricKind* kind);
#ifdef __cplusplus
}
#endif
// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "backend_config.h"
#include "status.h"
#include "triton/common/logging.h"
#include "triton/common/model_config.h"
namespace triton { namespace core {
namespace {
Status
GetTFSpecializedBackendName(
const triton::common::BackendCmdlineConfigMap& config_map,
std::string* specialized_name)
{
std::string tf_version_str = "2";
const auto& itr = config_map.find("tensorflow");
if (itr != config_map.end()) {
if (BackendConfiguration(itr->second, "version", &tf_version_str).IsOk()) {
if ((tf_version_str != "1") && (tf_version_str != "2")) {
return Status(
Status::Code::INVALID_ARG,
"unexpected TensorFlow library version '" + tf_version_str +
"', expects 1 or 2.");
}
}
}
*specialized_name += tf_version_str;
return Status::Success;
}
} // namespace
Status
BackendConfiguration(
const triton::common::BackendCmdlineConfig& config, const std::string& key,
std::string* val)
{
for (const auto& pr : config) {
if (pr.first == key) {
*val = pr.second;
return Status::Success;
}
}
return Status(
Status::Code::INTERNAL,
std::string("unable to find common backend configuration for '") + key +
"'");
}
Status
BackendConfigurationParseStringToDouble(const std::string& str, double* val)
{
try {
*val = std::stod(str);
}
catch (...) {
return Status(
Status::Code::INTERNAL,
"unable to parse common backend configuration as double");
}
return Status::Success;
}
Status
BackendConfigurationParseStringToBool(const std::string& str, bool* val)
{
try {
std::string lowercase_str{str};
std::transform(
lowercase_str.begin(), lowercase_str.end(), lowercase_str.begin(),
[](unsigned char c) { return std::tolower(c); });
*val = (lowercase_str == "true");
}
catch (...) {
return Status(
Status::Code::INTERNAL,
"unable to parse common backend configuration as bool");
}
return Status::Success;
}
Status
BackendConfigurationGlobalBackendsDirectory(
const triton::common::BackendCmdlineConfigMap& config_map, std::string* dir)
{
const auto& itr = config_map.find(std::string());
if (itr == config_map.end()) {
return Status(
Status::Code::INTERNAL,
"unable to find global backends directory configuration");
}
RETURN_IF_ERROR(BackendConfiguration(itr->second, "backend-directory", dir));
return Status::Success;
}
Status
BackendConfigurationMinComputeCapability(
const triton::common::BackendCmdlineConfigMap& config_map, double* mcc)
{
#ifdef TRITON_ENABLE_GPU
*mcc = TRITON_MIN_COMPUTE_CAPABILITY;
#else
*mcc = 0;
#endif // TRITON_ENABLE_GPU
const auto& itr = config_map.find(std::string());
if (itr == config_map.end()) {
return Status(
Status::Code::INTERNAL, "unable to find common backend configuration");
}
std::string min_compute_capability_str;
RETURN_IF_ERROR(BackendConfiguration(
itr->second, "min-compute-capability", &min_compute_capability_str));
RETURN_IF_ERROR(
BackendConfigurationParseStringToDouble(min_compute_capability_str, mcc));
return Status::Success;
}
Status
BackendConfigurationAutoCompleteConfig(
const triton::common::BackendCmdlineConfigMap& config_map, bool* acc)
{
const auto& itr = config_map.find(std::string());
if (itr == config_map.end()) {
return Status(
Status::Code::INTERNAL, "unable to find auto-complete configuration");
}
std::string auto_complete_config_str;
RETURN_IF_ERROR(BackendConfiguration(
itr->second, "auto-complete-config", &auto_complete_config_str));
RETURN_IF_ERROR(
BackendConfigurationParseStringToBool(auto_complete_config_str, acc));
return Status::Success;
}
Status
BackendConfigurationSpecializeBackendName(
const triton::common::BackendCmdlineConfigMap& config_map,
const std::string& backend_name, std::string* specialized_name)
{
*specialized_name = backend_name;
if (backend_name == "tensorflow") {
RETURN_IF_ERROR(GetTFSpecializedBackendName(config_map, specialized_name));
}
return Status::Success;
}
Status
BackendConfigurationBackendLibraryName(
const std::string& backend_name, std::string* libname)
{
#ifdef _WIN32
*libname = "triton_" + backend_name + ".dll";
#else
*libname = "libtriton_" + backend_name + ".so";
#endif
return Status::Success;
}
Status
BackendConfigurationModelLoadGpuFraction(
const triton::common::BackendCmdlineConfigMap& config_map,
const int device_id, double* memory_limit)
{
*memory_limit = 1.0;
const auto& itr = config_map.find(std::string());
if (itr == config_map.end()) {
return Status(
Status::Code::INTERNAL,
"unable to find global backends directory configuration");
}
static std::string key_prefix = "model-load-gpu-limit-device-";
std::string memory_limit_str;
auto status = BackendConfiguration(
itr->second, key_prefix + std::to_string(device_id), &memory_limit_str);
// Allow missing key, default to 1.0 (no limit) if the limit is not specified
if (status.IsOk()) {
RETURN_IF_ERROR(BackendConfigurationParseStringToDouble(
memory_limit_str, memory_limit));
}
return Status::Success;
}
}} // namespace triton::core
// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include "status.h"
#include "triton/common/model_config.h"
namespace triton { namespace core {
/// Get a key's string value from a backend configuration.
Status BackendConfiguration(
const triton::common::BackendCmdlineConfig& config, const std::string& key,
std::string* val);
/// Convert a backend configuration string value into a double.
Status BackendConfigurationParseStringToDouble(
const std::string& str, double* val);
/// Convert a backend configuration string value into a bool.
Status BackendConfigurationParseStringToBool(const std::string& str, bool* val);
/// Get the global backends directory from the backend configuration.
Status BackendConfigurationGlobalBackendsDirectory(
const triton::common::BackendCmdlineConfigMap& config_map,
std::string* dir);
/// Get the minimum compute capability from the backend configuration.
Status BackendConfigurationMinComputeCapability(
const triton::common::BackendCmdlineConfigMap& config_map, double* mcc);
/// Get the model configuration auto-complete setting from the backend
/// configuration.
Status BackendConfigurationAutoCompleteConfig(
const triton::common::BackendCmdlineConfigMap& config_map, bool* acc);
/// Convert a backend name to the specialized version of that name
/// based on the backend configuration. For example, "tensorflow" will
/// convert to either "tensorflow1" or "tensorflow2" depending on how
/// tritonserver is run.
Status BackendConfigurationSpecializeBackendName(
const triton::common::BackendCmdlineConfigMap& config_map,
const std::string& backend_name, std::string* specialized_name);
/// Return the shared library name for a backend.
Status BackendConfigurationBackendLibraryName(
const std::string& backend_name, std::string* libname);
/// Get GPU memory limit fraction for model loading
/// from the backend configuration.
Status BackendConfigurationModelLoadGpuFraction(
const triton::common::BackendCmdlineConfigMap& config_map,
const int device_id, double* memory_limit);
}} // namespace triton::core
// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "backend_manager.h"
#include "backend_memory_manager.h"
#include "server_message.h"
#include "shared_library.h"
#include "triton/common/logging.h"
// For unknown reason, windows will not export the TRITONBACKEND_*
// functions declared with dllexport in tritonbackend.h. To get those
// functions exported it is (also?) necessary to mark the definitions
// in this file with dllexport as well.
#if defined(_MSC_VER)
#define TRITONAPI_DECLSPEC __declspec(dllexport)
#elif defined(__GNUC__)
#define TRITONAPI_DECLSPEC __attribute__((__visibility__("default")))
#else
#define TRITONAPI_DECLSPEC
#endif
namespace triton { namespace core {
//
// TritonBackend
//
Status
TritonBackend::Create(
const std::string& name, const std::string& dir, const std::string& libpath,
const triton::common::BackendCmdlineConfig& backend_cmdline_config,
std::shared_ptr<TritonBackend>* backend)
{
// Create the JSON representation of the backend configuration.
triton::common::TritonJson::Value backend_config_json(
triton::common::TritonJson::ValueType::OBJECT);
if (!backend_cmdline_config.empty()) {
triton::common::TritonJson::Value cmdline_json(
backend_config_json, triton::common::TritonJson::ValueType::OBJECT);
for (const auto& pr : backend_cmdline_config) {
RETURN_IF_ERROR(cmdline_json.AddString(pr.first.c_str(), pr.second));
}
RETURN_IF_ERROR(
backend_config_json.Add("cmdline", std::move(cmdline_json)));
}
TritonServerMessage backend_config(backend_config_json);
auto local_backend = std::shared_ptr<TritonBackend>(
new TritonBackend(name, dir, libpath, backend_config));
// Load the library and initialize all the entrypoints
RETURN_IF_ERROR(local_backend->LoadBackendLibrary());
// Backend initialization is optional... The TRITONBACKEND_Backend
// object is this TritonBackend object. We must set set shared
// library path to point to the backend directory in case the
// backend library attempts to load additional shared libaries.
if (local_backend->backend_init_fn_ != nullptr) {
std::unique_ptr<SharedLibrary> slib;
RETURN_IF_ERROR(SharedLibrary::Acquire(&slib));
RETURN_IF_ERROR(slib->SetLibraryDirectory(local_backend->dir_));
TRITONSERVER_Error* err = local_backend->backend_init_fn_(
reinterpret_cast<TRITONBACKEND_Backend*>(local_backend.get()));
RETURN_IF_ERROR(slib->ResetLibraryDirectory());
RETURN_IF_TRITONSERVER_ERROR(err);
}
local_backend->UpdateAttributes();
*backend = std::move(local_backend);
return Status::Success;
}
Status
TritonBackend::UpdateAttributes()
{
if (backend_attri_fn_ == nullptr) {
return Status::Success;
}
// Create an Attribute object for the backend to fill, note that it copies
// some fields from 'attributes_' while the others use default value. This
// is an ad hoc way to determine whether the attribute is set by the backend
// and keep / update current value.
Attribute latest;
latest.exec_policy_ = attributes_.exec_policy_;
RETURN_IF_TRITONSERVER_ERROR(backend_attri_fn_(
reinterpret_cast<TRITONBACKEND_Backend*>(this),
reinterpret_cast<TRITONBACKEND_BackendAttribute*>(&latest)));
// Update attributes that were set
attributes_.exec_policy_ = latest.exec_policy_;
if (!latest.preferred_groups_.empty()) {
attributes_.preferred_groups_ = latest.preferred_groups_;
}
return Status::Success;
}
TritonBackend::TritonBackend(
const std::string& name, const std::string& dir, const std::string& libpath,
const TritonServerMessage& backend_config)
: name_(name), dir_(dir), libpath_(libpath),
backend_config_(backend_config), state_(nullptr)
{
ClearHandles();
}
TritonBackend::~TritonBackend()
{
LOG_VERBOSE(1) << "unloading backend '" << name_ << "'";
// Backend finalization is optional... The TRITONBACKEND_Backend
// object is this TritonBackend object.
if (backend_fini_fn_ != nullptr) {
LOG_TRITONSERVER_ERROR(
backend_fini_fn_(reinterpret_cast<TRITONBACKEND_Backend*>(this)),
"failed finalizing backend");
}
ClearHandles();
}
void
TritonBackend::ClearHandles()
{
dlhandle_ = nullptr;
backend_init_fn_ = nullptr;
backend_fini_fn_ = nullptr;
backend_attri_fn_ = nullptr;
model_init_fn_ = nullptr;
model_fini_fn_ = nullptr;
inst_init_fn_ = nullptr;
inst_fini_fn_ = nullptr;
inst_exec_fn_ = nullptr;
}
Status
TritonBackend::LoadBackendLibrary()
{
TritonBackendInitFn_t bifn;
TritonBackendFiniFn_t bffn;
TritonBackendAttriFn_t bafn;
TritonModelInitFn_t mifn;
TritonModelFiniFn_t mffn;
TritonModelInstanceInitFn_t iifn;
TritonModelInstanceFiniFn_t iffn;
TritonModelInstanceExecFn_t iefn;
{
std::unique_ptr<SharedLibrary> slib;
RETURN_IF_ERROR(SharedLibrary::Acquire(&slib));
RETURN_IF_ERROR(slib->OpenLibraryHandle(libpath_, &dlhandle_));
// Backend initialize and finalize functions, optional
RETURN_IF_ERROR(slib->GetEntrypoint(
dlhandle_, "TRITONBACKEND_Initialize", true /* optional */,
reinterpret_cast<void**>(&bifn)));
RETURN_IF_ERROR(slib->GetEntrypoint(
dlhandle_, "TRITONBACKEND_Finalize", true /* optional */,
reinterpret_cast<void**>(&bffn)));
// Backend attribute function, optional
RETURN_IF_ERROR(slib->GetEntrypoint(
dlhandle_, "TRITONBACKEND_GetBackendAttribute", true /* optional */,
reinterpret_cast<void**>(&bafn)));
// Model initialize and finalize functions, optional
RETURN_IF_ERROR(slib->GetEntrypoint(
dlhandle_, "TRITONBACKEND_ModelInitialize", true /* optional */,
reinterpret_cast<void**>(&mifn)));
RETURN_IF_ERROR(slib->GetEntrypoint(
dlhandle_, "TRITONBACKEND_ModelFinalize", true /* optional */,
reinterpret_cast<void**>(&mffn)));
// Model instance initialize and finalize functions, optional
RETURN_IF_ERROR(slib->GetEntrypoint(
dlhandle_, "TRITONBACKEND_ModelInstanceInitialize", true /* optional */,
reinterpret_cast<void**>(&iifn)));
RETURN_IF_ERROR(slib->GetEntrypoint(
dlhandle_, "TRITONBACKEND_ModelInstanceFinalize", true /* optional */,
reinterpret_cast<void**>(&iffn)));
// Model instance execute function, required
RETURN_IF_ERROR(slib->GetEntrypoint(
dlhandle_, "TRITONBACKEND_ModelInstanceExecute", false /* optional */,
reinterpret_cast<void**>(&iefn)));
}
backend_init_fn_ = bifn;
backend_fini_fn_ = bffn;
backend_attri_fn_ = bafn;
model_init_fn_ = mifn;
model_fini_fn_ = mffn;
inst_init_fn_ = iifn;
inst_fini_fn_ = iffn;
inst_exec_fn_ = iefn;
return Status::Success;
}
extern "C" {
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ApiVersion(uint32_t* major, uint32_t* minor)
{
*major = TRITONBACKEND_API_VERSION_MAJOR;
*minor = TRITONBACKEND_API_VERSION_MINOR;
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_BackendName(TRITONBACKEND_Backend* backend, const char** name)
{
TritonBackend* tb = reinterpret_cast<TritonBackend*>(backend);
*name = tb->Name().c_str();
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_BackendConfig(
TRITONBACKEND_Backend* backend, TRITONSERVER_Message** backend_config)
{
TritonBackend* tb = reinterpret_cast<TritonBackend*>(backend);
*backend_config = const_cast<TRITONSERVER_Message*>(
reinterpret_cast<const TRITONSERVER_Message*>(&tb->BackendConfig()));
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_BackendExecutionPolicy(
TRITONBACKEND_Backend* backend, TRITONBACKEND_ExecutionPolicy* policy)
{
TritonBackend* tb = reinterpret_cast<TritonBackend*>(backend);
*policy = tb->ExecutionPolicy();
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_BackendSetExecutionPolicy(
TRITONBACKEND_Backend* backend, TRITONBACKEND_ExecutionPolicy policy)
{
TritonBackend* tb = reinterpret_cast<TritonBackend*>(backend);
tb->SetExecutionPolicy(policy);
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_BackendArtifacts(
TRITONBACKEND_Backend* backend, TRITONBACKEND_ArtifactType* artifact_type,
const char** location)
{
TritonBackend* tb = reinterpret_cast<TritonBackend*>(backend);
*artifact_type = TRITONBACKEND_ARTIFACT_FILESYSTEM;
*location = tb->Directory().c_str();
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_BackendMemoryManager(
TRITONBACKEND_Backend* backend, TRITONBACKEND_MemoryManager** manager)
{
static TritonMemoryManager gMemoryManager;
*manager = reinterpret_cast<TRITONBACKEND_MemoryManager*>(&gMemoryManager);
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_BackendState(TRITONBACKEND_Backend* backend, void** state)
{
TritonBackend* tb = reinterpret_cast<TritonBackend*>(backend);
*state = tb->State();
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_BackendSetState(TRITONBACKEND_Backend* backend, void* state)
{
TritonBackend* tb = reinterpret_cast<TritonBackend*>(backend);
tb->SetState(state);
return nullptr; // success
}
} // extern C
//
// TritonBackendManager
//
static std::weak_ptr<TritonBackendManager> backend_manager_;
static std::mutex mu_;
Status
TritonBackendManager::Create(std::shared_ptr<TritonBackendManager>* manager)
{
std::lock_guard<std::mutex> lock(mu_);
// If there is already a manager then we just use it...
*manager = backend_manager_.lock();
if (*manager != nullptr) {
return Status::Success;
}
manager->reset(new TritonBackendManager());
backend_manager_ = *manager;
return Status::Success;
}
Status
TritonBackendManager::CreateBackend(
const std::string& name, const std::string& dir, const std::string& libpath,
const triton::common::BackendCmdlineConfig& backend_cmdline_config,
std::shared_ptr<TritonBackend>* backend)
{
std::lock_guard<std::mutex> lock(mu_);
const auto& itr = backend_map_.find(libpath);
if (itr != backend_map_.end()) {
*backend = itr->second;
return Status::Success;
}
RETURN_IF_ERROR(TritonBackend::Create(
name, dir, libpath, backend_cmdline_config, backend));
backend_map_.insert({libpath, *backend});
return Status::Success;
}
Status
TritonBackendManager::BackendState(
std::unique_ptr<std::unordered_map<std::string, std::vector<std::string>>>*
backend_state)
{
std::lock_guard<std::mutex> lock(mu_);
std::unique_ptr<std::unordered_map<std::string, std::vector<std::string>>>
backend_state_map(
new std::unordered_map<std::string, std::vector<std::string>>);
for (const auto& backend_pair : backend_map_) {
auto& libpath = backend_pair.first;
auto backend = backend_pair.second;
const char* backend_config;
size_t backend_config_size;
backend->BackendConfig().Serialize(&backend_config, &backend_config_size);
backend_state_map->insert(
{backend->Name(), std::vector<std::string>{libpath, backend_config}});
}
*backend_state = std::move(backend_state_map);
return Status::Success;
}
}} // namespace triton::core
// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <memory>
#include <mutex>
#include <string>
#include <unordered_map>
#include "constants.h"
#include "server_message.h"
#include "status.h"
#include "triton/common/model_config.h"
#include "tritonserver_apis.h"
namespace triton { namespace core {
//
// Proxy to a backend shared library.
//
class TritonBackend {
public:
struct Attribute {
Attribute() : exec_policy_(TRITONBACKEND_EXECUTION_BLOCKING) {}
TRITONBACKEND_ExecutionPolicy exec_policy_;
std::vector<inference::ModelInstanceGroup> preferred_groups_;
};
typedef TRITONSERVER_Error* (*TritonModelInitFn_t)(
TRITONBACKEND_Model* model);
typedef TRITONSERVER_Error* (*TritonModelFiniFn_t)(
TRITONBACKEND_Model* model);
typedef TRITONSERVER_Error* (*TritonModelInstanceInitFn_t)(
TRITONBACKEND_ModelInstance* instance);
typedef TRITONSERVER_Error* (*TritonModelInstanceFiniFn_t)(
TRITONBACKEND_ModelInstance* instance);
typedef TRITONSERVER_Error* (*TritonModelInstanceExecFn_t)(
TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Request** requests,
const uint32_t request_cnt);
static Status Create(
const std::string& name, const std::string& dir,
const std::string& libpath,
const triton::common::BackendCmdlineConfig& backend_cmdline_config,
std::shared_ptr<TritonBackend>* backend);
~TritonBackend();
const std::string& Name() const { return name_; }
const std::string& Directory() const { return dir_; }
const TritonServerMessage& BackendConfig() const { return backend_config_; }
const Attribute& BackendAttributes() const { return attributes_; }
TRITONBACKEND_ExecutionPolicy ExecutionPolicy() const
{
return attributes_.exec_policy_;
}
void SetExecutionPolicy(const TRITONBACKEND_ExecutionPolicy policy)
{
attributes_.exec_policy_ = policy;
}
void* State() { return state_; }
void SetState(void* state) { state_ = state; }
TritonModelInitFn_t ModelInitFn() const { return model_init_fn_; }
TritonModelFiniFn_t ModelFiniFn() const { return model_fini_fn_; }
TritonModelInstanceInitFn_t ModelInstanceInitFn() const
{
return inst_init_fn_;
}
TritonModelInstanceFiniFn_t ModelInstanceFiniFn() const
{
return inst_fini_fn_;
}
TritonModelInstanceExecFn_t ModelInstanceExecFn() const
{
return inst_exec_fn_;
}
private:
typedef TRITONSERVER_Error* (*TritonBackendInitFn_t)(
TRITONBACKEND_Backend* backend);
typedef TRITONSERVER_Error* (*TritonBackendFiniFn_t)(
TRITONBACKEND_Backend* backend);
typedef TRITONSERVER_Error* (*TritonBackendAttriFn_t)(
TRITONBACKEND_Backend* backend,
TRITONBACKEND_BackendAttribute* backend_attributes);
TritonBackend(
const std::string& name, const std::string& dir,
const std::string& libpath, const TritonServerMessage& backend_config);
void ClearHandles();
Status LoadBackendLibrary();
Status UpdateAttributes();
// The name of the backend.
const std::string name_;
// Full path to the directory holding backend shared library and
// other artifacts.
const std::string dir_;
// Full path to the backend shared library.
const std::string libpath_;
// Backend configuration as JSON
TritonServerMessage backend_config_;
// backend attributes
Attribute attributes_;
// dlopen / dlsym handles
void* dlhandle_;
TritonBackendInitFn_t backend_init_fn_;
TritonBackendFiniFn_t backend_fini_fn_;
TritonBackendAttriFn_t backend_attri_fn_;
TritonModelInitFn_t model_init_fn_;
TritonModelFiniFn_t model_fini_fn_;
TritonModelInstanceInitFn_t inst_init_fn_;
TritonModelInstanceFiniFn_t inst_fini_fn_;
TritonModelInstanceExecFn_t inst_exec_fn_;
// Opaque state associated with the backend.
void* state_;
};
//
// Manage communication with Triton backends and their lifecycle.
//
class TritonBackendManager {
public:
static Status Create(std::shared_ptr<TritonBackendManager>* manager);
Status CreateBackend(
const std::string& name, const std::string& dir,
const std::string& libpath,
const triton::common::BackendCmdlineConfig& backend_cmdline_config,
std::shared_ptr<TritonBackend>* backend);
Status BackendState(
std::unique_ptr<
std::unordered_map<std::string, std::vector<std::string>>>*
backend_state);
private:
DISALLOW_COPY_AND_ASSIGN(TritonBackendManager);
TritonBackendManager() = default;
std::unordered_map<std::string, std::shared_ptr<TritonBackend>> backend_map_;
};
}} // namespace triton::core
// Copyright 2020-2022, NVIDIA CORPORATION. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "backend_memory_manager.h"
#include "pinned_memory_manager.h"
#include "status.h"
#include "tritonserver_apis.h"
#ifdef TRITON_ENABLE_GPU
#include <cuda_runtime_api.h>
#include "cuda_memory_manager.h"
#endif // TRITON_ENABLE_GPU
// For unknown reason, windows will not export the TRITONBACKEND_*
// functions declared with dllexport in tritonbackend.h. To get those
// functions exported it is (also?) necessary to mark the definitions
// in this file with dllexport as well.
#if defined(_MSC_VER)
#define TRITONAPI_DECLSPEC __declspec(dllexport)
#elif defined(__GNUC__)
#define TRITONAPI_DECLSPEC __attribute__((__visibility__("default")))
#else
#define TRITONAPI_DECLSPEC
#endif
namespace triton { namespace core {
extern "C" {
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_MemoryManagerAllocate(
TRITONBACKEND_MemoryManager* manager, void** buffer,
const TRITONSERVER_MemoryType memory_type, const int64_t memory_type_id,
const uint64_t byte_size)
{
switch (memory_type) {
case TRITONSERVER_MEMORY_GPU:
#ifdef TRITON_ENABLE_GPU
{
auto status = CudaMemoryManager::Alloc(buffer, byte_size, memory_type_id);
if (!status.IsOk()) {
return TRITONSERVER_ErrorNew(
StatusCodeToTritonCode(status.ErrorCode()),
status.Message().c_str());
}
break;
}
#else
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_UNSUPPORTED,
"GPU memory allocation not supported");
#endif // TRITON_ENABLE_GPU
case TRITONSERVER_MEMORY_CPU_PINNED:
#ifdef TRITON_ENABLE_GPU
{
TRITONSERVER_MemoryType mt = memory_type;
auto status = PinnedMemoryManager::Alloc(buffer, byte_size, &mt, false);
if (!status.IsOk()) {
return TRITONSERVER_ErrorNew(
StatusCodeToTritonCode(status.ErrorCode()),
status.Message().c_str());
}
break;
}
#else
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_UNSUPPORTED,
"Pinned memory allocation not supported");
#endif // TRITON_ENABLE_GPU
case TRITONSERVER_MEMORY_CPU: {
*buffer = malloc(byte_size);
if (*buffer == nullptr) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_UNAVAILABLE, "CPU memory allocation failed");
}
break;
}
}
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_MemoryManagerFree(
TRITONBACKEND_MemoryManager* manager, void* buffer,
const TRITONSERVER_MemoryType memory_type, const int64_t memory_type_id)
{
switch (memory_type) {
case TRITONSERVER_MEMORY_GPU: {
#ifdef TRITON_ENABLE_GPU
auto status = CudaMemoryManager::Free(buffer, memory_type_id);
if (!status.IsOk()) {
return TRITONSERVER_ErrorNew(
StatusCodeToTritonCode(status.StatusCode()),
status.Message().c_str());
}
#endif // TRITON_ENABLE_GPU
break;
}
case TRITONSERVER_MEMORY_CPU_PINNED: {
#ifdef TRITON_ENABLE_GPU
auto status = PinnedMemoryManager::Free(buffer);
if (!status.IsOk()) {
return TRITONSERVER_ErrorNew(
StatusCodeToTritonCode(status.StatusCode()),
status.Message().c_str());
}
#endif // TRITON_ENABLE_GPU
break;
}
case TRITONSERVER_MEMORY_CPU:
free(buffer);
break;
}
return nullptr; // success
}
} // extern C
}} // namespace triton::core
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
namespace triton { namespace core {
// Currently there is just a global memory manager that is used for
// all backends and which simply forwards requests on to the core
// memory manager.
struct TritonMemoryManager {
};
}} // namespace triton::core
// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "backend_model.h"
#include <vector>
#include "backend_config.h"
#include "backend_model_instance.h"
#include "dynamic_batch_scheduler.h"
#include "filesystem.h"
#include "model_config_utils.h"
#include "numa_utils.h"
#include "sequence_batch_scheduler.h"
#include "sequence_state.h"
#include "server.h"
#include "server_message.h"
#include "shared_library.h"
#include "triton/common/logging.h"
#include "tritonserver_apis.h"
// For unknown reason, windows will not export the TRITONBACKEND_*
// functions declared with dllexport in tritonbackend.h. To get those
// functions exported it is (also?) necessary to mark the definitions
// in this file with dllexport as well.
#if defined(_MSC_VER)
#define TRITONAPI_DECLSPEC __declspec(dllexport)
#elif defined(__GNUC__)
#define TRITONAPI_DECLSPEC __attribute__((__visibility__("default")))
#else
#define TRITONAPI_DECLSPEC
#endif
namespace triton { namespace core {
Status
TritonModel::Create(
InferenceServer* server, const std::string& model_path,
const triton::common::BackendCmdlineConfigMap& backend_cmdline_config_map,
const triton::common::HostPolicyCmdlineConfigMap& host_policy_map,
const std::string& model_name, const int64_t version,
inference::ModelConfig model_config, const bool is_config_provided,
std::unique_ptr<TritonModel>* model)
{
model->reset();
// The model configuration must specify a backend. The name of the
// corresponding shared library must be libtriton_<backend>.so.
if (model_config.backend().empty()) {
return Status(
Status::Code::INVALID_ARG,
"must specify 'backend' for '" + model_config.name() + "'");
}
// Localize the content of the model repository corresponding to
// 'model_name'. This model holds a handle to the localized content
// so that it persists as long as the model is loaded.
std::shared_ptr<LocalizedPath> localized_model_dir;
RETURN_IF_ERROR(LocalizePath(model_path, &localized_model_dir));
// Localize paths in backend model config
// [FIXME] Remove once a more permanent solution is implemented (DLIS-4211)
RETURN_IF_ERROR(LocalizePythonBackendExecutionEnvironmentPath(
model_path, &model_config, &localized_model_dir));
// Get some internal configuration values needed for initialization.
std::string backend_dir;
RETURN_IF_ERROR(BackendConfigurationGlobalBackendsDirectory(
backend_cmdline_config_map, &backend_dir));
bool auto_complete_config = false;
RETURN_IF_ERROR(BackendConfigurationAutoCompleteConfig(
backend_cmdline_config_map, &auto_complete_config));
double min_compute_capability = 0;
RETURN_IF_ERROR(BackendConfigurationMinComputeCapability(
backend_cmdline_config_map, &min_compute_capability));
std::string specialized_backend_name;
RETURN_IF_ERROR(BackendConfigurationSpecializeBackendName(
backend_cmdline_config_map, model_config.backend(),
&specialized_backend_name));
std::string backend_libname;
RETURN_IF_ERROR(BackendConfigurationBackendLibraryName(
specialized_backend_name, &backend_libname));
// Get the path to the backend shared library. Search path is
// version directory, model directory, global backend directory.
const auto localized_model_path = localized_model_dir->Path();
const auto version_path =
JoinPath({localized_model_path, std::to_string(version)});
const std::string global_path =
JoinPath({backend_dir, specialized_backend_name});
const std::vector<std::string> search_paths = {
version_path, localized_model_path, global_path};
std::string backend_libdir;
std::string backend_libpath;
for (const auto& path : search_paths) {
const auto full_path = JoinPath({path, backend_libname});
bool exists = false;
RETURN_IF_ERROR(FileExists(full_path, &exists));
if (exists) {
backend_libdir = path;
backend_libpath = full_path;
break;
}
}
if (backend_libpath.empty()) {
return Status(
Status::Code::INVALID_ARG, "unable to find '" + backend_libname +
"' for model '" + model_config.name() +
"', searched: " + version_path + ", " +
model_path + ", " + global_path);
}
// Resolve the global backend configuration with the specific backend
// configuration
triton::common::BackendCmdlineConfig config;
RETURN_IF_ERROR(ResolveBackendConfigs(
backend_cmdline_config_map, model_config.backend(), config));
RETURN_IF_ERROR(SetBackendConfigDefaults(config));
std::shared_ptr<TritonBackend> backend;
RETURN_IF_ERROR(server->BackendManager()->CreateBackend(
model_config.backend(), backend_libdir, backend_libpath, config,
&backend));
// Normalize backend-dependent config
{
const auto& attributes = backend->BackendAttributes();
// [WIP] formalize config normalization / validation
RETURN_IF_ERROR(NormalizeInstanceGroup(
min_compute_capability, attributes.preferred_groups_, &model_config));
RETURN_IF_ERROR(
ValidateInstanceGroup(model_config, min_compute_capability));
}
// Create and initialize the model.
std::unique_ptr<TritonModel> local_model(new TritonModel(
server, localized_model_dir, backend, min_compute_capability, version,
model_config, auto_complete_config));
TritonModel* raw_local_model = local_model.get();
// Model initialization is optional... The TRITONBACKEND_Model
// object is this TritonModel object. We must set set shared library
// path to point to the backend directory in case the backend
// library attempts to load additional shared libaries.
if (backend->ModelInitFn() != nullptr) {
std::unique_ptr<SharedLibrary> slib;
RETURN_IF_ERROR(SharedLibrary::Acquire(&slib));
RETURN_IF_ERROR(slib->SetLibraryDirectory(backend->Directory()));
TRITONSERVER_Error* err = backend->ModelInitFn()(
reinterpret_cast<TRITONBACKEND_Model*>(raw_local_model));
RETURN_IF_ERROR(slib->ResetLibraryDirectory());
RETURN_IF_TRITONSERVER_ERROR(err);
}
// Initialize the model for Triton core usage
RETURN_IF_ERROR(local_model->Init(is_config_provided));
bool device_blocking = false;
if (local_model->backend_->ExecutionPolicy() ==
TRITONBACKEND_EXECUTION_DEVICE_BLOCKING) {
if (model_config.has_sequence_batching()) {
LOG_INFO << "Overriding execution policy to "
"\"TRITONBACKEND_EXECUTION_BLOCKING\" for sequence model \""
<< model_config.name() << "\"";
} else {
device_blocking = true;
}
}
// Create and initialize the model instances for this model.
RETURN_IF_ERROR(TritonModelInstance::CreateInstances(
raw_local_model, backend_cmdline_config_map, host_policy_map,
model_config, device_blocking));
RETURN_IF_ERROR(local_model->SetConfiguredScheduler());
*model = std::move(local_model);
return Status::Success;
}
Status
TritonModel::ResolveBackendConfigs(
const triton::common::BackendCmdlineConfigMap& backend_cmdline_config_map,
const std::string& backend_name,
triton::common::BackendCmdlineConfig& config)
{
const auto& global_itr = backend_cmdline_config_map.find(std::string());
const auto& specific_itr = backend_cmdline_config_map.find(backend_name);
if (specific_itr == backend_cmdline_config_map.end() &&
global_itr != backend_cmdline_config_map.end()) {
for (auto setting : global_itr->second) {
config.push_back(setting);
}
} else if (
specific_itr != backend_cmdline_config_map.end() &&
global_itr == backend_cmdline_config_map.end()) {
for (auto setting : specific_itr->second) {
config.push_back(setting);
}
} else if (
specific_itr != backend_cmdline_config_map.end() &&
global_itr != backend_cmdline_config_map.end()) {
triton::common::BackendCmdlineConfig global_backend_config =
global_itr->second;
triton::common::BackendCmdlineConfig specific_backend_config =
specific_itr->second;
std::sort(global_backend_config.begin(), global_backend_config.end());
std::sort(specific_backend_config.begin(), specific_backend_config.end());
size_t global_index = 0;
size_t specific_index = 0;
while (global_index < global_backend_config.size() &&
specific_index < specific_backend_config.size()) {
auto& current_global_setting = global_backend_config.at(global_index);
auto& current_specific_setting =
specific_backend_config.at(specific_index);
if (current_specific_setting.first.compare(
current_global_setting.first) == 0) {
// specific setting overrides global setting
config.push_back(current_specific_setting);
++global_index;
++specific_index;
} else if (
current_specific_setting.first.compare(current_global_setting.first) <
0) {
config.push_back(current_specific_setting);
++specific_index;
} else {
config.push_back(current_global_setting);
++global_index;
}
}
// add the rest of the global configs
if (global_index < global_backend_config.size()) {
auto& current_global_setting = global_backend_config.at(global_index);
config.push_back(current_global_setting);
}
// add the rest of the specific settings
if (specific_index < specific_backend_config.size()) {
auto& current_specific_setting =
specific_backend_config.at(specific_index);
config.push_back(current_specific_setting);
}
} // else empty config
return Status::Success;
}
const std::unordered_map<std::string, std::string> backend_config_defaults(
{{"default-max-batch-size", "4"}});
Status
TritonModel::SetBackendConfigDefaults(
triton::common::BackendCmdlineConfig& config)
{
auto backend_config_defaults_copy = backend_config_defaults;
for (auto& setting : config) {
if (setting.first.compare("default-max-batch-size") == 0) {
LOG_VERBOSE(1) << "Found overwritten default setting: " << setting.first
<< "," << setting.second;
backend_config_defaults_copy.erase(setting.first);
}
if (backend_config_defaults_copy.empty()) {
break;
}
}
// Anything left should be added to the config
for (const auto& default_setting : backend_config_defaults_copy) {
LOG_VERBOSE(1) << "Adding default backend config setting: "
<< default_setting.first << "," << default_setting.second;
config.push_back(
std::make_pair(default_setting.first, default_setting.second));
}
return Status::Success;
}
Status
TritonModel::AddInstance(
std::unique_ptr<TritonModelInstance>&& instance, const bool passive)
{
if (passive) {
passive_instances_.emplace_back(std::move(instance));
} else {
instances_.emplace_back(std::move(instance));
}
return Status::Success;
}
Status
TritonModel::UpdateModelConfig(
const uint32_t config_version, TRITONSERVER_Message* updated_config_message)
{
const char* buffer;
size_t byte_size;
RETURN_IF_TRITONSERVER_ERROR(TRITONSERVER_MessageSerializeToJson(
updated_config_message, &buffer, &byte_size));
inference::ModelConfig updated_config;
RETURN_IF_ERROR(
JsonToModelConfig({buffer, byte_size}, config_version, &updated_config));
auto config = Config();
config.set_max_batch_size(updated_config.max_batch_size());
auto inputs_config = config.mutable_input();
*inputs_config = updated_config.input();
auto outputs_config = config.mutable_output();
*outputs_config = updated_config.output();
if (!config.scheduling_choice_case()) {
if (updated_config.has_dynamic_batching()) {
auto dynamic_batching_config = config.mutable_dynamic_batching();
*dynamic_batching_config = updated_config.dynamic_batching();
} else if (updated_config.has_sequence_batching()) {
auto sequence_batching_config = config.mutable_sequence_batching();
*sequence_batching_config = updated_config.sequence_batching();
} else if (updated_config.has_ensemble_scheduling()) {
auto ensemble_scheduling_config = config.mutable_ensemble_scheduling();
*ensemble_scheduling_config = updated_config.ensemble_scheduling();
} // else do nothing
} else if (
config.scheduling_choice_case() !=
updated_config.scheduling_choice_case()) {
return Status(
triton::common::Error::Code::INTERNAL,
(std::string("Cannot update scheduling choice from ") +
std::to_string(config.scheduling_choice_case()) + std::string(" to ") +
std::to_string(config.scheduling_choice_case()) +
std::string(" when auto-completing."))
.c_str());
} // else do nothing
// Need to normalize the model configuration for
// populating missing fields.
RETURN_IF_ERROR(NormalizeModelConfig(min_compute_capability_, &config));
RETURN_IF_ERROR(SetModelConfig(config));
return Status::Success;
}
Status
TritonModel::SetConfiguredScheduler()
{
std::unique_ptr<Scheduler> scheduler;
// Need to enforce equal shape batches (i.e. non-ragged batches) if
// the model 1) allows one or more variable-size input tensors that
// are not marked as 'allow_ragged_batch' or 2) has one or more
// shape-tensor inputs. This is not needed if all input shapes are
// non-variable and if there are no shape tensors... so we don't
// enable it in that case for efficiency reasons.
std::unordered_map<std::string, bool> enforce_equal_shape_tensors;
for (const auto input : config_.input()) {
if (input.is_shape_tensor()) {
enforce_equal_shape_tensors.insert({input.name(), true});
} else if (
!input.allow_ragged_batch() &&
(triton::common::GetElementCount(input) == -1)) {
enforce_equal_shape_tensors.insert({input.name(), false});
}
}
// If 'sequence_batching' is configured, then use the SequenceBatchScheduler,
// otherwise use the default DynamicBatchScheduler.
if (config_.has_sequence_batching()) {
// Sequence batcher
RETURN_IF_ERROR(SequenceBatchScheduler::Create(
this, enforce_equal_shape_tensors, &scheduler));
} else if (config_.has_dynamic_batching()) {
// Dynamic batcher
RETURN_IF_ERROR(DynamicBatchScheduler::Create(
this, nullptr, 0 /*nice*/, true /* dynamic_batching_enabled */,
config_.max_batch_size(), enforce_equal_shape_tensors,
config_.dynamic_batching(),
config_.response_cache().enable() /* response_cache_enable */,
&scheduler));
} else {
// Default scheduler. Use dynamic batch scheduler (with batching
// disabled) as the default scheduler.
RETURN_IF_ERROR(DynamicBatchScheduler::Create(
this, nullptr, 0 /*nice*/, false /* dynamic_batching_enabled */,
1 /* max_batch_size */,
std::unordered_map<
std::string, bool>() /* enforce_equal_shape_tensors */,
false /* preserve_ordering */,
config_.response_cache().enable() /* response_cache_enable */,
std::set<int32_t>() /* preferred_batch_sizes */,
0 /* max_queue_delay_microseconds */, &scheduler));
}
return SetScheduler(std::move(scheduler));
}
Status
TritonModel::Initialize()
{
for (const auto& instance : instances_) {
RETURN_IF_ERROR(instance->Initialize());
}
return Status::Success;
}
Status
TritonModel::WarmUp()
{
for (const auto& instance : instances_) {
RETURN_IF_ERROR(instance->WarmUp());
}
return Status::Success;
}
TritonModel::TritonModel(
InferenceServer* server,
const std::shared_ptr<LocalizedPath>& localized_model_dir,
const std::shared_ptr<TritonBackend>& backend,
const double min_compute_capability, const int64_t version,
const inference::ModelConfig& config, const bool auto_complete_config)
: Model(
min_compute_capability, localized_model_dir->Path(), version, config),
server_(server), min_compute_capability_(min_compute_capability),
auto_complete_config_(auto_complete_config),
localized_model_dir_(localized_model_dir), backend_(backend),
state_(nullptr)
{
}
TritonModel::~TritonModel()
{
// Explicitly delete/finalize all model instances before finalizing
// the model itself.
instances_.clear();
passive_instances_.clear();
// Unregister itself from the rate limiter. Note this should happen
// after all instances are destructed. Destrucing instances ensures
// there are no instance threads waiting on rate limiter for
// receiving their payloads.
server_->GetRateLimiter()->UnregisterModel(this);
// Model finalization is optional... The TRITONBACKEND_Model
// object is this TritonModel object.
if (backend_->ModelFiniFn() != nullptr) {
LOG_TRITONSERVER_ERROR(
backend_->ModelFiniFn()(reinterpret_cast<TRITONBACKEND_Model*>(this)),
"failed finalizing model");
}
}
extern "C" {
//
// TRITONBACKEND_Model
//
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelName(TRITONBACKEND_Model* model, const char** name)
{
TritonModel* tm = reinterpret_cast<TritonModel*>(model);
*name = tm->Name().c_str();
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelVersion(TRITONBACKEND_Model* model, uint64_t* version)
{
TritonModel* tm = reinterpret_cast<TritonModel*>(model);
*version = tm->Version();
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelRepository(
TRITONBACKEND_Model* model, TRITONBACKEND_ArtifactType* artifact_type,
const char** location)
{
TritonModel* tm = reinterpret_cast<TritonModel*>(model);
*artifact_type = TRITONBACKEND_ARTIFACT_FILESYSTEM;
*location = tm->LocalizedModelPath().c_str();
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelConfig(
TRITONBACKEND_Model* model, const uint32_t config_version,
TRITONSERVER_Message** model_config)
{
TritonModel* tm = reinterpret_cast<TritonModel*>(model);
std::string model_config_json;
Status status =
ModelConfigToJson(tm->Config(), config_version, &model_config_json);
if (!status.IsOk()) {
return TRITONSERVER_ErrorNew(
StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
}
*model_config = reinterpret_cast<TRITONSERVER_Message*>(
new TritonServerMessage(std::move(model_config_json)));
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelAutoCompleteConfig(
TRITONBACKEND_Model* model, bool* auto_complete_config)
{
TritonModel* tm = reinterpret_cast<TritonModel*>(model);
*auto_complete_config = tm->AutoCompleteConfig();
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelSetConfig(
TRITONBACKEND_Model* model, const uint32_t config_version,
TRITONSERVER_Message* model_config)
{
TritonModel* tm = reinterpret_cast<TritonModel*>(model);
Status status = tm->UpdateModelConfig(config_version, model_config);
if (!status.IsOk()) {
return TRITONSERVER_ErrorNew(
StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
}
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelServer(
TRITONBACKEND_Model* model, TRITONSERVER_Server** server)
{
TritonModel* tm = reinterpret_cast<TritonModel*>(model);
*server = reinterpret_cast<TRITONSERVER_Server*>(tm->Server());
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelBackend(
TRITONBACKEND_Model* model, TRITONBACKEND_Backend** backend)
{
TritonModel* tm = reinterpret_cast<TritonModel*>(model);
*backend = reinterpret_cast<TRITONBACKEND_Backend*>(tm->Backend().get());
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelState(TRITONBACKEND_Model* model, void** state)
{
TritonModel* tm = reinterpret_cast<TritonModel*>(model);
*state = tm->State();
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelSetState(TRITONBACKEND_Model* model, void* state)
{
TritonModel* tm = reinterpret_cast<TritonModel*>(model);
tm->SetState(state);
return nullptr; // success
}
///
/// TRITONBACKEND_Request
///
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_RequestId(TRITONBACKEND_Request* request, const char** id)
{
InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
*id = tr->Id().c_str();
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_RequestCorrelationId(TRITONBACKEND_Request* request, uint64_t* id)
{
InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
const InferenceRequest::SequenceId& correlation_id = tr->CorrelationId();
if (correlation_id.Type() != InferenceRequest::SequenceId::DataType::UINT64) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INVALID_ARG,
(tr->LogRequest() + "correlation ID in request is not an unsigned int")
.c_str());
}
*id = correlation_id.UnsignedIntValue();
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_RequestFlags(TRITONBACKEND_Request* request, uint32_t* flags)
{
InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
*flags = tr->Flags();
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_RequestCorrelationIdString(
TRITONBACKEND_Request* request, const char** id)
{
InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
const InferenceRequest::SequenceId& correlation_id = tr->CorrelationId();
if (correlation_id.Type() != InferenceRequest::SequenceId::DataType::STRING) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INVALID_ARG,
(tr->LogRequest() + "correlation ID in request is not a string")
.c_str());
}
*id = correlation_id.StringValue().c_str();
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_RequestInputCount(TRITONBACKEND_Request* request, uint32_t* count)
{
InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
*count = tr->ImmutableInputs().size();
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_RequestInputName(
TRITONBACKEND_Request* request, const uint32_t index,
const char** input_name)
{
*input_name = nullptr;
InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
const auto& inputs = tr->ImmutableInputs();
if (index >= inputs.size()) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INVALID_ARG,
(tr->LogRequest() + "out of bounds index " + std::to_string(index) +
": request has " + std::to_string(inputs.size()) + " inputs")
.c_str());
}
// The request inputs are not allowed to change once the request
// makes it to the backend, so it is ok to just iterate through the
// map. This linear search is the best we can do given the need for
// the inputs to be in a map and given the typical small number of
// inputs is better than having every request maintain the inputs as
// both map and vector.
uint32_t cnt = 0;
for (const auto& pr : inputs) {
if (cnt++ == index) {
InferenceRequest::Input* in = pr.second;
*input_name = in->Name().c_str();
break;
}
}
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_RequestInput(
TRITONBACKEND_Request* request, const char* name,
TRITONBACKEND_Input** input)
{
InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
const auto& inputs = tr->ImmutableInputs();
const auto& itr = inputs.find(name);
if (itr == inputs.end()) {
*input = nullptr;
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INVALID_ARG,
(tr->LogRequest() + "unknown request input name " + name).c_str());
}
InferenceRequest::Input* in = itr->second;
*input = reinterpret_cast<TRITONBACKEND_Input*>(in);
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_RequestInputByIndex(
TRITONBACKEND_Request* request, const uint32_t index,
TRITONBACKEND_Input** input)
{
InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
const auto& inputs = tr->ImmutableInputs();
if (index >= inputs.size()) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INVALID_ARG,
(tr->LogRequest() + "out of bounds index " + std::to_string(index) +
": request has " + std::to_string(inputs.size()) + " inputs")
.c_str());
}
// The request inputs are not allowed to change once the request
// makes it to the backend, so it is ok to just iterate through the
// map. This linear search is the best we can do given the need for
// the inputs to be in a map and given the typical small number of
// inputs is better than having every request maintain the inputs as
// both map and vector.
uint32_t cnt = 0;
for (const auto& pr : inputs) {
if (cnt++ == index) {
InferenceRequest::Input* in = pr.second;
*input = reinterpret_cast<TRITONBACKEND_Input*>(in);
break;
}
}
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_RequestOutputCount(
TRITONBACKEND_Request* request, uint32_t* count)
{
InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
*count = tr->ImmutableRequestedOutputs().size();
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_RequestOutputName(
TRITONBACKEND_Request* request, const uint32_t index,
const char** output_name)
{
*output_name = nullptr;
InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
const auto& routputs = tr->ImmutableRequestedOutputs();
if (index >= routputs.size()) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INVALID_ARG,
(tr->LogRequest() + "out of bounds index " + std::to_string(index) +
": request has " + std::to_string(routputs.size()) +
" requested outputs")
.c_str());
}
// The requested outputs are not allowed to change once the request
// makes it to the backend, so it is ok to just iterate through the
// set. This linear search is the best we can do given the requested
// outputs being in a set and given the typical small number of
// requested outputs it should not be a performance issue.
uint32_t cnt = 0;
for (const auto& rout : routputs) {
if (cnt++ == index) {
*output_name = rout.c_str();
break;
}
}
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_RequestOutputBufferProperties(
TRITONBACKEND_Request* request, const char* name, size_t* byte_size,
TRITONSERVER_MemoryType* memory_type, int64_t* memory_type_id)
{
InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
auto status =
tr->OutputBufferProperties(name, byte_size, memory_type, memory_type_id);
if (!status.IsOk()) {
return TRITONSERVER_ErrorNew(
StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
}
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_RequestRelease(
TRITONBACKEND_Request* request, uint32_t release_flags)
{
InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
std::unique_ptr<InferenceRequest> ur(tr);
InferenceRequest::Release(std::move(ur), release_flags);
return nullptr; // success
}
///
/// TRITONBACKEND_State
///
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_StateUpdate(TRITONBACKEND_State* state)
{
SequenceState* ts = reinterpret_cast<SequenceState*>(state);
auto status = ts->Update();
if (!status.IsOk()) {
return TRITONSERVER_ErrorNew(
StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
}
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_StateNew(
TRITONBACKEND_State** state, TRITONBACKEND_Request* request,
const char* name, const TRITONSERVER_DataType datatype,
const int64_t* shape, const uint32_t dims_count)
{
InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
SequenceState* lstate;
std::vector<int64_t> lshape(shape, shape + dims_count);
auto& sequence_state = tr->GetSequenceStates();
if (sequence_state == nullptr) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INVALID_ARG,
(std::string("unable to add state '") + name +
"'. State configuration is missing for model '" + tr->ModelName() +
"'.")
.c_str());
}
Status status = sequence_state->OutputState(
name, TritonToDataType(datatype), lshape, &lstate);
if (!status.IsOk()) {
*state = nullptr;
return TRITONSERVER_ErrorNew(
StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
}
*state = reinterpret_cast<TRITONBACKEND_State*>(lstate);
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_StateBuffer(
TRITONBACKEND_State* state, void** buffer, const uint64_t buffer_byte_size,
TRITONSERVER_MemoryType* memory_type, int64_t* memory_type_id)
{
SequenceState* to = reinterpret_cast<SequenceState*>(state);
Status status = Status::Success;
// If the buffer size exactly matches the buffer available, reuse the
// currently allocated buffer.
if (to->Data()->TotalByteSize() == buffer_byte_size) {
const std::shared_ptr<AllocatedMemory>& memory =
reinterpret_cast<const std::shared_ptr<AllocatedMemory>&>(to->Data());
TRITONSERVER_MemoryType current_memory_type;
int64_t current_memory_type_id;
void* lbuffer =
memory->MutableBuffer(&current_memory_type, &current_memory_type_id);
// If the requested memory type doesn't match the current buffer, allocate a
// new buffer with the requested memory type and memory type id.
if (current_memory_type == *memory_type &&
current_memory_type_id == *memory_type_id) {
*buffer = lbuffer;
} else {
std::shared_ptr<AllocatedMemory> memory =
std::make_shared<AllocatedMemory>(
buffer_byte_size, *memory_type, *memory_type_id);
*buffer = memory->MutableBuffer(memory_type, memory_type_id);
to->RemoveAllData();
status = to->SetData(memory);
}
} else {
std::shared_ptr<AllocatedMemory> memory = std::make_shared<AllocatedMemory>(
buffer_byte_size, *memory_type, *memory_type_id);
*buffer = memory->MutableBuffer(memory_type, memory_type_id);
to->RemoveAllData();
status = to->SetData(memory);
}
if (!status.IsOk()) {
*buffer = nullptr;
return TRITONSERVER_ErrorNew(
StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
}
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_StateBufferAttributes(
TRITONBACKEND_State* state,
TRITONSERVER_BufferAttributes** buffer_attributes)
{
SequenceState* to = reinterpret_cast<SequenceState*>(state);
to->Data()->BufferAt(
0, reinterpret_cast<BufferAttributes**>(buffer_attributes));
return nullptr; // success
}
//
// TRITONBACKEND_ResponseFactory
//
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ResponseFactoryNew(
TRITONBACKEND_ResponseFactory** factory, TRITONBACKEND_Request* request)
{
InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
std::shared_ptr<InferenceResponseFactory>* response_factory =
new std::shared_ptr<InferenceResponseFactory>(tr->ResponseFactory());
*factory = reinterpret_cast<TRITONBACKEND_ResponseFactory*>(response_factory);
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ResponseFactoryDelete(TRITONBACKEND_ResponseFactory* factory)
{
std::shared_ptr<InferenceResponseFactory>* response_factory =
reinterpret_cast<std::shared_ptr<InferenceResponseFactory>*>(factory);
delete response_factory;
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ResponseFactorySendFlags(
TRITONBACKEND_ResponseFactory* factory, const uint32_t send_flags)
{
std::shared_ptr<InferenceResponseFactory>* response_factory =
reinterpret_cast<std::shared_ptr<InferenceResponseFactory>*>(factory);
Status status = (*response_factory)->SendFlags(send_flags);
if (!status.IsOk()) {
return TRITONSERVER_ErrorNew(
StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
}
return nullptr; // success
}
///
/// TRITONBACKEND_Response
///
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ResponseNew(
TRITONBACKEND_Response** response, TRITONBACKEND_Request* request)
{
InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
std::unique_ptr<InferenceResponse> tresp;
Status status = tr->ResponseFactory()->CreateResponse(&tresp);
if (!status.IsOk()) {
*response = nullptr;
return TRITONSERVER_ErrorNew(
StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
}
*response = reinterpret_cast<TRITONBACKEND_Response*>(tresp.release());
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ResponseNewFromFactory(
TRITONBACKEND_Response** response, TRITONBACKEND_ResponseFactory* factory)
{
std::shared_ptr<InferenceResponseFactory>* response_factory =
reinterpret_cast<std::shared_ptr<InferenceResponseFactory>*>(factory);
std::unique_ptr<InferenceResponse> tr;
Status status = (*response_factory)->CreateResponse(&tr);
if (!status.IsOk()) {
*response = nullptr;
return TRITONSERVER_ErrorNew(
StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
}
*response = reinterpret_cast<TRITONBACKEND_Response*>(tr.release());
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ResponseDelete(TRITONBACKEND_Response* response)
{
InferenceResponse* tr = reinterpret_cast<InferenceResponse*>(response);
delete tr;
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ResponseSetStringParameter(
TRITONBACKEND_Response* response, const char* name, const char* value)
{
InferenceResponse* tr = reinterpret_cast<InferenceResponse*>(response);
Status status = tr->AddParameter(name, value);
if (!status.IsOk()) {
return TRITONSERVER_ErrorNew(
StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
}
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ResponseSetIntParameter(
TRITONBACKEND_Response* response, const char* name, const int64_t value)
{
InferenceResponse* tr = reinterpret_cast<InferenceResponse*>(response);
Status status = tr->AddParameter(name, value);
if (!status.IsOk()) {
return TRITONSERVER_ErrorNew(
StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
}
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ResponseSetBoolParameter(
TRITONBACKEND_Response* response, const char* name, const bool value)
{
InferenceResponse* tr = reinterpret_cast<InferenceResponse*>(response);
Status status = tr->AddParameter(name, value);
if (!status.IsOk()) {
return TRITONSERVER_ErrorNew(
StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
}
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ResponseOutput(
TRITONBACKEND_Response* response, TRITONBACKEND_Output** output,
const char* name, const TRITONSERVER_DataType datatype,
const int64_t* shape, const uint32_t dims_count)
{
InferenceResponse* tr = reinterpret_cast<InferenceResponse*>(response);
std::vector<int64_t> lshape(shape, shape + dims_count);
InferenceResponse::Output* loutput;
Status status = tr->AddOutput(
name, TritonToDataType(datatype), std::move(lshape), &loutput);
if (!status.IsOk()) {
*output = nullptr;
return TRITONSERVER_ErrorNew(
StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
}
*output = reinterpret_cast<TRITONBACKEND_Output*>(loutput);
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ResponseSend(
TRITONBACKEND_Response* response, const uint32_t send_flags,
TRITONSERVER_Error* error)
{
InferenceResponse* tr = reinterpret_cast<InferenceResponse*>(response);
Status status;
std::unique_ptr<InferenceResponse> utr(tr);
if (error == nullptr) {
status = InferenceResponse::Send(std::move(utr), send_flags);
} else {
status = InferenceResponse::SendWithStatus(
std::move(utr), send_flags,
Status(
TritonCodeToStatusCode(TRITONSERVER_ErrorCode(error)),
TRITONSERVER_ErrorMessage(error)));
}
if (!status.IsOk()) {
return TRITONSERVER_ErrorNew(
StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
}
return nullptr; // success
}
///
/// TRITONBACKEND_Input
///
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_InputProperties(
TRITONBACKEND_Input* input, const char** name,
TRITONSERVER_DataType* datatype, const int64_t** shape,
uint32_t* dims_count, uint64_t* byte_size, uint32_t* buffer_count)
{
InferenceRequest::Input* ti =
reinterpret_cast<InferenceRequest::Input*>(input);
if (name != nullptr) {
*name = ti->Name().c_str();
}
if (datatype != nullptr) {
*datatype = DataTypeToTriton(ti->DType());
}
if (shape != nullptr) {
*shape = ti->ShapeWithBatchDim().data();
}
if (dims_count != nullptr) {
*dims_count = ti->ShapeWithBatchDim().size();
}
if (byte_size != nullptr) {
*byte_size = ti->Data()->TotalByteSize();
}
if (buffer_count != nullptr) {
*buffer_count = ti->DataBufferCount();
}
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_InputPropertiesForHostPolicy(
TRITONBACKEND_Input* input, const char* host_policy_name, const char** name,
TRITONSERVER_DataType* datatype, const int64_t** shape,
uint32_t* dims_count, uint64_t* byte_size, uint32_t* buffer_count)
{
InferenceRequest::Input* ti =
reinterpret_cast<InferenceRequest::Input*>(input);
if (name != nullptr) {
*name = ti->Name().c_str();
}
if (datatype != nullptr) {
*datatype = DataTypeToTriton(ti->DType());
}
if (shape != nullptr) {
*shape = ti->ShapeWithBatchDim().data();
}
if (dims_count != nullptr) {
*dims_count = ti->ShapeWithBatchDim().size();
}
if (host_policy_name != nullptr) {
if (byte_size != nullptr) {
*byte_size = ti->Data(host_policy_name)->TotalByteSize();
}
if (buffer_count != nullptr) {
*buffer_count = ti->DataBufferCountForHostPolicy(host_policy_name);
}
} else {
if (byte_size != nullptr) {
*byte_size = ti->Data()->TotalByteSize();
}
if (buffer_count != nullptr) {
*buffer_count = ti->DataBufferCount();
}
}
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_InputBuffer(
TRITONBACKEND_Input* input, const uint32_t index, const void** buffer,
uint64_t* buffer_byte_size, TRITONSERVER_MemoryType* memory_type,
int64_t* memory_type_id)
{
InferenceRequest::Input* ti =
reinterpret_cast<InferenceRequest::Input*>(input);
Status status = ti->DataBuffer(
index, buffer, buffer_byte_size, memory_type, memory_type_id);
if (!status.IsOk()) {
*buffer = nullptr;
*buffer_byte_size = 0;
return TRITONSERVER_ErrorNew(
StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
}
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_InputBufferAttributes(
TRITONBACKEND_Input* input, const uint32_t index, const void** buffer,
TRITONSERVER_BufferAttributes** buffer_attributes)
{
InferenceRequest::Input* ti =
reinterpret_cast<InferenceRequest::Input*>(input);
Status status = ti->DataBufferAttributes(
index, buffer, reinterpret_cast<BufferAttributes**>(buffer_attributes));
if (!status.IsOk()) {
*buffer = nullptr;
*buffer_attributes = nullptr;
return TRITONSERVER_ErrorNew(
StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
}
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_InputBufferForHostPolicy(
TRITONBACKEND_Input* input, const char* host_policy_name,
const uint32_t index, const void** buffer, uint64_t* buffer_byte_size,
TRITONSERVER_MemoryType* memory_type, int64_t* memory_type_id)
{
InferenceRequest::Input* ti =
reinterpret_cast<InferenceRequest::Input*>(input);
Status status =
(host_policy_name == nullptr)
? ti->DataBuffer(
index, buffer, buffer_byte_size, memory_type, memory_type_id)
: ti->DataBufferForHostPolicy(
index, buffer, buffer_byte_size, memory_type, memory_type_id,
host_policy_name);
if (!status.IsOk()) {
*buffer = nullptr;
*buffer_byte_size = 0;
return TRITONSERVER_ErrorNew(
StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
}
return nullptr; // success
}
///
/// TRITONBACKEND_Output
///
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_OutputBuffer(
TRITONBACKEND_Output* output, void** buffer,
const uint64_t buffer_byte_size, TRITONSERVER_MemoryType* memory_type,
int64_t* memory_type_id)
{
InferenceResponse::Output* to =
reinterpret_cast<InferenceResponse::Output*>(output);
Status status = to->AllocateDataBuffer(
buffer, buffer_byte_size, memory_type, memory_type_id);
if (!status.IsOk()) {
*buffer = nullptr;
return TRITONSERVER_ErrorNew(
StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
}
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_OutputBufferAttributes(
TRITONBACKEND_Output* output,
TRITONSERVER_BufferAttributes** buffer_attributes)
{
InferenceResponse::Output* to =
reinterpret_cast<InferenceResponse::Output*>(output);
*buffer_attributes = reinterpret_cast<TRITONSERVER_BufferAttributes*>(
to->GetBufferAttributes());
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_BackendAttributeAddPreferredInstanceGroup(
TRITONBACKEND_BackendAttribute* backend_attributes,
const TRITONSERVER_InstanceGroupKind kind, const uint64_t count,
const uint64_t* device_ids, const uint64_t id_count)
{
auto ba = reinterpret_cast<TritonBackend::Attribute*>(backend_attributes);
ba->preferred_groups_.emplace_back();
auto& pg = ba->preferred_groups_.back();
switch (kind) {
case TRITONSERVER_INSTANCEGROUPKIND_AUTO:
pg.set_kind(inference::ModelInstanceGroup::KIND_AUTO);
break;
case TRITONSERVER_INSTANCEGROUPKIND_CPU:
pg.set_kind(inference::ModelInstanceGroup::KIND_CPU);
break;
case TRITONSERVER_INSTANCEGROUPKIND_GPU:
pg.set_kind(inference::ModelInstanceGroup::KIND_GPU);
break;
case TRITONSERVER_INSTANCEGROUPKIND_MODEL:
pg.set_kind(inference::ModelInstanceGroup::KIND_MODEL);
break;
}
pg.set_count(count);
if (device_ids != nullptr) {
for (size_t i = 0; i < id_count; ++i) {
pg.add_gpus(device_ids[i]);
}
}
return nullptr;
}
} // extern C
}} // namespace triton::core
// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <memory>
#include <string>
#include "backend_manager.h"
#include "filesystem.h"
#include "infer_request.h"
#include "model.h"
#include "model_config.pb.h"
#include "status.h"
namespace triton { namespace core {
class InferenceServer;
class TritonModelInstance;
//
// Represents a model.
//
// Inheriting from Model to implement backend APIs
//
class TritonModel : public Model {
public:
static Status Create(
InferenceServer* server, const std::string& model_path,
const triton::common::BackendCmdlineConfigMap& backend_cmdline_config_map,
const triton::common::HostPolicyCmdlineConfigMap& host_policy_map,
const std::string& model_name, const int64_t version,
inference::ModelConfig model_config, const bool is_config_provided,
std::unique_ptr<TritonModel>* model);
~TritonModel();
const std::string& LocalizedModelPath() const
{
return localized_model_dir_->Path();
}
InferenceServer* Server() { return server_; }
bool AutoCompleteConfig() const { return auto_complete_config_; }
Status UpdateModelConfig(
const uint32_t config_version,
TRITONSERVER_Message* updated_config_message);
const std::shared_ptr<TritonBackend>& Backend() const { return backend_; }
const std::vector<std::unique_ptr<TritonModelInstance>>& Instances() const
{
return instances_;
}
void* State() { return state_; }
void SetState(void* state) { state_ = state; }
Status AddInstance(
std::unique_ptr<TritonModelInstance>&& instance, const bool passive);
private:
DISALLOW_COPY_AND_ASSIGN(TritonModel);
TritonModel(
InferenceServer* server,
const std::shared_ptr<LocalizedPath>& localized_model_dir,
const std::shared_ptr<TritonBackend>& backend,
const double min_compute_capability, const int64_t version,
const inference::ModelConfig& config, const bool auto_complete_config);
// Set the scheduler based on the model configuration. The scheduler
// can only be set once for a backend.
Status SetConfiguredScheduler();
// Merges the global backend configs with the specific
// backend configs.
static Status ResolveBackendConfigs(
const triton::common::BackendCmdlineConfigMap& backend_cmdline_config_map,
const std::string& backend_name,
triton::common::BackendCmdlineConfig& config);
// Sets defaults for some backend configurations when none are specified on
// the command line.
static Status SetBackendConfigDefaults(
triton::common::BackendCmdlineConfig& config);
Status Initialize();
Status WarmUp();
// The server object that owns this model. The model holds this as a
// raw pointer because the lifetime of the server is guaranteed to
// be longer than the lifetime of a model owned by the server.
InferenceServer* server_;
// The minimum supported compute capability on device.
const double min_compute_capability_;
// Whether the backend should attempt to auto-complete the model config.
const bool auto_complete_config_;
// The localized repo directory holding the model. If localization
// required creation of a temporary local copy then that copy will
// persist as along as this object is retained by this model.
std::shared_ptr<LocalizedPath> localized_model_dir_;
// Backend used by this model.
std::shared_ptr<TritonBackend> backend_;
// The model instances for this model.
std::vector<std::unique_ptr<TritonModelInstance>> instances_;
std::vector<std::unique_ptr<TritonModelInstance>> passive_instances_;
// Opaque state associated with this model.
void* state_;
};
}} // namespace triton::core
// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "backend_model_instance.h"
#ifndef _WIN32
#include <sys/resource.h>
#include <sys/syscall.h>
#include <unistd.h>
#endif
#include "backend_config.h"
#include "backend_model.h"
#include "cuda_utils.h"
#include "metrics.h"
#include "model_config.pb.h"
#include "numa_utils.h"
#include "server.h"
#include "shared_library.h"
#include "triton/common/logging.h"
#include "triton/common/nvtx.h"
#include "tritonserver_apis.h"
// For unknown reason, windows will not export the TRITONBACKEND_*
// functions declared with dllexport in tritonbackend.h. To get those
// functions exported it is (also?) necessary to mark the definitions
// in this file with dllexport as well.
#if defined(_MSC_VER)
#define TRITONAPI_DECLSPEC __declspec(dllexport)
#elif defined(__GNUC__)
#define TRITONAPI_DECLSPEC __attribute__((__visibility__("default")))
#else
#define TRITONAPI_DECLSPEC
#endif
namespace triton { namespace core {
namespace {
// Utilities for warmup feature
TRITONSERVER_Error*
WarmupResponseAlloc(
TRITONSERVER_ResponseAllocator* allocator, const char* tensor_name,
size_t byte_size, TRITONSERVER_MemoryType preferred_memory_type,
int64_t preferred_memory_type_id, void* userp, void** buffer,
void** buffer_userp, TRITONSERVER_MemoryType* actual_memory_type,
int64_t* actual_memory_type_id)
{
*buffer = malloc(byte_size);
if (*buffer != nullptr) {
*actual_memory_type = TRITONSERVER_MEMORY_CPU;
*actual_memory_type_id = 0;
return nullptr;
}
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL,
"failed to allocate output buffer for warmup.");
}
TRITONSERVER_Error*
WarmupResponseRelease(
TRITONSERVER_ResponseAllocator* allocator, void* buffer, void* buffer_userp,
size_t byte_size, TRITONSERVER_MemoryType memory_type,
int64_t memory_type_id)
{
free(buffer);
return nullptr;
}
ResponseAllocator warmup_allocator = ResponseAllocator(
WarmupResponseAlloc, WarmupResponseRelease, nullptr /* start_fn */);
void
WarmupResponseComplete(
TRITONSERVER_InferenceResponse* iresponse, const uint32_t flags,
void* userp)
{
auto res_pair = reinterpret_cast<
std::pair<std::promise<void>, std::vector<std::string>*>*>(userp);
if (iresponse != nullptr) {
auto err = TRITONSERVER_InferenceResponseError(iresponse);
if (err != nullptr) {
// The error vector is shared by all requests in the batch for now
static std::mutex res_mtx;
{
std::lock_guard<std::mutex> lk(res_mtx);
res_pair->second->emplace_back(TRITONSERVER_ErrorMessage(err));
}
TRITONSERVER_ErrorDelete(err);
}
// Just delete the response, warmup doesn't check for correctness
LOG_TRITONSERVER_ERROR(
TRITONSERVER_InferenceResponseDelete(iresponse),
"deleting warmup response");
}
// Last response
if ((flags & TRITONSERVER_RESPONSE_COMPLETE_FINAL) != 0) {
res_pair->first.set_value();
}
}
void
WarmupRequestComplete(
TRITONSERVER_InferenceRequest* request, const uint32_t flags, void* userp)
{
if ((flags & TRITONSERVER_REQUEST_RELEASE_ALL) != 0) {
// Don't need to release request here, it is managed in WarmupData
if (userp != nullptr) {
auto warmup_promise = reinterpret_cast<std::promise<void>*>(userp);
warmup_promise->set_value();
}
}
}
} // namespace
TritonModelInstance::TritonModelInstance(
TritonModel* model, const std::string& name, const size_t index,
const TRITONSERVER_InstanceGroupKind kind, const int32_t device_id,
const std::vector<std::string>& profile_names, const bool passive,
const triton::common::HostPolicyCmdlineConfig& host_policy,
const TritonServerMessage& host_policy_message,
const std::vector<SecondaryDevice>& secondary_devices)
: model_(model), name_(name), index_(index), kind_(kind),
device_id_(device_id), host_policy_(host_policy),
host_policy_message_(host_policy_message), profile_names_(profile_names),
passive_(passive), secondary_devices_(secondary_devices), state_(nullptr)
{
#ifdef TRITON_ENABLE_METRICS
if (Metrics::Enabled()) {
// Use an ID in the metric only for GPU instances. Otherwise use
// METRIC_REPORTER_ID_CPU to indicate no device should be reported in the
// metric.
const int id = (kind_ == TRITONSERVER_INSTANCEGROUPKIND_GPU)
? device_id_
: METRIC_REPORTER_ID_CPU;
MetricModelReporter::Create(
model_->Name(), model_->Version(), id, model_->Config().metric_tags(),
&reporter_);
}
#endif // TRITON_ENABLE_METRICS
}
TritonModelInstance::~TritonModelInstance()
{
if (triton_backend_thread_.get() != nullptr) {
triton_backend_thread_->StopBackendThread();
}
// Model finalization is optional...
if (model_->Backend()->ModelInstanceFiniFn() != nullptr) {
LOG_TRITONSERVER_ERROR(
model_->Backend()->ModelInstanceFiniFn()(
reinterpret_cast<TRITONBACKEND_ModelInstance*>(this)),
"failed finalizing model instance");
}
}
Status
TritonModelInstance::CreateInstances(
TritonModel* model,
const triton::common::BackendCmdlineConfigMap& backend_cmdline_config_map,
const triton::common::HostPolicyCmdlineConfigMap& host_policy_map,
const inference::ModelConfig& model_config, const bool device_blocking)
{
static triton::common::HostPolicyCmdlineConfig empty_host_policy;
// This structure is used to allocate TritonBackendThread to instances on same
// device for device blocking execution policy.
std::map<uint32_t, std::shared_ptr<TritonBackendThread>> device_to_thread_map;
for (const auto& group : model_config.instance_group()) {
std::vector<std::string> profile_names;
for (const auto& profile_name : group.profile()) {
profile_names.push_back(profile_name);
}
std::vector<SecondaryDevice> secondary_devices;
for (const auto& secondary_device : group.secondary_devices()) {
secondary_devices.emplace_back(
inference::
ModelInstanceGroup_SecondaryDevice_SecondaryDeviceKind_Name(
secondary_device.kind()),
secondary_device.device_id());
}
for (int32_t c = 0; c < group.count(); ++c) {
std::string instance_name{group.count() > 1
? group.name() + "_" + std::to_string(c)
: group.name()};
const bool passive = group.passive();
std::vector<std::tuple<
std::string, TRITONSERVER_InstanceGroupKind, int32_t,
const inference::ModelRateLimiter*>>
instance_setting;
if (group.kind() == inference::ModelInstanceGroup::KIND_CPU) {
instance_setting.emplace_back(
group.host_policy().empty() ? "cpu" : group.host_policy(),
TRITONSERVER_INSTANCEGROUPKIND_CPU, 0 /* device_id */,
&group.rate_limiter());
} else if (group.kind() == inference::ModelInstanceGroup::KIND_GPU) {
for (const int32_t device_id : group.gpus()) {
instance_setting.emplace_back(
group.host_policy().empty() ? ("gpu_" + std::to_string(device_id))
: group.host_policy(),
TRITONSERVER_INSTANCEGROUPKIND_GPU, device_id,
&group.rate_limiter());
}
} else if (group.kind() == inference::ModelInstanceGroup::KIND_MODEL) {
instance_setting.emplace_back(
group.host_policy().empty() ? "model" : group.host_policy(),
TRITONSERVER_INSTANCEGROUPKIND_MODEL, 0 /* device_id */,
&group.rate_limiter());
} else {
return Status(
Status::Code::INVALID_ARG,
std::string("instance_group kind ") +
ModelInstanceGroup_Kind_Name(group.kind()) + " not supported");
}
for (const auto is : instance_setting) {
const auto& kind = std::get<1>(is);
const auto& id = std::get<2>(is);
const std::string& policy_name = std::get<0>(is);
const triton::common::HostPolicyCmdlineConfig* host_policy;
const auto policy_it = host_policy_map.find(policy_name);
if (policy_it != host_policy_map.end()) {
host_policy = &policy_it->second;
} else {
host_policy = &empty_host_policy;
}
RETURN_IF_ERROR(SetNumaConfigOnThread(*host_policy));
auto err = CreateInstance(
model, instance_name, c, kind, id, profile_names, passive,
policy_name, *host_policy, *(std::get<3>(is)), device_blocking,
&device_to_thread_map, secondary_devices);
RETURN_IF_ERROR(ResetNumaMemoryPolicy());
RETURN_IF_ERROR(err);
// When deploying on GPU, we want to make sure the GPU memory usage
// is within allowed range, otherwise, stop the creation to ensure
// there is sufficient GPU memory for other use.
// We check the usage after loading the instance to better enforcing
// the limit. If we check before loading, we may create instance
// that occupies the rest of available memory which against the purpose
if (kind == TRITONSERVER_INSTANCEGROUPKIND_GPU) {
size_t free, total;
double memory_limit;
RETURN_IF_ERROR(GetDeviceMemoryInfo(id, &free, &total));
RETURN_IF_ERROR(BackendConfigurationModelLoadGpuFraction(
backend_cmdline_config_map, id, &memory_limit));
const size_t allow = total * memory_limit;
const size_t used = total - free;
if (used > allow) {
return Status(
Status::Code::UNAVAILABLE,
std::string("can not create model '") + instance_name +
"': memory limit set for " +
TRITONSERVER_InstanceGroupKindString(kind) + " " +
std::to_string(id) +
" has exceeded, model loading is rejected.");
}
}
}
}
}
return Status::Success;
}
Status
TritonModelInstance::CreateInstance(
TritonModel* model, const std::string& name, const size_t index,
const TRITONSERVER_InstanceGroupKind kind, const int32_t device_id,
const std::vector<std::string>& profile_names, const bool passive,
const std::string& host_policy_name,
const triton::common::HostPolicyCmdlineConfig& host_policy,
const inference::ModelRateLimiter& rate_limiter_config,
const bool device_blocking,
std::map<uint32_t, std::shared_ptr<TritonBackendThread>>*
device_to_thread_map,
const std::vector<SecondaryDevice>& secondary_devices)
{
// Create the JSON representation of the backend configuration.
triton::common::TritonJson::Value host_policy_json(
triton::common::TritonJson::ValueType::OBJECT);
triton::common::TritonJson::Value policy_setting_json(
host_policy_json, triton::common::TritonJson::ValueType::OBJECT);
for (const auto& pr : host_policy) {
RETURN_IF_ERROR(policy_setting_json.AddString(pr.first.c_str(), pr.second));
}
RETURN_IF_ERROR(host_policy_json.Add(
host_policy_name.c_str(), std::move(policy_setting_json)));
TritonServerMessage host_policy_message(host_policy_json);
std::unique_ptr<TritonModelInstance> local_instance(new TritonModelInstance(
model, name, index, kind, device_id, profile_names, passive, host_policy,
host_policy_message, secondary_devices));
TRITONBACKEND_ModelInstance* triton_instance =
reinterpret_cast<TRITONBACKEND_ModelInstance*>(local_instance.get());
// Instance initialization is optional... We must set set shared
// library path to point to the backend directory in case the
// backend library attempts to load additional shared libaries.
if (model->Backend()->ModelInstanceInitFn() != nullptr) {
std::unique_ptr<SharedLibrary> slib;
RETURN_IF_ERROR(SharedLibrary::Acquire(&slib));
RETURN_IF_ERROR(slib->SetLibraryDirectory(model->Backend()->Directory()));
TRITONSERVER_Error* err =
model->Backend()->ModelInstanceInitFn()(triton_instance);
RETURN_IF_ERROR(slib->ResetLibraryDirectory());
RETURN_IF_TRITONSERVER_ERROR(err);
}
if (!passive) {
RETURN_IF_ERROR(local_instance->GenerateWarmupData());
RETURN_IF_ERROR(model->Server()->GetRateLimiter()->RegisterModelInstance(
local_instance.get(), rate_limiter_config));
RETURN_IF_ERROR(local_instance->SetBackendThread(
kind, device_id, device_blocking, device_to_thread_map));
}
RETURN_IF_ERROR(model->AddInstance(std::move(local_instance), passive));
return Status::Success;
}
Status
TritonModelInstance::SetBackendThread(
const TRITONSERVER_InstanceGroupKind kind, const int32_t device_id,
const bool device_blocking,
std::map<uint32_t, std::shared_ptr<TritonBackendThread>>*
device_to_thread_map)
{
if (device_blocking && (kind == TRITONSERVER_INSTANCEGROUPKIND_GPU)) {
auto thread_it = device_to_thread_map->find(device_id);
if (thread_it != device_to_thread_map->end()) {
LOG_VERBOSE(1) << "Using already started backend thread for " << Name()
<< " on device " << device_id;
triton_backend_thread_ = thread_it->second;
}
}
if (triton_backend_thread_.get() == nullptr) {
std::unique_ptr<TritonBackendThread> local_backend_thread;
RETURN_IF_ERROR(TritonBackendThread::CreateBackendThread(
Name(), this, 0 /* nice */, device_id, &local_backend_thread));
triton_backend_thread_ = std::move(local_backend_thread);
device_to_thread_map->insert({device_id, triton_backend_thread_});
} else {
triton_backend_thread_->AddModelInstance(this);
}
RETURN_IF_ERROR(triton_backend_thread_->InitAndWarmUpModelInstance(this));
return Status::Success;
}
Status
TritonModelInstance::GenerateWarmupData()
{
warmup_samples_.clear();
for (const auto& warmup_setting : model_->Config().model_warmup()) {
if (warmup_setting.batch_size() == 0) {
LOG_VERBOSE(1) << "Skipping batch 0 warmup sample '"
<< warmup_setting.name() << "'";
continue;
}
LOG_VERBOSE(1) << "Generating warmup sample data for '"
<< warmup_setting.name() << "'";
// Two passes. First pass to get max byte size for synthetic
// data. Second pass to add original inputs and override inputs
// for control inputs.
int64_t max_zero_byte_size = 0;
int64_t max_random_byte_size = 0;
for (const auto& input_meta : warmup_setting.inputs()) {
auto element_count =
triton::common::GetElementCount(input_meta.second.dims());
if (element_count == -1) {
return Status(
Status::Code::INVALID_ARG,
"warmup setting expects all variable-size dimensions are specified "
"for input '" +
input_meta.first + "'");
}
int64_t batch_byte_size =
element_count *
triton::common::GetDataTypeByteSize(input_meta.second.data_type());
if (batch_byte_size == 0) {
batch_byte_size = element_count * sizeof(int32_t);
}
switch (input_meta.second.input_data_type_case()) {
case inference::ModelWarmup_Input::InputDataTypeCase::kZeroData:
max_zero_byte_size = std::max(batch_byte_size, max_zero_byte_size);
break;
case inference::ModelWarmup_Input::InputDataTypeCase::kRandomData: {
// Because Triton expects STRING type to be in special format
// (prepend 4 bytes to specify string length), so using zero data
// for simplicity (4 bytes * element count of zeros).
if (input_meta.second.data_type() ==
inference::DataType::TYPE_STRING) {
max_zero_byte_size = std::max(batch_byte_size, max_zero_byte_size);
} else {
max_random_byte_size =
std::max(batch_byte_size, max_random_byte_size);
}
break;
}
default:
break;
}
}
warmup_samples_.emplace_back(warmup_setting.name(), warmup_setting.count());
auto& warmup_data = warmup_samples_.back();
// Create buffers for synthetic data
TRITONSERVER_MemoryType type;
int64_t type_id;
warmup_data.zero_data_.reset(new AllocatedMemory(
max_zero_byte_size, TRITONSERVER_MEMORY_CPU_PINNED /* memory_type */,
0 /* memory_type_id */));
char* zero_buffer = warmup_data.zero_data_->MutableBuffer(&type, &type_id);
memset(zero_buffer, 0, max_zero_byte_size);
warmup_data.random_data_.reset(new AllocatedMemory(
max_random_byte_size, TRITONSERVER_MEMORY_CPU_PINNED /* memory_type */,
0 /* memory_type_id */));
char* random_buffer =
warmup_data.random_data_->MutableBuffer(&type, &type_id);
for (int64_t offset = 0; offset < max_random_byte_size; offset++) {
random_buffer[offset] = rand();
}
// Prepare the inference request for the specified sample, not using
// in-process C API because the request doesn't go through the same pipeline
// (i.e. no normalization / scheduler) so we need to prepare the request to
// the state just before calling instance execute function.
for (size_t cnt = 0; cnt < warmup_setting.batch_size(); cnt++) {
warmup_data.requests_.emplace_back(
new InferenceRequest(model_, model_->Version()));
auto& lrequest = warmup_data.requests_.back();
// Second pass to prepare original inputs.
std::vector<std::shared_ptr<InferenceRequest::Input>> input_sps;
for (const auto& input_meta : warmup_setting.inputs()) {
auto batch1_element_count =
triton::common::GetElementCount(input_meta.second.dims());
auto batch_byte_size =
batch1_element_count *
triton::common::GetDataTypeByteSize(input_meta.second.data_type());
if (batch_byte_size == 0) {
batch_byte_size = batch1_element_count * sizeof(int32_t);
}
const char* allocated_ptr;
switch (input_meta.second.input_data_type_case()) {
case inference::ModelWarmup_Input::InputDataTypeCase::kZeroData:
allocated_ptr = zero_buffer;
break;
case inference::ModelWarmup_Input::InputDataTypeCase::kRandomData: {
if (input_meta.second.data_type() ==
inference::DataType::TYPE_STRING) {
allocated_ptr = zero_buffer;
} else {
allocated_ptr = random_buffer;
}
break;
}
case inference::ModelWarmup_Input::InputDataTypeCase::
kInputDataFile: {
// For data provided from file, we can set buffer in first pass
warmup_data.provided_data_.emplace_back(new std::string());
auto input_data = warmup_data.provided_data_.back().get();
RETURN_IF_ERROR(ReadTextFile(
JoinPath({model_->LocalizedModelPath(), kWarmupDataFolder,
input_meta.second.input_data_file()}),
input_data));
if (input_meta.second.data_type() ==
inference::DataType::TYPE_STRING) {
batch_byte_size = input_data->size();
} else if (((size_t)batch_byte_size) > input_data->size()) {
return Status(
Status::Code::INVALID_ARG,
lrequest->LogRequest() + "warmup setting expects " +
std::to_string(batch_byte_size) +
" bytes, but the data "
"provided from " +
input_meta.second.input_data_file() + "only has " +
std::to_string(input_data->size()) + " bytes");
}
allocated_ptr = input_data->data();
break;
}
default:
return Status(
Status::Code::INVALID_ARG,
lrequest->LogRequest() + "warmup setting expects input '" +
input_meta.first + "' to have input_data_type set");
}
const inference::ModelInput* input_config;
bool is_original_input =
model_->GetInput(input_meta.first, &input_config).IsOk();
InferenceRequest::Input* input = nullptr;
std::vector<int64_t> input_meta_shape;
// Append batch size only if the model supports batching
// and not control inpt.
if ((model_->Config().max_batch_size() != 0) && is_original_input) {
input_meta_shape.push_back(1);
}
for (auto d : input_meta.second.dims()) {
input_meta_shape.push_back(d);
}
if (is_original_input) {
RETURN_IF_ERROR(lrequest->AddOriginalInput(
input_meta.first, input_meta.second.data_type(), input_meta_shape,
&input));
} else {
input_sps.emplace_back();
RETURN_IF_ERROR(lrequest->AddOverrideInput(
input_meta.first, input_meta.second.data_type(),
(model_->Config().max_batch_size() != 0 ? 1 : 0),
input_meta_shape, &input_sps.back()));
input = input_sps.back().get();
}
RETURN_IF_ERROR(input->AppendData(
allocated_ptr, batch_byte_size,
TRITONSERVER_MEMORY_CPU /* memory_type */, 0 /* memory_type_id */));
}
RETURN_IF_ERROR(lrequest->PrepareForInference());
// Override inputs must be added after PrepareForInference() is called
for (const auto& sp : input_sps) {
RETURN_IF_ERROR(lrequest->AddOverrideInput(sp));
}
}
}
return Status::Success;
}
void
TritonModelInstance::Schedule(
std::vector<std::unique_ptr<InferenceRequest>>&& requests,
const std::function<void()>& OnCompletion)
{
// Use a thread local vector to avoid needing to malloc each
// time an inference is run.
thread_local std::vector<TRITONBACKEND_Request*> triton_requests(1024);
triton_requests.clear();
for (auto& r : requests) {
// Load the input states for the inference request.
r->LoadInputStates();
triton_requests.push_back(
reinterpret_cast<TRITONBACKEND_Request*>(r.release()));
}
Execute(triton_requests);
OnCompletion();
}
Status
TritonModelInstance::Initialize()
{
RETURN_IF_ERROR(SetNumaConfigOnThread(HostPolicy()));
return Status::Success;
}
Status
TritonModelInstance::WarmUp()
{
// move samples to local variable for scoped cleanup
std::vector<triton::core::TritonModelInstance::WarmupData> lwarmup_samples;
lwarmup_samples.swap(warmup_samples_);
for (auto& sample : lwarmup_samples) {
for (size_t iteration = 1; iteration <= sample.count_; ++iteration) {
LOG_VERBOSE(1) << "model '" << sample.requests_.back()->ModelName()
<< "' instance " << Name() << " is running warmup sample '"
<< sample.sample_name_ << "' for iteration " << iteration;
// request/response complete is asynchronous so use promise to wait for
// completion. Also collects error message from the responses in a vector.
std::vector<std::promise<void>> request_complete(sample.requests_.size());
std::vector<std::string> response_errors;
std::vector<std::pair<std::promise<void>, std::vector<std::string>*>>
response_complete(sample.requests_.size());
std::vector<TRITONBACKEND_Request*> triton_requests;
for (size_t i = 0; i < sample.requests_.size(); ++i) {
auto& request = sample.requests_[i];
request->SetReleaseCallback(
WarmupRequestComplete, &request_complete[i]);
response_complete[i].second = &response_errors;
request->SetResponseCallback(
&warmup_allocator, nullptr, WarmupResponseComplete,
&response_complete[i]);
// Capture timestamp before run to avoid incorrect accumulation from
// sequential warmup runs
#ifdef TRITON_ENABLE_STATS
request->CaptureRequestStartNs();
#endif // TRITON_ENABLE_STATS
request->CaptureQueueStartNs();
triton_requests.push_back(
reinterpret_cast<TRITONBACKEND_Request*>(request.get()));
}
Execute(triton_requests);
// Wait for warmup sample to complete and check error
for (size_t i = 0; i < sample.requests_.size(); ++i) {
request_complete[i].get_future().get();
response_complete[i].first.get_future().get();
}
if (response_errors.size() != 0) {
std::string err_str =
"failed to run warmup sample '" + sample.sample_name_ + "': ";
for (const auto& error : response_errors) {
err_str += (error + "; ");
}
// End warmup as soon as there is failing sample
LOG_VERBOSE(1) << "model '" << sample.requests_.back()->ModelName()
<< "' instance " << Name()
<< " failed to run warmup sample '"
<< sample.sample_name_ << "'";
return Status(Status::Code::INVALID_ARG, err_str);
}
}
}
return Status::Success;
}
void
TritonModelInstance::Execute(
std::vector<TRITONBACKEND_Request*>& triton_requests)
{
TRITONBACKEND_ModelInstance* triton_model_instance =
reinterpret_cast<TRITONBACKEND_ModelInstance*>(this);
TritonBackend::TritonModelInstanceExecFn_t inst_exec_fn =
model_->Backend()->ModelInstanceExecFn();
// If there is an error then we retain ownership of 'requests'
// and must send error responses.
TRITONSERVER_Error* err = inst_exec_fn(
triton_model_instance, &triton_requests[0], triton_requests.size());
if (err != nullptr) {
Status status = Status(
TritonCodeToStatusCode(TRITONSERVER_ErrorCode(err)),
TRITONSERVER_ErrorMessage(err));
for (TRITONBACKEND_Request* tr : triton_requests) {
std::unique_ptr<InferenceRequest> ur(
reinterpret_cast<InferenceRequest*>(tr));
InferenceRequest::RespondIfError(ur, status, true /* release_requests */);
}
TRITONSERVER_ErrorDelete(err);
}
}
Status
TritonModelInstance::TritonBackendThread::CreateBackendThread(
const std::string name, TritonModelInstance* model_instance, const int nice,
const int32_t device_id,
std::unique_ptr<TritonBackendThread>* triton_backend_thread)
{
TritonBackendThread* raw_triton_backend_thread =
new TritonBackendThread(name, model_instance->Model());
std::unique_ptr<TritonBackendThread> runner(raw_triton_backend_thread);
runner->AddModelInstance(model_instance);
runner->backend_thread_ =
std::thread([raw_triton_backend_thread, nice, device_id]() {
raw_triton_backend_thread->BackendThread(nice, device_id);
});
triton_backend_thread->reset(runner.release());
return Status::Success;
}
void
TritonModelInstance::TritonBackendThread::AddModelInstance(
TritonModelInstance* model_instance)
{
model_instances_.push_back(model_instance);
}
Status
TritonModelInstance::TritonBackendThread::InitAndWarmUpModelInstance(
TritonModelInstance* model_instance)
{
// Initialize the instance on the backend thread
auto init_payload = model_->Server()->GetRateLimiter()->GetPayload(
Payload::Operation::INIT, model_instance);
RETURN_IF_ERROR(
model_->Server()->GetRateLimiter()->EnqueuePayload(model_, init_payload));
RETURN_IF_ERROR(init_payload->Wait());
// Warm-up the instance on the backend thread
auto warmup_payload = model_->Server()->GetRateLimiter()->GetPayload(
Payload::Operation::WARM_UP, model_instance);
RETURN_IF_ERROR(model_->Server()->GetRateLimiter()->EnqueuePayload(
model_, warmup_payload));
RETURN_IF_ERROR(warmup_payload->Wait());
return Status::Success;
}
TritonModelInstance::TritonBackendThread::TritonBackendThread(
const std::string& name, TritonModel* model)
: name_(name), model_(model)
{
}
TritonModelInstance::TritonBackendThread::~TritonBackendThread()
{
StopBackendThread();
}
void
TritonModelInstance::TritonBackendThread::StopBackendThread()
{
if (backend_thread_.joinable()) {
// Signal the backend thread to exit and then wait for it...
auto exit_payload = model_->Server()->GetRateLimiter()->GetPayload(
Payload::Operation::EXIT, model_instances_.back());
model_->Server()->GetRateLimiter()->EnqueuePayload(model_, exit_payload);
backend_thread_.join();
}
}
void
TritonModelInstance::TritonBackendThread::BackendThread(
const int nice, const int32_t device_id)
{
#ifndef _WIN32
if (setpriority(PRIO_PROCESS, syscall(SYS_gettid), nice) == 0) {
LOG_VERBOSE(1) << "Starting backend thread for " << name_ << " at nice "
<< nice << " on device " << device_id << "...";
} else {
LOG_VERBOSE(1) << "Starting backend thread for " << name_
<< " at default nice (requested nice " << nice << " failed)"
<< " on device " << device_id << "...";
}
#else
LOG_VERBOSE(1) << "Starting backend thread for " << name_
<< " at default nice on device " << device_id << "...";
#endif
bool should_exit = false;
while (!should_exit) {
std::shared_ptr<Payload> payload;
model_->Server()->GetRateLimiter()->DequeuePayload(
model_instances_, &payload);
NVTX_RANGE(nvtx_, "BackendThread " + name_);
payload->Execute(&should_exit);
model_instances_.push_back(payload->GetInstance());
// Release the payload to the RateLimiter
model_->Server()->GetRateLimiter()->PayloadRelease(payload);
}
LOG_VERBOSE(1) << "Stopping backend thread for " << name_ << "...";
}
extern "C" {
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceName(
TRITONBACKEND_ModelInstance* instance, const char** name)
{
TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
*name = ti->Name().c_str();
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceKind(
TRITONBACKEND_ModelInstance* instance, TRITONSERVER_InstanceGroupKind* kind)
{
TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
*kind = ti->Kind();
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceDeviceId(
TRITONBACKEND_ModelInstance* instance, int32_t* device_id)
{
TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
*device_id = ti->DeviceId();
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceHostPolicy(
TRITONBACKEND_ModelInstance* instance, TRITONSERVER_Message** host_policy)
{
TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
*host_policy = const_cast<TRITONSERVER_Message*>(
reinterpret_cast<const TRITONSERVER_Message*>(&ti->HostPolicyMessage()));
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceProfileCount(
TRITONBACKEND_ModelInstance* instance, uint32_t* count)
{
TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
*count = ti->Profiles().size();
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceProfileName(
TRITONBACKEND_ModelInstance* instance, const uint32_t index,
const char** profile_name)
{
*profile_name = nullptr;
TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
const auto& rprofiles = ti->Profiles();
if (index >= rprofiles.size()) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INVALID_ARG,
(std::string("out of bounds index ") + std::to_string(index) +
": instance is configured with " + std::to_string(rprofiles.size()) +
" profiles")
.c_str());
}
*profile_name = rprofiles[index].c_str();
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceSecondaryDeviceCount(
TRITONBACKEND_ModelInstance* instance, uint32_t* count)
{
TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
*count = ti->SecondaryDevices().size();
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceSecondaryDeviceProperties(
TRITONBACKEND_ModelInstance* instance, uint32_t index, const char** kind,
int64_t* id)
{
TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
const auto& rsecondarydevices = ti->SecondaryDevices();
if (index >= rsecondarydevices.size()) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INVALID_ARG,
(std::string("out of bounds index ") + std::to_string(index) +
": instance is configured with " +
std::to_string(rsecondarydevices.size()) + " secondary devices")
.c_str());
}
*kind = rsecondarydevices[index].kind_.c_str();
*id = rsecondarydevices[index].id_;
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceIsPassive(
TRITONBACKEND_ModelInstance* instance, bool* is_passive)
{
TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
*is_passive = ti->IsPassive();
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceModel(
TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Model** model)
{
TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
*model = reinterpret_cast<TRITONBACKEND_Model*>(ti->Model());
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceState(
TRITONBACKEND_ModelInstance* instance, void** state)
{
TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
*state = ti->State();
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceSetState(
TRITONBACKEND_ModelInstance* instance, void* state)
{
TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
ti->SetState(state);
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceReportStatistics(
TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Request* request,
const bool success, const uint64_t exec_start_ns,
const uint64_t compute_start_ns, const uint64_t compute_end_ns,
const uint64_t exec_end_ns)
{
#ifdef TRITON_ENABLE_STATS
TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
tr->ReportStatistics(
ti->MetricReporter(), success, exec_start_ns, compute_start_ns,
compute_end_ns, exec_end_ns);
#endif // TRITON_ENABLE_STATS
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceReportBatchStatistics(
TRITONBACKEND_ModelInstance* instance, const uint64_t batch_size,
const uint64_t exec_start_ns, const uint64_t compute_start_ns,
const uint64_t compute_end_ns, const uint64_t exec_end_ns)
{
#ifdef TRITON_ENABLE_STATS
TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
ti->Model()->MutableStatsAggregator()->UpdateInferBatchStats(
ti->MetricReporter(), batch_size, exec_start_ns, compute_start_ns,
compute_end_ns, exec_end_ns);
#endif // TRITON_ENABLE_STATS
return nullptr; // success
}
} // extern C
}} // namespace triton::core
// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <functional>
#include <future>
#include <memory>
#include <string>
#include <thread>
#include "constants.h"
#include "memory.h"
#include "metric_model_reporter.h"
#include "model_config.pb.h"
#include "server_message.h"
#include "status.h"
#include "triton/common/sync_queue.h"
namespace triton { namespace core {
class TritonModel;
class InferenceRequest;
//
// Represents a model instance.
//
class TritonModelInstance {
public:
static Status CreateInstances(
TritonModel* model,
const triton::common::BackendCmdlineConfigMap& backend_cmdline_config_map,
const triton::common::HostPolicyCmdlineConfigMap& host_policy_map,
const inference::ModelConfig& model_config, const bool device_blocking);
~TritonModelInstance();
const std::string& Name() const { return name_; }
size_t Index() const { return index_; }
TRITONSERVER_InstanceGroupKind Kind() const { return kind_; }
int32_t DeviceId() const { return device_id_; }
const triton::common::HostPolicyCmdlineConfig& HostPolicy() const
{
return host_policy_;
}
const TritonServerMessage& HostPolicyMessage() const
{
return host_policy_message_;
}
bool IsPassive() const { return passive_; }
const std::vector<std::string>& Profiles() const { return profile_names_; }
struct SecondaryDevice {
SecondaryDevice(const std::string kind, const int64_t id)
: kind_(kind), id_(id)
{
}
const std::string kind_;
const int64_t id_;
};
const std::vector<SecondaryDevice>& SecondaryDevices() const
{
return secondary_devices_;
}
Status Initialize();
Status WarmUp();
void Schedule(
std::vector<std::unique_ptr<InferenceRequest>>&& requests,
const std::function<void()>& OnCompletion);
TritonModel* Model() const { return model_; }
void* State() { return state_; }
void SetState(void* state) { state_ = state; }
MetricModelReporter* MetricReporter() const { return reporter_.get(); }
private:
DISALLOW_COPY_AND_ASSIGN(TritonModelInstance);
class TritonBackendThread;
TritonModelInstance(
TritonModel* model, const std::string& name, const size_t index,
const TRITONSERVER_InstanceGroupKind kind, const int32_t device_id,
const std::vector<std::string>& profile_names, const bool passive,
const triton::common::HostPolicyCmdlineConfig& host_policy,
const TritonServerMessage& host_policy_message,
const std::vector<SecondaryDevice>& secondary_devices);
static Status CreateInstance(
TritonModel* model, const std::string& name, const size_t index,
const TRITONSERVER_InstanceGroupKind kind, const int32_t device_id,
const std::vector<std::string>& profile_names, const bool passive,
const std::string& host_policy_name,
const triton::common::HostPolicyCmdlineConfig& host_policy,
const inference::ModelRateLimiter& rate_limiter_config,
const bool device_blocking,
std::map<uint32_t, std::shared_ptr<TritonBackendThread>>*
device_to_thread_map,
const std::vector<SecondaryDevice>& secondary_devices);
Status SetBackendThread(
const TRITONSERVER_InstanceGroupKind kind, const int32_t device_id,
const bool device_blocking,
std::map<uint32_t, std::shared_ptr<TritonBackendThread>>*
device_to_thread_map);
Status GenerateWarmupData();
void Execute(std::vector<TRITONBACKEND_Request*>& triton_requests);
class TritonBackendThread {
public:
static Status CreateBackendThread(
const std::string name, TritonModelInstance* model, const int nice,
const int32_t device_id,
std::unique_ptr<TritonBackendThread>* triton_backend_thread);
void AddModelInstance(TritonModelInstance* model_instance);
Status InitAndWarmUpModelInstance(TritonModelInstance* model_instance);
void StopBackendThread();
~TritonBackendThread();
private:
TritonBackendThread(const std::string& name, TritonModel* model);
void BackendThread(const int nice, const int32_t device_id);
std::string name_;
TritonModel* model_;
std::deque<TritonModelInstance*> model_instances_;
std::thread backend_thread_;
std::atomic<bool> backend_thread_exit_;
};
std::shared_ptr<TritonBackendThread> triton_backend_thread_;
struct WarmupData {
WarmupData(const std::string& sample_name, const size_t count)
: sample_name_(sample_name), count_(std::max(count, size_t{1}))
{
}
std::string sample_name_;
size_t count_;
// Using a batch of requests to satisfy batch size, this provides better
// alignment on the batch expected by the model, especially for sequence
// model.
std::vector<std::unique_ptr<InferenceRequest>> requests_;
// Placeholder for input data
std::unique_ptr<AllocatedMemory> zero_data_;
std::unique_ptr<AllocatedMemory> random_data_;
std::vector<std::unique_ptr<std::string>> provided_data_;
};
std::vector<WarmupData> warmup_samples_;
// The TritonModel object that owns this instance. The instance
// holds this as a raw pointer because the lifetime of the model is
// guaranteed to be longer than the lifetime of an instance owned by the
// model.
TritonModel* model_;
std::string name_;
size_t index_;
// For CPU device_id_ is always 0. For GPU device_id_ indicates the
// GPU device to be used by the instance.
TRITONSERVER_InstanceGroupKind kind_;
int32_t device_id_;
const triton::common::HostPolicyCmdlineConfig host_policy_;
TritonServerMessage host_policy_message_;
std::vector<std::string> profile_names_;
bool passive_;
std::vector<SecondaryDevice> secondary_devices_;
// Reporter for metrics, or nullptr if no metrics should be reported
std::shared_ptr<MetricModelReporter> reporter_;
// Opaque state associated with this model instance.
void* state_;
};
}} // namespace triton::core
// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "buffer_attributes.h"
#include <cstring>
#include "constants.h"
namespace triton { namespace core {
void
BufferAttributes::SetByteSize(const size_t& byte_size)
{
byte_size_ = byte_size;
}
void
BufferAttributes::SetMemoryType(const TRITONSERVER_MemoryType& memory_type)
{
memory_type_ = memory_type;
}
void
BufferAttributes::SetMemoryTypeId(const int64_t& memory_type_id)
{
memory_type_id_ = memory_type_id;
}
void
BufferAttributes::SetCudaIpcHandle(void* cuda_ipc_handle)
{
char* lcuda_ipc_handle = reinterpret_cast<char*>(cuda_ipc_handle);
cuda_ipc_handle_.clear();
std::copy(
lcuda_ipc_handle, lcuda_ipc_handle + CUDA_IPC_STRUCT_SIZE,
std::back_inserter(cuda_ipc_handle_));
}
void*
BufferAttributes::CudaIpcHandle()
{
if (cuda_ipc_handle_.empty()) {
return nullptr;
} else {
return reinterpret_cast<void*>(cuda_ipc_handle_.data());
}
}
size_t
BufferAttributes::ByteSize() const
{
return byte_size_;
}
TRITONSERVER_MemoryType
BufferAttributes::MemoryType() const
{
return memory_type_;
}
int64_t
BufferAttributes::MemoryTypeId() const
{
return memory_type_id_;
}
BufferAttributes::BufferAttributes(
size_t byte_size, TRITONSERVER_MemoryType memory_type,
int64_t memory_type_id, char* cuda_ipc_handle)
: byte_size_(byte_size), memory_type_(memory_type),
memory_type_id_(memory_type_id)
{
// cuda ipc handle size
cuda_ipc_handle_.reserve(CUDA_IPC_STRUCT_SIZE);
if (cuda_ipc_handle != nullptr) {
std::copy(
cuda_ipc_handle, cuda_ipc_handle + CUDA_IPC_STRUCT_SIZE,
std::back_inserter(cuda_ipc_handle_));
}
}
}} // namespace triton::core
// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <iterator>
#include <vector>
#include "tritonserver_apis.h"
#pragma once
namespace triton { namespace core {
//
// A class to hold information about the buffer allocation.
//
class BufferAttributes {
public:
BufferAttributes(
size_t byte_size, TRITONSERVER_MemoryType memory_type,
int64_t memory_type_id, char cuda_ipc_handle[64]);
BufferAttributes()
{
memory_type_ = TRITONSERVER_MEMORY_CPU;
memory_type_id_ = 0;
cuda_ipc_handle_.reserve(64);
}
// Set the buffer byte size
void SetByteSize(const size_t& byte_size);
// Set the buffer memory_type
void SetMemoryType(const TRITONSERVER_MemoryType& memory_type);
// Set the buffer memory type id
void SetMemoryTypeId(const int64_t& memory_type_id);
// Set the cuda ipc handle
void SetCudaIpcHandle(void* cuda_ipc_handle);
// Get the cuda ipc handle
void* CudaIpcHandle();
// Get the byte size
size_t ByteSize() const;
// Get the memory type
TRITONSERVER_MemoryType MemoryType() const;
// Get the memory type id
int64_t MemoryTypeId() const;
private:
size_t byte_size_;
TRITONSERVER_MemoryType memory_type_;
int64_t memory_type_id_;
std::vector<char> cuda_ipc_handle_;
};
}} // namespace triton::core
// Copyright 2018-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <stdint.h>
namespace triton { namespace core {
constexpr char kInferHeaderContentLengthHTTPHeader[] =
"Inference-Header-Content-Length";
constexpr char kAcceptEncodingHTTPHeader[] = "Accept-Encoding";
constexpr char kContentEncodingHTTPHeader[] = "Content-Encoding";
constexpr char kContentTypeHeader[] = "Content-Type";
constexpr char kContentLengthHeader[] = "Content-Length";
constexpr char kTensorFlowGraphDefPlatform[] = "tensorflow_graphdef";
constexpr char kTensorFlowSavedModelPlatform[] = "tensorflow_savedmodel";
constexpr char kTensorFlowGraphDefFilename[] = "model.graphdef";
constexpr char kTensorFlowSavedModelFilename[] = "model.savedmodel";
constexpr char kTensorFlowBackend[] = "tensorflow";
constexpr char kTensorRTPlanPlatform[] = "tensorrt_plan";
constexpr char kTensorRTPlanFilename[] = "model.plan";
constexpr char kTensorRTBackend[] = "tensorrt";
constexpr char kOnnxRuntimeOnnxPlatform[] = "onnxruntime_onnx";
constexpr char kOnnxRuntimeOnnxFilename[] = "model.onnx";
constexpr char kOnnxRuntimeBackend[] = "onnxruntime";
constexpr char kOpenVINORuntimeOpenVINOFilename[] = "model.xml";
constexpr char kOpenVINORuntimeBackend[] = "openvino";
constexpr char kPyTorchLibTorchPlatform[] = "pytorch_libtorch";
constexpr char kPyTorchLibTorchFilename[] = "model.pt";
constexpr char kPyTorchBackend[] = "pytorch";
constexpr char kPythonFilename[] = "model.py";
constexpr char kPythonBackend[] = "python";
#ifdef TRITON_ENABLE_ENSEMBLE
constexpr char kEnsemblePlatform[] = "ensemble";
#endif // TRITON_ENABLE_ENSEMBLE
constexpr char kTensorRTExecutionAccelerator[] = "tensorrt";
constexpr char kOpenVINOExecutionAccelerator[] = "openvino";
constexpr char kGPUIOExecutionAccelerator[] = "gpu_io";
constexpr char kAutoMixedPrecisionExecutionAccelerator[] =
"auto_mixed_precision";
constexpr char kModelConfigPbTxt[] = "config.pbtxt";
constexpr char kMetricsLabelModelName[] = "model";
constexpr char kMetricsLabelModelVersion[] = "version";
constexpr char kMetricsLabelGpuUuid[] = "gpu_uuid";
constexpr char kWarmupDataFolder[] = "warmup";
constexpr char kInitialStateFolder[] = "initial_state";
constexpr uint64_t NANOS_PER_SECOND = 1000000000;
constexpr uint64_t NANOS_PER_MILLIS = 1000000;
constexpr int MAX_GRPC_MESSAGE_SIZE = INT32_MAX;
constexpr uint64_t SEQUENCE_IDLE_DEFAULT_MICROSECONDS = 1000 * 1000;
constexpr size_t STRING_CORRELATION_ID_MAX_LENGTH_BYTES = 128;
constexpr size_t CUDA_IPC_STRUCT_SIZE = 64;
#ifdef TRITON_ENABLE_METRICS
// MetricModelReporter expects a device ID for GPUs, but we reuse this device
// ID for other metrics as well such as for CPU and Response Cache metrics
constexpr int METRIC_REPORTER_ID_CPU = -1;
constexpr int METRIC_REPORTER_ID_RESPONSE_CACHE = -2;
#endif
#define TIMESPEC_TO_NANOS(TS) \
((TS).tv_sec * triton::core::NANOS_PER_SECOND + (TS).tv_nsec)
#define TIMESPEC_TO_MILLIS(TS) \
(TIMESPEC_TO_NANOS(TS) / triton::core::NANOS_PER_MILLIS)
#define DISALLOW_MOVE(TypeName) TypeName(Context&& o) = delete;
#define DISALLOW_COPY(TypeName) TypeName(const TypeName&) = delete;
#define DISALLOW_ASSIGN(TypeName) void operator=(const TypeName&) = delete;
#define DISALLOW_COPY_AND_ASSIGN(TypeName) \
DISALLOW_COPY(TypeName) \
DISALLOW_ASSIGN(TypeName)
}} // namespace triton::core
// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
#include "cuda_memory_manager.h"
#include <cnmem.h>
#include <string.h>
#include <set>
#include "cuda_utils.h"
#include "triton/common/logging.h"
namespace {
#define RETURN_IF_CNMEM_ERROR(S, MSG) \
do { \
auto status__ = (S); \
if (status__ != CNMEM_STATUS_SUCCESS) { \
return Status( \
Status::Code::INTERNAL, \
(MSG) + ": " + cnmemGetErrorString(status__)); \
} \
} while (false)
std::string
PointerToString(void* ptr)
{
std::stringstream ss;
ss << ptr;
return ss.str();
}
} // namespace
namespace triton { namespace core {
std::unique_ptr<CudaMemoryManager> CudaMemoryManager::instance_;
std::mutex CudaMemoryManager::instance_mu_;
CudaMemoryManager::~CudaMemoryManager()
{
if (has_allocation_) {
auto status = cnmemFinalize();
if (status != CNMEM_STATUS_SUCCESS) {
LOG_ERROR << "Failed to finalize CUDA memory manager: [" << status << "] "
<< cnmemGetErrorString(status);
}
}
}
void
CudaMemoryManager::Reset()
{
std::lock_guard<std::mutex> lock(instance_mu_);
instance_.reset();
}
Status
CudaMemoryManager::Create(const CudaMemoryManager::Options& options)
{
// Ensure thread-safe creation of CUDA memory pool
std::lock_guard<std::mutex> lock(instance_mu_);
if (instance_ != nullptr) {
LOG_WARNING << "New CUDA memory pools could not be created since they "
"already exists";
return Status::Success;
}
std::set<int> supported_gpus;
auto status = GetSupportedGPUs(
&supported_gpus, options.min_supported_compute_capability_);
if (status.IsOk()) {
std::vector<cnmemDevice_t> devices;
for (auto gpu : supported_gpus) {
const auto it = options.memory_pool_byte_size_.find(gpu);
if ((it != options.memory_pool_byte_size_.end()) && (it->second != 0)) {
devices.emplace_back();
auto& device = devices.back();
memset(&device, 0, sizeof(device));
device.device = gpu;
device.size = it->second;
LOG_INFO << "CUDA memory pool is created on device " << device.device
<< " with size " << device.size;
}
}
if (!devices.empty()) {
RETURN_IF_CNMEM_ERROR(
cnmemInit(devices.size(), devices.data(), CNMEM_FLAGS_CANNOT_GROW),
std::string("Failed to finalize CUDA memory manager"));
} else {
LOG_INFO << "CUDA memory pool disabled";
}
// Use to finalize CNMeM properly when out of scope
instance_.reset(new CudaMemoryManager(!devices.empty()));
} else {
return Status(
status.ErrorCode(),
"Failed to initialize CUDA memory manager: " + status.Message());
}
return Status::Success;
}
Status
CudaMemoryManager::Alloc(void** ptr, uint64_t size, int64_t device_id)
{
if (instance_ == nullptr) {
return Status(
Status::Code::UNAVAILABLE, "CudaMemoryManager has not been created");
} else if (!instance_->has_allocation_) {
return Status(
Status::Code::UNAVAILABLE,
"CudaMemoryManager has no preallocated CUDA memory");
}
int current_device;
RETURN_IF_CUDA_ERR(
cudaGetDevice(&current_device), std::string("Failed to get device"));
bool overridden = (current_device != device_id);
if (overridden) {
RETURN_IF_CUDA_ERR(
cudaSetDevice(device_id), std::string("Failed to set device"));
}
// Defer returning error to make sure the device is recovered
auto err = cnmemMalloc(ptr, size, nullptr);
if (overridden) {
cudaSetDevice(current_device);
}
RETURN_IF_CNMEM_ERROR(
err, std::string("Failed to allocate CUDA memory with byte size ") +
std::to_string(size) + " on GPU " + std::to_string(device_id));
return Status::Success;
}
Status
CudaMemoryManager::Free(void* ptr, int64_t device_id)
{
if (instance_ == nullptr) {
return Status(
Status::Code::UNAVAILABLE, "CudaMemoryManager has not been created");
} else if (!instance_->has_allocation_) {
return Status(
Status::Code::UNAVAILABLE,
"CudaMemoryManager has no preallocated CUDA memory");
}
int current_device;
RETURN_IF_CUDA_ERR(
cudaGetDevice(&current_device), std::string("Failed to get device"));
bool overridden = (current_device != device_id);
if (overridden) {
RETURN_IF_CUDA_ERR(
cudaSetDevice(device_id), std::string("Failed to set device"));
}
// Defer returning error to make sure the device is recovered
auto err = cnmemFree(ptr, nullptr);
if (overridden) {
cudaSetDevice(current_device);
}
RETURN_IF_CNMEM_ERROR(
err, std::string("Failed to deallocate CUDA memory at address ") +
PointerToString(ptr) + " on GPU " + std::to_string(device_id));
return Status::Success;
}
}} // namespace triton::core
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
#pragma once
#include <map>
#include <memory>
#include <mutex>
#include "status.h"
namespace triton { namespace core {
// This is a singleton class responsible for maintaining CUDA memory pool
// used by the inference server. CUDA memory allocations and deallocations
// must be requested via functions provided by this class.
class CudaMemoryManager {
public:
// Options to configure CUDA memory manager.
struct Options {
Options(double cc = 6.0, const std::map<int, uint64_t>& s = {})
: min_supported_compute_capability_(cc), memory_pool_byte_size_(s)
{
}
// The minimum compute capability of the supported devices.
double min_supported_compute_capability_;
// The size of CUDA memory reserved for the specified devices.
// The memory size will be rounded up to align with
// the default granularity (512 bytes).
// No memory will be reserved for devices that is not listed.
std::map<int, uint64_t> memory_pool_byte_size_;
};
~CudaMemoryManager();
// Create the memory manager based on 'options' specified.
// Return Status object indicating success or failure.
static Status Create(const Options& options);
// Allocate CUDA memory on GPU 'device_id' with
// the requested 'size' and return the pointer in 'ptr'.
// Return Status object indicating success or failure.
static Status Alloc(void** ptr, uint64_t size, int64_t device_id);
// Free the memory allocated by the memory manager on 'device_id'.
// Return Status object indicating success or failure.
static Status Free(void* ptr, int64_t device_id);
protected:
// Provide explicit control on the lifecycle of the CUDA memory manager,
// for testing only.
static void Reset();
private:
CudaMemoryManager(bool has_allocation) : has_allocation_(has_allocation) {}
bool has_allocation_;
static std::unique_ptr<CudaMemoryManager> instance_;
static std::mutex instance_mu_;
};
}} // namespace triton::core
// Copyright 2019-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "cuda_utils.h"
#include "model_config_utils.h"
#include "triton/common/nvtx.h"
namespace triton { namespace core {
#ifdef TRITON_ENABLE_GPU
void CUDART_CB
MemcpyHost(void* args)
{
auto* copy_params = reinterpret_cast<CopyParams*>(args);
memcpy(copy_params->dst_, copy_params->src_, copy_params->byte_size_);
delete copy_params;
}
#endif // TRITON_ENABLE_GPU
Status
GetDeviceMemoryInfo(const int device_id, size_t* free, size_t* total)
{
*free = 0;
*total = 0;
#ifdef TRITON_ENABLE_GPU
// Make sure that correct device is set before creating stream and
// then restore the device to what was set by the caller.
int current_device;
auto cuerr = cudaGetDevice(&current_device);
bool overridden = false;
if (cuerr == cudaSuccess) {
overridden = (current_device != device_id);
if (overridden) {
cuerr = cudaSetDevice(device_id);
}
}
if (cuerr == cudaSuccess) {
cuerr = cudaMemGetInfo(free, total);
}
if (overridden) {
cudaSetDevice(current_device);
}
if (cuerr != cudaSuccess) {
return Status(
Status::Code::INTERNAL,
(std::string("unable to get memory info for device ") +
std::to_string(device_id) + ": " + cudaGetErrorString(cuerr)));
}
#endif // TRITON_ENABLE_GPU
return Status::Success;
}
Status
EnablePeerAccess(const double min_compute_capability)
{
#ifdef TRITON_ENABLE_GPU
// If we can't enable peer access for one device pair, the best we can
// do is skipping it...
std::set<int> supported_gpus;
bool all_enabled = false;
if (GetSupportedGPUs(&supported_gpus, min_compute_capability).IsOk()) {
all_enabled = true;
int can_access_peer = false;
for (const auto& host : supported_gpus) {
auto cuerr = cudaSetDevice(host);
if (cuerr == cudaSuccess) {
for (const auto& peer : supported_gpus) {
if (host == peer) {
continue;
}
cuerr = cudaDeviceCanAccessPeer(&can_access_peer, host, peer);
if ((cuerr == cudaSuccess) && (can_access_peer == 1)) {
cuerr = cudaDeviceEnablePeerAccess(peer, 0);
}
all_enabled &= ((cuerr == cudaSuccess) && (can_access_peer == 1));
}
}
}
}
if (!all_enabled) {
return Status(
Status::Code::UNSUPPORTED,
"failed to enable peer access for some device pairs");
}
#endif // TRITON_ENABLE_GPU
return Status::Success;
}
Status
CopyBuffer(
const std::string& msg, const TRITONSERVER_MemoryType src_memory_type,
const int64_t src_memory_type_id,
const TRITONSERVER_MemoryType dst_memory_type,
const int64_t dst_memory_type_id, const size_t byte_size, const void* src,
void* dst, cudaStream_t cuda_stream, bool* cuda_used, bool copy_on_stream)
{
NVTX_RANGE(nvtx_, "CopyBuffer");
*cuda_used = false;
// For CUDA memcpy, all host to host copy will be blocked in respect to the
// host, so use memcpy() directly. In this case, need to be careful on whether
// the src buffer is valid.
if ((src_memory_type != TRITONSERVER_MEMORY_GPU) &&
(dst_memory_type != TRITONSERVER_MEMORY_GPU)) {
#ifdef TRITON_ENABLE_GPU
if (copy_on_stream) {
auto params = new CopyParams(dst, src, byte_size);
cudaLaunchHostFunc(
cuda_stream, MemcpyHost, reinterpret_cast<void*>(params));
*cuda_used = true;
} else {
memcpy(dst, src, byte_size);
}
#else
memcpy(dst, src, byte_size);
#endif // TRITON_ENABLE_GPU
} else {
#ifdef TRITON_ENABLE_GPU
RETURN_IF_CUDA_ERR(
cudaMemcpyAsync(dst, src, byte_size, cudaMemcpyDefault, cuda_stream),
msg + ": failed to perform CUDA copy");
*cuda_used = true;
#else
return Status(
Status::Code::INTERNAL,
msg + ": try to use CUDA copy while GPU is not supported");
#endif // TRITON_ENABLE_GPU
}
return Status::Success;
}
void
CopyBufferHandler(
const std::string& msg, const TRITONSERVER_MemoryType src_memory_type,
const int64_t src_memory_type_id,
const TRITONSERVER_MemoryType dst_memory_type,
const int64_t dst_memory_type_id, const size_t byte_size, const void* src,
void* dst, cudaStream_t cuda_stream, void* response_ptr,
triton::common::SyncQueue<std::tuple<Status, bool, void*>>*
completion_queue)
{
bool cuda_used = false;
Status status = CopyBuffer(
msg, src_memory_type, src_memory_type_id, dst_memory_type,
dst_memory_type_id, byte_size, src, dst, cuda_stream, &cuda_used);
completion_queue->Put(std::make_tuple(status, cuda_used, response_ptr));
}
#ifdef TRITON_ENABLE_GPU
Status
CheckGPUCompatibility(const int gpu_id, const double min_compute_capability)
{
// Query the compute capability from the device
cudaDeviceProp cuprops;
cudaError_t cuerr = cudaGetDeviceProperties(&cuprops, gpu_id);
if (cuerr != cudaSuccess) {
return Status(
Status::Code::INTERNAL,
"unable to get CUDA device properties for GPU ID" +
std::to_string(gpu_id) + ": " + cudaGetErrorString(cuerr));
}
double compute_compability = cuprops.major + (cuprops.minor / 10.0);
if ((compute_compability > min_compute_capability) ||
(abs(compute_compability - min_compute_capability) < 0.01)) {
return Status::Success;
} else {
return Status(
Status::Code::UNSUPPORTED,
"gpu " + std::to_string(gpu_id) + " has compute capability '" +
std::to_string(cuprops.major) + "." +
std::to_string(cuprops.minor) +
"' which is less than the minimum supported of '" +
std::to_string(min_compute_capability) + "'");
}
}
Status
GetSupportedGPUs(
std::set<int>* supported_gpus, const double min_compute_capability)
{
// Make sure set is empty before starting
supported_gpus->clear();
int device_cnt;
cudaError_t cuerr = cudaGetDeviceCount(&device_cnt);
if ((cuerr == cudaErrorNoDevice) || (cuerr == cudaErrorInsufficientDriver)) {
device_cnt = 0;
} else if (cuerr != cudaSuccess) {
return Status(
Status::Code::INTERNAL, "unable to get number of CUDA devices: " +
std::string(cudaGetErrorString(cuerr)));
}
// populates supported_gpus
for (int gpu_id = 0; gpu_id < device_cnt; gpu_id++) {
Status status = CheckGPUCompatibility(gpu_id, min_compute_capability);
if (status.IsOk()) {
supported_gpus->insert(gpu_id);
}
}
return Status::Success;
}
Status
SupportsIntegratedZeroCopy(const int gpu_id, bool* zero_copy_support)
{
// Query the device to check if integrated
cudaDeviceProp cuprops;
cudaError_t cuerr = cudaGetDeviceProperties(&cuprops, gpu_id);
if (cuerr != cudaSuccess) {
return Status(
Status::Code::INTERNAL,
"unable to get CUDA device properties for GPU ID" +
std::to_string(gpu_id) + ": " + cudaGetErrorString(cuerr));
}
// Zero-copy supported only on integrated GPU when it can map host memory
if (cuprops.integrated && cuprops.canMapHostMemory) {
*zero_copy_support = true;
} else {
*zero_copy_support = false;
}
return Status::Success;
}
#endif
}} // namespace triton::core
// Copyright 2019-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <set>
#include "status.h"
#include "triton/common/sync_queue.h"
#ifdef TRITON_ENABLE_GPU
#include <cuda_runtime_api.h>
#endif // TRITON_ENABLE_GPU
namespace triton { namespace core {
#ifdef TRITON_ENABLE_GPU
#define RETURN_IF_CUDA_ERR(X, MSG) \
do { \
cudaError_t err__ = (X); \
if (err__ != cudaSuccess) { \
return Status( \
Status::Code::INTERNAL, (MSG) + ": " + cudaGetErrorString(err__)); \
} \
} while (false)
#endif // TRITON_ENABLE_GPU
#ifndef TRITON_ENABLE_GPU
using cudaStream_t = void*;
#endif // !TRITON_ENABLE_GPU
/// Get the memory info for the specified device.
/// \param device_id The device ID.
/// \param free Return free memory in bytes.
/// \param total Return total memory in bytes.
/// \return The error status. A non-OK status means failure to get memory info.
Status GetDeviceMemoryInfo(const int device_id, size_t* free, size_t* total);
/// Enable peer access for all GPU device pairs
/// \param min_compute_capability The minimum support CUDA compute
/// capability.
/// \return The error status. A non-OK status means not all pairs are enabled
Status EnablePeerAccess(const double min_compute_capability);
/// Copy buffer from 'src' to 'dst' for given 'byte_size'. The buffer location
/// is identified by the memory type and id, and the corresponding copy will be
/// initiated.
/// \param msg The message to be prepended in error message.
/// \param src_memory_type The memory type CPU/GPU of the source.
/// \param src_memory_type_id The device id of the source.
/// \param dst_memory_type The memory type CPU/GPU of the destination.
/// \param dst_memory_type_id The device id of the destination.
/// \param byte_size The size in bytes to me copied from source to destination.
/// \param src The buffer start address of the source.
/// \param dst The buffer start address of the destination.
/// \param cuda_stream The stream to be associated with, and 0 can be
/// passed for default stream.
/// \param cuda_used returns whether a CUDA memory copy is initiated. If true,
/// the caller should synchronize on the given 'cuda_stream' to ensure data copy
/// is completed.
/// \param copy_on_stream whether the memory copies should be performed in cuda
/// host functions on the 'cuda_stream'.
/// \return The error status. A non-ok status indicates failure to copy the
/// buffer.
Status CopyBuffer(
const std::string& msg, const TRITONSERVER_MemoryType src_memory_type,
const int64_t src_memory_type_id,
const TRITONSERVER_MemoryType dst_memory_type,
const int64_t dst_memory_type_id, const size_t byte_size, const void* src,
void* dst, cudaStream_t cuda_stream, bool* cuda_used,
bool copy_on_stream = false);
#ifdef TRITON_ENABLE_GPU
/// Validates the compute capability of the GPU indexed
/// \param gpu_id The index of the target GPU.
/// \param min_compute_capability The minimum support CUDA compute
/// capability.
/// \return The error status. A non-OK status means the target GPU is
/// not supported.
Status CheckGPUCompatibility(
const int gpu_id, const double min_compute_capability);
/// Obtains a set of gpu ids that is supported by triton.
/// \param supported_gpus Returns the set of integers which is
/// populated by ids of supported GPUS
/// \param min_compute_capability The minimum support CUDA compute
/// capability.
/// \return The error status. A non-ok status means there were
/// errors encountered while querying GPU devices.
Status GetSupportedGPUs(
std::set<int>* supported_gpus, const double min_compute_capability);
/// Checks if the GPU specified is an integrated GPU and supports Zero-copy.
/// \param gpu_id The index of the target GPU.
/// \param zero_copy_support If true, Zero-copy is supported by this GPU.
/// \return The error status. A non-OK status means the target GPU is
/// not supported.
Status SupportsIntegratedZeroCopy(const int gpu_id, bool* zero_copy_support);
#endif
// Helper around CopyBuffer that updates the completion queue with the returned
// status and cuda_used flag.
void CopyBufferHandler(
const std::string& msg, const TRITONSERVER_MemoryType src_memory_type,
const int64_t src_memory_type_id,
const TRITONSERVER_MemoryType dst_memory_type,
const int64_t dst_memory_type_id, const size_t byte_size, const void* src,
void* dst, cudaStream_t cuda_stream, void* response_ptr,
triton::common::SyncQueue<std::tuple<Status, bool, void*>>*
completion_queue);
struct CopyParams {
CopyParams(void* dst, const void* src, const size_t byte_size)
: dst_(dst), src_(src), byte_size_(byte_size)
{
}
void* dst_;
const void* src_;
const size_t byte_size_;
};
}} // namespace triton::core
// Copyright 2018-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "dynamic_batch_scheduler.h"
#ifndef _WIN32
#include <sys/resource.h>
#include <sys/syscall.h>
#include <unistd.h>
#endif
#include "constants.h"
#include "server.h"
#include "triton/common/logging.h"
#include "triton/common/model_config.h"
#include "triton/common/nvtx.h"
namespace triton { namespace core {
bool
IsStaleState(Payload::State payload_state)
{
return (
(payload_state == Payload::State::EXECUTING) ||
(payload_state == Payload::State::RELEASED));
}
DynamicBatchScheduler::DynamicBatchScheduler(
TritonModel* model, TritonModelInstance* model_instance,
const bool dynamic_batching_enabled, const int32_t max_batch_size,
const std::unordered_map<std::string, bool>& enforce_equal_shape_tensors,
const bool preserve_ordering, const bool response_cache_enable,
const std::set<int32_t>& preferred_batch_sizes,
const uint64_t max_queue_delay_microseconds,
const inference::ModelQueuePolicy& default_queue_policy,
const uint32_t priority_levels, const ModelQueuePolicyMap& queue_policy_map)
: model_(model), model_instance_(model_instance),
model_name_(model->Name()),
dynamic_batching_enabled_(dynamic_batching_enabled),
queue_(default_queue_policy, priority_levels, queue_policy_map),
stop_(false), max_batch_size_((size_t)std::max(1, max_batch_size)),
preferred_batch_sizes_(preferred_batch_sizes),
pending_batch_delay_ns_(max_queue_delay_microseconds * 1000),
pending_batch_size_(0), queued_batch_size_(0),
next_preferred_batch_size_(0),
enforce_equal_shape_tensors_(enforce_equal_shape_tensors),
has_optional_input_(false), preserve_ordering_(preserve_ordering)
{
rate_limiter_ = model_->Server()->GetRateLimiter();
// Both the server and model config should specify
// caching enabled for model to utilize response cache.
response_cache_enabled_ =
(model_->Server()->ResponseCacheEnabled() && response_cache_enable);
#ifdef TRITON_ENABLE_METRICS
// Initialize metric reporter for cache statistics if cache enabled
if (response_cache_enabled_) {
MetricModelReporter::Create(
model_name_, model_->Version(), METRIC_REPORTER_ID_RESPONSE_CACHE,
model_->Config().metric_tags(), &reporter_);
}
#endif // TRITON_ENABLE_METRICS
max_preferred_batch_size_ = 0;
for (const auto size : preferred_batch_sizes_) {
max_preferred_batch_size_ =
std::max(max_preferred_batch_size_, (size_t)size);
}
for (const auto& input : model_->Config().input()) {
if (input.optional()) {
has_optional_input_ = true;
break;
}
}
}
Status
DynamicBatchScheduler::Create(
TritonModel* model, TritonModelInstance* model_instance, const int nice,
const bool dynamic_batching_enabled, const int32_t max_batch_size,
const std::unordered_map<std::string, bool>& enforce_equal_shape_tensors,
const bool preserve_ordering, const bool response_cache_enable,
const std::set<int32_t>& preferred_batch_sizes,
const uint64_t max_queue_delay_microseconds,
std::unique_ptr<Scheduler>* scheduler)
{
inference::ModelDynamicBatching batcher_config;
batcher_config.set_preserve_ordering(preserve_ordering);
for (const auto& bs : preferred_batch_sizes) {
batcher_config.add_preferred_batch_size(bs);
}
batcher_config.set_max_queue_delay_microseconds(max_queue_delay_microseconds);
return Create(
model, model_instance, nice, dynamic_batching_enabled, max_batch_size,
enforce_equal_shape_tensors, batcher_config, response_cache_enable,
scheduler);
}
Status
DynamicBatchScheduler::Create(
TritonModel* model, TritonModelInstance* model_instance, const int nice,
const bool dynamic_batching_enabled, const int32_t max_batch_size,
const std::unordered_map<std::string, bool>& enforce_equal_shape_tensors,
const inference::ModelDynamicBatching& batcher_config,
const bool response_cache_enable, std::unique_ptr<Scheduler>* scheduler)
{
std::set<int32_t> preferred_batch_sizes;
for (const auto size : batcher_config.preferred_batch_size()) {
preferred_batch_sizes.insert(size);
}
DynamicBatchScheduler* dyna_sched = new DynamicBatchScheduler(
model, model_instance, dynamic_batching_enabled, max_batch_size,
enforce_equal_shape_tensors, batcher_config.preserve_ordering(),
response_cache_enable, preferred_batch_sizes,
batcher_config.max_queue_delay_microseconds(),
batcher_config.default_queue_policy(), batcher_config.priority_levels(),
batcher_config.priority_queue_policy());
std::unique_ptr<DynamicBatchScheduler> sched(dyna_sched);
sched->scheduler_thread_exit_.store(false);
if (dynamic_batching_enabled) {
sched->NewPayload();
sched->scheduler_thread_ =
std::thread([dyna_sched, nice]() { dyna_sched->BatcherThread(nice); });
}
scheduler->reset(sched.release());
return Status::Success;
}
DynamicBatchScheduler::~DynamicBatchScheduler()
{
// Signal the scheduler thread to exit and then wait for it..
scheduler_thread_exit_.store(true);
cv_.notify_one();
if (scheduler_thread_.joinable()) {
scheduler_thread_.join();
}
}
Status
DynamicBatchScheduler::Enqueue(std::unique_ptr<InferenceRequest>& request)
{
if (stop_) {
return Status(
Status::Code::UNAVAILABLE,
request->LogRequest() +
"Server is stopping, scheduler for model has stopped accepting new "
"inference requests");
}
// If queue start timestamp hasn't been set, queue timer starts at
// the beginning of the queueing and scheduling process. Otherwise,
// dynamic batcher is used as component of another batcher and should not
// overwrite the queue start timestamp.
if (request->QueueStartNs() == 0) {
request->CaptureQueueStartNs();
INFER_TRACE_ACTIVITY(
request->Trace(), TRITONSERVER_TRACE_QUEUE_START,
request->QueueStartNs());
#ifdef TRITON_ENABLE_TRACING
request->TraceInputTensors(
TRITONSERVER_TRACE_TENSOR_QUEUE_INPUT, "DynamicBatchScheduler Enqueue");
#endif // TRITON_ENABLE_TRACING
}
// Record time at the beginning of the batcher queueing. In the case of
// oldest sequence batcher, this will overwrite the value that was previously
// set by sequence batcher, which is okay as by this point, the previous
// batcher won't be needing this value and it can be safely reused by
// the dynamic batcher.
request->CaptureBatcherStartNs();
std::unique_ptr<InferenceResponse> cached_response;
if (response_cache_enabled_) {
CacheLookUp(request, cached_response);
}
if (cached_response != nullptr) {
// If there was a cache hit then try sending the cached response
// and release the request.
if (preserve_ordering_) {
// In order to preserve the order, the response send must be
// delegated.
DelegateResponse(request);
}
// Send cached response and release request
InferenceResponse::Send(
std::move(cached_response), TRITONSERVER_RESPONSE_COMPLETE_FINAL);
InferenceRequest::Release(
std::move(request), TRITONSERVER_REQUEST_RELEASE_ALL);
return Status::Success;
}
if (!dynamic_batching_enabled_) {
if (preserve_ordering_ || response_cache_enabled_) {
DelegateResponse(request);
}
// If not using dynamic batching, directly enqueue the
// request to model for execution
auto payload = model_->Server()->GetRateLimiter()->GetPayload(
Payload::Operation::INFER_RUN, nullptr /* TritonModelInstance*/);
payload->AddRequest(std::move(request));
RETURN_IF_ERROR(
model_->Server()->GetRateLimiter()->EnqueuePayload(model_, payload));
} else {
bool wake_batcher = true;
{
std::lock_guard<std::mutex> lock(mu_);
queued_batch_size_ += std::max(1U, request->BatchSize());
// Assuming no error is returned, this call takes ownership of
// 'request' and so we can't use it after this point.
RETURN_IF_ERROR(queue_.Enqueue(request->Priority(), request));
// If there are any idle runners and the queued batch size is greater or
// equal to next preferred batch size, then wake batcher up to service
// this request. We do the actual wake outside of the lock to avoid
// having the woken thread immediately block on the lock
wake_batcher =
model_->Server()->GetRateLimiter()->PayloadSlotAvailable(model_);
// We may wake up runner less often if we don't enforce equal shape
// within a batch, otherwise must always wake up runner to check it
if (enforce_equal_shape_tensors_.empty()) {
std::lock_guard<std::mutex> exec_lock(*(curr_payload_->GetExecMutex()));
auto payload_state = curr_payload_->GetState();
wake_batcher &=
(payload_saturated_ || IsStaleState(payload_state) ||
(queued_batch_size_ >= next_preferred_batch_size_));
}
}
if (wake_batcher) {
cv_.notify_one();
}
}
return Status::Success;
}
void
DynamicBatchScheduler::NewPayload()
{
curr_payload_ = model_->Server()->GetRateLimiter()->GetPayload(
Payload::Operation::INFER_RUN, model_instance_);
payload_saturated_ = false;
}
void
DynamicBatchScheduler::BatcherThread(const int nice)
{
#ifndef _WIN32
if (setpriority(PRIO_PROCESS, syscall(SYS_gettid), nice) == 0) {
LOG_VERBOSE(1) << "Starting dynamic-batcher thread for " << model_name_
<< " at nice " << nice << "...";
} else {
LOG_VERBOSE(1) << "Starting dynamic-batcher thread for " << model_name_
<< " at default nice (requested nice " << nice
<< " failed)...";
}
#else
LOG_VERBOSE(1) << "Starting dynamic-batcher thread for " << model_name_
<< " at default nice...";
#endif
// For debugging/testing, delay start of threads until the queue
// contains the specified number of entries.
size_t delay_cnt = 0;
{
const char* dstr = getenv("TRITONSERVER_DELAY_SCHEDULER");
if (dstr != nullptr) {
delay_cnt = atoi(dstr);
LOG_VERBOSE(1) << "Delaying batcher thread for " << model_name_
<< " until " << delay_cnt << " queued requests...";
}
}
auto wait_for_slots = [this]() {
return model_->Server()->GetRateLimiter()->PayloadSlotAvailable(model_);
};
const uint64_t default_wait_microseconds = 500 * 1000;
while (!scheduler_thread_exit_.load()) {
NVTX_RANGE(nvtx_, "DynamicBatcher " + model_name_);
std::shared_ptr<std::vector<std::deque<std::unique_ptr<InferenceRequest>>>>
rejected_requests;
uint64_t wait_microseconds = 0;
// Hold the lock for as short a time as possible.
{
std::unique_lock<std::mutex> lock(mu_);
{
std::lock_guard<std::mutex> exec_lock(*(curr_payload_->GetExecMutex()));
auto payload_state = curr_payload_->GetState();
if (payload_saturated_ || IsStaleState(payload_state)) {
NewPayload();
next_preferred_batch_size_ = 0;
}
}
if (delay_cnt > 0) {
// Debugging/testing... wait until queue contains 'delay_cnt'
// items...
wait_microseconds = 10 * 1000;
if (queue_.Size() >= delay_cnt) {
delay_cnt = 0;
}
LOG_VERBOSE(1) << "Delaying batcher thread " << model_name_ << " until "
<< delay_cnt
<< " queued requests, current total = " << queue_.Size();
} else if (queue_.Empty()) {
wait_microseconds = default_wait_microseconds;
} else {
if (payload_saturated_) {
continue;
}
cv_.wait(lock, wait_for_slots);
{
std::lock_guard<std::mutex> exec_lock(
*(curr_payload_->GetExecMutex()));
auto payload_state = curr_payload_->GetState();
if (IsStaleState(payload_state)) {
continue;
}
// Use dynamic batching to get request(s) to execute.
wait_microseconds = GetDynamicBatch();
// Get requests that are rejected from searching dynamic batch.
queue_.ReleaseRejectedRequests(&rejected_requests);
// Extract batch only if there is pending batch
auto pending_batch_queue_cnt = queue_.PendingBatchCount();
if ((wait_microseconds == 0) && (pending_batch_queue_cnt != 0)) {
curr_payload_->ReserveRequests(pending_batch_queue_cnt);
for (size_t idx = 0; idx < pending_batch_queue_cnt; ++idx) {
std::unique_ptr<InferenceRequest> request;
auto status = queue_.Dequeue(&request);
if (status.IsOk()) {
if (preserve_ordering_ || response_cache_enabled_) {
DelegateResponse(request);
}
curr_payload_->AddRequest(std::move(request));
} else {
// The queue is empty which conflicts with pending batch
// count. Send the current batch if any and reset related
// variables.
LOG_ERROR << request->LogRequest()
<< "Failed to retrieve request from scheduler queue: "
<< status.Message();
queue_.ResetCursor();
queued_batch_size_ = 0;
pending_batch_size_ = 0;
break;
}
}
if (curr_payload_->GetState() == Payload::State::UNINITIALIZED) {
curr_payload_->SetState(Payload::State::READY);
}
queued_batch_size_ -= pending_batch_size_;
pending_batch_size_ = 0;
}
}
}
// If no requests are to be handled, wait for notification or
// for the specified timeout before checking the queue again.
if (wait_microseconds > 0) {
std::chrono::microseconds wait_timeout(wait_microseconds);
cv_.wait_for(lock, wait_timeout);
}
}
if (curr_payload_->GetState() == Payload::State::READY) {
auto callback = [this]() { cv_.notify_one(); };
curr_payload_->SetCallback(callback);
model_->Server()->GetRateLimiter()->EnqueuePayload(model_, curr_payload_);
}
// Finish rejected requests if any
if (rejected_requests != nullptr) {
static Status rejected_status =
Status(Status::Code::UNAVAILABLE, "Request timeout expired");
for (auto& rejected_queue : *rejected_requests) {
for (auto& rejected_request : rejected_queue) {
InferenceRequest::RespondIfError(
rejected_request, rejected_status, true);
}
}
}
} // end runner loop
LOG_VERBOSE(1) << "Stopping dynamic-batcher thread for " << model_name_
<< "...";
}
uint64_t
DynamicBatchScheduler::GetDynamicBatch()
{
// 'mu_' mutex must be held when this function is called. queue_
// must not be empty.
// Examine the new requests. If adding these new requests to the
// pending batch allows a preferred batch size then execute it
// immediately. Stop examining requests if the maximum preferred
// batch size would be exceeded or if the shape of the next request
// does not match the shape of the pending batch.
bool send_now = false;
if (!queue_.IsCursorValid()) {
queue_.ResetCursor();
pending_batch_size_ = 0;
}
size_t best_preferred_batch_size = 0;
queued_batch_size_ -= queue_.ApplyPolicyAtCursor();
// When there is optional input or input shape must be enforced,
// the inputs in the requests must be examined for forming a batch
const bool check_input =
!enforce_equal_shape_tensors_.empty() || has_optional_input_;
auto payload_batch_size = curr_payload_->BatchSize();
while (!queue_.CursorEnd()) {
const auto batch_size = std::max(1U, queue_.RequestAtCursor()->BatchSize());
// If there is no pending batch, then this request is starting a
// new batch.
if ((payload_batch_size + queue_.PendingBatchCount()) == 0) {
// Get the shape of the new batch that is being started...
if (check_input) {
if (!curr_payload_->MutableRequiredEqualInputs()
->Initialize(
queue_.RequestAtCursor(), enforce_equal_shape_tensors_,
has_optional_input_)
.IsOk()) {
send_now = true;
break;
}
}
} else {
// There is a pending batch and adding this request would make
// the batch size larger than all of the preferred batch sizes,
// so mark the cursor at this point. Not sending the pending batch so
// that we can examine the queue delay of requests that fits in a batch.
if (((payload_batch_size + pending_batch_size_ + batch_size) >
max_preferred_batch_size_) &&
(best_preferred_batch_size == 0)) {
best_preferred_batch_size = pending_batch_size_;
queue_.MarkCursor();
payload_saturated_ = true;
}
if ((payload_batch_size + pending_batch_size_ + batch_size) >
max_batch_size_) {
send_now = true;
break;
}
// There is a pending batch and it has a different shape then
// this request, so send the pending batch as it is.
if (check_input &&
!curr_payload_->MutableRequiredEqualInputs()->HasEqualInputs(
queue_.RequestAtCursor())) {
curr_payload_->MarkSaturated();
send_now = true;
break;
}
}
pending_batch_size_ += batch_size;
queue_.AdvanceCursor();
queued_batch_size_ -= queue_.ApplyPolicyAtCursor();
if (preferred_batch_sizes_.find(pending_batch_size_ + payload_batch_size) !=
preferred_batch_sizes_.end()) {
best_preferred_batch_size = pending_batch_size_;
queue_.MarkCursor();
}
}
// Obatin the age of the oldest pending request to compare with the maximum
// batch queuing delay
uint64_t now_ns = std::chrono::duration_cast<std::chrono::nanoseconds>(
std::chrono::steady_clock::now().time_since_epoch())
.count();
uint64_t delay_ns = now_ns - queue_.OldestEnqueueTime();
bool delay_is_exceeded =
(pending_batch_delay_ns_ != 0) && (delay_ns >= pending_batch_delay_ns_);
// If we found a preferred batch size and the queue delay hasn't been
// exceeded, then execute that.
if ((best_preferred_batch_size != 0) && !delay_is_exceeded) {
if (pending_batch_delay_ns_ == 0) {
payload_saturated_ = true;
}
pending_batch_size_ = best_preferred_batch_size;
queue_.SetCursorToMark();
return 0;
}
// No request in pending batch happens when all queued requests have expired
// timeout and the policies are REJECT
if (queue_.PendingBatchCount() == 0) {
return 0;
}
// If the delay has been exceeded, or if the current batch can't grow
// any larger then just immediately execute whatever is pending.
if (send_now || ((payload_batch_size + pending_batch_size_) >=
max_preferred_batch_size_)) {
payload_saturated_ = true;
return 0;
}
if (delay_is_exceeded || (pending_batch_delay_ns_ == 0)) {
return 0;
}
// Set the next preferred batch size given the pending batch size
auto next_preferred_batch_size_it = preferred_batch_sizes_.upper_bound(
pending_batch_size_ + payload_batch_size);
if (next_preferred_batch_size_it != preferred_batch_sizes_.end()) {
next_preferred_batch_size_ = *next_preferred_batch_size_it;
} else {
next_preferred_batch_size_ =
preferred_batch_sizes_.empty() ? 0 : *preferred_batch_sizes_.begin();
}
if (next_preferred_batch_size_ != 0) {
next_preferred_batch_size_ -= payload_batch_size;
}
// By this point, we have not seen the pending batch that should be executed
// immediately. However, if we have scheduled a payload that can be grown and
// not yet in preferred batch size, we should move the pending batch over to
// ensure the model instance will pick up largest available batch even if it
// is not the preferred batch.
if (!payload_saturated_ && (payload_batch_size != 0) &&
(preferred_batch_sizes_.find(payload_batch_size) ==
preferred_batch_sizes_.end())) {
return 0;
}
uint64_t wait_ns = pending_batch_delay_ns_ - delay_ns;
// Note that taking request timeout into consideration allows us to reset
// pending batch as soon as it is invalidated. But the cost is that in edge
// case where the timeout will be expired one by one, the thread will be
// waken frequently.
if (queue_.ClosestTimeout() != 0) {
if (now_ns <= queue_.ClosestTimeout()) {
wait_ns = std::min(queue_.ClosestTimeout() - now_ns, wait_ns);
} else {
// A request in pending batch is timed-out, wait for 1 us to force the
// thread to reset the pending batch right the way.
wait_ns = 1000;
}
}
// Return non-zero wait microseconds to cause this thread to wait
// until the queue delay or the closest timeout has expired.
// Another thread may be awaken due to incoming request to handle the
// pending batch before this thread wakes and that is ok. But if no other
// request comes in then this thread will wake and revisit the pending batch
// (and at that time will then see the delay has been exceeded and will send
// the batch).
return wait_ns / 1000;
}
void
DynamicBatchScheduler::DelegateResponse(
std::unique_ptr<InferenceRequest>& request)
{
std::lock_guard<std::mutex> lock(completion_queue_mtx_);
completion_queue_.emplace_back();
auto queue_slot = &completion_queue_.back();
// Pass raw ptr to lambda for tracking stats from cache and updating
// metric reporter on cache miss stats after insertion
InferenceRequest* raw_request_ptr = request.get();
request->SetResponseDelegator(
[this, queue_slot, raw_request_ptr](
std::unique_ptr<InferenceResponse>&& response, const uint32_t flags) {
if (response_cache_enabled_ && raw_request_ptr->CacheKeyIsSet()) {
// Cache insertion happens here because we need the backend to have
// computed the inference response first in the case of cache miss
auto cache = model_->Server()->GetResponseCache();
auto status = cache->Insert(*response, raw_request_ptr);
bool cache_miss =
(status.StatusCode() != Status::Code::ALREADY_EXISTS);
if (cache_miss) {
#ifdef TRITON_ENABLE_STATS
// Update cache miss statistics even on failure to insert
// as we still spend time on lookup and attempting to insert
raw_request_ptr->ReportStatisticsCacheMiss(reporter_.get());
#endif // TRITON_ENABLE_STATS
if (!status.IsOk()) {
LOG_ERROR << raw_request_ptr->LogRequest()
<< "Failed to insert request_hash ["
<< raw_request_ptr->CacheKey()
<< "] into response cache: " << status.Message();
}
} // Otherwise do nothing; we update cache hit statistics on Lookup
}
if (preserve_ordering_) {
{
std::lock_guard<std::mutex> lock(completion_queue_mtx_);
queue_slot->emplace_back(std::move(response), flags);
}
FinalizeResponses();
} else {
InferenceResponse::Send(std::move(response), flags);
}
});
}
void
DynamicBatchScheduler::CacheLookUp(
std::unique_ptr<InferenceRequest>& request,
std::unique_ptr<InferenceResponse>& cached_response)
{
auto cache = model_->Server()->GetResponseCache();
// Lookup request in cache
std::unique_ptr<InferenceResponse> local_response;
request->ResponseFactory()->CreateResponse(&local_response);
auto status = cache->Lookup(local_response.get(), request.get());
if (status.IsOk() && (local_response != nullptr)) {
cached_response = std::move(local_response);
#ifdef TRITON_ENABLE_STATS
// Update model metrics/stats on cache hits
// Backends will update metrics as normal on cache misses
request->ReportStatisticsCacheHit(reporter_.get());
#endif // TRITON_ENABLE_STATS
}
}
void
DynamicBatchScheduler::FinalizeResponses()
{
// Need exclusive access of the function to ensure responses are sent
// in order
std::lock_guard<std::mutex> lock(finalize_mtx_);
// Finalize the completed payloads in-order as far as possible
std::deque<std::pair<std::unique_ptr<InferenceResponse>, const uint32_t>>
responses;
{
std::lock_guard<std::mutex> queue_lock(completion_queue_mtx_);
while (!completion_queue_.empty() && !completion_queue_.front().empty()) {
bool response_complete = false;
for (auto& response_pair : completion_queue_.front()) {
// Assuming FINAL flag is set only in the last response of the request
response_complete =
((response_pair.second & TRITONSERVER_RESPONSE_COMPLETE_FINAL) !=
0);
responses.emplace_back(std::move(response_pair));
}
if (response_complete) {
completion_queue_.pop_front();
} else {
completion_queue_.front().clear();
}
}
}
for (auto& response : responses) {
InferenceResponse::Send(std::move(response.first), response.second);
}
}
}} // namespace triton::core
// Copyright 2018-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <atomic>
#include <condition_variable>
#include <deque>
#include <future>
#include <map>
#include <mutex>
#include <queue>
#include <set>
#include <thread>
#include "backend_model.h"
#include "backend_model_instance.h"
#include "model_config.pb.h"
#include "rate_limiter.h"
#include "scheduler.h"
#include "scheduler_utils.h"
#include "status.h"
#include "triton/common/model_config.h"
namespace triton { namespace core {
// Scheduler that implements dynamic batching.
class DynamicBatchScheduler : public Scheduler {
public:
// Create a scheduler to support a given number of runners and a run
// function to call when a request is scheduled.
static Status Create(
TritonModel* model, TritonModelInstance* model_instance, const int nice,
const bool dynamic_batching_enabled, const int32_t max_batch_size,
const std::unordered_map<std::string, bool>& enforce_equal_shape_tensors,
const bool preserve_ordering, const bool response_cache_enable,
const std::set<int32_t>& preferred_batch_sizes,
const uint64_t max_queue_delay_microseconds,
std::unique_ptr<Scheduler>* scheduler);
// Create a scheduler to support a given number of runners and a run
// function to call when a request is scheduled. And the scheduler also
// supports different queue policies for different priority levels.
static Status Create(
TritonModel* model, TritonModelInstance* model_instance, const int nice,
const bool dynamic_batching_enabled, const int32_t max_batch_size,
const std::unordered_map<std::string, bool>& enforce_equal_shape_tensors,
const inference::ModelDynamicBatching& batcher_config,
const bool response_cache_enable, std::unique_ptr<Scheduler>* scheduler);
~DynamicBatchScheduler();
// \see Scheduler::Enqueue()
Status Enqueue(std::unique_ptr<InferenceRequest>& request) override;
// \see Scheduler::InflightInferenceCount()
size_t InflightInferenceCount() override
{
std::unique_lock<std::mutex> lock(mu_);
if (curr_payload_ != nullptr) {
return queue_.Size() + curr_payload_->RequestCount();
}
return queue_.Size();
}
// \see Scheduler::Stop()
void Stop() override { stop_ = true; }
MetricModelReporter* MetricReporter() const { return reporter_.get(); }
private:
DynamicBatchScheduler(
TritonModel* model, TritonModelInstance* model_instance,
const bool dynamic_batching_enabled, const int32_t max_batch_size,
const std::unordered_map<std::string, bool>& enforce_equal_shape_tensors,
const bool preserve_ordering, const bool response_cache_enable,
const std::set<int32_t>& preferred_batch_sizes,
const uint64_t max_queue_delay_microseconds,
const inference::ModelQueuePolicy& default_queue_policy,
const uint32_t priority_levels,
const ModelQueuePolicyMap& queue_policy_map);
void BatcherThread(const int nice);
void NewPayload();
uint64_t GetDynamicBatch();
void DelegateResponse(std::unique_ptr<InferenceRequest>& request);
void CacheLookUp(
std::unique_ptr<InferenceRequest>& request,
std::unique_ptr<InferenceResponse>& cached_response);
void FinalizeResponses();
TritonModel* model_;
TritonModelInstance* model_instance_;
// Name of the model.
std::string model_name_;
// True if dynamic batching is enabled.
const bool dynamic_batching_enabled_;
// Map from priority level to queue holding inference requests for the model
// represented by this scheduler. If priority queues are not supported by the
// scheduler, then priority zero entry is used as the single queue.
PriorityQueue queue_;
bool stop_;
std::thread scheduler_thread_;
std::atomic<bool> scheduler_thread_exit_;
// Mutex and condvar for signaling scheduler thread
std::mutex mu_;
std::condition_variable cv_;
std::shared_ptr<RateLimiter> rate_limiter_;
std::shared_ptr<Payload> curr_payload_;
bool payload_saturated_;
size_t max_batch_size_;
size_t max_preferred_batch_size_;
std::set<int32_t> preferred_batch_sizes_;
uint64_t pending_batch_delay_ns_;
size_t pending_batch_size_;
size_t queued_batch_size_;
size_t next_preferred_batch_size_;
// The input tensors that require shape checking before being
// allowed in a batch. As a map from the tensor name to a bool. If
// tensor is in map then its shape must match shape of same tensor
// in requests already in the batch. If value is "true" then
// additional tensor is treated as a shape tensor and the values
// contained in the shape tensor must match same tensor already in
// the batch.
const std::unordered_map<std::string, bool> enforce_equal_shape_tensors_;
// Store information on whether the model contains optional inputs.
bool has_optional_input_;
// If true the ordering of responses matches the order of requests
// even when there are multiple scheduler threads.
const bool preserve_ordering_;
// If true, the scheduler will try to retrieve responses from cache.
bool response_cache_enabled_;
// Per completion-id queues to store the ready responses
std::deque<
std::vector<std::pair<std::unique_ptr<InferenceResponse>, uint32_t>>>
completion_queue_;
// Lock to protect the completion_queues_
std::mutex completion_queue_mtx_;
// Preserves the order in which responses are finalized
std::mutex finalize_mtx_;
// Reporter for metrics, or nullptr if no metrics should be reported
std::shared_ptr<MetricModelReporter> reporter_;
};
}} // namespace triton::core
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment