Commit fcefbf3d authored by xiabo's avatar xiabo
Browse files

重新整理工程

parent d592fbea
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
include(CMakeFindDependencyMacro)
get_filename_component(
TRITONCORE_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH
)
list(APPEND CMAKE_MODULE_PATH ${TRITONCORE_CMAKE_DIR})
if(NOT TARGET TritonCore::triton-core-serverapi)
include("${TRITONCORE_CMAKE_DIR}/TritonCoreTargets.cmake")
endif()
// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <stddef.h>
#include <stdint.h>
#include "triton/core/tritonserver.h"
#ifdef __cplusplus
extern "C" {
#endif
#ifdef _COMPILING_TRITONBACKEND
#if defined(_MSC_VER)
#define TRITONBACKEND_DECLSPEC __declspec(dllexport)
#define TRITONBACKEND_ISPEC __declspec(dllimport)
#elif defined(__GNUC__)
#define TRITONBACKEND_DECLSPEC __attribute__((__visibility__("default")))
#define TRITONBACKEND_ISPEC
#else
#define TRITONBACKEND_DECLSPEC
#define TRITONBACKEND_ISPEC
#endif
#else
#if defined(_MSC_VER)
#define TRITONBACKEND_DECLSPEC __declspec(dllimport)
#define TRITONBACKEND_ISPEC __declspec(dllexport)
#else
#define TRITONBACKEND_DECLSPEC
#define TRITONBACKEND_ISPEC
#endif
#endif
struct TRITONBACKEND_MemoryManager;
struct TRITONBACKEND_Input;
struct TRITONBACKEND_Output;
struct TRITONBACKEND_State;
struct TRITONBACKEND_Request;
struct TRITONBACKEND_ResponseFactory;
struct TRITONBACKEND_Response;
struct TRITONBACKEND_Backend;
struct TRITONBACKEND_Model;
struct TRITONBACKEND_ModelInstance;
struct TRITONBACKEND_BackendAttribute;
///
/// TRITONBACKEND API Version
///
/// The TRITONBACKEND API is versioned with major and minor version
/// numbers. Any change to the API that does not impact backwards
/// compatibility (for example, adding a non-required function)
/// increases the minor version number. Any change that breaks
/// backwards compatibility (for example, deleting or changing the
/// behavior of a function) increases the major version number. A
/// backend should check that the API version used to compile the
/// backend is compatible with the API version of the Triton server
/// that it is running in. This is typically done by code similar to
/// the following which makes sure that the major versions are equal
/// and that the minor version of Triton is >= the minor version used
/// to build the backend.
///
/// uint32_t api_version_major, api_version_minor;
/// TRITONBACKEND_ApiVersion(&api_version_major, &api_version_minor);
/// if ((api_version_major != TRITONBACKEND_API_VERSION_MAJOR) ||
/// (api_version_minor < TRITONBACKEND_API_VERSION_MINOR)) {
/// return TRITONSERVER_ErrorNew(
/// TRITONSERVER_ERROR_UNSUPPORTED,
/// "triton backend API version does not support this backend");
/// }
///
#define TRITONBACKEND_API_VERSION_MAJOR 1
#define TRITONBACKEND_API_VERSION_MINOR 10
/// Get the TRITONBACKEND API version supported by Triton. This value
/// can be compared against the TRITONBACKEND_API_VERSION_MAJOR and
/// TRITONBACKEND_API_VERSION_MINOR used to build the backend to
/// ensure that Triton is compatible with the backend.
///
/// \param major Returns the TRITONBACKEND API major version supported
/// by Triton.
/// \param minor Returns the TRITONBACKEND API minor version supported
/// by Triton.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ApiVersion(
uint32_t* major, uint32_t* minor);
/// TRITONBACKEND_ArtifactType
///
/// The ways that the files that make up a backend or model are
/// communicated to the backend.
///
/// TRITONBACKEND_ARTIFACT_FILESYSTEM: The model or backend
/// artifacts are made available to Triton via a locally
/// accessible filesystem. The backend can access these files
/// using an appropriate system API.
///
typedef enum TRITONBACKEND_artifacttype_enum {
TRITONBACKEND_ARTIFACT_FILESYSTEM
} TRITONBACKEND_ArtifactType;
///
/// TRITONBACKEND_MemoryManager
///
/// Object representing an memory manager that is capable of
/// allocating and otherwise managing different memory types. For
/// improved performance Triton maintains pools for GPU and CPU-pinned
/// memory and the memory manager allows backends to access those
/// pools.
///
/// Allocate a contiguous block of memory of a specific type using a
/// memory manager. Two error codes have specific interpretations for
/// this function:
///
/// TRITONSERVER_ERROR_UNSUPPORTED: Indicates that Triton is
/// incapable of allocating the requested memory type and memory
/// type ID. Requests for the memory type and ID will always fail
/// no matter 'byte_size' of the request.
///
/// TRITONSERVER_ERROR_UNAVAILABLE: Indicates that Triton can
/// allocate the memory type and ID but that currently it cannot
/// allocate a contiguous block of memory of the requested
/// 'byte_size'.
///
/// \param manager The memory manager.
/// \param buffer Returns the allocated memory.
/// \param memory_type The type of memory to allocate.
/// \param memory_type_id The ID associated with the memory type to
/// allocate. For GPU memory this indicates the device ID of the GPU
/// to allocate from.
/// \param byte_size The size of memory to allocate, in bytes.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_MemoryManagerAllocate(
TRITONBACKEND_MemoryManager* manager, void** buffer,
const TRITONSERVER_MemoryType memory_type, const int64_t memory_type_id,
const uint64_t byte_size);
/// Free a buffer that was previously allocated with
/// TRITONBACKEND_MemoryManagerAllocate. The call must provide the
/// same values for 'memory_type' and 'memory_type_id' as were used
/// when the buffer was allocate or else the behavior is undefined.
///
/// \param manager The memory manager.
/// \param buffer The allocated memory buffer to free.
/// \param memory_type The type of memory of the buffer.
/// \param memory_type_id The ID associated with the memory type of
/// the buffer.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_MemoryManagerFree(
TRITONBACKEND_MemoryManager* manager, void* buffer,
const TRITONSERVER_MemoryType memory_type, const int64_t memory_type_id);
///
/// TRITONBACKEND_Input
///
/// Object representing an input tensor.
///
/// Get the name and properties of an input tensor. The returned
/// strings and other properties are owned by the input, not the
/// caller, and so should not be modified or freed.
///
/// \param input The input tensor.
/// \param name If non-nullptr, returns the tensor name.
/// \param datatype If non-nullptr, returns the tensor datatype.
/// \param shape If non-nullptr, returns the tensor shape.
/// \param dim_count If non-nullptr, returns the number of dimensions
/// in the tensor shape.
/// \param byte_size If non-nullptr, returns the size of the available
/// data for the tensor, in bytes. This size reflects the actual data
/// available, and does not necessarily match what is
/// expected/required for the tensor given its shape and datatype. It
/// is the responsibility of the backend to handle mismatches in these
/// sizes appropriately.
/// \param buffer_count If non-nullptr, returns the number of buffers
/// holding the contents of the tensor. These buffers are accessed
/// using TRITONBACKEND_InputBuffer.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_InputProperties(
TRITONBACKEND_Input* input, const char** name,
TRITONSERVER_DataType* datatype, const int64_t** shape,
uint32_t* dims_count, uint64_t* byte_size, uint32_t* buffer_count);
/// Get the name and properties of an input tensor associated with a given
/// host policy. If there are no input buffers for the specified host policy,
/// the properties of the fallback input buffers are returned. The returned
/// strings and other properties are owned by the input, not the caller, and so
/// should not be modified or freed.
///
/// \param input The input tensor.
/// \param host_policy_name The host policy name. Fallback input properties
/// will be return if nullptr is provided.
/// \param name If non-nullptr, returns the tensor name.
/// \param datatype If non-nullptr, returns the tensor datatype.
/// \param shape If non-nullptr, returns the tensor shape.
/// \param dim_count If non-nullptr, returns the number of dimensions
/// in the tensor shape.
/// \param byte_size If non-nullptr, returns the size of the available
/// data for the tensor, in bytes. This size reflects the actual data
/// available, and does not necessarily match what is
/// expected/required for the tensor given its shape and datatype. It
/// is the responsibility of the backend to handle mismatches in these
/// sizes appropriately.
/// \param buffer_count If non-nullptr, returns the number of buffers
/// holding the contents of the tensor. These buffers are accessed
/// using TRITONBACKEND_InputBufferForHostPolicy.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_InputPropertiesForHostPolicy(
TRITONBACKEND_Input* input, const char* host_policy_name, const char** name,
TRITONSERVER_DataType* datatype, const int64_t** shape,
uint32_t* dims_count, uint64_t* byte_size, uint32_t* buffer_count);
/// Get a buffer holding (part of) the tensor data for an input. For a
/// given input the number of buffers composing the input are found
/// from 'buffer_count' returned by TRITONBACKEND_InputProperties. The
/// returned buffer is owned by the input and so should not be
/// modified or freed by the caller. The lifetime of the buffer
/// matches that of the input and so the buffer should not be accessed
/// after the input tensor object is released.
///
/// \param input The input tensor.
/// \param index The index of the buffer. Must be 0 <= index <
/// buffer_count, where buffer_count is the value returned by
/// TRITONBACKEND_InputProperties.
/// \param buffer Returns a pointer to a contiguous block of data for
/// the named input.
/// \param buffer_byte_size Returns the size, in bytes, of 'buffer'.
/// \param memory_type Acts as both input and output. On input gives
/// the buffer memory type preferred by the function caller. Returns
/// the actual memory type of 'buffer'.
/// \param memory_type_id Acts as both input and output. On input
/// gives the buffer memory type id preferred by the function caller.
/// Returns the actual memory type id of 'buffer'.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_InputBuffer(
TRITONBACKEND_Input* input, const uint32_t index, const void** buffer,
uint64_t* buffer_byte_size, TRITONSERVER_MemoryType* memory_type,
int64_t* memory_type_id);
/// Get a buffer holding (part of) the tensor data for an input for a specific
/// host policy. If there are no input buffers specified for this host policy,
/// the fallback input buffer is returned.
/// For a given input the number of buffers composing the input are found
/// from 'buffer_count' returned by TRITONBACKEND_InputPropertiesForHostPolicy.
/// The returned buffer is owned by the input and so should not be modified or
/// freed by the caller. The lifetime of the buffer matches that of the input
/// and so the buffer should not be accessed after the input tensor object is
/// released.
///
/// \param input The input tensor.
/// \param host_policy_name The host policy name. Fallback input buffer
/// will be return if nullptr is provided.
/// \param index The index of the buffer. Must be 0 <= index <
/// buffer_count, where buffer_count is the value returned by
/// TRITONBACKEND_InputPropertiesForHostPolicy.
/// \param buffer Returns a pointer to a contiguous block of data for
/// the named input.
/// \param buffer_byte_size Returns the size, in bytes, of 'buffer'.
/// \param memory_type Acts as both input and output. On input gives
/// the buffer memory type preferred by the function caller. Returns
/// the actual memory type of 'buffer'.
/// \param memory_type_id Acts as both input and output. On input
/// gives the buffer memory type id preferred by the function caller.
/// Returns the actual memory type id of 'buffer'.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_InputBufferForHostPolicy(
TRITONBACKEND_Input* input, const char* host_policy_name,
const uint32_t index, const void** buffer, uint64_t* buffer_byte_size,
TRITONSERVER_MemoryType* memory_type, int64_t* memory_type_id);
/// Get the buffer attributes associated with the given input buffer. For a
/// given input the number of buffers composing the input are found from
/// 'buffer_count' returned by TRITONBACKEND_InputProperties. The returned
/// 'buffer_attributes' is owned by the input and so should not be modified or
/// freed by the caller. The lifetime of the 'buffer_attributes' matches that of
/// the input and so the 'buffer_attributes' should not be accessed after the
/// input tensor object is released.
///
/// \param input The input tensor.
/// \param index The index of the buffer. Must be 0 <= index < buffer_count,
/// where buffer_count is the value returned by TRITONBACKEND_InputProperties.
/// \param buffer Returns a pointer to a contiguous block of data for
/// the named input.
/// \param buffer_attributes Returns the attributes for the given buffer.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_InputBufferAttributes(
TRITONBACKEND_Input* input, const uint32_t index, const void** buffer,
TRITONSERVER_BufferAttributes** buffer_attributes);
///
/// TRITONBACKEND_Output
///
/// Object representing a response output tensor.
///
/// Get a buffer to use to hold the tensor data for the output. The
/// returned buffer is owned by the output and so should not be freed
/// by the caller. The caller can and should fill the buffer with the
/// output data for the tensor. The lifetime of the buffer matches
/// that of the output and so the buffer should not be accessed after
/// the output tensor object is released.
///
/// \param buffer Returns a pointer to a buffer where the contents of
/// the output tensor should be placed.
/// \param buffer_byte_size The size, in bytes, of the buffer required
/// by the caller.
/// \param memory_type Acts as both input and output. On input gives
/// the buffer memory type preferred by the caller. Returns the
/// actual memory type of 'buffer'.
/// \param memory_type_id Acts as both input and output. On input
/// gives the buffer memory type id preferred by the caller. Returns
/// the actual memory type id of 'buffer'.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_OutputBuffer(
TRITONBACKEND_Output* output, void** buffer,
const uint64_t buffer_byte_size, TRITONSERVER_MemoryType* memory_type,
int64_t* memory_type_id);
/// Get the buffer attributes associated with the given output buffer. The
/// returned 'buffer_attributes' is owned by the output and so should not be
/// modified or freed by the caller. The lifetime of the 'buffer_attributes'
/// matches that of the output and so the 'buffer_attributes' should not be
/// accessed after the output tensor object is released. This function must be
/// called after the TRITONBACKEND_OutputBuffer otherwise it might contain
/// incorrect data.
///
/// \param output The output tensor.
/// \param buffer_attributes Returns the attributes for the output buffer.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_OutputBufferAttributes(
TRITONBACKEND_Output* output,
TRITONSERVER_BufferAttributes** buffer_attributes);
///
/// TRITONBACKEND_Request
///
/// Object representing an inference request.
///
/// Get the ID of the request. Can be nullptr if request doesn't have
/// an ID. The returned string is owned by the request, not the
/// caller, and so should not be modified or freed.
///
/// \param request The inference request.
/// \param id Returns the ID.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_RequestId(
TRITONBACKEND_Request* request, const char** id);
/// Get the correlation ID of the request if it is an unsigned integer.
/// Zero indicates that the request does not have a correlation ID.
/// Returns failure if correlation ID for given request is not an unsigned
/// integer.
///
/// \param request The inference request.
/// \param id Returns the correlation ID.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_RequestCorrelationId(
TRITONBACKEND_Request* request, uint64_t* id);
/// Get the correlation ID of the request if it is a string.
/// Empty string indicates that the request does not have a correlation ID.
/// Returns error if correlation ID for given request is not a string.
///
/// \param request The inference request.
/// \param id Returns the correlation ID.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_RequestCorrelationIdString(
TRITONBACKEND_Request* request, const char** id);
/// Get the flag(s) associated with a request. On return 'flags' holds
/// a bitwise-or of all flag values, see TRITONSERVER_RequestFlag for
/// available flags.
///
/// \param request The inference request.
/// \param flags Returns the flags.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_RequestFlags(
TRITONBACKEND_Request* request, uint32_t* flags);
/// Get the number of input tensors specified in the request.
///
/// \param request The inference request.
/// \param count Returns the number of input tensors.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_RequestInputCount(
TRITONBACKEND_Request* request, uint32_t* count);
/// Get the name of an input tensor. The caller does not own
/// the returned string and must not modify or delete it. The lifetime
/// of the returned string extends only as long as 'request'.
///
/// \param request The inference request.
/// \param index The index of the input tensor. Must be 0 <= index <
/// count, where count is the value returned by
/// TRITONBACKEND_RequestInputCount.
/// \param input_name Returns the name of the input tensor
/// corresponding to the index.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_RequestInputName(
TRITONBACKEND_Request* request, const uint32_t index,
const char** input_name);
/// Get a named request input. The lifetime of the returned input
/// object matches that of the request and so the input object should
/// not be accessed after the request object is released.
///
/// \param request The inference request.
/// \param name The name of the input.
/// \param input Returns the input corresponding to the name.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_RequestInput(
TRITONBACKEND_Request* request, const char* name,
TRITONBACKEND_Input** input);
/// Get a request input by index. The order of inputs in a given
/// request is not necessarily consistent with other requests, even if
/// the requests are in the same batch. As a result, you can not
/// assume that an index obtained from one request will point to the
/// same input in a different request.
///
/// The lifetime of the returned input object matches that of the
/// request and so the input object should not be accessed after the
/// request object is released.
///
/// \param request The inference request.
/// \param index The index of the input tensor. Must be 0 <= index <
/// count, where count is the value returned by
/// TRITONBACKEND_RequestInputCount.
/// \param input Returns the input corresponding to the index.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_RequestInputByIndex(
TRITONBACKEND_Request* request, const uint32_t index,
TRITONBACKEND_Input** input);
/// Get the number of output tensors requested to be returned in the
/// request.
///
/// \param request The inference request.
/// \param count Returns the number of output tensors.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_RequestOutputCount(
TRITONBACKEND_Request* request, uint32_t* count);
/// Get the name of a requested output tensor. The caller does not own
/// the returned string and must not modify or delete it. The lifetime
/// of the returned string extends only as long as 'request'.
///
/// \param request The inference request.
/// \param index The index of the requested output tensor. Must be 0
/// <= index < count, where count is the value returned by
/// TRITONBACKEND_RequestOutputCount.
/// \param output_name Returns the name of the requested output tensor
/// corresponding to the index.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_RequestOutputName(
TRITONBACKEND_Request* request, const uint32_t index,
const char** output_name);
/// Returns the preferred memory type and memory type ID of the output buffer
/// for the request. As much as possible, Triton will attempt to return
/// the same memory_type and memory_type_id values that will be returned by
/// the subsequent call to TRITONBACKEND_OutputBuffer, however, the backend must
/// be capable of handling cases where the values differ.
///
/// \param request The request.
/// \param name The name of the output tensor. This is optional
/// and it should be set to nullptr to indicate that the tensor name has
/// not determined.
/// \param byte_size The expected size of the buffer. This is optional
/// and it should be set to nullptr to indicate that the byte size has
/// not determined.
/// \param memory_type Acts as both input and output. On input gives
/// the memory type preferred by the caller. Returns memory type preferred
/// by Triton, taken account of the caller preferred type.
/// \param memory_type_id Acts as both input and output. On input gives
/// the memory type ID preferred by the caller. Returns memory type ID preferred
/// by Triton, taken account of the caller preferred type ID.
/// \return a TRITONSERVER_Error object if a failure occurs.
/// A TRITONSERVER_ERROR_UNAVAILABLE error indicates that the properties are not
/// available, other error codes indicate an error.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_RequestOutputBufferProperties(
TRITONBACKEND_Request* request, const char* name, size_t* byte_size,
TRITONSERVER_MemoryType* memory_type, int64_t* memory_type_id);
/// Release the request. The request should be released when it is no
/// longer needed by the backend. If this call returns with an error
/// (i.e. non-nullptr) then the request was not released and ownership
/// remains with the backend. If this call returns with success, the
/// 'request' object is no longer owned by the backend and must not be
/// used. Any tensor names, data types, shapes, input tensors,
/// etc. returned by TRITONBACKEND_Request* functions for this request
/// are no longer valid. If a persistent copy of that data is required
/// it must be created before calling this function.
///
/// \param request The inference request.
/// \param release_flags Flags indicating what type of request release
/// should be performed. \see TRITONSERVER_RequestReleaseFlag. \see
/// TRITONSERVER_InferenceRequestReleaseFn_t.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_RequestRelease(
TRITONBACKEND_Request* request, uint32_t release_flags);
///
/// TRITONBACKEND_ResponseFactory
///
/// Object representing an inference response factory. Using a
/// response factory is not required; instead a response can be
/// generated directly from a TRITONBACKEND_Request object using
/// TRITONBACKEND_ResponseNew(). A response factory allows a request
/// to be released before all responses have been sent. Releasing a
/// request as early as possible releases all input tensor data and
/// therefore may be desirable in some cases.
/// Create the response factory associated with a request.
///
/// \param factory Returns the new response factory.
/// \param request The inference request.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ResponseFactoryNew(
TRITONBACKEND_ResponseFactory** factory, TRITONBACKEND_Request* request);
/// Destroy a response factory.
///
/// \param factory The response factory.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ResponseFactoryDelete(
TRITONBACKEND_ResponseFactory* factory);
/// Send response flags without a corresponding response.
///
/// \param factory The response factory.
/// \param send_flags Flags to send. \see
/// TRITONSERVER_ResponseCompleteFlag. \see
/// TRITONSERVER_InferenceResponseCompleteFn_t.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ResponseFactorySendFlags(
TRITONBACKEND_ResponseFactory* factory, const uint32_t send_flags);
///
/// TRITONBACKEND_Response
///
/// Object representing an inference response. For a given request,
/// the backend must carefully manage the lifecycle of responses
/// generated for that request to ensure that the output tensor
/// buffers are allocated correctly. When a response is created with
/// TRITONBACKEND_ResponseNew or TRITONBACKEND_ResponseNewFromFactory,
/// all the outputs and corresponding buffers must be created for that
/// response using TRITONBACKEND_ResponseOutput and
/// TRITONBACKEND_OutputBuffer *before* another response is created
/// for the request. For a given response, outputs can be created in
/// any order but they must be created sequentially/sychronously (for
/// example, the backend cannot use multiple threads to simultaneously
/// add multiple outputs to a response).
///
/// The above requirement applies only to responses being generated
/// for a given request. The backend may generate responses in
/// parallel on multiple threads as long as those responses are for
/// different requests.
///
/// This order of response creation must be strictly followed. But,
/// once response(s) are created they do not need to be sent
/// immediately, nor do they need to be sent in the order they were
/// created. The backend may even delete a created response instead of
/// sending it by using TRITONBACKEND_ResponseDelete.
/// Create a response for a request.
///
/// \param response Returns the new response.
/// \param request The request.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ResponseNew(
TRITONBACKEND_Response** response, TRITONBACKEND_Request* request);
/// Create a response using a factory.
///
/// \param response Returns the new response.
/// \param factory The response factory.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ResponseNewFromFactory(
TRITONBACKEND_Response** response, TRITONBACKEND_ResponseFactory* factory);
/// Destroy a response. It is not necessary to delete a response if
/// TRITONBACKEND_ResponseSend is called as that function transfers
/// ownership of the response object to Triton.
///
/// \param response The response.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ResponseDelete(
TRITONBACKEND_Response* response);
/// Set a string parameter in the response.
///
/// \param response The response.
/// \param name The name of the parameter.
/// \param value The value of the parameter.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ResponseSetStringParameter(
TRITONBACKEND_Response* response, const char* name, const char* value);
/// Set an integer parameter in the response.
///
/// \param response The response.
/// \param name The name of the parameter.
/// \param value The value of the parameter.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ResponseSetIntParameter(
TRITONBACKEND_Response* response, const char* name, const int64_t value);
/// Set an boolean parameter in the response.
///
/// \param response The response.
/// \param name The name of the parameter.
/// \param value The value of the parameter.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ResponseSetBoolParameter(
TRITONBACKEND_Response* response, const char* name, const bool value);
/// Create an output tensor in the response. The lifetime of the
/// returned output tensor object matches that of the response and so
/// the output tensor object should not be accessed after the response
/// object is deleted.
///
/// \param response The response.
/// \param output Returns the new response output.
/// \param name The name of the output tensor.
/// \param datatype The datatype of the output tensor.
/// \param shape The shape of the output tensor.
/// \param dims_count The number of dimensions in the output tensor
/// shape.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ResponseOutput(
TRITONBACKEND_Response* response, TRITONBACKEND_Output** output,
const char* name, const TRITONSERVER_DataType datatype,
const int64_t* shape, const uint32_t dims_count);
/// Send a response. Calling this function transfers ownership of the
/// response object to Triton. The caller must not access or delete
/// the response object after calling this function.
///
/// \param response The response.
/// \param send_flags Flags associated with the response. \see
/// TRITONSERVER_ResponseCompleteFlag. \see
/// TRITONSERVER_InferenceResponseCompleteFn_t.
/// \param error The TRITONSERVER_Error to send if the response is an
/// error, or nullptr if the response is successful.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ResponseSend(
TRITONBACKEND_Response* response, const uint32_t send_flags,
TRITONSERVER_Error* error);
///
/// TRITONBACKEND_State
///
/// Object representing a state.
///
/// Create a state in the request. The returned state object is only valid
/// before the TRITONBACKEND_StateUpdate is called. The state should not be
/// freed by the caller. If TRITONBACKEND_StateUpdate is not called, the
/// lifetime of the state matches the lifetime of the request. If the state name
/// does not exist in the "state" section of the model configuration, the state
/// will not be created and an error will be returned. If this function is
/// called when sequence batching is not enabled or there is no 'states' section
/// in the sequence batching section of the model configuration, this call will
/// return an error.
///
/// \param state Returns the new state.
/// \param request The request.
/// \param name The name of the state.
/// \param datatype The datatype of the state.
/// \param shape The shape of the state.
/// \param dims_count The number of dimensions in the state shape.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_StateNew(
TRITONBACKEND_State** state, TRITONBACKEND_Request* request,
const char* name, const TRITONSERVER_DataType datatype,
const int64_t* shape, const uint32_t dims_count);
/// Update the state for the sequence. Calling this function will replace the
/// state stored for this seqeunce in Triton with 'state' provided in the
/// function argument. If this function is called when sequence batching is not
/// enabled or there is no 'states' section in the sequence batching section of
/// the model configuration, this call will return an error. The backend is not
/// required to call this function. If the backend doesn't call
/// TRITONBACKEND_StateUpdate function, this particular state for the sequence
/// will not be updated and the next inference request in the sequence will use
/// the same state as the current inference request.
///
/// \param state The state.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_StateUpdate(
TRITONBACKEND_State* state);
/// Get a buffer to use to hold the tensor data for the state. The returned
/// buffer is owned by the state and so should not be freed by the caller. The
/// caller can and should fill the buffer with the state data. The buffer must
/// not be accessed by the backend after TRITONBACKEND_StateUpdate is called.
/// The caller should fill the buffer before calling TRITONBACKEND_StateUpdate.
///
/// \param state The state.
/// \param buffer Returns a pointer to a buffer where the contents of the state
/// should be placed.
/// \param buffer_byte_size The size, in bytes, of the buffer required
/// by the caller.
/// \param memory_type Acts as both input and output. On input gives
/// the buffer memory type preferred by the caller. Returns the
/// actual memory type of 'buffer'.
/// \param memory_type_id Acts as both input and output. On input
/// gives the buffer memory type id preferred by the caller. Returns
/// the actual memory type id of 'buffer'.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_StateBuffer(
TRITONBACKEND_State* state, void** buffer, const uint64_t buffer_byte_size,
TRITONSERVER_MemoryType* memory_type, int64_t* memory_type_id);
/// Get the buffer attributes associated with the given state buffer.
/// The returned 'buffer_attributes' is owned by the state and so should not be
/// modified or freed by the caller. The lifetime of the 'buffer_attributes'
/// matches that of the state.
///
/// \param state The state.
/// \param buffer_attributes Returns the buffer attributes for the given state.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_StateBufferAttributes(
TRITONBACKEND_State* state,
TRITONSERVER_BufferAttributes** buffer_attributes);
///
/// TRITONBACKEND_Backend
///
/// Object representing a backend.
///
/// TRITONBACKEND_ExecutionPolicy
///
/// Types of execution policy that can be implemented by a backend.
///
/// TRITONBACKEND_EXECUTION_BLOCKING: An instance of the model
/// blocks in TRITONBACKEND_ModelInstanceExecute until it is ready
/// to handle another inference. Upon returning from
/// TRITONBACKEND_ModelInstanceExecute, Triton may immediately
/// call TRITONBACKEND_ModelInstanceExecute for the same instance
/// to execute a new batch of requests. Thus, most backends using
/// this policy will not return from
/// TRITONBACKEND_ModelInstanceExecute until all responses have
/// been sent and all requests have been released. This is the
/// default execution policy.
///
/// TRITONBACKEND_EXECUTION_DEVICE_BLOCKING: An instance, A, of the
/// model blocks in TRITONBACKEND_ModelInstanceExecute if the
/// device associated with the instance is unable to handle
/// another inference. Even if another instance, B, associated
/// with the device, is available and ready to perform an
/// inference, Triton will not invoke
/// TRITONBACKEND_ModeInstanceExecute for B until A returns from
/// TRITONBACKEND_ModelInstanceExecute. Triton will not be blocked
/// from calling TRITONBACKEND_ModelInstanceExecute for instance
/// C, which is associated with a different device than A and B,
/// even if A or B has not returned from
/// TRITONBACKEND_ModelInstanceExecute. This execution policy is
/// typically used by a backend that can cooperatively execute
/// multiple model instances on the same device.
///
typedef enum TRITONBACKEND_execpolicy_enum {
TRITONBACKEND_EXECUTION_BLOCKING,
TRITONBACKEND_EXECUTION_DEVICE_BLOCKING
} TRITONBACKEND_ExecutionPolicy;
/// Get the name of the backend. The caller does not own the returned
/// string and must not modify or delete it. The lifetime of the
/// returned string extends only as long as 'backend'.
///
/// \param backend The backend.
/// \param name Returns the name of the backend.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_BackendName(
TRITONBACKEND_Backend* backend, const char** name);
/// Get the backend configuration. The 'backend_config' message is
/// owned by Triton and should not be modified or freed by the caller.
///
/// The backend configuration, as JSON, is:
///
/// {
/// "cmdline" : {
/// "<setting>" : "<value>",
/// ...
/// }
/// }
///
/// \param backend The backend.
/// \param backend_config Returns the backend configuration as a message.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_BackendConfig(
TRITONBACKEND_Backend* backend, TRITONSERVER_Message** backend_config);
/// Get the execution policy for this backend. By default the
/// execution policy is TRITONBACKEND_EXECUTION_BLOCKING.
///
/// \param backend The backend.
/// \param policy Returns the execution policy.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_BackendExecutionPolicy(
TRITONBACKEND_Backend* backend, TRITONBACKEND_ExecutionPolicy* policy);
/// Set the execution policy for this backend. By default the
/// execution policy is TRITONBACKEND_EXECUTION_BLOCKING. Triton reads
/// the backend's execution policy after calling
/// TRITONBACKEND_Initialize, so to be recognized changes to the
/// execution policy must be made in TRITONBACKEND_Initialize.
/// Also, note that if using sequence batcher for the model, Triton will
/// use TRITONBACKEND_EXECUTION_BLOCKING policy irrespective of the
/// policy specified by this setter function.
///
/// \param backend The backend.
/// \param policy The execution policy.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_BackendSetExecutionPolicy(
TRITONBACKEND_Backend* backend, TRITONBACKEND_ExecutionPolicy policy);
/// Get the location of the files that make up the backend
/// implementation. This location contains the backend shared library
/// and any other files located with the shared library. The
/// 'location' communicated depends on how the backend is being
/// communicated to Triton as indicated by 'artifact_type'.
///
/// TRITONBACKEND_ARTIFACT_FILESYSTEM: The backend artifacts are
/// made available to Triton via the local filesytem. 'location'
/// returns the full path to the directory containing this
/// backend's artifacts. The returned string is owned by Triton,
/// not the caller, and so should not be modified or freed.
///
/// \param backend The backend.
/// \param artifact_type Returns the artifact type for the backend.
/// \param path Returns the location.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_BackendArtifacts(
TRITONBACKEND_Backend* backend, TRITONBACKEND_ArtifactType* artifact_type,
const char** location);
/// Get the memory manager associated with a backend.
///
/// \param backend The backend.
/// \param manager Returns the memory manager.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_BackendMemoryManager(
TRITONBACKEND_Backend* backend, TRITONBACKEND_MemoryManager** manager);
/// Get the user-specified state associated with the backend. The
/// state is completely owned and managed by the backend.
///
/// \param backend The backend.
/// \param state Returns the user state, or nullptr if no user state.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_BackendState(
TRITONBACKEND_Backend* backend, void** state);
/// Set the user-specified state associated with the backend. The
/// state is completely owned and managed by the backend.
///
/// \param backend The backend.
/// \param state The user state, or nullptr if no user state.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_BackendSetState(
TRITONBACKEND_Backend* backend, void* state);
///
/// TRITONBACKEND_Model
///
/// Object representing a model implemented using the backend.
///
/// Get the name of the model. The returned string is owned by the
/// model object, not the caller, and so should not be modified or
/// freed.
///
/// \param model The model.
/// \param name Returns the model name.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ModelName(
TRITONBACKEND_Model* model, const char** name);
/// Get the version of the model.
///
/// \param model The model.
/// \param version Returns the model version.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ModelVersion(
TRITONBACKEND_Model* model, uint64_t* version);
/// Get the location of the files that make up the model. The
/// 'location' communicated depends on how the model is being
/// communicated to Triton as indicated by 'artifact_type'.
///
/// TRITONBACKEND_ARTIFACT_FILESYSTEM: The model artifacts are made
/// available to Triton via the local filesytem. 'location'
/// returns the full path to the directory in the model repository
/// that contains this model's artifacts. The returned string is
/// owned by Triton, not the caller, and so should not be modified
/// or freed.
///
/// \param model The model.
/// \param artifact_type Returns the artifact type for the model.
/// \param path Returns the location.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ModelRepository(
TRITONBACKEND_Model* model, TRITONBACKEND_ArtifactType* artifact_type,
const char** location);
/// Get the model configuration. The caller takes ownership of the
/// message object and must call TRITONSERVER_MessageDelete to release
/// the object. The configuration is available via this call even
/// before the model is loaded and so can be used in
/// TRITONBACKEND_ModelInitialize. TRITONSERVER_ServerModelConfig
/// returns equivalent information but is not useable until after the
/// model loads.
///
/// \param model The model.
/// \param config_version The model configuration will be returned in
/// a format matching this version. If the configuration cannot be
/// represented in the requested version's format then an error will
/// be returned. Currently only version 1 is supported.
/// \param model_config Returns the model configuration as a message.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ModelConfig(
TRITONBACKEND_Model* model, const uint32_t config_version,
TRITONSERVER_Message** model_config);
/// Whether the backend should attempt to auto-complete the model configuration.
/// If true, the model should fill the inputs, outputs, and max batch size in
/// the model configuration if incomplete. If the model configuration is
/// changed, the new configuration must be reported to Triton using
/// TRITONBACKEND_ModelSetConfig.
///
/// \param model The model.
/// \param auto_complete_config Returns whether the backend should auto-complete
/// the model configuration.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelAutoCompleteConfig(
TRITONBACKEND_Model* model, bool* auto_complete_config);
/// Set the model configuration in Triton server. This API should only be called
/// when the backend implements the auto-completion of model configuration
/// and TRITONBACKEND_ModelAutoCompleteConfig returns true in
/// auto_complete_config. Only the inputs, outputs, max batch size, and
/// scheduling choice can be changed. A caveat being scheduling choice can only
/// be changed if none is previously set. Any other changes to the model
/// configuration will be ignored by Triton. This function can only be called
/// from TRITONBACKEND_ModelInitialize, calling in any other context will result
/// in an error being returned. Additionally, Triton server can add some of the
/// missing fields in the provided config with this call. The backend must get
/// the complete configuration again by using TRITONBACKEND_ModelConfig.
/// TRITONBACKEND_ModelSetConfig does not take ownership of the message object
/// and so the caller should call TRITONSERVER_MessageDelete to release the
/// object once the function returns.
///
/// \param model The model.
/// \param config_version The format version of the model configuration.
/// If the configuration is not represented in the version's format
/// then an error will be returned. Currently only version 1 is supported.
/// \param model_config The updated model configuration as a message.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ModelSetConfig(
TRITONBACKEND_Model* model, const uint32_t config_version,
TRITONSERVER_Message* model_config);
/// Get the TRITONSERVER_Server object that this model is being served
/// by.
///
/// \param model The model.
/// \param server Returns the server.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ModelServer(
TRITONBACKEND_Model* model, TRITONSERVER_Server** server);
/// Get the backend used by the model.
///
/// \param model The model.
/// \param model Returns the backend object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ModelBackend(
TRITONBACKEND_Model* model, TRITONBACKEND_Backend** backend);
/// Get the user-specified state associated with the model. The
/// state is completely owned and managed by the backend.
///
/// \param model The model.
/// \param state Returns the user state, or nullptr if no user state.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ModelState(
TRITONBACKEND_Model* model, void** state);
/// Set the user-specified state associated with the model. The
/// state is completely owned and managed by the backend.
///
/// \param model The model.
/// \param state The user state, or nullptr if no user state.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ModelSetState(
TRITONBACKEND_Model* model, void* state);
///
/// TRITONBACKEND_ModelInstance
///
/// Object representing a model instance implemented using the
/// backend.
///
/// Get the name of the model instance. The returned string is owned by the
/// model object, not the caller, and so should not be modified or
/// freed.
///
/// \param instance The model instance.
/// \param name Returns the instance name.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ModelInstanceName(
TRITONBACKEND_ModelInstance* instance, const char** name);
/// Get the kind of the model instance.
///
/// \param instance The model instance.
/// \param kind Returns the instance kind.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ModelInstanceKind(
TRITONBACKEND_ModelInstance* instance,
TRITONSERVER_InstanceGroupKind* kind);
/// Get the device ID of the model instance.
///
/// \param instance The model instance.
/// \param device_id Returns the instance device ID.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ModelInstanceDeviceId(
TRITONBACKEND_ModelInstance* instance, int32_t* device_id);
/// Get the host policy setting. The 'host_policy' message is
/// owned by Triton and should not be modified or freed by the caller.
///
/// The host policy setting, as JSON, is:
///
/// {
/// "<host_policy>" : {
/// "<setting>" : "<value>",
/// ...
/// }
/// }
///
/// \param instance The model instance.
/// \param host_policy Returns the host policy setting as a message.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceHostPolicy(
TRITONBACKEND_ModelInstance* instance, TRITONSERVER_Message** host_policy);
/// Whether the model instance is passive.
///
/// \param instance The model instance.
/// \param is_passive Returns true if the instance is passive, false otherwise
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ModelInstanceIsPassive(
TRITONBACKEND_ModelInstance* instance, bool* is_passive);
/// Get the number of optimization profiles to be loaded for the instance.
///
/// \param instance The model instance.
/// \param count Returns the number of optimization profiles.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceProfileCount(
TRITONBACKEND_ModelInstance* instance, uint32_t* count);
/// Get the name of optimization profile. The caller does not own
/// the returned string and must not modify or delete it. The lifetime
/// of the returned string extends only as long as 'instance'.
///
/// \param instance The model instance.
/// \param index The index of the optimization profile. Must be 0
/// <= index < count, where count is the value returned by
/// TRITONBACKEND_ModelInstanceProfileCount.
/// \param profile_name Returns the name of the optimization profile
/// corresponding to the index.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceProfileName(
TRITONBACKEND_ModelInstance* instance, const uint32_t index,
const char** profile_name);
/// Get the number of secondary devices configured for the instance.
///
/// \param instance The model instance.
/// \param count Returns the number of secondary devices.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceSecondaryDeviceCount(
TRITONBACKEND_ModelInstance* instance, uint32_t* count);
/// Get the properties of indexed secondary device. The returned
/// strings and other properties are owned by the instance, not the
/// caller, and so should not be modified or freed.
///
/// \param instance The model instance.
/// \param index The index of the secondary device. Must be 0
/// <= index < count, where count is the value returned by
/// TRITONBACKEND_ModelInstanceSecondaryDeviceCount.
/// \param kind Returns the kind of secondary device corresponding
/// to the index.
/// \param id Returns the id of secondary device corresponding to the index.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceSecondaryDeviceProperties(
TRITONBACKEND_ModelInstance* instance, uint32_t index, const char** kind,
int64_t* id);
/// Get the model associated with a model instance.
///
/// \param instance The model instance.
/// \param backend Returns the model object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ModelInstanceModel(
TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Model** model);
/// Get the user-specified state associated with the model
/// instance. The state is completely owned and managed by the
/// backend.
///
/// \param instance The model instance.
/// \param state Returns the user state, or nullptr if no user state.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ModelInstanceState(
TRITONBACKEND_ModelInstance* instance, void** state);
/// Set the user-specified state associated with the model
/// instance. The state is completely owned and managed by the
/// backend.
///
/// \param instance The model instance.
/// \param state The user state, or nullptr if no user state.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ModelInstanceSetState(
TRITONBACKEND_ModelInstance* instance, void* state);
/// Record statistics for an inference request.
///
/// Set 'success' true to indicate that the inference request
/// completed successfully. In this case all timestamps should be
/// non-zero values reported in nanoseconds and should be collected
/// using std::chrono::steady_clock::now().time_since_epoch() or the equivalent.
/// Set 'success' to false to indicate that the inference request failed
/// to complete successfully. In this case all timestamps values are
/// ignored.
///
/// For consistency of measurement across different backends, the
/// timestamps should be collected at the following points during
/// TRITONBACKEND_ModelInstanceExecute.
///
/// TRITONBACKEND_ModelInstanceExecute()
/// CAPTURE TIMESPACE (exec_start_ns)
/// < process input tensors to prepare them for inference
/// execution, including copying the tensors to/from GPU if
/// necessary>
/// CAPTURE TIMESPACE (compute_start_ns)
/// < perform inference computations to produce outputs >
/// CAPTURE TIMESPACE (compute_end_ns)
/// < allocate output buffers and extract output tensors, including
/// copying the tensors to/from GPU if necessary>
/// CAPTURE TIMESPACE (exec_end_ns)
/// return
///
/// Note that these statistics are associated with a valid
/// TRITONBACKEND_Request object and so must be reported before the
/// request is released. For backends that release the request before
/// all response(s) are sent, these statistics cannot capture
/// information about the time required to produce the response.
///
/// \param instance The model instance.
/// \param request The inference request that statistics are being
/// reported for.
/// \param success True if the inference request completed
/// successfully, false if it failed to complete.
/// \param exec_start_ns Timestamp for the start of execution.
/// \param compute_start_ns Timestamp for the start of execution
/// computations.
/// \param compute_end_ns Timestamp for the end of execution
/// computations.
/// \param exec_end_ns Timestamp for the end of execution.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceReportStatistics(
TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Request* request,
const bool success, const uint64_t exec_start_ns,
const uint64_t compute_start_ns, const uint64_t compute_end_ns,
const uint64_t exec_end_ns);
/// Record statistics for the execution of an entire batch of
/// inference requests.
///
/// All timestamps should be non-zero values reported in nanoseconds
/// and should be collected using
/// std::chrono::steady_clock::now().time_since_epoch() or the equivalent.
/// See TRITONBACKEND_ModelInstanceReportStatistics for more information about
/// the timestamps.
///
/// 'batch_size' is the sum of the batch sizes for the individual
/// requests that were delivered together in the call to
/// TRITONBACKEND_ModelInstanceExecute. For example, if three requests
/// are passed to TRITONBACKEND_ModelInstanceExecute and those
/// requests have batch size 1, 2, and 3; then 'batch_size' should be
/// set to 6.
///
/// \param instance The model instance.
/// \param batch_size Combined batch size of all the individual
/// requests executed in the batch.
/// \param exec_start_ns Timestamp for the start of execution.
/// \param compute_start_ns Timestamp for the start of execution
/// computations.
/// \param compute_end_ns Timestamp for the end of execution
/// computations.
/// \param exec_end_ns Timestamp for the end of execution.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceReportBatchStatistics(
TRITONBACKEND_ModelInstance* instance, const uint64_t batch_size,
const uint64_t exec_start_ns, const uint64_t compute_start_ns,
const uint64_t compute_end_ns, const uint64_t exec_end_ns);
///
/// The following functions can be implemented by a backend. Functions
/// indicated as required must be implemented or the backend will fail
/// to load.
///
/// Initialize a backend. This function is optional, a backend is not
/// required to implement it. This function is called once when a
/// backend is loaded to allow the backend to initialize any state
/// associated with the backend. A backend has a single state that is
/// shared across all models that use the backend.
///
/// \param backend The backend.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_ISPEC TRITONSERVER_Error* TRITONBACKEND_Initialize(
TRITONBACKEND_Backend* backend);
/// Finalize for a backend. This function is optional, a backend is
/// not required to implement it. This function is called once, just
/// before the backend is unloaded. All state associated with the
/// backend should be freed and any threads created for the backend
/// should be exited/joined before returning from this function.
///
/// \param backend The backend.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_ISPEC TRITONSERVER_Error* TRITONBACKEND_Finalize(
TRITONBACKEND_Backend* backend);
/// Initialize for a model. This function is optional, a backend is
/// not required to implement it. This function is called once when a
/// model that uses the backend is loaded to allow the backend to
/// initialize any state associated with the model. The backend should
/// also examine the model configuration to determine if the
/// configuration is suitable for the backend. Any errors reported by
/// this function will prevent the model from loading.
///
/// \param model The model.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_ISPEC TRITONSERVER_Error* TRITONBACKEND_ModelInitialize(
TRITONBACKEND_Model* model);
/// Finalize for a model. This function is optional, a backend is not
/// required to implement it. This function is called once for a
/// model, just before the model is unloaded from Triton. All state
/// associated with the model should be freed and any threads created
/// for the model should be exited/joined before returning from this
/// function.
///
/// \param model The model.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_ISPEC TRITONSERVER_Error* TRITONBACKEND_ModelFinalize(
TRITONBACKEND_Model* model);
/// Initialize for a model instance. This function is optional, a
/// backend is not required to implement it. This function is called
/// once when a model instance is created to allow the backend to
/// initialize any state associated with the instance.
///
/// \param instance The model instance.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_ISPEC TRITONSERVER_Error* TRITONBACKEND_ModelInstanceInitialize(
TRITONBACKEND_ModelInstance* instance);
/// Finalize for a model instance. This function is optional, a
/// backend is not required to implement it. This function is called
/// once for an instance, just before the corresponding model is
/// unloaded from Triton. All state associated with the instance
/// should be freed and any threads created for the instance should be
/// exited/joined before returning from this function.
///
/// \param instance The model instance.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_ISPEC TRITONSERVER_Error* TRITONBACKEND_ModelInstanceFinalize(
TRITONBACKEND_ModelInstance* instance);
/// Execute a batch of one or more requests on a model instance. This
/// function is required. Triton will not perform multiple
/// simultaneous calls to this function for a given model 'instance';
/// however, there may be simultaneous calls for different model
/// instances (for the same or different models).
///
/// If an error is returned the ownership of the request objects
/// remains with Triton and the backend must not retain references to
/// the request objects or access them in any way.
///
/// If success is returned, ownership of the request objects is
/// transferred to the backend and it is then responsible for creating
/// responses and releasing the request objects. Note that even though
/// ownership of the request objects is transferred to the backend, the
/// ownership of the buffer holding request pointers is returned back
/// to Triton upon return from TRITONBACKEND_ModelInstanceExecute. If
/// any request objects need to be maintained beyond
/// TRITONBACKEND_ModelInstanceExecute, then the pointers must be copied
/// out of the array within TRITONBACKEND_ModelInstanceExecute.
///
/// \param instance The model instance.
/// \param requests The requests.
/// \param request_count The number of requests in the batch.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_ISPEC TRITONSERVER_Error* TRITONBACKEND_ModelInstanceExecute(
TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Request** requests,
const uint32_t request_count);
/// Query the backend for different model attributes. This function is optional,
/// a backend is not required to implement it. The backend is also not required
/// to set all backend attribute listed. This function is called when
/// Triton requires further backend / model information to perform operations.
/// This function may be called multiple times within the lifetime of the
/// backend (between TRITONBACKEND_Initialize and TRITONBACKEND_Finalize).
/// The backend may return error to indicate failure to set the backend
/// attributes, and the attributes specified in the same function call will be
/// ignored. Triton will update the specified attributes if 'nullptr' is
/// returned.
///
/// \param backend The backend.
/// \param backend_attributes Return the backend attribute.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_ISPEC TRITONSERVER_Error* TRITONBACKEND_GetBackendAttribute(
TRITONBACKEND_Backend* backend,
TRITONBACKEND_BackendAttribute* backend_attributes);
/// TRITONBACKEND_BackendAttribute
///
/// API to modify attributes associated with a backend.
///
/// Add the preferred instance group of the backend. This function
/// can be called multiple times to cover different instance group kinds that
/// the backend supports, given the priority order that the first call describes
/// the most preferred group. In the case where instance group are not
/// explicitly provided, Triton will use this attribute to create model
/// deployment that aligns more with the backend preference.
///
/// \param backend_attributes The backend attributes object.
/// \param kind The kind of the instance group.
/// \param count The number of instances per device. Triton default will be used
/// if 0 is provided.
/// \param device_ids The devices where instances should be available. Triton
/// default will be used if 'nullptr' is provided.
/// \param id_count The number of devices in 'device_ids'.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_BackendAttributeAddPreferredInstanceGroup(
TRITONBACKEND_BackendAttribute* backend_attributes,
const TRITONSERVER_InstanceGroupKind kind, const uint64_t count,
const uint64_t* device_ids, const uint64_t id_count);
#ifdef __cplusplus
}
#endif
// Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <stddef.h>
#include <stdint.h>
#include "triton/core/tritonserver.h"
#ifdef __cplusplus
extern "C" {
#endif
#ifdef _COMPILING_TRITONREPOAGENT
#if defined(_MSC_VER)
#define TRITONREPOAGENT_DECLSPEC __declspec(dllexport)
#define TRITONREPOAGENT_ISPEC __declspec(dllimport)
#elif defined(__GNUC__)
#define TRITONREPOAGENT_DECLSPEC __attribute__((__visibility__("default")))
#define TRITONREPOAGENT_ISPEC
#else
#define TRITONREPOAGENT_DECLSPEC
#define TRITONREPOAGENT_ISPEC
#endif
#else
#if defined(_MSC_VER)
#define TRITONREPOAGENT_DECLSPEC __declspec(dllimport)
#define TRITONREPOAGENT_ISPEC __declspec(dllexport)
#else
#define TRITONREPOAGENT_DECLSPEC
#define TRITONREPOAGENT_ISPEC
#endif
#endif
struct TRITONREPOAGENT_Agent;
struct TRITONREPOAGENT_AgentModel;
///
/// TRITONREPOAGENT API Version
///
/// The TRITONREPOAGENT API is versioned with major and minor version
/// numbers. Any change to the API that does not impact backwards
/// compatibility (for example, adding a non-required function)
/// increases the minor version number. Any change that breaks
/// backwards compatibility (for example, deleting or changing the
/// behavior of a function) increases the major version number. A
/// repository agent should check that the API version used to compile
/// the agent is compatible with the API version of the Triton server
/// that it is running in. This is typically done by code similar to
/// the following which makes sure that the major versions are equal
/// and that the minor version of Triton is >= the minor version used
/// to build the agent.
///
/// uint32_t api_version_major, api_version_minor;
/// TRITONREPOAGENT_ApiVersion(&api_version_major, &api_version_minor);
/// if ((api_version_major != TRITONREPOAGENT_API_VERSION_MAJOR) ||
/// (api_version_minor < TRITONREPOAGENT_API_VERSION_MINOR)) {
/// return TRITONSERVER_ErrorNew(
/// TRITONSERVER_ERROR_UNSUPPORTED,
/// "triton repository agent API version does not support this agent");
/// }
///
#define TRITONREPOAGENT_API_VERSION_MAJOR 0
#define TRITONREPOAGENT_API_VERSION_MINOR 1
/// Get the TRITONREPOAGENT API version supported by Triton. This
/// value can be compared against the
/// TRITONREPOAGENT_API_VERSION_MAJOR and
/// TRITONREPOAGENT_API_VERSION_MINOR used to build the agent to
/// ensure that Triton is compatible with the agent.
///
/// \param major Returns the TRITONREPOAGENT API major version supported
/// by Triton.
/// \param minor Returns the TRITONREPOAGENT API minor version supported
/// by Triton.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONREPOAGENT_DECLSPEC TRITONSERVER_Error* TRITONREPOAGENT_ApiVersion(
uint32_t* major, uint32_t* minor);
/// TRITONREPOAGENT_ArtifactType
///
/// The ways that the files that make up a model's repository content
/// are communicated between Triton and the agent.
///
/// TRITONREPOAGENT_ARTIFACT_FILESYSTEM: The model artifacts are
/// communicated to and from the repository agent via a locally
/// accessible filesystem. The agent can access these files using
/// an appropriate filesystem API.
///
/// TRITONREPOAGENT_ARTIFACT_REMOTE_FILESYSTEM: The model artifacts are
/// communicated to and from the repository agent via a remote filesystem.
/// The remote filesystem path follows the same convention as is used for
/// repository paths, for example, "s3://" prefix indicates an S3 path.
///
typedef enum TRITONREPOAGENT_artifacttype_enum {
TRITONREPOAGENT_ARTIFACT_FILESYSTEM,
TRITONREPOAGENT_ARTIFACT_REMOTE_FILESYSTEM
} TRITONREPOAGENT_ArtifactType;
/// TRITONREPOAGENT_ActionType
///
/// Types of repository actions that can be handled by an agent.
/// The lifecycle of a TRITONREPOAGENT_AgentModel begins with a call to
/// TRITONREPOAGENT_ModelInitialize and ends with a call to
/// TRITONREPOAGENT_ModelFinalize. Between those calls the current lifecycle
/// state of the model is communicated by calls to TRITONREPOAGENT_ModelAction.
/// Possible lifecycles are:
///
/// LOAD -> LOAD_COMPLETE -> UNLOAD -> UNLOAD_COMPLETE
/// LOAD -> LOAD_FAIL
///
/// TRITONREPOAGENT_ACTION_LOAD: A model is being loaded.
///
/// TRITONREPOAGENT_ACTION_LOAD_COMPLETE: The model load completed
/// successfully and the model is now loaded.
///
/// TRITONREPOAGENT_ACTION_LOAD_FAIL: The model load did not complete
/// successfully. The model is not loaded.
///
/// TRITONREPOAGENT_ACTION_UNLOAD: The model is being unloaded.
///
/// TRITONREPOAGENT_ACTION_UNLOAD_COMPLETE: The model unload is complete.
///
typedef enum TRITONREPOAGENT_actiontype_enum {
TRITONREPOAGENT_ACTION_LOAD,
TRITONREPOAGENT_ACTION_LOAD_COMPLETE,
TRITONREPOAGENT_ACTION_LOAD_FAIL,
TRITONREPOAGENT_ACTION_UNLOAD,
TRITONREPOAGENT_ACTION_UNLOAD_COMPLETE
} TRITONREPOAGENT_ActionType;
/// Get the location of the files that make up the model. The
/// 'location' communicated depends on how the model is being
/// communicated to the agent as indicated by 'artifact_type'.
///
/// TRITONREPOAGENT_ARTIFACT_FILESYSTEM: The model artifacts are
/// made available to the agent via the local
/// filesytem. 'location' returns the full path to the directory
/// in the model repository that contains the model's
/// artifacts. The returned location string is owned by Triton,
/// not the caller, and so should not be modified or freed. The
/// contents of the directory are owned by Triton, not the agent,
/// and so the agent should not delete or modify the contents. Use
/// TRITONREPOAGENT_RepositoryAcquire to get a location that can be
/// used to modify the model repository contents.
///
/// TRITONREPOAGENT_ARTIFACT_REMOTE_FILESYSTEM: The model artifacts are
/// made available to the agent via a remote filesystem.
/// 'location' returns the full path to the remote directory that contains
/// the model's artifacts. The returned location string is owned by Triton,
/// not the caller, and so should not be modified or freed. The contents of
/// the remote directory are owned by Triton, not the agent,
/// and so the agent should not delete or modify the contents.
/// Use TRITONREPOAGENT_ModelRepositoryLocationAcquire to get a location
/// that can be used to write updated model repository contents.
///
/// \param agent The agent.
/// \param model The model.
/// \param artifact_type Returns the artifact type for the location.
/// \param path Returns the location.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONREPOAGENT_DECLSPEC TRITONSERVER_Error*
TRITONREPOAGENT_ModelRepositoryLocation(
TRITONREPOAGENT_Agent* agent, TRITONREPOAGENT_AgentModel* model,
TRITONREPOAGENT_ArtifactType* artifact_type, const char** location);
/// Acquire a location where the agent can produce a new version of
/// the model repository files. This is a convenience method to create
/// a temporary directory for the agent. The agent is responsible for
/// calling TRITONREPOAGENT_ModelRepositoryLocationDelete in
/// TRITONREPOAGENT_ModelFinalize to delete the location. Initially the
/// acquired location is empty. The 'location' communicated depends on
/// the requested 'artifact_type'.
///
/// TRITONREPOAGENT_ARTIFACT_FILESYSTEM: The location is a directory
/// on the local filesystem. 'location' returns the full path to
/// an empty directory that the agent should populate with the
/// model's artifacts. The returned location string is owned by
/// Triton, not the agent, and so should not be modified or freed.
///
/// \param agent The agent.
/// \param model The model.
/// \param artifact_type The artifact type for the location.
/// \param path Returns the location.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONREPOAGENT_DECLSPEC TRITONSERVER_Error*
TRITONREPOAGENT_ModelRepositoryLocationAcquire(
TRITONREPOAGENT_Agent* agent, TRITONREPOAGENT_AgentModel* model,
const TRITONREPOAGENT_ArtifactType artifact_type, const char** location);
/// Discard and release ownership of a previously acquired location
/// and its contents. The agent must not access or modify the location
/// or its contents after this call.
///
/// \param agent The agent.
/// \param model The model.
/// \param path The location to release.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONREPOAGENT_DECLSPEC TRITONSERVER_Error*
TRITONREPOAGENT_ModelRepositoryLocationRelease(
TRITONREPOAGENT_Agent* agent, TRITONREPOAGENT_AgentModel* model,
const char* location);
/// Inform Triton that the specified repository location should be used for
/// the model in place of the original model repository. This method can only be
/// called when TRITONREPOAGENT_ModelAction is invoked with
/// TRITONREPOAGENT_ACTION_LOAD. The 'location' The 'location'
/// communicated depends on how the repository is being
/// communicated to Triton as indicated by 'artifact_type'.
///
/// TRITONREPOAGENT_ARTIFACT_FILESYSTEM: The model artifacts are
/// made available to Triton via the local filesytem. 'location' returns
/// the full path to the directory. Ownership of the contents of the
/// returned directory are transferred to Triton and the agent should not
/// modified or freed the contents until TRITONREPOAGENT_ModelFinalize.
/// The local filesystem directory can be created using
/// TRITONREPOAGENT_ModelReopsitroyLocationAcquire or the agent can use
/// its own local filesystem API.
///
/// TRITONREPOAGENT_ARTIFACT_REMOTE_FILESYSTEM: The model artifacts are
/// made available to Triton via a remote filesystem. 'location' returns
/// the full path to the remote filesystem directory. Ownership of the
/// contents of the returned directory are transferred to Triton and
/// the agent should not modified or freed the contents until
/// TRITONREPOAGENT_ModelFinalize.
///
/// \param agent The agent.
/// \param model The model.
/// \param artifact_type The artifact type for the location.
/// \param path Returns the location.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONREPOAGENT_DECLSPEC TRITONSERVER_Error*
TRITONREPOAGENT_ModelRepositoryUpdate(
TRITONREPOAGENT_Agent* agent, TRITONREPOAGENT_AgentModel* model,
const TRITONREPOAGENT_ArtifactType artifact_type, const char* location);
/// Get the number of agent parameters defined for a model.
///
/// \param agent The agent.
/// \param model The model.
/// \param count Returns the number of input tensors.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONREPOAGENT_DECLSPEC TRITONSERVER_Error*
TRITONREPOAGENT_ModelParameterCount(
TRITONREPOAGENT_Agent* agent, TRITONREPOAGENT_AgentModel* model,
uint32_t* count);
/// Get a parameter name and value. The caller does not own the
/// returned strings and must not modify or delete them.
///
/// \param agent The agent.
/// \param model The model.
/// \param index The index of the parameter. Must be 0 <= index <
/// count, where count is the value returned by
/// TRITONREPOAGENT_ModelParameterCount.
/// \param parameter_name Returns the name of the parameter.
/// \param parameter_value Returns the value of the parameter.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONREPOAGENT_DECLSPEC TRITONSERVER_Error* TRITONREPOAGENT_ModelParameter(
TRITONREPOAGENT_Agent* agent, TRITONREPOAGENT_AgentModel* model,
const uint32_t index, const char** parameter_name,
const char** parameter_value);
/// Get the model configuration. The caller takes ownership of the
/// message object and must call TRITONSERVER_MessageDelete to release
/// the object. If the model repository does not contain a
/// config.pbtxt file then 'model_config' is returned as nullptr.
///
/// \param agent The agent.
/// \param model The model.
/// \param config_version The model configuration will be returned in
/// a format matching this version. If the configuration cannot be
/// represented in the requested version's format then an error will
/// be returned. Currently only version 1 is supported.
/// \param model_config Returns the model configuration as a message.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONREPOAGENT_DECLSPEC TRITONSERVER_Error* TRITONREPOAGENT_ModelConfig(
TRITONREPOAGENT_Agent* agent, TRITONREPOAGENT_AgentModel* model,
const uint32_t config_version, TRITONSERVER_Message** model_config);
/// Get the user-specified state associated with the model.
///
/// \param model The agent model.
/// \param state Returns the user state, or nullptr if no user state.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONREPOAGENT_DECLSPEC TRITONSERVER_Error* TRITONREPOAGENT_ModelState(
TRITONREPOAGENT_AgentModel* model, void** state);
/// Set the user-specified state associated with the model.
///
/// \param model The agent model.
/// \param state The user state, or nullptr if no user state.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONREPOAGENT_DECLSPEC TRITONSERVER_Error* TRITONREPOAGENT_ModelSetState(
TRITONREPOAGENT_AgentModel* model, void* state);
/// Get the user-specified state associated with the agent.
///
/// \param agent The agent.
/// \param state Returns the user state, or nullptr if no user state.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONREPOAGENT_DECLSPEC TRITONSERVER_Error* TRITONREPOAGENT_State(
TRITONREPOAGENT_Agent* agent, void** state);
/// Set the user-specified state associated with the agent.
///
/// \param agent The agent.
/// \param state The user state, or nullptr if no user state.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONREPOAGENT_DECLSPEC TRITONSERVER_Error* TRITONREPOAGENT_SetState(
TRITONREPOAGENT_Agent* agent, void* state);
///
/// The following functions can be implemented by an agent. Functions
/// indicated as required must be implemented or the agent will fail
/// to load.
///
/// Initialize an agent. This function is optional. This function is
/// called once when an agent is loaded to allow the agent to
/// initialize any state associated with the agent. An agent has a
/// single state that is shared across all invocations of the agent.
///
/// \param agent The agent.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONREPOAGENT_ISPEC TRITONSERVER_Error* TRITONREPOAGENT_Initialize(
TRITONREPOAGENT_Agent* agent);
/// Finalize for an agent. This function is optional. This function is
/// called once, just before the agent is unloaded. All state
/// associated with the agent should be freed and any threads created
/// for the agent should be exited/joined before returning from this
/// function.
///
/// \param agent The agent.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONREPOAGENT_ISPEC TRITONSERVER_Error* TRITONREPOAGENT_Finalize(
TRITONREPOAGENT_Agent* agent);
/// Initialize a model associated with an agent. This function is optional.
/// This function is called once when an agent model's lifecycle begins to allow
/// the agent model to initialize any state associated with it. An agent model
/// has a single state that is shared across all the lifecycle of the agent
/// model.
///
/// \param agent The agent to be associated with the model.
/// \param model The model.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONREPOAGENT_ISPEC TRITONSERVER_Error* TRITONREPOAGENT_ModelInitialize(
TRITONREPOAGENT_Agent* agent, TRITONREPOAGENT_AgentModel* model);
/// Finalize for a model. This function is optional. This function is
/// called once, just before the end of the agent model's lifecycle. All state
/// associated with the agent model should be freed and any threads created
/// for the agent model should be exited/joined before returning from this
/// function. If the model acquired a model location using
/// TRITONREPOAGENT_ModelRepositoryLocationAcquire, it must call
/// TRITONREPOAGENT_ModelRepositoryLocationRelease to release that location.
///
/// \param agent The agent associated with the model.
/// \param model The model.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONREPOAGENT_ISPEC TRITONSERVER_Error* TRITONREPOAGENT_ModelFinalize(
TRITONREPOAGENT_Agent* agent, TRITONREPOAGENT_AgentModel* model);
/// Handle an action for a specified model. This function is
/// required. Triton will not perform multiple simultaneous calls to
/// this function for a given agent and model; however, there may be
/// simultaneous calls for the agent for different models.
///
/// If the agent does not handle the action the agent should
/// immediately return success (nullptr).
///
/// Any modification to the model's repository must be made when 'action_type'
/// is TRITONREPOAGENT_ACTION_LOAD.
/// To modify the model's repository the agent must either acquire a mutable
/// location via TRITONREPOAGENT_ModelRepositoryLocationAcquire
/// or its own managed location, report the location to Triton via
/// TRITONREPOAGENT_ModelRepositoryUpdate, and then return
/// success (nullptr). If the agent does not need to make any changes
/// to the model repository it should not call
/// TRITONREPOAGENT_ModelRepositoryUpdate and then return success.
/// To indicate that a model load should fail return a non-success status.
///
/// \param agent The agent.
/// \param model The model that is the target of the action.
/// \action_type The type of action the agent should handle for the model.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONREPOAGENT_ISPEC TRITONSERVER_Error* TRITONREPOAGENT_ModelAction(
TRITONREPOAGENT_Agent* agent, TRITONREPOAGENT_AgentModel* model,
const TRITONREPOAGENT_ActionType action_type);
#ifdef __cplusplus
}
#endif
// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
/// \file
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#ifdef __cplusplus
extern "C" {
#endif
#ifdef _COMPILING_TRITONSERVER
#if defined(_MSC_VER)
#define TRITONSERVER_DECLSPEC __declspec(dllexport)
#elif defined(__GNUC__)
#define TRITONSERVER_DECLSPEC __attribute__((__visibility__("default")))
#else
#define TRITONSERVER_DECLSPEC
#endif
#else
#if defined(_MSC_VER)
#define TRITONSERVER_DECLSPEC __declspec(dllimport)
#else
#define TRITONSERVER_DECLSPEC
#endif
#endif
struct TRITONSERVER_BufferAttributes;
struct TRITONSERVER_Error;
struct TRITONSERVER_InferenceRequest;
struct TRITONSERVER_InferenceResponse;
struct TRITONSERVER_InferenceTrace;
struct TRITONSERVER_Message;
struct TRITONSERVER_Metrics;
struct TRITONSERVER_Parameter;
struct TRITONSERVER_ResponseAllocator;
struct TRITONSERVER_Server;
struct TRITONSERVER_ServerOptions;
struct TRITONSERVER_Metric;
struct TRITONSERVER_MetricFamily;
///
/// TRITONSERVER API Version
///
/// The TRITONSERVER API is versioned with major and minor version
/// numbers. Any change to the API that does not impact backwards
/// compatibility (for example, adding a non-required function)
/// increases the minor version number. Any change that breaks
/// backwards compatibility (for example, deleting or changing the
/// behavior of a function) increases the major version number. A
/// client should check that the API version used to compile the
/// client is compatible with the API version of the Triton shared
/// library that it is linking against. This is typically done by code
/// similar to the following which makes sure that the major versions
/// are equal and that the minor version of the Triton shared library
/// is >= the minor version used to build the client.
///
/// uint32_t api_version_major, api_version_minor;
/// TRITONSERVER_ApiVersion(&api_version_major, &api_version_minor);
/// if ((api_version_major != TRITONSERVER_API_VERSION_MAJOR) ||
/// (api_version_minor < TRITONSERVER_API_VERSION_MINOR)) {
/// return TRITONSERVER_ErrorNew(
/// TRITONSERVER_ERROR_UNSUPPORTED,
/// "triton server API version does not support this client");
/// }
///
#define TRITONSERVER_API_VERSION_MAJOR 1
#define TRITONSERVER_API_VERSION_MINOR 17
/// Get the TRITONBACKEND API version supported by the Triton shared
/// library. This value can be compared against the
/// TRITONSERVER_API_VERSION_MAJOR and TRITONSERVER_API_VERSION_MINOR
/// used to build the client to ensure that Triton shared library is
/// compatible with the client.
///
/// \param major Returns the TRITONSERVER API major version supported
/// by Triton.
/// \param minor Returns the TRITONSERVER API minor version supported
/// by Triton.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ApiVersion(
uint32_t* major, uint32_t* minor);
/// TRITONSERVER_DataType
///
/// Tensor data types recognized by TRITONSERVER.
///
typedef enum TRITONSERVER_datatype_enum {
TRITONSERVER_TYPE_INVALID,
TRITONSERVER_TYPE_BOOL,
TRITONSERVER_TYPE_UINT8,
TRITONSERVER_TYPE_UINT16,
TRITONSERVER_TYPE_UINT32,
TRITONSERVER_TYPE_UINT64,
TRITONSERVER_TYPE_INT8,
TRITONSERVER_TYPE_INT16,
TRITONSERVER_TYPE_INT32,
TRITONSERVER_TYPE_INT64,
TRITONSERVER_TYPE_FP16,
TRITONSERVER_TYPE_FP32,
TRITONSERVER_TYPE_FP64,
TRITONSERVER_TYPE_BYTES,
TRITONSERVER_TYPE_BF16
} TRITONSERVER_DataType;
/// Get the string representation of a data type. The returned string
/// is not owned by the caller and so should not be modified or freed.
///
/// \param datatype The data type.
/// \return The string representation of the data type.
TRITONSERVER_DECLSPEC const char* TRITONSERVER_DataTypeString(
TRITONSERVER_DataType datatype);
/// Get the Triton datatype corresponding to a string representation
/// of a datatype.
///
/// \param dtype The datatype string representation.
/// \return The Triton data type or TRITONSERVER_TYPE_INVALID if the
/// string does not represent a data type.
TRITONSERVER_DECLSPEC TRITONSERVER_DataType
TRITONSERVER_StringToDataType(const char* dtype);
/// Get the size of a Triton datatype in bytes. Zero is returned for
/// TRITONSERVER_TYPE_BYTES because it have variable size. Zero is
/// returned for TRITONSERVER_TYPE_INVALID.
///
/// \param dtype The datatype.
/// \return The size of the datatype.
TRITONSERVER_DECLSPEC uint32_t
TRITONSERVER_DataTypeByteSize(TRITONSERVER_DataType datatype);
/// TRITONSERVER_MemoryType
///
/// Types of memory recognized by TRITONSERVER.
///
typedef enum TRITONSERVER_memorytype_enum {
TRITONSERVER_MEMORY_CPU,
TRITONSERVER_MEMORY_CPU_PINNED,
TRITONSERVER_MEMORY_GPU
} TRITONSERVER_MemoryType;
/// Get the string representation of a memory type. The returned
/// string is not owned by the caller and so should not be modified or
/// freed.
///
/// \param memtype The memory type.
/// \return The string representation of the memory type.
TRITONSERVER_DECLSPEC const char* TRITONSERVER_MemoryTypeString(
TRITONSERVER_MemoryType memtype);
/// TRITONSERVER_ParameterType
///
/// Types of parameters recognized by TRITONSERVER.
///
typedef enum TRITONSERVER_parametertype_enum {
TRITONSERVER_PARAMETER_STRING,
TRITONSERVER_PARAMETER_INT,
TRITONSERVER_PARAMETER_BOOL,
TRITONSERVER_PARAMETER_BYTES
} TRITONSERVER_ParameterType;
/// Get the string representation of a parameter type. The returned
/// string is not owned by the caller and so should not be modified or
/// freed.
///
/// \param paramtype The parameter type.
/// \return The string representation of the parameter type.
TRITONSERVER_DECLSPEC const char* TRITONSERVER_ParameterTypeString(
TRITONSERVER_ParameterType paramtype);
/// Create a new parameter object. The caller takes ownership of the
/// TRITONSERVER_Parameter object and must call TRITONSERVER_ParameterDelete to
/// release the object. The object will maintain its own copy of the 'value'
///
/// \param name The parameter name.
/// \param type The parameter type.
/// \param value The pointer to the value.
/// \return A new TRITONSERVER_Parameter object. 'nullptr' will be returned if
/// 'type' is 'TRITONSERVER_PARAMETER_BYTES'. The caller should use
/// TRITONSERVER_ParameterBytesNew to create parameter with bytes type.
TRITONSERVER_DECLSPEC TRITONSERVER_Parameter* TRITONSERVER_ParameterNew(
const char* name, const TRITONSERVER_ParameterType type, const void* value);
/// Create a new parameter object with type TRITONSERVER_PARAMETER_BYTES.
/// The caller takes ownership of the TRITONSERVER_Parameter object and must
/// call TRITONSERVER_ParameterDelete to release the object. The object only
/// maintains a shallow copy of the 'byte_ptr' so the data content must be
/// valid until the parameter object is deleted.
///
/// \param name The parameter name.
/// \param byte_ptr The pointer to the data content.
/// \param size The size of the data content.
/// \return A new TRITONSERVER_Error object.
TRITONSERVER_DECLSPEC TRITONSERVER_Parameter* TRITONSERVER_ParameterBytesNew(
const char* name, const void* byte_ptr, const uint64_t size);
/// Delete an parameter object.
///
/// \param parameter The parameter object.
TRITONSERVER_DECLSPEC void TRITONSERVER_ParameterDelete(
TRITONSERVER_Parameter* parameter);
/// TRITONSERVER_InstanceGroupKind
///
/// Kinds of instance groups recognized by TRITONSERVER.
///
typedef enum TRITONSERVER_instancegroupkind_enum {
TRITONSERVER_INSTANCEGROUPKIND_AUTO,
TRITONSERVER_INSTANCEGROUPKIND_CPU,
TRITONSERVER_INSTANCEGROUPKIND_GPU,
TRITONSERVER_INSTANCEGROUPKIND_MODEL
} TRITONSERVER_InstanceGroupKind;
/// Get the string representation of an instance-group kind. The
/// returned string is not owned by the caller and so should not be
/// modified or freed.
///
/// \param kind The instance-group kind.
/// \return The string representation of the kind.
TRITONSERVER_DECLSPEC const char* TRITONSERVER_InstanceGroupKindString(
TRITONSERVER_InstanceGroupKind kind);
/// TRITONSERVER_Logging
///
/// Types/levels of logging.
///
typedef enum TRITONSERVER_loglevel_enum {
TRITONSERVER_LOG_INFO,
TRITONSERVER_LOG_WARN,
TRITONSERVER_LOG_ERROR,
TRITONSERVER_LOG_VERBOSE
} TRITONSERVER_LogLevel;
///
/// Format of logging.
///
/// TRITONSERVER_LOG_DEFAULT: the log severity (L) and timestamp will be
/// logged as "LMMDD hh:mm:ss.ssssss".
///
/// TRITONSERVER_LOG_ISO8601: the log format will be "YYYY-MM-DDThh:mm:ssZ L".
///
typedef enum TRITONSERVER_logformat_enum {
TRITONSERVER_LOG_DEFAULT,
TRITONSERVER_LOG_ISO8601
} TRITONSERVER_LogFormat;
/// Is a log level enabled?
///
/// \param level The log level.
/// \return True if the log level is enabled, false if not enabled.
TRITONSERVER_DECLSPEC bool TRITONSERVER_LogIsEnabled(
TRITONSERVER_LogLevel level);
/// Log a message at a given log level if that level is enabled.
///
/// \param level The log level.
/// \param filename The file name of the location of the log message.
/// \param line The line number of the log message.
/// \param msg The log message.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_LogMessage(
TRITONSERVER_LogLevel level, const char* filename, const int line,
const char* msg);
/// TRITONSERVER_Error
///
/// Errors are reported by a TRITONSERVER_Error object. A NULL
/// TRITONSERVER_Error indicates no error, a non-NULL TRITONSERVER_Error
/// indicates error and the code and message for the error can be
/// retrieved from the object.
///
/// The caller takes ownership of a TRITONSERVER_Error object returned by
/// the API and must call TRITONSERVER_ErrorDelete to release the object.
///
/// The TRITONSERVER_Error error codes
typedef enum TRITONSERVER_errorcode_enum {
TRITONSERVER_ERROR_UNKNOWN,
TRITONSERVER_ERROR_INTERNAL,
TRITONSERVER_ERROR_NOT_FOUND,
TRITONSERVER_ERROR_INVALID_ARG,
TRITONSERVER_ERROR_UNAVAILABLE,
TRITONSERVER_ERROR_UNSUPPORTED,
TRITONSERVER_ERROR_ALREADY_EXISTS
} TRITONSERVER_Error_Code;
/// Create a new error object. The caller takes ownership of the
/// TRITONSERVER_Error object and must call TRITONSERVER_ErrorDelete to
/// release the object.
///
/// \param code The error code.
/// \param msg The error message.
/// \return A new TRITONSERVER_Error object.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ErrorNew(
TRITONSERVER_Error_Code code, const char* msg);
/// Delete an error object.
///
/// \param error The error object.
TRITONSERVER_DECLSPEC void TRITONSERVER_ErrorDelete(TRITONSERVER_Error* error);
/// Get the error code.
///
/// \param error The error object.
/// \return The error code.
TRITONSERVER_DECLSPEC TRITONSERVER_Error_Code
TRITONSERVER_ErrorCode(TRITONSERVER_Error* error);
/// Get the string representation of an error code. The returned
/// string is not owned by the caller and so should not be modified or
/// freed. The lifetime of the returned string extends only as long as
/// 'error' and must not be accessed once 'error' is deleted.
///
/// \param error The error object.
/// \return The string representation of the error code.
TRITONSERVER_DECLSPEC const char* TRITONSERVER_ErrorCodeString(
TRITONSERVER_Error* error);
/// Get the error message. The returned string is not owned by the
/// caller and so should not be modified or freed. The lifetime of the
/// returned string extends only as long as 'error' and must not be
/// accessed once 'error' is deleted.
///
/// \param error The error object.
/// \return The error message.
TRITONSERVER_DECLSPEC const char* TRITONSERVER_ErrorMessage(
TRITONSERVER_Error* error);
/// TRITONSERVER_ResponseAllocator
///
/// Object representing a memory allocator for output tensors in an
/// inference response.
///
/// Type for allocation function that allocates a buffer to hold an
/// output tensor.
///
/// \param allocator The allocator that is provided in the call to
/// TRITONSERVER_InferenceRequestSetResponseCallback.
/// \param tensor_name The name of the output tensor to allocate for.
/// \param byte_size The size of the buffer to allocate.
/// \param memory_type The type of memory that the caller prefers for
/// the buffer allocation.
/// \param memory_type_id The ID of the memory that the caller prefers
/// for the buffer allocation.
/// \param userp The user data pointer that is provided as
/// 'response_allocator_userp' in the call to
/// TRITONSERVER_InferenceRequestSetResponseCallback.
/// \param buffer Returns a pointer to the allocated memory.
/// \param buffer_userp Returns a user-specified value to associate
/// with the buffer, or nullptr if no user-specified value should be
/// associated with the buffer. This value will be provided in the
/// call to TRITONSERVER_ResponseAllocatorReleaseFn_t when the buffer
/// is released and will also be returned by
/// TRITONSERVER_InferenceResponseOutput.
/// \param actual_memory_type Returns the type of memory where the
/// allocation resides. May be different than the type of memory
/// requested by 'memory_type'.
/// \param actual_memory_type_id Returns the ID of the memory where
/// the allocation resides. May be different than the ID of the memory
/// requested by 'memory_type_id'.
/// \return a TRITONSERVER_Error object if a failure occurs while
/// attempting an allocation. If an error is returned all other return
/// values will be ignored.
typedef TRITONSERVER_Error* (*TRITONSERVER_ResponseAllocatorAllocFn_t)(
TRITONSERVER_ResponseAllocator* allocator, const char* tensor_name,
size_t byte_size, TRITONSERVER_MemoryType memory_type,
int64_t memory_type_id, void* userp, void** buffer, void** buffer_userp,
TRITONSERVER_MemoryType* actual_memory_type,
int64_t* actual_memory_type_id);
/// Type for allocation function that allocates a buffer to hold an
/// output tensor with buffer attributes. The callback function must fill in the
/// appropriate buffer attributes information related to this buffer. If set,
/// this function is always called after TRITONSERVER_ResponseAllocatorAllocFn_t
/// function.
///
/// \param allocator The allocator that is provided in the call to
/// TRITONSERVER_InferenceRequestSetResponseCallback.
/// \param tensor_name The name of the output tensor to allocate for.
/// \param buffer_attributes The buffer attributes associated with the buffer.
/// \param userp The user data pointer that is provided as
/// 'response_allocator_userp' in the call to
/// TRITONSERVER_InferenceRequestSetResponseCallback.
/// \param buffer_userp Returns a user-specified value to associate
/// with the buffer, or nullptr if no user-specified value should be
/// associated with the buffer. This value will be provided in the
/// call to TRITONSERVER_ResponseAllocatorReleaseFn_t when the buffer
/// is released and will also be returned by
/// TRITONSERVER_InferenceResponseOutput.
/// \return a TRITONSERVER_Error object if a failure occurs while
/// attempting an allocation. If an error is returned all other return
/// values will be ignored.
typedef TRITONSERVER_Error* (
*TRITONSERVER_ResponseAllocatorBufferAttributesFn_t)(
TRITONSERVER_ResponseAllocator* allocator, const char* tensor_name,
TRITONSERVER_BufferAttributes* buffer_attributes, void* userp,
void* buffer_userp);
/// Type for function that is called to query the allocator's preferred memory
/// type and memory type ID. As much as possible, the allocator should attempt
/// to return the same memory_type and memory_type_id values that will be
/// returned by the subsequent call to TRITONSERVER_ResponseAllocatorAllocFn_t.
/// But the allocator is not required to do so.
///
/// \param allocator The allocator that is provided in the call to
/// TRITONSERVER_InferenceRequestSetResponseCallback.
/// \param userp The user data pointer that is provided as
/// 'response_allocator_userp' in the call to
/// TRITONSERVER_InferenceRequestSetResponseCallback.
/// \param tensor_name The name of the output tensor. This is optional
/// and it should be set to nullptr to indicate that the tensor name has
/// not determined.
/// \param byte_size The expected size of the buffer. This is optional
/// and it should be set to nullptr to indicate that the byte size has
/// not determined.
/// \param memory_type Acts as both input and output. On input gives
/// the memory type preferred by the caller. Returns memory type preferred
/// by the allocator, taken account of the caller preferred type.
/// \param memory_type_id Acts as both input and output. On input gives
/// the memory type ID preferred by the caller. Returns memory type ID preferred
/// by the allocator, taken account of the caller preferred type ID.
/// \return a TRITONSERVER_Error object if a failure occurs.
typedef TRITONSERVER_Error* (*TRITONSERVER_ResponseAllocatorQueryFn_t)(
TRITONSERVER_ResponseAllocator* allocator, void* userp,
const char* tensor_name, size_t* byte_size,
TRITONSERVER_MemoryType* memory_type, int64_t* memory_type_id);
/// Type for function that is called when the server no longer holds
/// any reference to a buffer allocated by
/// TRITONSERVER_ResponseAllocatorAllocFn_t. In practice this function
/// is typically called when the response object associated with the
/// buffer is deleted by TRITONSERVER_InferenceResponseDelete.
///
/// \param allocator The allocator that is provided in the call to
/// TRITONSERVER_InferenceRequestSetResponseCallback.
/// \param buffer Pointer to the buffer to be freed.
/// \param buffer_userp The user-specified value associated
/// with the buffer in TRITONSERVER_ResponseAllocatorAllocFn_t.
/// \param byte_size The size of the buffer.
/// \param memory_type The type of memory holding the buffer.
/// \param memory_type_id The ID of the memory holding the buffer.
/// \return a TRITONSERVER_Error object if a failure occurs while
/// attempting the release. If an error is returned Triton will not
/// attempt to release the buffer again.
typedef TRITONSERVER_Error* (*TRITONSERVER_ResponseAllocatorReleaseFn_t)(
TRITONSERVER_ResponseAllocator* allocator, void* buffer, void* buffer_userp,
size_t byte_size, TRITONSERVER_MemoryType memory_type,
int64_t memory_type_id);
/// Type for function that is called to indicate that subsequent
/// allocation requests will refer to a new response.
///
/// \param allocator The allocator that is provided in the call to
/// TRITONSERVER_InferenceRequestSetResponseCallback.
/// \param userp The user data pointer that is provided as
/// 'response_allocator_userp' in the call to
/// TRITONSERVER_InferenceRequestSetResponseCallback.
/// \return a TRITONSERVER_Error object if a failure occurs.
typedef TRITONSERVER_Error* (*TRITONSERVER_ResponseAllocatorStartFn_t)(
TRITONSERVER_ResponseAllocator* allocator, void* userp);
/// Create a new response allocator object.
///
/// The response allocator object is used by Triton to allocate
/// buffers to hold the output tensors in inference responses. Most
/// models generate a single response for each inference request
/// (TRITONSERVER_TXN_ONE_TO_ONE). For these models the order of
/// callbacks will be:
///
/// TRITONSERVER_ServerInferAsync called
/// - start_fn : optional (and typically not required)
/// - alloc_fn : called once for each output tensor in response
/// TRITONSERVER_InferenceResponseDelete called
/// - release_fn: called once for each output tensor in response
///
/// For models that generate multiple responses for each inference
/// request (TRITONSERVER_TXN_DECOUPLED), the start_fn callback can be
/// used to determine sets of alloc_fn callbacks that belong to the
/// same response:
///
/// TRITONSERVER_ServerInferAsync called
/// - start_fn
/// - alloc_fn : called once for each output tensor in response
/// - start_fn
/// - alloc_fn : called once for each output tensor in response
/// ...
/// For each response, TRITONSERVER_InferenceResponseDelete called
/// - release_fn: called once for each output tensor in the response
///
/// In all cases the start_fn, alloc_fn and release_fn callback
/// functions must be thread-safe. Typically making these functions
/// thread-safe does not require explicit locking. The recommended way
/// to implement these functions is to have each inference request
/// provide a 'response_allocator_userp' object that is unique to that
/// request with TRITONSERVER_InferenceRequestSetResponseCallback. The
/// callback functions then operate only on this unique state. Locking
/// is required only when the callback function needs to access state
/// that is shared across inference requests (for example, a common
/// allocation pool).
///
/// \param allocator Returns the new response allocator object.
/// \param alloc_fn The function to call to allocate buffers for result
/// tensors.
/// \param release_fn The function to call when the server no longer
/// holds a reference to an allocated buffer.
/// \param start_fn The function to call to indicate that the
/// subsequent 'alloc_fn' calls are for a new response. This callback
/// is optional (use nullptr to indicate that it should not be
/// invoked).
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ResponseAllocatorNew(
TRITONSERVER_ResponseAllocator** allocator,
TRITONSERVER_ResponseAllocatorAllocFn_t alloc_fn,
TRITONSERVER_ResponseAllocatorReleaseFn_t release_fn,
TRITONSERVER_ResponseAllocatorStartFn_t start_fn);
/// Set the buffer attributes function for a response allocator object.
/// The function will be called after alloc_fn to set the buffer attributes
/// associated with the output buffer.
///
/// The thread-safy requirement for buffer_attributes_fn is the same as other
/// allocator callbacks.
///
/// \param allocator The response allocator object.
/// \param buffer_attributes_fn The function to call to get the buffer
/// attributes information for an allocated buffer.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ResponseAllocatorSetBufferAttributesFunction(
TRITONSERVER_ResponseAllocator* allocator,
TRITONSERVER_ResponseAllocatorBufferAttributesFn_t buffer_attributes_fn);
/// Set the query function to a response allocator object. Usually the
/// function will be called before alloc_fn to understand what is the
/// allocator's preferred memory type and memory type ID at the current
/// situation to make different execution decision.
///
/// The thread-safy requirement for query_fn is the same as other allocator
/// callbacks.
///
/// \param allocator The response allocator object.
/// \param query_fn The function to call to query allocator's preferred memory
/// type and memory type ID.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ResponseAllocatorSetQueryFunction(
TRITONSERVER_ResponseAllocator* allocator,
TRITONSERVER_ResponseAllocatorQueryFn_t query_fn);
/// Delete a response allocator.
///
/// \param allocator The response allocator object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ResponseAllocatorDelete(
TRITONSERVER_ResponseAllocator* allocator);
/// TRITONSERVER_Message
///
/// Object representing a Triton Server message.
///
/// Create a new message object from serialized JSON string.
///
/// \param message The message object.
/// \param base The base of the serialized JSON.
/// \param byte_size The size, in bytes, of the serialized message.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_MessageNewFromSerializedJson(
TRITONSERVER_Message** message, const char* base, size_t byte_size);
/// Delete a message object.
///
/// \param message The message object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_MessageDelete(
TRITONSERVER_Message* message);
/// Get the base and size of the buffer containing the serialized
/// message in JSON format. The buffer is owned by the
/// TRITONSERVER_Message object and should not be modified or freed by
/// the caller. The lifetime of the buffer extends only as long as
/// 'message' and must not be accessed once 'message' is deleted.
///
/// \param message The message object.
/// \param base Returns the base of the serialized message.
/// \param byte_size Returns the size, in bytes, of the serialized
/// message.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_MessageSerializeToJson(
TRITONSERVER_Message* message, const char** base, size_t* byte_size);
/// TRITONSERVER_Metrics
///
/// Object representing metrics.
///
/// Metric format types
typedef enum tritonserver_metricformat_enum {
TRITONSERVER_METRIC_PROMETHEUS
} TRITONSERVER_MetricFormat;
/// Delete a metrics object.
///
/// \param metrics The metrics object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_MetricsDelete(
TRITONSERVER_Metrics* metrics);
/// Get a buffer containing the metrics in the specified format. For
/// each format the buffer contains the following:
///
/// TRITONSERVER_METRIC_PROMETHEUS: 'base' points to a single multiline
/// string (char*) that gives a text representation of the metrics in
/// prometheus format. 'byte_size' returns the length of the string
/// in bytes.
///
/// The buffer is owned by the 'metrics' object and should not be
/// modified or freed by the caller. The lifetime of the buffer
/// extends only as long as 'metrics' and must not be accessed once
/// 'metrics' is deleted.
///
/// \param metrics The metrics object.
/// \param format The format to use for the returned metrics.
/// \param base Returns a pointer to the base of the formatted
/// metrics, as described above.
/// \param byte_size Returns the size, in bytes, of the formatted
/// metrics.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_MetricsFormatted(
TRITONSERVER_Metrics* metrics, TRITONSERVER_MetricFormat format,
const char** base, size_t* byte_size);
/// TRITONSERVER_InferenceTrace
///
/// Object that represents tracing for an inference request.
///
/// Trace levels. The trace level controls the type of trace
/// activities that are reported for an inference request.
///
/// Trace level values are power-of-2 and can be combined to trace
/// multiple types of activities. For example, use
/// (TRITONSERVER_TRACE_LEVEL_TIMESTAMPS |
/// TRITONSERVER_TRACE_LEVEL_TENSORS) to trace both timestamps and
/// tensors for an inference request.
///
/// TRITONSERVER_TRACE_LEVEL_MIN and TRITONSERVER_TRACE_LEVEL_MAX are
/// deprecated and should not be used.
typedef enum tritonserver_tracelevel_enum {
/// Tracing disabled. No trace activities are reported.
TRITONSERVER_TRACE_LEVEL_DISABLED = 0,
/// Deprecated. Use TRITONSERVER_TRACE_LEVEL_TIMESTAMPS.
TRITONSERVER_TRACE_LEVEL_MIN = 1,
/// Deprecated. Use TRITONSERVER_TRACE_LEVEL_TIMESTAMPS.
TRITONSERVER_TRACE_LEVEL_MAX = 2,
/// Record timestamps for the inference request.
TRITONSERVER_TRACE_LEVEL_TIMESTAMPS = 0x4,
/// Record input and output tensor values for the inference request.
TRITONSERVER_TRACE_LEVEL_TENSORS = 0x8
} TRITONSERVER_InferenceTraceLevel;
/// Get the string representation of a trace level. The returned
/// string is not owned by the caller and so should not be modified or
/// freed.
///
/// \param level The trace level.
/// \return The string representation of the trace level.
TRITONSERVER_DECLSPEC const char* TRITONSERVER_InferenceTraceLevelString(
TRITONSERVER_InferenceTraceLevel level);
/// Trace activities
typedef enum tritonserver_traceactivity_enum {
TRITONSERVER_TRACE_REQUEST_START = 0,
TRITONSERVER_TRACE_QUEUE_START = 1,
TRITONSERVER_TRACE_COMPUTE_START = 2,
TRITONSERVER_TRACE_COMPUTE_INPUT_END = 3,
TRITONSERVER_TRACE_COMPUTE_OUTPUT_START = 4,
TRITONSERVER_TRACE_COMPUTE_END = 5,
TRITONSERVER_TRACE_REQUEST_END = 6,
TRITONSERVER_TRACE_TENSOR_QUEUE_INPUT = 7,
TRITONSERVER_TRACE_TENSOR_BACKEND_INPUT = 8,
TRITONSERVER_TRACE_TENSOR_BACKEND_OUTPUT = 9
} TRITONSERVER_InferenceTraceActivity;
/// Get the string representation of a trace activity. The returned
/// string is not owned by the caller and so should not be modified or
/// freed.
///
/// \param activity The trace activity.
/// \return The string representation of the trace activity.
TRITONSERVER_DECLSPEC const char* TRITONSERVER_InferenceTraceActivityString(
TRITONSERVER_InferenceTraceActivity activity);
/// Type for trace timeline activity callback function. This callback function
/// is used to report activity occurring for a trace. This function
/// does not take ownership of 'trace' and so any information needed
/// from that object must be copied before returning. The 'userp' data
/// is the same as what is supplied in the call to
/// TRITONSERVER_InferenceTraceNew.
typedef void (*TRITONSERVER_InferenceTraceActivityFn_t)(
TRITONSERVER_InferenceTrace* trace,
TRITONSERVER_InferenceTraceActivity activity, uint64_t timestamp_ns,
void* userp);
/// Type for trace tensor activity callback function. This callback function
/// is used to report tensor activity occurring for a trace. This function
/// does not take ownership of 'trace' and so any information needed
/// from that object must be copied before returning. The 'userp' data
/// is the same as what is supplied in the call to
/// TRITONSERVER_InferenceTraceTensorNew.
typedef void (*TRITONSERVER_InferenceTraceTensorActivityFn_t)(
TRITONSERVER_InferenceTrace* trace,
TRITONSERVER_InferenceTraceActivity activity, const char* name,
TRITONSERVER_DataType datatype, const void* base, size_t byte_size,
const int64_t* shape, uint64_t dim_count,
TRITONSERVER_MemoryType memory_type, int64_t memory_type_id, void* userp);
/// Type for trace release callback function. This callback function
/// is called when all activity for the trace has completed. The
/// callback function takes ownership of the
/// TRITONSERVER_InferenceTrace object. The 'userp' data is the same
/// as what is supplied in the call to TRITONSERVER_InferenceTraceNew.
typedef void (*TRITONSERVER_InferenceTraceReleaseFn_t)(
TRITONSERVER_InferenceTrace* trace, void* userp);
/// Create a new inference trace object. The caller takes ownership of
/// the TRITONSERVER_InferenceTrace object and must call
/// TRITONSERVER_InferenceTraceDelete to release the object.
///
/// The activity callback function will be called to report activity
/// for 'trace' as well as for any child traces that are spawned by
/// 'trace', and so the activity callback must check the trace object
/// to determine specifically what activity is being reported.
///
/// The release callback is called for both 'trace' and for any child
/// traces spawned by 'trace'.
///
/// \param trace Returns the new inference trace object.
/// \param level The tracing level.
/// \param parent_id The parent trace id for this trace. A value of 0
/// indicates that there is not parent trace.
/// \param activity_fn The callback function where activity for the
/// trace is reported.
/// \param release_fn The callback function called when all activity
/// is complete for the trace.
/// \param trace_userp User-provided pointer that is delivered to
/// the activity and release callback functions.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceTraceNew(
TRITONSERVER_InferenceTrace** trace, TRITONSERVER_InferenceTraceLevel level,
uint64_t parent_id, TRITONSERVER_InferenceTraceActivityFn_t activity_fn,
TRITONSERVER_InferenceTraceReleaseFn_t release_fn, void* trace_userp);
/// Create a new inference trace object. The caller takes ownership of
/// the TRITONSERVER_InferenceTrace object and must call
/// TRITONSERVER_InferenceTraceDelete to release the object.
///
/// The timeline and tensor activity callback function will be called to report
/// activity for 'trace' as well as for any child traces that are spawned by
/// 'trace', and so the activity callback must check the trace object
/// to determine specifically what activity is being reported.
///
/// The release callback is called for both 'trace' and for any child
/// traces spawned by 'trace'.
///
/// \param trace Returns the new inference trace object.
/// \param level The tracing level.
/// \param parent_id The parent trace id for this trace. A value of 0
/// indicates that there is not parent trace.
/// \param activity_fn The callback function where timeline activity for the
/// trace is reported.
/// \param tensor_activity_fn The callback function where tensor activity for
/// the trace is reported.
/// \param release_fn The callback function called when all activity
/// is complete for the trace.
/// \param trace_userp User-provided pointer that is delivered to
/// the activity and release callback functions.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceTraceTensorNew(
TRITONSERVER_InferenceTrace** trace, TRITONSERVER_InferenceTraceLevel level,
uint64_t parent_id, TRITONSERVER_InferenceTraceActivityFn_t activity_fn,
TRITONSERVER_InferenceTraceTensorActivityFn_t tensor_activity_fn,
TRITONSERVER_InferenceTraceReleaseFn_t release_fn, void* trace_userp);
/// Delete a trace object.
///
/// \param trace The trace object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceTraceDelete(
TRITONSERVER_InferenceTrace* trace);
/// Get the id associated with a trace. Every trace is assigned an id
/// that is unique across all traces created for a Triton server.
///
/// \param trace The trace.
/// \param id Returns the id associated with the trace.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceTraceId(
TRITONSERVER_InferenceTrace* trace, uint64_t* id);
/// Get the parent id associated with a trace. The parent id indicates
/// a parent-child relationship between two traces. A parent id value
/// of 0 indicates that there is no parent trace.
///
/// \param trace The trace.
/// \param id Returns the parent id associated with the trace.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceTraceParentId(
TRITONSERVER_InferenceTrace* trace, uint64_t* parent_id);
/// Get the name of the model associated with a trace. The caller does
/// not own the returned string and must not modify or delete it. The
/// lifetime of the returned string extends only as long as 'trace'.
///
/// \param trace The trace.
/// \param model_name Returns the name of the model associated with
/// the trace.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceTraceModelName(
TRITONSERVER_InferenceTrace* trace, const char** model_name);
/// Get the version of the model associated with a trace.
///
/// \param trace The trace.
/// \param model_version Returns the version of the model associated
/// with the trace.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_InferenceTraceModelVersion(
TRITONSERVER_InferenceTrace* trace, int64_t* model_version);
/// TRITONSERVER_InferenceRequest
///
/// Object representing an inference request. The inference request
/// provides the meta-data and input tensor values needed for an
/// inference and returns the inference result meta-data and output
/// tensors. An inference request object can be modified and reused
/// multiple times.
///
/// Inference request flags. The enum values must be power-of-2 values.
typedef enum tritonserver_requestflag_enum {
TRITONSERVER_REQUEST_FLAG_SEQUENCE_START = 1,
TRITONSERVER_REQUEST_FLAG_SEQUENCE_END = 2
} TRITONSERVER_RequestFlag;
/// Inference request release flags. The enum values must be
/// power-of-2 values.
typedef enum tritonserver_requestreleaseflag_enum {
TRITONSERVER_REQUEST_RELEASE_ALL = 1
} TRITONSERVER_RequestReleaseFlag;
/// Inference response complete flags. The enum values must be
/// power-of-2 values.
typedef enum tritonserver_responsecompleteflag_enum {
TRITONSERVER_RESPONSE_COMPLETE_FINAL = 1
} TRITONSERVER_ResponseCompleteFlag;
/// Type for inference request release callback function. The callback
/// indicates what type of release is being performed on the request
/// and for some of these the callback function takes ownership of the
/// TRITONSERVER_InferenceRequest object. The 'userp' data is the data
/// provided as 'request_release_userp' in the call to
/// TRITONSERVER_InferenceRequestSetReleaseCallback.
///
/// One or more flags will be specified when the callback is invoked,
/// and the callback must take the following actions:
///
/// - TRITONSERVER_REQUEST_RELEASE_ALL: The entire inference request
/// is being released and ownership is passed to the callback
/// function. Triton will not longer access the 'request' object
/// itself nor any input tensor data associated with the
/// request. The callback should free or otherwise manage the
/// 'request' object and all associated tensor data.
///
/// Note that currently TRITONSERVER_REQUEST_RELEASE_ALL should always
/// be set when the callback is invoked but in the future that may
/// change, so the callback should explicitly check for the flag
/// before taking ownership of the request object.
///
typedef void (*TRITONSERVER_InferenceRequestReleaseFn_t)(
TRITONSERVER_InferenceRequest* request, const uint32_t flags, void* userp);
/// Type for callback function indicating that an inference response
/// has completed. The callback function takes ownership of the
/// TRITONSERVER_InferenceResponse object. The 'userp' data is the
/// data provided as 'response_userp' in the call to
/// TRITONSERVER_InferenceRequestSetResponseCallback.
///
/// One or more flags may be specified when the callback is invoked:
///
/// - TRITONSERVER_RESPONSE_COMPLETE_FINAL: Indicates that no more
/// responses will be generated for a given request (more
/// specifically, that no more responses will be generated for the
/// inference request that set this callback and 'userp'). When
/// this flag is set 'response' may be a response object or may be
/// nullptr. If 'response' is not nullptr, then 'response' is the
/// last response that Triton will produce for the request. If
/// 'response' is nullptr then Triton is indicating that no more
/// responses will be produced for the request.
typedef void (*TRITONSERVER_InferenceResponseCompleteFn_t)(
TRITONSERVER_InferenceResponse* response, const uint32_t flags,
void* userp);
/// Create a new inference request object.
///
/// \param inference_request Returns the new request object.
/// \param server the inference server object.
/// \param model_name The name of the model to use for the request.
/// \param model_version The version of the model to use for the
/// request. If -1 then the server will choose a version based on the
/// model's policy.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceRequestNew(
TRITONSERVER_InferenceRequest** inference_request,
TRITONSERVER_Server* server, const char* model_name,
const int64_t model_version);
/// Delete an inference request object.
///
/// \param inference_request The request object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceRequestDelete(
TRITONSERVER_InferenceRequest* inference_request);
/// Get the ID for a request. The returned ID is owned by
/// 'inference_request' and must not be modified or freed by the
/// caller.
///
/// \param inference_request The request object.
/// \param id Returns the ID.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceRequestId(
TRITONSERVER_InferenceRequest* inference_request, const char** id);
/// Set the ID for a request.
///
/// \param inference_request The request object.
/// \param id The ID.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceRequestSetId(
TRITONSERVER_InferenceRequest* inference_request, const char* id);
/// Get the flag(s) associated with a request. On return 'flags' holds
/// a bitwise-or of all flag values, see TRITONSERVER_RequestFlag for
/// available flags.
///
/// \param inference_request The request object.
/// \param flags Returns the flags.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceRequestFlags(
TRITONSERVER_InferenceRequest* inference_request, uint32_t* flags);
/// Set the flag(s) associated with a request. 'flags' should hold a
/// bitwise-or of all flag values, see TRITONSERVER_RequestFlag for
/// available flags.
///
/// \param inference_request The request object.
/// \param flags The flags.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceRequestSetFlags(
TRITONSERVER_InferenceRequest* inference_request, uint32_t flags);
/// Get the correlation ID of the inference request as an unsigned integer.
/// Default is 0, which indicates that the request has no correlation ID.
/// If the correlation id associated with the inference request is a string,
/// this function will return a failure. The correlation ID is used
/// to indicate two or more inference request are related to each other.
/// How this relationship is handled by the inference server is determined by
/// the model's scheduling policy.
///
/// \param inference_request The request object.
/// \param correlation_id Returns the correlation ID.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_InferenceRequestCorrelationId(
TRITONSERVER_InferenceRequest* inference_request, uint64_t* correlation_id);
/// Get the correlation ID of the inference request as a string.
/// Default is empty "", which indicates that the request has no correlation ID.
/// If the correlation id associated with the inference request is an unsigned
/// integer, then this function will return a failure. The correlation ID
/// is used to indicate two or more inference request are related to each other.
/// How this relationship is handled by the inference server is determined by
/// the model's scheduling policy.
///
/// \param inference_request The request object.
/// \param correlation_id Returns the correlation ID.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_InferenceRequestCorrelationIdString(
TRITONSERVER_InferenceRequest* inference_request,
const char** correlation_id);
/// Set the correlation ID of the inference request to be an unsigned integer.
/// Default is 0, which indicates that the request has no correlation ID.
/// The correlation ID is used to indicate two or more inference request
/// are related to each other. How this relationship is handled by the
/// inference server is determined by the model's scheduling policy.
///
/// \param inference_request The request object.
/// \param correlation_id The correlation ID.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_InferenceRequestSetCorrelationId(
TRITONSERVER_InferenceRequest* inference_request, uint64_t correlation_id);
/// Set the correlation ID of the inference request to be a string.
/// The correlation ID is used to indicate two or more inference
/// request are related to each other. How this relationship is
/// handled by the inference server is determined by the model's
/// scheduling policy.
///
/// \param inference_request The request object.
/// \param correlation_id The correlation ID.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_InferenceRequestSetCorrelationIdString(
TRITONSERVER_InferenceRequest* inference_request,
const char* correlation_id);
/// Get the priority for a request. The default is 0 indicating that
/// the request does not specify a priority and so will use the
/// model's default priority.
///
/// \param inference_request The request object.
/// \param priority Returns the priority level.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceRequestPriority(
TRITONSERVER_InferenceRequest* inference_request, uint32_t* priority);
/// Set the priority for a request. The default is 0 indicating that
/// the request does not specify a priority and so will use the
/// model's default priority.
///
/// \param inference_request The request object.
/// \param priority The priority level.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_InferenceRequestSetPriority(
TRITONSERVER_InferenceRequest* inference_request, uint32_t priority);
/// Get the timeout for a request, in microseconds. The default is 0
/// which indicates that the request has no timeout.
///
/// \param inference_request The request object.
/// \param timeout_us Returns the timeout, in microseconds.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_InferenceRequestTimeoutMicroseconds(
TRITONSERVER_InferenceRequest* inference_request, uint64_t* timeout_us);
/// Set the timeout for a request, in microseconds. The default is 0
/// which indicates that the request has no timeout.
///
/// \param inference_request The request object.
/// \param timeout_us The timeout, in microseconds.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_InferenceRequestSetTimeoutMicroseconds(
TRITONSERVER_InferenceRequest* inference_request, uint64_t timeout_us);
/// Add an input to a request.
///
/// \param inference_request The request object.
/// \param name The name of the input.
/// \param datatype The type of the input. Valid type names are BOOL,
/// UINT8, UINT16, UINT32, UINT64, INT8, INT16, INT32, INT64, FP16,
/// FP32, FP64, and BYTES.
/// \param shape The shape of the input.
/// \param dim_count The number of dimensions of 'shape'.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceRequestAddInput(
TRITONSERVER_InferenceRequest* inference_request, const char* name,
const TRITONSERVER_DataType datatype, const int64_t* shape,
uint64_t dim_count);
/// Add a raw input to a request. The name recognized by the model, data type
/// and shape of the input will be deduced from model configuration.
/// This function must be called at most once on request with no other input to
/// ensure the deduction is accurate.
///
/// \param inference_request The request object.
/// \param name The name of the input. This name is only used as a reference
/// of the raw input in other Tritonserver APIs. It doesn't assoicate with the
/// name used in the model.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_InferenceRequestAddRawInput(
TRITONSERVER_InferenceRequest* inference_request, const char* name);
/// Remove an input from a request.
///
/// \param inference_request The request object.
/// \param name The name of the input.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_InferenceRequestRemoveInput(
TRITONSERVER_InferenceRequest* inference_request, const char* name);
/// Remove all inputs from a request.
///
/// \param inference_request The request object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_InferenceRequestRemoveAllInputs(
TRITONSERVER_InferenceRequest* inference_request);
/// Assign a buffer of data to an input. The buffer will be appended
/// to any existing buffers for that input. The 'inference_request'
/// object takes ownership of the buffer and so the caller should not
/// modify or free the buffer until that ownership is released by
/// 'inference_request' being deleted or by the input being removed
/// from 'inference_request'.
///
/// \param inference_request The request object.
/// \param name The name of the input.
/// \param base The base address of the input data.
/// \param byte_size The size, in bytes, of the input data.
/// \param memory_type The memory type of the input data.
/// \param memory_type_id The memory type id of the input data.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_InferenceRequestAppendInputData(
TRITONSERVER_InferenceRequest* inference_request, const char* name,
const void* base, size_t byte_size, TRITONSERVER_MemoryType memory_type,
int64_t memory_type_id);
/// Assign a buffer of data to an input for execution on all model instances
/// with the specified host policy. The buffer will be appended to any existing
/// buffers for that input on all devices with this host policy. The
/// 'inference_request' object takes ownership of the buffer and so the caller
/// should not modify or free the buffer until that ownership is released by
/// 'inference_request' being deleted or by the input being removed from
/// 'inference_request'. If the execution is scheduled on a device that does not
/// have a input buffer specified using this function, then the input buffer
/// specified with TRITONSERVER_InferenceRequestAppendInputData will be used so
/// a non-host policy specific version of data must be added using that API.
/// \param inference_request The request object.
/// \param name The name of the input.
/// \param base The base address of the input data.
/// \param byte_size The size, in bytes, of the input data.
/// \param memory_type The memory type of the input data.
/// \param memory_type_id The memory type id of the input data.
/// \param host_policy_name All model instances executing with this host_policy
/// will use this input buffer for execution.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_InferenceRequestAppendInputDataWithHostPolicy(
TRITONSERVER_InferenceRequest* inference_request, const char* name,
const void* base, size_t byte_size, TRITONSERVER_MemoryType memory_type,
int64_t memory_type_id, const char* host_policy_name);
/// Assign a buffer of data to an input. The buffer will be appended
/// to any existing buffers for that input. The 'inference_request'
/// object takes ownership of the buffer and so the caller should not
/// modify or free the buffer until that ownership is released by
/// 'inference_request' being deleted or by the input being removed
/// from 'inference_request'.
///
/// \param inference_request The request object.
/// \param name The name of the input.
/// \param base The base address of the input data.
/// \param buffer_attributes The buffer attrubutes of the input.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_InferenceRequestAppendInputDataWithBufferAttributes(
TRITONSERVER_InferenceRequest* inference_request, const char* name,
const void* base, TRITONSERVER_BufferAttributes* buffer_attributes);
/// Clear all input data from an input, releasing ownership of the
/// buffer(s) that were appended to the input with
/// TRITONSERVER_InferenceRequestAppendInputData or
/// TRITONSERVER_InferenceRequestAppendInputDataWithHostPolicy
/// \param inference_request The request object.
/// \param name The name of the input.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_InferenceRequestRemoveAllInputData(
TRITONSERVER_InferenceRequest* inference_request, const char* name);
/// Add an output request to an inference request.
///
/// \param inference_request The request object.
/// \param name The name of the output.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_InferenceRequestAddRequestedOutput(
TRITONSERVER_InferenceRequest* inference_request, const char* name);
/// Remove an output request from an inference request.
///
/// \param inference_request The request object.
/// \param name The name of the output.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_InferenceRequestRemoveRequestedOutput(
TRITONSERVER_InferenceRequest* inference_request, const char* name);
/// Remove all output requests from an inference request.
///
/// \param inference_request The request object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_InferenceRequestRemoveAllRequestedOutputs(
TRITONSERVER_InferenceRequest* inference_request);
/// Set the release callback for an inference request. The release
/// callback is called by Triton to return ownership of the request
/// object.
///
/// \param inference_request The request object.
/// \param request_release_fn The function called to return ownership
/// of the 'inference_request' object.
/// \param request_release_userp User-provided pointer that is
/// delivered to the 'request_release_fn' callback.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_InferenceRequestSetReleaseCallback(
TRITONSERVER_InferenceRequest* inference_request,
TRITONSERVER_InferenceRequestReleaseFn_t request_release_fn,
void* request_release_userp);
/// Set the allocator and response callback for an inference
/// request. The allocator is used to allocate buffers for any output
/// tensors included in responses that are produced for this
/// request. The response callback is called to return response
/// objects representing responses produced for this request.
///
/// \param inference_request The request object.
/// \param response_allocator The TRITONSERVER_ResponseAllocator to use
/// to allocate buffers to hold inference results.
/// \param response_allocator_userp User-provided pointer that is
/// delivered to the response allocator's start and allocation functions.
/// \param response_fn The function called to deliver an inference
/// response for this request.
/// \param response_userp User-provided pointer that is delivered to
/// the 'response_fn' callback.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_InferenceRequestSetResponseCallback(
TRITONSERVER_InferenceRequest* inference_request,
TRITONSERVER_ResponseAllocator* response_allocator,
void* response_allocator_userp,
TRITONSERVER_InferenceResponseCompleteFn_t response_fn,
void* response_userp);
/// TRITONSERVER_InferenceResponse
///
/// Object representing an inference response. The inference response
/// provides the meta-data and output tensor values calculated by the
/// inference.
///
/// Delete an inference response object.
///
/// \param inference_response The response object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceResponseDelete(
TRITONSERVER_InferenceResponse* inference_response);
/// Return the error status of an inference response. Return a
/// TRITONSERVER_Error object on failure, return nullptr on success.
/// The returned error object is owned by 'inference_response' and so
/// should not be deleted by the caller.
///
/// \param inference_response The response object.
/// \return a TRITONSERVER_Error indicating the success or failure
/// status of the response.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceResponseError(
TRITONSERVER_InferenceResponse* inference_response);
/// Get model used to produce a response. The caller does not own the
/// returned model name value and must not modify or delete it. The
/// lifetime of all returned values extends until 'inference_response'
/// is deleted.
///
/// \param inference_response The response object.
/// \param model_name Returns the name of the model.
/// \param model_version Returns the version of the model.
/// this response.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceResponseModel(
TRITONSERVER_InferenceResponse* inference_response, const char** model_name,
int64_t* model_version);
/// Get the ID of the request corresponding to a response. The caller
/// does not own the returned ID and must not modify or delete it. The
/// lifetime of all returned values extends until 'inference_response'
/// is deleted.
///
/// \param inference_response The response object.
/// \param request_id Returns the ID of the request corresponding to
/// this response.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceResponseId(
TRITONSERVER_InferenceResponse* inference_response,
const char** request_id);
/// Get the number of parameters available in the response.
///
/// \param inference_response The response object.
/// \param count Returns the number of parameters.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_InferenceResponseParameterCount(
TRITONSERVER_InferenceResponse* inference_response, uint32_t* count);
/// Get all information about a parameter. The caller does not own any
/// of the returned values and must not modify or delete them. The
/// lifetime of all returned values extends until 'inference_response'
/// is deleted.
///
/// The 'vvalue' returns a void* pointer that must be cast
/// appropriately based on 'type'. For example:
///
/// void* vvalue;
/// TRITONSERVER_ParameterType type;
/// TRITONSERVER_InferenceResponseParameter(
/// response, index, &name, &type, &vvalue);
/// switch (type) {
/// case TRITONSERVER_PARAMETER_BOOL:
/// bool value = *(reinterpret_cast<bool*>(vvalue));
/// ...
/// case TRITONSERVER_PARAMETER_INT:
/// int64_t value = *(reinterpret_cast<int64_t*>(vvalue));
/// ...
/// case TRITONSERVER_PARAMETER_STRING:
/// const char* value = reinterpret_cast<const char*>(vvalue);
/// ...
///
/// \param inference_response The response object.
/// \param index The index of the parameter, must be 0 <= index <
/// count, where 'count' is the value returned by
/// TRITONSERVER_InferenceResponseParameterCount.
/// \param name Returns the name of the parameter.
/// \param type Returns the type of the parameter.
/// \param vvalue Returns a pointer to the parameter value.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_InferenceResponseParameter(
TRITONSERVER_InferenceResponse* inference_response, const uint32_t index,
const char** name, TRITONSERVER_ParameterType* type, const void** vvalue);
/// Get the number of outputs available in the response.
///
/// \param inference_response The response object.
/// \param count Returns the number of output tensors.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_InferenceResponseOutputCount(
TRITONSERVER_InferenceResponse* inference_response, uint32_t* count);
/// Get all information about an output tensor. The tensor data is
/// returned as the base pointer to the data and the size, in bytes,
/// of the data. The caller does not own any of the returned values
/// and must not modify or delete them. The lifetime of all returned
/// values extends until 'inference_response' is deleted.
///
/// \param inference_response The response object.
/// \param index The index of the output tensor, must be 0 <= index <
/// count, where 'count' is the value returned by
/// TRITONSERVER_InferenceResponseOutputCount.
/// \param name Returns the name of the output.
/// \param datatype Returns the type of the output.
/// \param shape Returns the shape of the output.
/// \param dim_count Returns the number of dimensions of the returned
/// shape.
/// \param base Returns the tensor data for the output.
/// \param byte_size Returns the size, in bytes, of the data.
/// \param memory_type Returns the memory type of the data.
/// \param memory_type_id Returns the memory type id of the data.
/// \param userp The user-specified value associated with the buffer
/// in TRITONSERVER_ResponseAllocatorAllocFn_t.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceResponseOutput(
TRITONSERVER_InferenceResponse* inference_response, const uint32_t index,
const char** name, TRITONSERVER_DataType* datatype, const int64_t** shape,
uint64_t* dim_count, const void** base, size_t* byte_size,
TRITONSERVER_MemoryType* memory_type, int64_t* memory_type_id,
void** userp);
/// Get a classification label associated with an output for a given
/// index. The caller does not own the returned label and must not
/// modify or delete it. The lifetime of all returned label extends
/// until 'inference_response' is deleted.
///
/// \param inference_response The response object.
/// \param index The index of the output tensor, must be 0 <= index <
/// count, where 'count' is the value returned by
/// TRITONSERVER_InferenceResponseOutputCount.
/// \param class_index The index of the class.
/// \param name Returns the label corresponding to 'class_index' or
/// nullptr if no label.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_InferenceResponseOutputClassificationLabel(
TRITONSERVER_InferenceResponse* inference_response, const uint32_t index,
const size_t class_index, const char** label);
/// TRITONSERVER_BufferAttributes
///
/// API to create, modify, or retrieve attributes associated with a buffer.
///
/// Create a new buffer attributes object. The caller takes ownership of
/// the TRITONSERVER_BufferAttributes object and must call
/// TRITONSERVER_BufferAttributesDelete to release the object.
///
/// \param buffer_attributes Returns the new buffer attributes object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_BufferAttributesNew(
TRITONSERVER_BufferAttributes** buffer_attributes);
/// Delete a buffer attributes object.
///
/// \param buffer_attributes The buffer_attributes object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_BufferAttributesDelete(
TRITONSERVER_BufferAttributes* buffer_attributes);
/// Set the memory type id field of the buffer attributes.
///
/// \param buffer_attributes The buffer attributes object.
/// \param memory_type_id Memory type id to assign to the buffer attributes
/// object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_BufferAttributesSetMemoryTypeId(
TRITONSERVER_BufferAttributes* buffer_attributes, int64_t memory_type_id);
/// Set the memory type field of the buffer attributes.
///
/// \param buffer_attributes The buffer attributes object.
/// \param memory_type Memory type to assign to the buffer attributes object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_BufferAttributesSetMemoryType(
TRITONSERVER_BufferAttributes* buffer_attributes,
TRITONSERVER_MemoryType memory_type);
/// Set the CudaIpcHandle field of the buffer attributes.
///
/// \param buffer_attributes The buffer attributes object.
/// \param cuda_ipc_handle The CudaIpcHandle to assign to the buffer attributes
/// object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_BufferAttributesSetCudaIpcHandle(
TRITONSERVER_BufferAttributes* buffer_attributes, void* cuda_ipc_handle);
/// Set the byte size field of the buffer attributes.
///
/// \param buffer_attributes The buffer attributes object.
/// \param byte_size Byte size to assign to the buffer attributes object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_BufferAttributesSetByteSize(
TRITONSERVER_BufferAttributes* buffer_attributes, size_t byte_size);
/// Get the memory type id field of the buffer attributes.
///
/// \param buffer_attributes The buffer attributes object.
/// \param memory_type_id Returns the memory type id associated with the buffer
/// attributes object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_BufferAttributesMemoryTypeId(
TRITONSERVER_BufferAttributes* buffer_attributes, int64_t* memory_type_id);
/// Get the memory type field of the buffer attributes.
///
/// \param buffer_attributes The buffer attributes object.
/// \param memory_type Returns the memory type associated with the buffer
/// attributes object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_BufferAttributesMemoryType(
TRITONSERVER_BufferAttributes* buffer_attributes,
TRITONSERVER_MemoryType* memory_type);
/// Get the CudaIpcHandle field of the buffer attributes object.
///
/// \param buffer_attributes The buffer attributes object.
/// \param cuda_ipc_handle Returns the memory type associated with the buffer
/// attributes object. If the cudaIpcHandle does not exist for the buffer,
/// nullptr will be returned.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_BufferAttributesCudaIpcHandle(
TRITONSERVER_BufferAttributes* buffer_attributes, void** cuda_ipc_handle);
/// Get the byte size field of the buffer attributes.
///
/// \param buffer_attributes The buffer attributes object.
/// \param byte_size Returns the byte size associated with the buffer attributes
/// object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_BufferAttributesByteSize(
TRITONSERVER_BufferAttributes* buffer_attributes, size_t* byte_size);
/// TRITONSERVER_ServerOptions
///
/// Options to use when creating an inference server.
///
/// Model control modes
typedef enum tritonserver_modelcontrolmode_enum {
TRITONSERVER_MODEL_CONTROL_NONE,
TRITONSERVER_MODEL_CONTROL_POLL,
TRITONSERVER_MODEL_CONTROL_EXPLICIT
} TRITONSERVER_ModelControlMode;
/// Rate limit modes
typedef enum tritonserver_ratelimitmode_enum {
TRITONSERVER_RATE_LIMIT_OFF,
TRITONSERVER_RATE_LIMIT_EXEC_COUNT
} TRITONSERVER_RateLimitMode;
/// Create a new server options object. The caller takes ownership of
/// the TRITONSERVER_ServerOptions object and must call
/// TRITONSERVER_ServerOptionsDelete to release the object.
///
/// \param options Returns the new server options object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerOptionsNew(
TRITONSERVER_ServerOptions** options);
/// Delete a server options object.
///
/// \param options The server options object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerOptionsDelete(
TRITONSERVER_ServerOptions* options);
/// Set the textual ID for the server in a server options. The ID is a
/// name that identifies the server.
///
/// \param options The server options object.
/// \param server_id The server identifier.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerOptionsSetServerId(
TRITONSERVER_ServerOptions* options, const char* server_id);
/// Set the model repository path in a server options. The path must be
/// the full absolute path to the model repository. This function can be called
/// multiple times with different paths to set multiple model repositories.
/// Note that if a model is not unique across all model repositories
/// at any time, the model will not be available.
///
/// \param options The server options object.
/// \param model_repository_path The full path to the model repository.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerOptionsSetModelRepositoryPath(
TRITONSERVER_ServerOptions* options, const char* model_repository_path);
/// Set the model control mode in a server options. For each mode the models
/// will be managed as the following:
///
/// TRITONSERVER_MODEL_CONTROL_NONE: the models in model repository will be
/// loaded on startup. After startup any changes to the model repository will
/// be ignored. Calling TRITONSERVER_ServerPollModelRepository will result in
/// an error.
///
/// TRITONSERVER_MODEL_CONTROL_POLL: the models in model repository will be
/// loaded on startup. The model repository can be polled periodically using
/// TRITONSERVER_ServerPollModelRepository and the server will load, unload,
/// and updated models according to changes in the model repository.
///
/// TRITONSERVER_MODEL_CONTROL_EXPLICIT: the models in model repository will
/// not be loaded on startup. The corresponding model control APIs must be
/// called to load / unload a model in the model repository.
///
/// \param options The server options object.
/// \param mode The mode to use for the model control.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerOptionsSetModelControlMode(
TRITONSERVER_ServerOptions* options, TRITONSERVER_ModelControlMode mode);
/// Set the model to be loaded at startup in a server options. The model must be
/// present in one, and only one, of the specified model repositories.
/// This function can be called multiple times with different model name
/// to set multiple startup models.
/// Note that it only takes affect on TRITONSERVER_MODEL_CONTROL_EXPLICIT mode.
///
/// \param options The server options object.
/// \param mode_name The name of the model to load on startup.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerOptionsSetStartupModel(
TRITONSERVER_ServerOptions* options, const char* model_name);
/// Enable or disable strict model configuration handling in a server
/// options.
///
/// \param options The server options object.
/// \param strict True to enable strict model configuration handling,
/// false to disable.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerOptionsSetStrictModelConfig(
TRITONSERVER_ServerOptions* options, bool strict);
/// Set the rate limit mode in a server options.
///
/// TRITONSERVER_RATE_LIMIT_EXEC_COUNT: The rate limiting prioritizes the
/// inference execution using the number of times each instance has got a
/// chance to run. The execution gets to run only when its resource
/// constraints are satisfied.
///
/// TRITONSERVER_RATE_LIMIT_OFF: The rate limiting is turned off and the
/// inference gets executed whenever an instance is available.
///
/// \param options The server options object.
/// \param mode The mode to use for the rate limiting. By default, execution
/// count is used to determine the priorities.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerOptionsSetRateLimiterMode(
TRITONSERVER_ServerOptions* options, TRITONSERVER_RateLimitMode mode);
/// Add resource count for rate limiting.
///
/// \param options The server options object.
/// \param name The name of the resource.
/// \param count The count of the resource.
/// \param device The device identifier for the resource. A value of -1
/// indicates that the specified number of resources are available on every
/// device. The device value is ignored for a global resource. The server
/// will use the rate limiter configuration specified for instance groups
/// in model config to determine whether resource is global. In case of
/// conflicting resource type in different model configurations, server
/// will raise an appropriate error while loading model.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerOptionsAddRateLimiterResource(
TRITONSERVER_ServerOptions* options, const char* resource_name,
const size_t resource_count, const int device);
/// Set the total pinned memory byte size that the server can allocate
/// in a server options. The pinned memory pool will be shared across
/// Triton itself and the backends that use
/// TRITONBACKEND_MemoryManager to allocate memory.
///
/// \param options The server options object.
/// \param size The pinned memory pool byte size.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerOptionsSetPinnedMemoryPoolByteSize(
TRITONSERVER_ServerOptions* options, uint64_t size);
/// Set the total CUDA memory byte size that the server can allocate
/// on given GPU device in a server options. The pinned memory pool
/// will be shared across Triton itself and the backends that use
/// TRITONBACKEND_MemoryManager to allocate memory.
///
/// \param options The server options object.
/// \param gpu_device The GPU device to allocate the memory pool.
/// \param size The CUDA memory pool byte size.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerOptionsSetCudaMemoryPoolByteSize(
TRITONSERVER_ServerOptions* options, int gpu_device, uint64_t size);
/// Set the total response cache byte size that the server can allocate in CPU
/// memory. The response cache will be shared across all inference requests and
/// across all models.
///
/// \param options The server options object.
/// \param size The total response cache byte size.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerOptionsSetResponseCacheByteSize(
TRITONSERVER_ServerOptions* options, uint64_t size);
/// Set the minimum support CUDA compute capability in a server
/// options.
///
/// \param options The server options object.
/// \param cc The minimum CUDA compute capability.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerOptionsSetMinSupportedComputeCapability(
TRITONSERVER_ServerOptions* options, double cc);
/// Enable or disable exit-on-error in a server options.
///
/// \param options The server options object.
/// \param exit True to enable exiting on intialization error, false
/// to continue.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerOptionsSetExitOnError(
TRITONSERVER_ServerOptions* options, bool exit);
/// Enable or disable strict readiness handling in a server options.
///
/// \param options The server options object.
/// \param strict True to enable strict readiness handling, false to
/// disable.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerOptionsSetStrictReadiness(
TRITONSERVER_ServerOptions* options, bool strict);
/// Set the exit timeout, in seconds, for the server in a server
/// options.
///
/// \param options The server options object.
/// \param timeout The exit timeout, in seconds.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerOptionsSetExitTimeout(
TRITONSERVER_ServerOptions* options, unsigned int timeout);
/// Set the number of threads used in buffer manager in a server options.
///
/// \param options The server options object.
/// \param thread_count The number of threads.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerOptionsSetBufferManagerThreadCount(
TRITONSERVER_ServerOptions* options, unsigned int thread_count);
/// Set the number of threads to concurrently load models in a server options.
///
/// \param options The server options object.
/// \param thread_count The number of threads.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerOptionsSetModelLoadThreadCount(
TRITONSERVER_ServerOptions* options, unsigned int thread_count);
/// Provide a log output file.
///
/// \param options The server options object.
/// \param file a string defining the file where the log outputs will be saved.
/// An empty string for the file name will cause triton to direct logging
/// facilities to the console
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerOptionsSetLogFile(
TRITONSERVER_ServerOptions* options, const char* file);
/// Enable or disable info level logging.
///
/// \param options The server options object.
/// \param log True to enable info logging, false to disable.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerOptionsSetLogInfo(
TRITONSERVER_ServerOptions* options, bool log);
/// Enable or disable warning level logging.
///
/// \param options The server options object.
/// \param log True to enable warning logging, false to disable.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerOptionsSetLogWarn(
TRITONSERVER_ServerOptions* options, bool log);
/// Enable or disable error level logging.
///
/// \param options The server options object.
/// \param log True to enable error logging, false to disable.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerOptionsSetLogError(
TRITONSERVER_ServerOptions* options, bool log);
/// Set the logging format.
///
/// \param options The server options object.
/// \param format The logging format.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerOptionsSetLogFormat(
TRITONSERVER_ServerOptions* options, const TRITONSERVER_LogFormat format);
/// Set verbose logging level. Level zero disables verbose logging.
///
/// \param options The server options object.
/// \param level The verbose logging level.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerOptionsSetLogVerbose(
TRITONSERVER_ServerOptions* options, int level);
/// Enable or disable metrics collection in a server options.
///
/// \param options The server options object.
/// \param metrics True to enable metrics, false to disable.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerOptionsSetMetrics(
TRITONSERVER_ServerOptions* options, bool metrics);
/// Enable or disable GPU metrics collection in a server options. GPU
/// metrics are collected if both this option and
/// TRITONSERVER_ServerOptionsSetMetrics are true.
///
/// \param options The server options object.
/// \param gpu_metrics True to enable GPU metrics, false to disable.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerOptionsSetGpuMetrics(
TRITONSERVER_ServerOptions* options, bool gpu_metrics);
/// Enable or disable CPU metrics collection in a server options. CPU
/// metrics are collected if both this option and
/// TRITONSERVER_ServerOptionsSetMetrics are true.
///
/// \param options The server options object.
/// \param cpu_metrics True to enable CPU metrics, false to disable.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerOptionsSetCpuMetrics(
TRITONSERVER_ServerOptions* options, bool cpu_metrics);
/// Set the interval for metrics collection in a server options.
/// This is 2000 milliseconds by default.
///
/// \param options The server options object.
/// \param metrics_interval_ms The time interval in ms between
/// successive metrics updates.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerOptionsSetMetricsInterval(
TRITONSERVER_ServerOptions* options, uint64_t metrics_interval_ms);
/// Set the directory containing backend shared libraries. This
/// directory is searched last after the version and model directory
/// in the model repository when looking for the backend shared
/// library for a model. If the backend is named 'be' the directory
/// searched is 'backend_dir'/be/libtriton_be.so.
///
/// \param options The server options object.
/// \param backend_dir The full path of the backend directory.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerOptionsSetBackendDirectory(
TRITONSERVER_ServerOptions* options, const char* backend_dir);
/// Set the directory containing repository agent shared libraries. This
/// directory is searched when looking for the repository agent shared
/// library for a model. If the backend is named 'ra' the directory
/// searched is 'repoagent_dir'/ra/libtritonrepoagent_ra.so.
///
/// \param options The server options object.
/// \param repoagent_dir The full path of the repository agent directory.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerOptionsSetRepoAgentDirectory(
TRITONSERVER_ServerOptions* options, const char* repoagent_dir);
/// Specify the limit on memory usage as a fraction on the device identified by
/// 'kind' and 'device_id'. If model loading on the device is requested and the
/// current memory usage exceeds the limit, the load will be rejected. If not
/// specified, the limit will not be set.
///
/// Currently support TRITONSERVER_INSTANCEGROUPKIND_GPU
///
/// \param options The server options object.
/// \param kind The kind of the device.
/// \param device_id The id of the device.
/// \param fraction The limit on memory usage as a fraction
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerOptionsSetModelLoadDeviceLimit(
TRITONSERVER_ServerOptions* options,
const TRITONSERVER_InstanceGroupKind kind, const int device_id,
const double fraction);
/// Set a configuration setting for a named backend in a server
/// options.
///
/// \param options The server options object.
/// \param backend_name The name of the backend.
/// \param setting The name of the setting.
/// \param value The setting value.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerOptionsSetBackendConfig(
TRITONSERVER_ServerOptions* options, const char* backend_name,
const char* setting, const char* value);
/// Set a host policy setting for a given policy name in a server options.
///
/// \param options The server options object.
/// \param policy_name The name of the policy.
/// \param setting The name of the setting.
/// \param value The setting value.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerOptionsSetHostPolicy(
TRITONSERVER_ServerOptions* options, const char* policy_name,
const char* setting, const char* value);
/// TRITONSERVER_Server
///
/// An inference server.
///
/// Model batch flags. The enum values must be power-of-2 values.
typedef enum tritonserver_batchflag_enum {
TRITONSERVER_BATCH_UNKNOWN = 1,
TRITONSERVER_BATCH_FIRST_DIM = 2
} TRITONSERVER_ModelBatchFlag;
/// Model index flags. The enum values must be power-of-2 values.
typedef enum tritonserver_modelindexflag_enum {
TRITONSERVER_INDEX_FLAG_READY = 1
} TRITONSERVER_ModelIndexFlag;
/// Model transaction policy flags. The enum values must be
/// power-of-2 values.
typedef enum tritonserver_txn_property_flag_enum {
TRITONSERVER_TXN_ONE_TO_ONE = 1,
TRITONSERVER_TXN_DECOUPLED = 2
} TRITONSERVER_ModelTxnPropertyFlag;
/// Create a new server object. The caller takes ownership of the
/// TRITONSERVER_Server object and must call TRITONSERVER_ServerDelete
/// to release the object.
///
/// \param server Returns the new inference server object.
/// \param options The inference server options object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerNew(
TRITONSERVER_Server** server, TRITONSERVER_ServerOptions* options);
/// Delete a server object. If server is not already stopped it is
/// stopped before being deleted.
///
/// \param server The inference server object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerDelete(
TRITONSERVER_Server* server);
/// Stop a server object. A server can't be restarted once it is
/// stopped.
///
/// \param server The inference server object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerStop(
TRITONSERVER_Server* server);
/// Register a new model repository. Not available in polling mode.
///
/// \param server The inference server object.
/// \param repository_path The full path to the model repository.
/// \param name_mapping List of name_mapping parameters. Each mapping has
/// the model directory name as its key, overriden model name as its value.
/// \param model_count Number of mappings provided.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerRegisterModelRepository(
TRITONSERVER_Server* server, const char* repository_path,
const TRITONSERVER_Parameter** name_mapping, const uint32_t mapping_count);
/// Unregister a model repository. Not available in polling mode.
///
/// \param server The inference server object.
/// \param repository_path The full path to the model repository.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerUnregisterModelRepository(
TRITONSERVER_Server* server, const char* repository_path);
/// Check the model repository for changes and update server state
/// based on those changes.
///
/// \param server The inference server object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerPollModelRepository(TRITONSERVER_Server* server);
/// Is the server live?
///
/// \param server The inference server object.
/// \param live Returns true if server is live, false otherwise.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerIsLive(
TRITONSERVER_Server* server, bool* live);
/// Is the server ready?
///
/// \param server The inference server object.
/// \param ready Returns true if server is ready, false otherwise.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerIsReady(
TRITONSERVER_Server* server, bool* ready);
/// Is the model ready?
///
/// \param server The inference server object.
/// \param model_name The name of the model to get readiness for.
/// \param model_version The version of the model to get readiness
/// for. If -1 then the server will choose a version based on the
/// model's policy.
/// \param ready Returns true if server is ready, false otherwise.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerModelIsReady(
TRITONSERVER_Server* server, const char* model_name,
const int64_t model_version, bool* ready);
/// Get the batch properties of the model. The properties are
/// communicated by a flags value and an (optional) object returned by
/// 'voidp'.
///
/// - TRITONSERVER_BATCH_UNKNOWN: Triton cannot determine the
/// batching properties of the model. This means that the model
/// does not support batching in any way that is useable by
/// Triton. The returned 'voidp' value is nullptr.
///
/// - TRITONSERVER_BATCH_FIRST_DIM: The model supports batching
/// along the first dimension of every input and output
/// tensor. Triton schedulers that perform batching can
/// automatically batch inference requests along this dimension.
/// The returned 'voidp' value is nullptr.
///
/// \param server The inference server object.
/// \param model_name The name of the model.
/// \param model_version The version of the model. If -1 then the
/// server will choose a version based on the model's policy.
/// \param flags Returns flags indicating the batch properties of the
/// model.
/// \param voidp If non-nullptr, returns a point specific to the
/// 'flags' value.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerModelBatchProperties(
TRITONSERVER_Server* server, const char* model_name,
const int64_t model_version, uint32_t* flags, void** voidp);
/// Get the transaction policy of the model. The policy is
/// communicated by a flags value.
///
/// - TRITONSERVER_TXN_ONE_TO_ONE: The model generates exactly
/// one response per request.
///
/// - TRITONSERVER_TXN_DECOUPLED: The model may generate zero
/// to many responses per request.
///
/// \param server The inference server object.
/// \param model_name The name of the model.
/// \param model_version The version of the model. If -1 then the
/// server will choose a version based on the model's policy.
/// \param txn_flags Returns flags indicating the transaction policy of the
/// model.
/// \param voidp If non-nullptr, returns a point specific to the 'flags' value.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerModelTransactionProperties(
TRITONSERVER_Server* server, const char* model_name,
const int64_t model_version, uint32_t* txn_flags, void** voidp);
/// Get the metadata of the server as a TRITONSERVER_Message object.
/// The caller takes ownership of the message object and must call
/// TRITONSERVER_MessageDelete to release the object.
///
/// \param server The inference server object.
/// \param server_metadata Returns the server metadata message.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerMetadata(
TRITONSERVER_Server* server, TRITONSERVER_Message** server_metadata);
/// Get the metadata of a model as a TRITONSERVER_Message
/// object. The caller takes ownership of the message object and must
/// call TRITONSERVER_MessageDelete to release the object.
///
/// \param server The inference server object.
/// \param model_name The name of the model.
/// \param model_version The version of the model.
/// If -1 then the server will choose a version based on the model's
/// policy.
/// \param model_metadata Returns the model metadata message.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerModelMetadata(
TRITONSERVER_Server* server, const char* model_name,
const int64_t model_version, TRITONSERVER_Message** model_metadata);
/// Get the statistics of a model as a TRITONSERVER_Message
/// object. The caller takes ownership of the object and must call
/// TRITONSERVER_MessageDelete to release the object.
///
/// \param server The inference server object.
/// \param model_name The name of the model.
/// If empty, then statistics for all available models will be returned,
/// and the server will choose a version based on those models' policies.
/// \param model_version The version of the model. If -1 then the
/// server will choose a version based on the model's policy.
/// \param model_stats Returns the model statistics message.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerModelStatistics(
TRITONSERVER_Server* server, const char* model_name,
const int64_t model_version, TRITONSERVER_Message** model_stats);
/// Get the configuration of a model as a TRITONSERVER_Message object.
/// The caller takes ownership of the message object and must call
/// TRITONSERVER_MessageDelete to release the object.
///
/// \param server The inference server object.
/// \param model_name The name of the model.
/// \param model_version The version of the model. If -1 then the
/// server will choose a version based on the model's policy.
/// \param config_version The model configuration will be returned in
/// a format matching this version. If the configuration cannot be
/// represented in the requested version's format then an error will
/// be returned. Currently only version 1 is supported.
/// \param model_config Returns the model config message.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerModelConfig(
TRITONSERVER_Server* server, const char* model_name,
const int64_t model_version, const uint32_t config_version,
TRITONSERVER_Message** model_config);
/// Get the index of all unique models in the model repositories as a
/// TRITONSERVER_Message object. The caller takes ownership of the
/// message object and must call TRITONSERVER_MessageDelete to release
/// the object.
///
/// If TRITONSERVER_INDEX_FLAG_READY is set in 'flags' only the models
/// that are loaded into the server and ready for inferencing are
/// returned.
///
/// \param server The inference server object.
/// \param flags TRITONSERVER_ModelIndexFlag flags that control how to
/// collect the index.
/// \param model_index Return the model index message that holds the
/// index of all models contained in the server's model repository(s).
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerModelIndex(
TRITONSERVER_Server* server, uint32_t flags,
TRITONSERVER_Message** model_index);
/// Load the requested model or reload the model if it is already
/// loaded. The function does not return until the model is loaded or
/// fails to load. Returned error indicates if model loaded
/// successfully or not.
///
/// \param server The inference server object.
/// \param model_name The name of the model.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerLoadModel(
TRITONSERVER_Server* server, const char* model_name);
/// Load the requested model or reload the model if it is already
/// loaded, with load parameters provided. The function does not return until
/// the model is loaded or fails to load. Returned error indicates if model
/// loaded successfully or not.
/// Currently the below parameter names are recognized:
/// - "config" : string parameter that contains a JSON representation of the
/// model configuration. This config will be used for loading the model instead
/// of the one in the model directory.
///
/// \param server The inference server object.
/// \param model_name The name of the model.
/// \param parameters The array of load parameters.
/// \param parameter_count The number of parameters.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerLoadModelWithParameters(
TRITONSERVER_Server* server, const char* model_name,
const TRITONSERVER_Parameter** parameters, const uint64_t parameter_count);
/// Unload the requested model. Unloading a model that is not loaded
/// on server has no affect and success code will be returned.
/// The function does not wait for the requested model to be fully unload
/// and success code will be returned.
/// Returned error indicates if model unloaded successfully or not.
///
/// \param server The inference server object.
/// \param model_name The name of the model.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerUnloadModel(
TRITONSERVER_Server* server, const char* model_name);
/// Unload the requested model, and also unload any dependent model that
/// was loaded along with the requested model (for example, the models composing
/// an ensemble). Unloading a model that is not loaded
/// on server has no affect and success code will be returned.
/// The function does not wait for the requested model and all dependent
/// models to be fully unload and success code will be returned.
/// Returned error indicates if model unloaded successfully or not.
///
/// \param server The inference server object.
/// \param model_name The name of the model.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error*
TRITONSERVER_ServerUnloadModelAndDependents(
TRITONSERVER_Server* server, const char* model_name);
/// Get the current metrics for the server. The caller takes ownership
/// of the metrics object and must call TRITONSERVER_MetricsDelete to
/// release the object.
///
/// \param server The inference server object.
/// \param metrics Returns the metrics.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerMetrics(
TRITONSERVER_Server* server, TRITONSERVER_Metrics** metrics);
/// Perform inference using the meta-data and inputs supplied by the
/// 'inference_request'. If the function returns success, then the
/// caller releases ownership of 'inference_request' and must not
/// access it in any way after this call, until ownership is returned
/// via the 'request_release_fn' callback registered in the request
/// object with TRITONSERVER_InferenceRequestSetReleaseCallback.
///
/// The function unconditionally takes ownership of 'trace' and so the
/// caller must not access it in any way after this call (except in
/// the trace activity callbacks) until ownership is returned via the
/// trace's release_fn callback.
///
/// Responses produced for this request are returned using the
/// allocator and callback registered with the request by
/// TRITONSERVER_InferenceRequestSetResponseCallback.
///
/// \param server The inference server object.
/// \param inference_request The request object.
/// \param trace The trace object for this request, or nullptr if no
/// tracing.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerInferAsync(
TRITONSERVER_Server* server,
TRITONSERVER_InferenceRequest* inference_request,
TRITONSERVER_InferenceTrace* trace);
/// TRITONSERVER_MetricKind
///
/// Types of metrics recognized by TRITONSERVER.
///
typedef enum TRITONSERVER_metrickind_enum {
TRITONSERVER_METRIC_KIND_COUNTER,
TRITONSERVER_METRIC_KIND_GAUGE
} TRITONSERVER_MetricKind;
/// Create a new metric family object. The caller takes ownership of the
/// TRITONSERVER_MetricFamily object and must call
/// TRITONSERVER_MetricFamilyDelete to release the object.
///
/// \param family Returns the new metric family object.
/// \param kind The type of metric family to create.
/// \param name The name of the metric family seen when calling the metrics
/// endpoint.
/// \param description The description of the metric family seen when
/// calling the metrics endpoint.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_MetricFamilyNew(
TRITONSERVER_MetricFamily** family, const TRITONSERVER_MetricKind kind,
const char* name, const char* description);
/// Delete a metric family object.
/// A TRITONSERVER_MetricFamily* object should be deleted AFTER its
/// corresponding TRITONSERVER_Metric* objects have been deleted.
/// Attempting to delete a family before its metrics will return an error.
///
/// \param family The metric family object to delete.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_MetricFamilyDelete(
TRITONSERVER_MetricFamily* family);
/// Create a new metric object. The caller takes ownership of the
/// TRITONSERVER_Metric object and must call
/// TRITONSERVER_MetricDelete to release the object. The caller is also
/// responsible for ownership of the labels passed in. Each label can be deleted
/// immediately after creating the metric with TRITONSERVER_ParameterDelete
/// if not re-using the labels.
///
/// \param metric Returns the new metric object.
/// \param family The metric family to add this new metric to.
/// \param labels The array of labels to associate with this new metric.
/// \param label_count The number of labels.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_MetricNew(
TRITONSERVER_Metric** metric, TRITONSERVER_MetricFamily* family,
const TRITONSERVER_Parameter** labels, const uint64_t label_count);
/// Delete a metric object.
/// All TRITONSERVER_Metric* objects should be deleted BEFORE their
/// corresponding TRITONSERVER_MetricFamily* objects have been deleted.
/// If a family is deleted before its metrics, an error will be returned.
///
/// \param metric The metric object to delete.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_MetricDelete(
TRITONSERVER_Metric* metric);
/// Get the current value of a metric object.
/// Supports metrics of kind TRITONSERVER_METRIC_KIND_COUNTER
/// and TRITONSERVER_METRIC_KIND_GAUGE, and returns
/// TRITONSERVER_ERROR_UNSUPPORTED for unsupported TRITONSERVER_MetricKind.
///
/// \param metric The metric object to query.
/// \param value Returns the current value of the metric object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_MetricValue(
TRITONSERVER_Metric* metric, double* value);
/// Increment the current value of metric by value.
/// Supports metrics of kind TRITONSERVER_METRIC_KIND_GAUGE for any value,
/// and TRITONSERVER_METRIC_KIND_COUNTER for non-negative values. Returns
/// TRITONSERVER_ERROR_UNSUPPORTED for unsupported TRITONSERVER_MetricKind
/// and TRITONSERVER_ERROR_INVALID_ARG for negative values on a
/// TRITONSERVER_METRIC_KIND_COUNTER metric.
///
/// \param metric The metric object to update.
/// \param value The amount to increment the metric's value by.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_MetricIncrement(
TRITONSERVER_Metric* metric, double value);
/// Set the current value of metric to value.
/// Supports metrics of kind TRITONSERVER_METRIC_KIND_GAUGE and returns
/// TRITONSERVER_ERROR_UNSUPPORTED for unsupported TRITONSERVER_MetricKind.
///
/// \param metric The metric object to update.
/// \param value The amount to set metric's value to.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_MetricSet(
TRITONSERVER_Metric* metric, double value);
/// Get the TRITONSERVER_MetricKind of metric and its corresponding family.
///
/// \param metric The metric object to query.
/// \param kind Returns the TRITONSERVER_MetricKind of metric.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_GetMetricKind(
TRITONSERVER_Metric* metric, TRITONSERVER_MetricKind* kind);
#ifdef __cplusplus
}
#endif
// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "backend_config.h"
#include "status.h"
#include "triton/common/logging.h"
#include "triton/common/model_config.h"
namespace triton { namespace core {
namespace {
Status
GetTFSpecializedBackendName(
const triton::common::BackendCmdlineConfigMap& config_map,
std::string* specialized_name)
{
std::string tf_version_str = "2";
const auto& itr = config_map.find("tensorflow");
if (itr != config_map.end()) {
if (BackendConfiguration(itr->second, "version", &tf_version_str).IsOk()) {
if ((tf_version_str != "1") && (tf_version_str != "2")) {
return Status(
Status::Code::INVALID_ARG,
"unexpected TensorFlow library version '" + tf_version_str +
"', expects 1 or 2.");
}
}
}
*specialized_name += tf_version_str;
return Status::Success;
}
} // namespace
Status
BackendConfiguration(
const triton::common::BackendCmdlineConfig& config, const std::string& key,
std::string* val)
{
for (const auto& pr : config) {
if (pr.first == key) {
*val = pr.second;
return Status::Success;
}
}
return Status(
Status::Code::INTERNAL,
std::string("unable to find common backend configuration for '") + key +
"'");
}
Status
BackendConfigurationParseStringToDouble(const std::string& str, double* val)
{
try {
*val = std::stod(str);
}
catch (...) {
return Status(
Status::Code::INTERNAL,
"unable to parse common backend configuration as double");
}
return Status::Success;
}
Status
BackendConfigurationParseStringToBool(const std::string& str, bool* val)
{
try {
std::string lowercase_str{str};
std::transform(
lowercase_str.begin(), lowercase_str.end(), lowercase_str.begin(),
[](unsigned char c) { return std::tolower(c); });
*val = (lowercase_str == "true");
}
catch (...) {
return Status(
Status::Code::INTERNAL,
"unable to parse common backend configuration as bool");
}
return Status::Success;
}
Status
BackendConfigurationGlobalBackendsDirectory(
const triton::common::BackendCmdlineConfigMap& config_map, std::string* dir)
{
const auto& itr = config_map.find(std::string());
if (itr == config_map.end()) {
return Status(
Status::Code::INTERNAL,
"unable to find global backends directory configuration");
}
RETURN_IF_ERROR(BackendConfiguration(itr->second, "backend-directory", dir));
return Status::Success;
}
Status
BackendConfigurationMinComputeCapability(
const triton::common::BackendCmdlineConfigMap& config_map, double* mcc)
{
#ifdef TRITON_ENABLE_GPU
*mcc = TRITON_MIN_COMPUTE_CAPABILITY;
#else
*mcc = 0;
#endif // TRITON_ENABLE_GPU
const auto& itr = config_map.find(std::string());
if (itr == config_map.end()) {
return Status(
Status::Code::INTERNAL, "unable to find common backend configuration");
}
std::string min_compute_capability_str;
RETURN_IF_ERROR(BackendConfiguration(
itr->second, "min-compute-capability", &min_compute_capability_str));
RETURN_IF_ERROR(
BackendConfigurationParseStringToDouble(min_compute_capability_str, mcc));
return Status::Success;
}
Status
BackendConfigurationAutoCompleteConfig(
const triton::common::BackendCmdlineConfigMap& config_map, bool* acc)
{
const auto& itr = config_map.find(std::string());
if (itr == config_map.end()) {
return Status(
Status::Code::INTERNAL, "unable to find auto-complete configuration");
}
std::string auto_complete_config_str;
RETURN_IF_ERROR(BackendConfiguration(
itr->second, "auto-complete-config", &auto_complete_config_str));
RETURN_IF_ERROR(
BackendConfigurationParseStringToBool(auto_complete_config_str, acc));
return Status::Success;
}
Status
BackendConfigurationSpecializeBackendName(
const triton::common::BackendCmdlineConfigMap& config_map,
const std::string& backend_name, std::string* specialized_name)
{
*specialized_name = backend_name;
if (backend_name == "tensorflow") {
RETURN_IF_ERROR(GetTFSpecializedBackendName(config_map, specialized_name));
}
return Status::Success;
}
Status
BackendConfigurationBackendLibraryName(
const std::string& backend_name, std::string* libname)
{
#ifdef _WIN32
*libname = "triton_" + backend_name + ".dll";
#else
*libname = "libtriton_" + backend_name + ".so";
#endif
return Status::Success;
}
Status
BackendConfigurationModelLoadGpuFraction(
const triton::common::BackendCmdlineConfigMap& config_map,
const int device_id, double* memory_limit)
{
*memory_limit = 1.0;
const auto& itr = config_map.find(std::string());
if (itr == config_map.end()) {
return Status(
Status::Code::INTERNAL,
"unable to find global backends directory configuration");
}
static std::string key_prefix = "model-load-gpu-limit-device-";
std::string memory_limit_str;
auto status = BackendConfiguration(
itr->second, key_prefix + std::to_string(device_id), &memory_limit_str);
// Allow missing key, default to 1.0 (no limit) if the limit is not specified
if (status.IsOk()) {
RETURN_IF_ERROR(BackendConfigurationParseStringToDouble(
memory_limit_str, memory_limit));
}
return Status::Success;
}
}} // namespace triton::core
// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include "status.h"
#include "triton/common/model_config.h"
namespace triton { namespace core {
/// Get a key's string value from a backend configuration.
Status BackendConfiguration(
const triton::common::BackendCmdlineConfig& config, const std::string& key,
std::string* val);
/// Convert a backend configuration string value into a double.
Status BackendConfigurationParseStringToDouble(
const std::string& str, double* val);
/// Convert a backend configuration string value into a bool.
Status BackendConfigurationParseStringToBool(const std::string& str, bool* val);
/// Get the global backends directory from the backend configuration.
Status BackendConfigurationGlobalBackendsDirectory(
const triton::common::BackendCmdlineConfigMap& config_map,
std::string* dir);
/// Get the minimum compute capability from the backend configuration.
Status BackendConfigurationMinComputeCapability(
const triton::common::BackendCmdlineConfigMap& config_map, double* mcc);
/// Get the model configuration auto-complete setting from the backend
/// configuration.
Status BackendConfigurationAutoCompleteConfig(
const triton::common::BackendCmdlineConfigMap& config_map, bool* acc);
/// Convert a backend name to the specialized version of that name
/// based on the backend configuration. For example, "tensorflow" will
/// convert to either "tensorflow1" or "tensorflow2" depending on how
/// tritonserver is run.
Status BackendConfigurationSpecializeBackendName(
const triton::common::BackendCmdlineConfigMap& config_map,
const std::string& backend_name, std::string* specialized_name);
/// Return the shared library name for a backend.
Status BackendConfigurationBackendLibraryName(
const std::string& backend_name, std::string* libname);
/// Get GPU memory limit fraction for model loading
/// from the backend configuration.
Status BackendConfigurationModelLoadGpuFraction(
const triton::common::BackendCmdlineConfigMap& config_map,
const int device_id, double* memory_limit);
}} // namespace triton::core
// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "backend_manager.h"
#include "backend_memory_manager.h"
#include "server_message.h"
#include "shared_library.h"
#include "triton/common/logging.h"
// For unknown reason, windows will not export the TRITONBACKEND_*
// functions declared with dllexport in tritonbackend.h. To get those
// functions exported it is (also?) necessary to mark the definitions
// in this file with dllexport as well.
#if defined(_MSC_VER)
#define TRITONAPI_DECLSPEC __declspec(dllexport)
#elif defined(__GNUC__)
#define TRITONAPI_DECLSPEC __attribute__((__visibility__("default")))
#else
#define TRITONAPI_DECLSPEC
#endif
namespace triton { namespace core {
//
// TritonBackend
//
Status
TritonBackend::Create(
const std::string& name, const std::string& dir, const std::string& libpath,
const triton::common::BackendCmdlineConfig& backend_cmdline_config,
std::shared_ptr<TritonBackend>* backend)
{
// Create the JSON representation of the backend configuration.
triton::common::TritonJson::Value backend_config_json(
triton::common::TritonJson::ValueType::OBJECT);
if (!backend_cmdline_config.empty()) {
triton::common::TritonJson::Value cmdline_json(
backend_config_json, triton::common::TritonJson::ValueType::OBJECT);
for (const auto& pr : backend_cmdline_config) {
RETURN_IF_ERROR(cmdline_json.AddString(pr.first.c_str(), pr.second));
}
RETURN_IF_ERROR(
backend_config_json.Add("cmdline", std::move(cmdline_json)));
}
TritonServerMessage backend_config(backend_config_json);
auto local_backend = std::shared_ptr<TritonBackend>(
new TritonBackend(name, dir, libpath, backend_config));
// Load the library and initialize all the entrypoints
RETURN_IF_ERROR(local_backend->LoadBackendLibrary());
// Backend initialization is optional... The TRITONBACKEND_Backend
// object is this TritonBackend object. We must set set shared
// library path to point to the backend directory in case the
// backend library attempts to load additional shared libaries.
if (local_backend->backend_init_fn_ != nullptr) {
std::unique_ptr<SharedLibrary> slib;
RETURN_IF_ERROR(SharedLibrary::Acquire(&slib));
RETURN_IF_ERROR(slib->SetLibraryDirectory(local_backend->dir_));
TRITONSERVER_Error* err = local_backend->backend_init_fn_(
reinterpret_cast<TRITONBACKEND_Backend*>(local_backend.get()));
RETURN_IF_ERROR(slib->ResetLibraryDirectory());
RETURN_IF_TRITONSERVER_ERROR(err);
}
local_backend->UpdateAttributes();
*backend = std::move(local_backend);
return Status::Success;
}
Status
TritonBackend::UpdateAttributes()
{
if (backend_attri_fn_ == nullptr) {
return Status::Success;
}
// Create an Attribute object for the backend to fill, note that it copies
// some fields from 'attributes_' while the others use default value. This
// is an ad hoc way to determine whether the attribute is set by the backend
// and keep / update current value.
Attribute latest;
latest.exec_policy_ = attributes_.exec_policy_;
RETURN_IF_TRITONSERVER_ERROR(backend_attri_fn_(
reinterpret_cast<TRITONBACKEND_Backend*>(this),
reinterpret_cast<TRITONBACKEND_BackendAttribute*>(&latest)));
// Update attributes that were set
attributes_.exec_policy_ = latest.exec_policy_;
if (!latest.preferred_groups_.empty()) {
attributes_.preferred_groups_ = latest.preferred_groups_;
}
return Status::Success;
}
TritonBackend::TritonBackend(
const std::string& name, const std::string& dir, const std::string& libpath,
const TritonServerMessage& backend_config)
: name_(name), dir_(dir), libpath_(libpath),
backend_config_(backend_config), state_(nullptr)
{
ClearHandles();
}
TritonBackend::~TritonBackend()
{
LOG_VERBOSE(1) << "unloading backend '" << name_ << "'";
// Backend finalization is optional... The TRITONBACKEND_Backend
// object is this TritonBackend object.
if (backend_fini_fn_ != nullptr) {
LOG_TRITONSERVER_ERROR(
backend_fini_fn_(reinterpret_cast<TRITONBACKEND_Backend*>(this)),
"failed finalizing backend");
}
ClearHandles();
}
void
TritonBackend::ClearHandles()
{
dlhandle_ = nullptr;
backend_init_fn_ = nullptr;
backend_fini_fn_ = nullptr;
backend_attri_fn_ = nullptr;
model_init_fn_ = nullptr;
model_fini_fn_ = nullptr;
inst_init_fn_ = nullptr;
inst_fini_fn_ = nullptr;
inst_exec_fn_ = nullptr;
}
Status
TritonBackend::LoadBackendLibrary()
{
TritonBackendInitFn_t bifn;
TritonBackendFiniFn_t bffn;
TritonBackendAttriFn_t bafn;
TritonModelInitFn_t mifn;
TritonModelFiniFn_t mffn;
TritonModelInstanceInitFn_t iifn;
TritonModelInstanceFiniFn_t iffn;
TritonModelInstanceExecFn_t iefn;
{
std::unique_ptr<SharedLibrary> slib;
RETURN_IF_ERROR(SharedLibrary::Acquire(&slib));
RETURN_IF_ERROR(slib->OpenLibraryHandle(libpath_, &dlhandle_));
// Backend initialize and finalize functions, optional
RETURN_IF_ERROR(slib->GetEntrypoint(
dlhandle_, "TRITONBACKEND_Initialize", true /* optional */,
reinterpret_cast<void**>(&bifn)));
RETURN_IF_ERROR(slib->GetEntrypoint(
dlhandle_, "TRITONBACKEND_Finalize", true /* optional */,
reinterpret_cast<void**>(&bffn)));
// Backend attribute function, optional
RETURN_IF_ERROR(slib->GetEntrypoint(
dlhandle_, "TRITONBACKEND_GetBackendAttribute", true /* optional */,
reinterpret_cast<void**>(&bafn)));
// Model initialize and finalize functions, optional
RETURN_IF_ERROR(slib->GetEntrypoint(
dlhandle_, "TRITONBACKEND_ModelInitialize", true /* optional */,
reinterpret_cast<void**>(&mifn)));
RETURN_IF_ERROR(slib->GetEntrypoint(
dlhandle_, "TRITONBACKEND_ModelFinalize", true /* optional */,
reinterpret_cast<void**>(&mffn)));
// Model instance initialize and finalize functions, optional
RETURN_IF_ERROR(slib->GetEntrypoint(
dlhandle_, "TRITONBACKEND_ModelInstanceInitialize", true /* optional */,
reinterpret_cast<void**>(&iifn)));
RETURN_IF_ERROR(slib->GetEntrypoint(
dlhandle_, "TRITONBACKEND_ModelInstanceFinalize", true /* optional */,
reinterpret_cast<void**>(&iffn)));
// Model instance execute function, required
RETURN_IF_ERROR(slib->GetEntrypoint(
dlhandle_, "TRITONBACKEND_ModelInstanceExecute", false /* optional */,
reinterpret_cast<void**>(&iefn)));
}
backend_init_fn_ = bifn;
backend_fini_fn_ = bffn;
backend_attri_fn_ = bafn;
model_init_fn_ = mifn;
model_fini_fn_ = mffn;
inst_init_fn_ = iifn;
inst_fini_fn_ = iffn;
inst_exec_fn_ = iefn;
return Status::Success;
}
extern "C" {
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ApiVersion(uint32_t* major, uint32_t* minor)
{
*major = TRITONBACKEND_API_VERSION_MAJOR;
*minor = TRITONBACKEND_API_VERSION_MINOR;
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_BackendName(TRITONBACKEND_Backend* backend, const char** name)
{
TritonBackend* tb = reinterpret_cast<TritonBackend*>(backend);
*name = tb->Name().c_str();
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_BackendConfig(
TRITONBACKEND_Backend* backend, TRITONSERVER_Message** backend_config)
{
TritonBackend* tb = reinterpret_cast<TritonBackend*>(backend);
*backend_config = const_cast<TRITONSERVER_Message*>(
reinterpret_cast<const TRITONSERVER_Message*>(&tb->BackendConfig()));
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_BackendExecutionPolicy(
TRITONBACKEND_Backend* backend, TRITONBACKEND_ExecutionPolicy* policy)
{
TritonBackend* tb = reinterpret_cast<TritonBackend*>(backend);
*policy = tb->ExecutionPolicy();
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_BackendSetExecutionPolicy(
TRITONBACKEND_Backend* backend, TRITONBACKEND_ExecutionPolicy policy)
{
TritonBackend* tb = reinterpret_cast<TritonBackend*>(backend);
tb->SetExecutionPolicy(policy);
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_BackendArtifacts(
TRITONBACKEND_Backend* backend, TRITONBACKEND_ArtifactType* artifact_type,
const char** location)
{
TritonBackend* tb = reinterpret_cast<TritonBackend*>(backend);
*artifact_type = TRITONBACKEND_ARTIFACT_FILESYSTEM;
*location = tb->Directory().c_str();
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_BackendMemoryManager(
TRITONBACKEND_Backend* backend, TRITONBACKEND_MemoryManager** manager)
{
static TritonMemoryManager gMemoryManager;
*manager = reinterpret_cast<TRITONBACKEND_MemoryManager*>(&gMemoryManager);
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_BackendState(TRITONBACKEND_Backend* backend, void** state)
{
TritonBackend* tb = reinterpret_cast<TritonBackend*>(backend);
*state = tb->State();
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_BackendSetState(TRITONBACKEND_Backend* backend, void* state)
{
TritonBackend* tb = reinterpret_cast<TritonBackend*>(backend);
tb->SetState(state);
return nullptr; // success
}
} // extern C
//
// TritonBackendManager
//
static std::weak_ptr<TritonBackendManager> backend_manager_;
static std::mutex mu_;
Status
TritonBackendManager::Create(std::shared_ptr<TritonBackendManager>* manager)
{
std::lock_guard<std::mutex> lock(mu_);
// If there is already a manager then we just use it...
*manager = backend_manager_.lock();
if (*manager != nullptr) {
return Status::Success;
}
manager->reset(new TritonBackendManager());
backend_manager_ = *manager;
return Status::Success;
}
Status
TritonBackendManager::CreateBackend(
const std::string& name, const std::string& dir, const std::string& libpath,
const triton::common::BackendCmdlineConfig& backend_cmdline_config,
std::shared_ptr<TritonBackend>* backend)
{
std::lock_guard<std::mutex> lock(mu_);
const auto& itr = backend_map_.find(libpath);
if (itr != backend_map_.end()) {
*backend = itr->second;
return Status::Success;
}
RETURN_IF_ERROR(TritonBackend::Create(
name, dir, libpath, backend_cmdline_config, backend));
backend_map_.insert({libpath, *backend});
return Status::Success;
}
Status
TritonBackendManager::BackendState(
std::unique_ptr<std::unordered_map<std::string, std::vector<std::string>>>*
backend_state)
{
std::lock_guard<std::mutex> lock(mu_);
std::unique_ptr<std::unordered_map<std::string, std::vector<std::string>>>
backend_state_map(
new std::unordered_map<std::string, std::vector<std::string>>);
for (const auto& backend_pair : backend_map_) {
auto& libpath = backend_pair.first;
auto backend = backend_pair.second;
const char* backend_config;
size_t backend_config_size;
backend->BackendConfig().Serialize(&backend_config, &backend_config_size);
backend_state_map->insert(
{backend->Name(), std::vector<std::string>{libpath, backend_config}});
}
*backend_state = std::move(backend_state_map);
return Status::Success;
}
}} // namespace triton::core
// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <memory>
#include <mutex>
#include <string>
#include <unordered_map>
#include "constants.h"
#include "server_message.h"
#include "status.h"
#include "triton/common/model_config.h"
#include "tritonserver_apis.h"
namespace triton { namespace core {
//
// Proxy to a backend shared library.
//
class TritonBackend {
public:
struct Attribute {
Attribute() : exec_policy_(TRITONBACKEND_EXECUTION_BLOCKING) {}
TRITONBACKEND_ExecutionPolicy exec_policy_;
std::vector<inference::ModelInstanceGroup> preferred_groups_;
};
typedef TRITONSERVER_Error* (*TritonModelInitFn_t)(
TRITONBACKEND_Model* model);
typedef TRITONSERVER_Error* (*TritonModelFiniFn_t)(
TRITONBACKEND_Model* model);
typedef TRITONSERVER_Error* (*TritonModelInstanceInitFn_t)(
TRITONBACKEND_ModelInstance* instance);
typedef TRITONSERVER_Error* (*TritonModelInstanceFiniFn_t)(
TRITONBACKEND_ModelInstance* instance);
typedef TRITONSERVER_Error* (*TritonModelInstanceExecFn_t)(
TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Request** requests,
const uint32_t request_cnt);
static Status Create(
const std::string& name, const std::string& dir,
const std::string& libpath,
const triton::common::BackendCmdlineConfig& backend_cmdline_config,
std::shared_ptr<TritonBackend>* backend);
~TritonBackend();
const std::string& Name() const { return name_; }
const std::string& Directory() const { return dir_; }
const TritonServerMessage& BackendConfig() const { return backend_config_; }
const Attribute& BackendAttributes() const { return attributes_; }
TRITONBACKEND_ExecutionPolicy ExecutionPolicy() const
{
return attributes_.exec_policy_;
}
void SetExecutionPolicy(const TRITONBACKEND_ExecutionPolicy policy)
{
attributes_.exec_policy_ = policy;
}
void* State() { return state_; }
void SetState(void* state) { state_ = state; }
TritonModelInitFn_t ModelInitFn() const { return model_init_fn_; }
TritonModelFiniFn_t ModelFiniFn() const { return model_fini_fn_; }
TritonModelInstanceInitFn_t ModelInstanceInitFn() const
{
return inst_init_fn_;
}
TritonModelInstanceFiniFn_t ModelInstanceFiniFn() const
{
return inst_fini_fn_;
}
TritonModelInstanceExecFn_t ModelInstanceExecFn() const
{
return inst_exec_fn_;
}
private:
typedef TRITONSERVER_Error* (*TritonBackendInitFn_t)(
TRITONBACKEND_Backend* backend);
typedef TRITONSERVER_Error* (*TritonBackendFiniFn_t)(
TRITONBACKEND_Backend* backend);
typedef TRITONSERVER_Error* (*TritonBackendAttriFn_t)(
TRITONBACKEND_Backend* backend,
TRITONBACKEND_BackendAttribute* backend_attributes);
TritonBackend(
const std::string& name, const std::string& dir,
const std::string& libpath, const TritonServerMessage& backend_config);
void ClearHandles();
Status LoadBackendLibrary();
Status UpdateAttributes();
// The name of the backend.
const std::string name_;
// Full path to the directory holding backend shared library and
// other artifacts.
const std::string dir_;
// Full path to the backend shared library.
const std::string libpath_;
// Backend configuration as JSON
TritonServerMessage backend_config_;
// backend attributes
Attribute attributes_;
// dlopen / dlsym handles
void* dlhandle_;
TritonBackendInitFn_t backend_init_fn_;
TritonBackendFiniFn_t backend_fini_fn_;
TritonBackendAttriFn_t backend_attri_fn_;
TritonModelInitFn_t model_init_fn_;
TritonModelFiniFn_t model_fini_fn_;
TritonModelInstanceInitFn_t inst_init_fn_;
TritonModelInstanceFiniFn_t inst_fini_fn_;
TritonModelInstanceExecFn_t inst_exec_fn_;
// Opaque state associated with the backend.
void* state_;
};
//
// Manage communication with Triton backends and their lifecycle.
//
class TritonBackendManager {
public:
static Status Create(std::shared_ptr<TritonBackendManager>* manager);
Status CreateBackend(
const std::string& name, const std::string& dir,
const std::string& libpath,
const triton::common::BackendCmdlineConfig& backend_cmdline_config,
std::shared_ptr<TritonBackend>* backend);
Status BackendState(
std::unique_ptr<
std::unordered_map<std::string, std::vector<std::string>>>*
backend_state);
private:
DISALLOW_COPY_AND_ASSIGN(TritonBackendManager);
TritonBackendManager() = default;
std::unordered_map<std::string, std::shared_ptr<TritonBackend>> backend_map_;
};
}} // namespace triton::core
// Copyright 2020-2022, NVIDIA CORPORATION. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "backend_memory_manager.h"
#include "pinned_memory_manager.h"
#include "status.h"
#include "tritonserver_apis.h"
#ifdef TRITON_ENABLE_GPU
#include <cuda_runtime_api.h>
#include "cuda_memory_manager.h"
#endif // TRITON_ENABLE_GPU
// For unknown reason, windows will not export the TRITONBACKEND_*
// functions declared with dllexport in tritonbackend.h. To get those
// functions exported it is (also?) necessary to mark the definitions
// in this file with dllexport as well.
#if defined(_MSC_VER)
#define TRITONAPI_DECLSPEC __declspec(dllexport)
#elif defined(__GNUC__)
#define TRITONAPI_DECLSPEC __attribute__((__visibility__("default")))
#else
#define TRITONAPI_DECLSPEC
#endif
namespace triton { namespace core {
extern "C" {
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_MemoryManagerAllocate(
TRITONBACKEND_MemoryManager* manager, void** buffer,
const TRITONSERVER_MemoryType memory_type, const int64_t memory_type_id,
const uint64_t byte_size)
{
switch (memory_type) {
case TRITONSERVER_MEMORY_GPU:
#ifdef TRITON_ENABLE_GPU
{
auto status = CudaMemoryManager::Alloc(buffer, byte_size, memory_type_id);
if (!status.IsOk()) {
return TRITONSERVER_ErrorNew(
StatusCodeToTritonCode(status.ErrorCode()),
status.Message().c_str());
}
break;
}
#else
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_UNSUPPORTED,
"GPU memory allocation not supported");
#endif // TRITON_ENABLE_GPU
case TRITONSERVER_MEMORY_CPU_PINNED:
#ifdef TRITON_ENABLE_GPU
{
TRITONSERVER_MemoryType mt = memory_type;
auto status = PinnedMemoryManager::Alloc(buffer, byte_size, &mt, false);
if (!status.IsOk()) {
return TRITONSERVER_ErrorNew(
StatusCodeToTritonCode(status.ErrorCode()),
status.Message().c_str());
}
break;
}
#else
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_UNSUPPORTED,
"Pinned memory allocation not supported");
#endif // TRITON_ENABLE_GPU
case TRITONSERVER_MEMORY_CPU: {
*buffer = malloc(byte_size);
if (*buffer == nullptr) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_UNAVAILABLE, "CPU memory allocation failed");
}
break;
}
}
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_MemoryManagerFree(
TRITONBACKEND_MemoryManager* manager, void* buffer,
const TRITONSERVER_MemoryType memory_type, const int64_t memory_type_id)
{
switch (memory_type) {
case TRITONSERVER_MEMORY_GPU: {
#ifdef TRITON_ENABLE_GPU
auto status = CudaMemoryManager::Free(buffer, memory_type_id);
if (!status.IsOk()) {
return TRITONSERVER_ErrorNew(
StatusCodeToTritonCode(status.StatusCode()),
status.Message().c_str());
}
#endif // TRITON_ENABLE_GPU
break;
}
case TRITONSERVER_MEMORY_CPU_PINNED: {
#ifdef TRITON_ENABLE_GPU
auto status = PinnedMemoryManager::Free(buffer);
if (!status.IsOk()) {
return TRITONSERVER_ErrorNew(
StatusCodeToTritonCode(status.StatusCode()),
status.Message().c_str());
}
#endif // TRITON_ENABLE_GPU
break;
}
case TRITONSERVER_MEMORY_CPU:
free(buffer);
break;
}
return nullptr; // success
}
} // extern C
}} // namespace triton::core
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
namespace triton { namespace core {
// Currently there is just a global memory manager that is used for
// all backends and which simply forwards requests on to the core
// memory manager.
struct TritonMemoryManager {
};
}} // namespace triton::core
// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "backend_model.h"
#include <vector>
#include "backend_config.h"
#include "backend_model_instance.h"
#include "dynamic_batch_scheduler.h"
#include "filesystem.h"
#include "model_config_utils.h"
#include "numa_utils.h"
#include "sequence_batch_scheduler.h"
#include "sequence_state.h"
#include "server.h"
#include "server_message.h"
#include "shared_library.h"
#include "triton/common/logging.h"
#include "tritonserver_apis.h"
// For unknown reason, windows will not export the TRITONBACKEND_*
// functions declared with dllexport in tritonbackend.h. To get those
// functions exported it is (also?) necessary to mark the definitions
// in this file with dllexport as well.
#if defined(_MSC_VER)
#define TRITONAPI_DECLSPEC __declspec(dllexport)
#elif defined(__GNUC__)
#define TRITONAPI_DECLSPEC __attribute__((__visibility__("default")))
#else
#define TRITONAPI_DECLSPEC
#endif
namespace triton { namespace core {
Status
TritonModel::Create(
InferenceServer* server, const std::string& model_path,
const triton::common::BackendCmdlineConfigMap& backend_cmdline_config_map,
const triton::common::HostPolicyCmdlineConfigMap& host_policy_map,
const std::string& model_name, const int64_t version,
inference::ModelConfig model_config, const bool is_config_provided,
std::unique_ptr<TritonModel>* model)
{
model->reset();
// The model configuration must specify a backend. The name of the
// corresponding shared library must be libtriton_<backend>.so.
if (model_config.backend().empty()) {
return Status(
Status::Code::INVALID_ARG,
"must specify 'backend' for '" + model_config.name() + "'");
}
// Localize the content of the model repository corresponding to
// 'model_name'. This model holds a handle to the localized content
// so that it persists as long as the model is loaded.
std::shared_ptr<LocalizedPath> localized_model_dir;
RETURN_IF_ERROR(LocalizePath(model_path, &localized_model_dir));
// Localize paths in backend model config
// [FIXME] Remove once a more permanent solution is implemented (DLIS-4211)
RETURN_IF_ERROR(LocalizePythonBackendExecutionEnvironmentPath(
model_path, &model_config, &localized_model_dir));
// Get some internal configuration values needed for initialization.
std::string backend_dir;
RETURN_IF_ERROR(BackendConfigurationGlobalBackendsDirectory(
backend_cmdline_config_map, &backend_dir));
bool auto_complete_config = false;
RETURN_IF_ERROR(BackendConfigurationAutoCompleteConfig(
backend_cmdline_config_map, &auto_complete_config));
double min_compute_capability = 0;
RETURN_IF_ERROR(BackendConfigurationMinComputeCapability(
backend_cmdline_config_map, &min_compute_capability));
std::string specialized_backend_name;
RETURN_IF_ERROR(BackendConfigurationSpecializeBackendName(
backend_cmdline_config_map, model_config.backend(),
&specialized_backend_name));
std::string backend_libname;
RETURN_IF_ERROR(BackendConfigurationBackendLibraryName(
specialized_backend_name, &backend_libname));
// Get the path to the backend shared library. Search path is
// version directory, model directory, global backend directory.
const auto localized_model_path = localized_model_dir->Path();
const auto version_path =
JoinPath({localized_model_path, std::to_string(version)});
const std::string global_path =
JoinPath({backend_dir, specialized_backend_name});
const std::vector<std::string> search_paths = {
version_path, localized_model_path, global_path};
std::string backend_libdir;
std::string backend_libpath;
for (const auto& path : search_paths) {
const auto full_path = JoinPath({path, backend_libname});
bool exists = false;
RETURN_IF_ERROR(FileExists(full_path, &exists));
if (exists) {
backend_libdir = path;
backend_libpath = full_path;
break;
}
}
if (backend_libpath.empty()) {
return Status(
Status::Code::INVALID_ARG, "unable to find '" + backend_libname +
"' for model '" + model_config.name() +
"', searched: " + version_path + ", " +
model_path + ", " + global_path);
}
// Resolve the global backend configuration with the specific backend
// configuration
triton::common::BackendCmdlineConfig config;
RETURN_IF_ERROR(ResolveBackendConfigs(
backend_cmdline_config_map, model_config.backend(), config));
RETURN_IF_ERROR(SetBackendConfigDefaults(config));
std::shared_ptr<TritonBackend> backend;
RETURN_IF_ERROR(server->BackendManager()->CreateBackend(
model_config.backend(), backend_libdir, backend_libpath, config,
&backend));
// Normalize backend-dependent config
{
const auto& attributes = backend->BackendAttributes();
// [WIP] formalize config normalization / validation
RETURN_IF_ERROR(NormalizeInstanceGroup(
min_compute_capability, attributes.preferred_groups_, &model_config));
RETURN_IF_ERROR(
ValidateInstanceGroup(model_config, min_compute_capability));
}
// Create and initialize the model.
std::unique_ptr<TritonModel> local_model(new TritonModel(
server, localized_model_dir, backend, min_compute_capability, version,
model_config, auto_complete_config));
TritonModel* raw_local_model = local_model.get();
// Model initialization is optional... The TRITONBACKEND_Model
// object is this TritonModel object. We must set set shared library
// path to point to the backend directory in case the backend
// library attempts to load additional shared libaries.
if (backend->ModelInitFn() != nullptr) {
std::unique_ptr<SharedLibrary> slib;
RETURN_IF_ERROR(SharedLibrary::Acquire(&slib));
RETURN_IF_ERROR(slib->SetLibraryDirectory(backend->Directory()));
TRITONSERVER_Error* err = backend->ModelInitFn()(
reinterpret_cast<TRITONBACKEND_Model*>(raw_local_model));
RETURN_IF_ERROR(slib->ResetLibraryDirectory());
RETURN_IF_TRITONSERVER_ERROR(err);
}
// Initialize the model for Triton core usage
RETURN_IF_ERROR(local_model->Init(is_config_provided));
bool device_blocking = false;
if (local_model->backend_->ExecutionPolicy() ==
TRITONBACKEND_EXECUTION_DEVICE_BLOCKING) {
if (model_config.has_sequence_batching()) {
LOG_INFO << "Overriding execution policy to "
"\"TRITONBACKEND_EXECUTION_BLOCKING\" for sequence model \""
<< model_config.name() << "\"";
} else {
device_blocking = true;
}
}
// Create and initialize the model instances for this model.
RETURN_IF_ERROR(TritonModelInstance::CreateInstances(
raw_local_model, backend_cmdline_config_map, host_policy_map,
model_config, device_blocking));
RETURN_IF_ERROR(local_model->SetConfiguredScheduler());
*model = std::move(local_model);
return Status::Success;
}
Status
TritonModel::ResolveBackendConfigs(
const triton::common::BackendCmdlineConfigMap& backend_cmdline_config_map,
const std::string& backend_name,
triton::common::BackendCmdlineConfig& config)
{
const auto& global_itr = backend_cmdline_config_map.find(std::string());
const auto& specific_itr = backend_cmdline_config_map.find(backend_name);
if (specific_itr == backend_cmdline_config_map.end() &&
global_itr != backend_cmdline_config_map.end()) {
for (auto setting : global_itr->second) {
config.push_back(setting);
}
} else if (
specific_itr != backend_cmdline_config_map.end() &&
global_itr == backend_cmdline_config_map.end()) {
for (auto setting : specific_itr->second) {
config.push_back(setting);
}
} else if (
specific_itr != backend_cmdline_config_map.end() &&
global_itr != backend_cmdline_config_map.end()) {
triton::common::BackendCmdlineConfig global_backend_config =
global_itr->second;
triton::common::BackendCmdlineConfig specific_backend_config =
specific_itr->second;
std::sort(global_backend_config.begin(), global_backend_config.end());
std::sort(specific_backend_config.begin(), specific_backend_config.end());
size_t global_index = 0;
size_t specific_index = 0;
while (global_index < global_backend_config.size() &&
specific_index < specific_backend_config.size()) {
auto& current_global_setting = global_backend_config.at(global_index);
auto& current_specific_setting =
specific_backend_config.at(specific_index);
if (current_specific_setting.first.compare(
current_global_setting.first) == 0) {
// specific setting overrides global setting
config.push_back(current_specific_setting);
++global_index;
++specific_index;
} else if (
current_specific_setting.first.compare(current_global_setting.first) <
0) {
config.push_back(current_specific_setting);
++specific_index;
} else {
config.push_back(current_global_setting);
++global_index;
}
}
// add the rest of the global configs
if (global_index < global_backend_config.size()) {
auto& current_global_setting = global_backend_config.at(global_index);
config.push_back(current_global_setting);
}
// add the rest of the specific settings
if (specific_index < specific_backend_config.size()) {
auto& current_specific_setting =
specific_backend_config.at(specific_index);
config.push_back(current_specific_setting);
}
} // else empty config
return Status::Success;
}
const std::unordered_map<std::string, std::string> backend_config_defaults(
{{"default-max-batch-size", "4"}});
Status
TritonModel::SetBackendConfigDefaults(
triton::common::BackendCmdlineConfig& config)
{
auto backend_config_defaults_copy = backend_config_defaults;
for (auto& setting : config) {
if (setting.first.compare("default-max-batch-size") == 0) {
LOG_VERBOSE(1) << "Found overwritten default setting: " << setting.first
<< "," << setting.second;
backend_config_defaults_copy.erase(setting.first);
}
if (backend_config_defaults_copy.empty()) {
break;
}
}
// Anything left should be added to the config
for (const auto& default_setting : backend_config_defaults_copy) {
LOG_VERBOSE(1) << "Adding default backend config setting: "
<< default_setting.first << "," << default_setting.second;
config.push_back(
std::make_pair(default_setting.first, default_setting.second));
}
return Status::Success;
}
Status
TritonModel::AddInstance(
std::unique_ptr<TritonModelInstance>&& instance, const bool passive)
{
if (passive) {
passive_instances_.emplace_back(std::move(instance));
} else {
instances_.emplace_back(std::move(instance));
}
return Status::Success;
}
Status
TritonModel::UpdateModelConfig(
const uint32_t config_version, TRITONSERVER_Message* updated_config_message)
{
const char* buffer;
size_t byte_size;
RETURN_IF_TRITONSERVER_ERROR(TRITONSERVER_MessageSerializeToJson(
updated_config_message, &buffer, &byte_size));
inference::ModelConfig updated_config;
RETURN_IF_ERROR(
JsonToModelConfig({buffer, byte_size}, config_version, &updated_config));
auto config = Config();
config.set_max_batch_size(updated_config.max_batch_size());
auto inputs_config = config.mutable_input();
*inputs_config = updated_config.input();
auto outputs_config = config.mutable_output();
*outputs_config = updated_config.output();
if (!config.scheduling_choice_case()) {
if (updated_config.has_dynamic_batching()) {
auto dynamic_batching_config = config.mutable_dynamic_batching();
*dynamic_batching_config = updated_config.dynamic_batching();
} else if (updated_config.has_sequence_batching()) {
auto sequence_batching_config = config.mutable_sequence_batching();
*sequence_batching_config = updated_config.sequence_batching();
} else if (updated_config.has_ensemble_scheduling()) {
auto ensemble_scheduling_config = config.mutable_ensemble_scheduling();
*ensemble_scheduling_config = updated_config.ensemble_scheduling();
} // else do nothing
} else if (
config.scheduling_choice_case() !=
updated_config.scheduling_choice_case()) {
return Status(
triton::common::Error::Code::INTERNAL,
(std::string("Cannot update scheduling choice from ") +
std::to_string(config.scheduling_choice_case()) + std::string(" to ") +
std::to_string(config.scheduling_choice_case()) +
std::string(" when auto-completing."))
.c_str());
} // else do nothing
// Need to normalize the model configuration for
// populating missing fields.
RETURN_IF_ERROR(NormalizeModelConfig(min_compute_capability_, &config));
RETURN_IF_ERROR(SetModelConfig(config));
return Status::Success;
}
Status
TritonModel::SetConfiguredScheduler()
{
std::unique_ptr<Scheduler> scheduler;
// Need to enforce equal shape batches (i.e. non-ragged batches) if
// the model 1) allows one or more variable-size input tensors that
// are not marked as 'allow_ragged_batch' or 2) has one or more
// shape-tensor inputs. This is not needed if all input shapes are
// non-variable and if there are no shape tensors... so we don't
// enable it in that case for efficiency reasons.
std::unordered_map<std::string, bool> enforce_equal_shape_tensors;
for (const auto input : config_.input()) {
if (input.is_shape_tensor()) {
enforce_equal_shape_tensors.insert({input.name(), true});
} else if (
!input.allow_ragged_batch() &&
(triton::common::GetElementCount(input) == -1)) {
enforce_equal_shape_tensors.insert({input.name(), false});
}
}
// If 'sequence_batching' is configured, then use the SequenceBatchScheduler,
// otherwise use the default DynamicBatchScheduler.
if (config_.has_sequence_batching()) {
// Sequence batcher
RETURN_IF_ERROR(SequenceBatchScheduler::Create(
this, enforce_equal_shape_tensors, &scheduler));
} else if (config_.has_dynamic_batching()) {
// Dynamic batcher
RETURN_IF_ERROR(DynamicBatchScheduler::Create(
this, nullptr, 0 /*nice*/, true /* dynamic_batching_enabled */,
config_.max_batch_size(), enforce_equal_shape_tensors,
config_.dynamic_batching(),
config_.response_cache().enable() /* response_cache_enable */,
&scheduler));
} else {
// Default scheduler. Use dynamic batch scheduler (with batching
// disabled) as the default scheduler.
RETURN_IF_ERROR(DynamicBatchScheduler::Create(
this, nullptr, 0 /*nice*/, false /* dynamic_batching_enabled */,
1 /* max_batch_size */,
std::unordered_map<
std::string, bool>() /* enforce_equal_shape_tensors */,
false /* preserve_ordering */,
config_.response_cache().enable() /* response_cache_enable */,
std::set<int32_t>() /* preferred_batch_sizes */,
0 /* max_queue_delay_microseconds */, &scheduler));
}
return SetScheduler(std::move(scheduler));
}
Status
TritonModel::Initialize()
{
for (const auto& instance : instances_) {
RETURN_IF_ERROR(instance->Initialize());
}
return Status::Success;
}
Status
TritonModel::WarmUp()
{
for (const auto& instance : instances_) {
RETURN_IF_ERROR(instance->WarmUp());
}
return Status::Success;
}
TritonModel::TritonModel(
InferenceServer* server,
const std::shared_ptr<LocalizedPath>& localized_model_dir,
const std::shared_ptr<TritonBackend>& backend,
const double min_compute_capability, const int64_t version,
const inference::ModelConfig& config, const bool auto_complete_config)
: Model(
min_compute_capability, localized_model_dir->Path(), version, config),
server_(server), min_compute_capability_(min_compute_capability),
auto_complete_config_(auto_complete_config),
localized_model_dir_(localized_model_dir), backend_(backend),
state_(nullptr)
{
}
TritonModel::~TritonModel()
{
// Explicitly delete/finalize all model instances before finalizing
// the model itself.
instances_.clear();
passive_instances_.clear();
// Unregister itself from the rate limiter. Note this should happen
// after all instances are destructed. Destrucing instances ensures
// there are no instance threads waiting on rate limiter for
// receiving their payloads.
server_->GetRateLimiter()->UnregisterModel(this);
// Model finalization is optional... The TRITONBACKEND_Model
// object is this TritonModel object.
if (backend_->ModelFiniFn() != nullptr) {
LOG_TRITONSERVER_ERROR(
backend_->ModelFiniFn()(reinterpret_cast<TRITONBACKEND_Model*>(this)),
"failed finalizing model");
}
}
extern "C" {
//
// TRITONBACKEND_Model
//
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelName(TRITONBACKEND_Model* model, const char** name)
{
TritonModel* tm = reinterpret_cast<TritonModel*>(model);
*name = tm->Name().c_str();
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelVersion(TRITONBACKEND_Model* model, uint64_t* version)
{
TritonModel* tm = reinterpret_cast<TritonModel*>(model);
*version = tm->Version();
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelRepository(
TRITONBACKEND_Model* model, TRITONBACKEND_ArtifactType* artifact_type,
const char** location)
{
TritonModel* tm = reinterpret_cast<TritonModel*>(model);
*artifact_type = TRITONBACKEND_ARTIFACT_FILESYSTEM;
*location = tm->LocalizedModelPath().c_str();
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelConfig(
TRITONBACKEND_Model* model, const uint32_t config_version,
TRITONSERVER_Message** model_config)
{
TritonModel* tm = reinterpret_cast<TritonModel*>(model);
std::string model_config_json;
Status status =
ModelConfigToJson(tm->Config(), config_version, &model_config_json);
if (!status.IsOk()) {
return TRITONSERVER_ErrorNew(
StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
}
*model_config = reinterpret_cast<TRITONSERVER_Message*>(
new TritonServerMessage(std::move(model_config_json)));
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelAutoCompleteConfig(
TRITONBACKEND_Model* model, bool* auto_complete_config)
{
TritonModel* tm = reinterpret_cast<TritonModel*>(model);
*auto_complete_config = tm->AutoCompleteConfig();
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelSetConfig(
TRITONBACKEND_Model* model, const uint32_t config_version,
TRITONSERVER_Message* model_config)
{
TritonModel* tm = reinterpret_cast<TritonModel*>(model);
Status status = tm->UpdateModelConfig(config_version, model_config);
if (!status.IsOk()) {
return TRITONSERVER_ErrorNew(
StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
}
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelServer(
TRITONBACKEND_Model* model, TRITONSERVER_Server** server)
{
TritonModel* tm = reinterpret_cast<TritonModel*>(model);
*server = reinterpret_cast<TRITONSERVER_Server*>(tm->Server());
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelBackend(
TRITONBACKEND_Model* model, TRITONBACKEND_Backend** backend)
{
TritonModel* tm = reinterpret_cast<TritonModel*>(model);
*backend = reinterpret_cast<TRITONBACKEND_Backend*>(tm->Backend().get());
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelState(TRITONBACKEND_Model* model, void** state)
{
TritonModel* tm = reinterpret_cast<TritonModel*>(model);
*state = tm->State();
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelSetState(TRITONBACKEND_Model* model, void* state)
{
TritonModel* tm = reinterpret_cast<TritonModel*>(model);
tm->SetState(state);
return nullptr; // success
}
///
/// TRITONBACKEND_Request
///
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_RequestId(TRITONBACKEND_Request* request, const char** id)
{
InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
*id = tr->Id().c_str();
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_RequestCorrelationId(TRITONBACKEND_Request* request, uint64_t* id)
{
InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
const InferenceRequest::SequenceId& correlation_id = tr->CorrelationId();
if (correlation_id.Type() != InferenceRequest::SequenceId::DataType::UINT64) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INVALID_ARG,
(tr->LogRequest() + "correlation ID in request is not an unsigned int")
.c_str());
}
*id = correlation_id.UnsignedIntValue();
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_RequestFlags(TRITONBACKEND_Request* request, uint32_t* flags)
{
InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
*flags = tr->Flags();
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_RequestCorrelationIdString(
TRITONBACKEND_Request* request, const char** id)
{
InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
const InferenceRequest::SequenceId& correlation_id = tr->CorrelationId();
if (correlation_id.Type() != InferenceRequest::SequenceId::DataType::STRING) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INVALID_ARG,
(tr->LogRequest() + "correlation ID in request is not a string")
.c_str());
}
*id = correlation_id.StringValue().c_str();
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_RequestInputCount(TRITONBACKEND_Request* request, uint32_t* count)
{
InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
*count = tr->ImmutableInputs().size();
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_RequestInputName(
TRITONBACKEND_Request* request, const uint32_t index,
const char** input_name)
{
*input_name = nullptr;
InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
const auto& inputs = tr->ImmutableInputs();
if (index >= inputs.size()) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INVALID_ARG,
(tr->LogRequest() + "out of bounds index " + std::to_string(index) +
": request has " + std::to_string(inputs.size()) + " inputs")
.c_str());
}
// The request inputs are not allowed to change once the request
// makes it to the backend, so it is ok to just iterate through the
// map. This linear search is the best we can do given the need for
// the inputs to be in a map and given the typical small number of
// inputs is better than having every request maintain the inputs as
// both map and vector.
uint32_t cnt = 0;
for (const auto& pr : inputs) {
if (cnt++ == index) {
InferenceRequest::Input* in = pr.second;
*input_name = in->Name().c_str();
break;
}
}
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_RequestInput(
TRITONBACKEND_Request* request, const char* name,
TRITONBACKEND_Input** input)
{
InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
const auto& inputs = tr->ImmutableInputs();
const auto& itr = inputs.find(name);
if (itr == inputs.end()) {
*input = nullptr;
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INVALID_ARG,
(tr->LogRequest() + "unknown request input name " + name).c_str());
}
InferenceRequest::Input* in = itr->second;
*input = reinterpret_cast<TRITONBACKEND_Input*>(in);
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_RequestInputByIndex(
TRITONBACKEND_Request* request, const uint32_t index,
TRITONBACKEND_Input** input)
{
InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
const auto& inputs = tr->ImmutableInputs();
if (index >= inputs.size()) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INVALID_ARG,
(tr->LogRequest() + "out of bounds index " + std::to_string(index) +
": request has " + std::to_string(inputs.size()) + " inputs")
.c_str());
}
// The request inputs are not allowed to change once the request
// makes it to the backend, so it is ok to just iterate through the
// map. This linear search is the best we can do given the need for
// the inputs to be in a map and given the typical small number of
// inputs is better than having every request maintain the inputs as
// both map and vector.
uint32_t cnt = 0;
for (const auto& pr : inputs) {
if (cnt++ == index) {
InferenceRequest::Input* in = pr.second;
*input = reinterpret_cast<TRITONBACKEND_Input*>(in);
break;
}
}
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_RequestOutputCount(
TRITONBACKEND_Request* request, uint32_t* count)
{
InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
*count = tr->ImmutableRequestedOutputs().size();
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_RequestOutputName(
TRITONBACKEND_Request* request, const uint32_t index,
const char** output_name)
{
*output_name = nullptr;
InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
const auto& routputs = tr->ImmutableRequestedOutputs();
if (index >= routputs.size()) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INVALID_ARG,
(tr->LogRequest() + "out of bounds index " + std::to_string(index) +
": request has " + std::to_string(routputs.size()) +
" requested outputs")
.c_str());
}
// The requested outputs are not allowed to change once the request
// makes it to the backend, so it is ok to just iterate through the
// set. This linear search is the best we can do given the requested
// outputs being in a set and given the typical small number of
// requested outputs it should not be a performance issue.
uint32_t cnt = 0;
for (const auto& rout : routputs) {
if (cnt++ == index) {
*output_name = rout.c_str();
break;
}
}
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_RequestOutputBufferProperties(
TRITONBACKEND_Request* request, const char* name, size_t* byte_size,
TRITONSERVER_MemoryType* memory_type, int64_t* memory_type_id)
{
InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
auto status =
tr->OutputBufferProperties(name, byte_size, memory_type, memory_type_id);
if (!status.IsOk()) {
return TRITONSERVER_ErrorNew(
StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
}
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_RequestRelease(
TRITONBACKEND_Request* request, uint32_t release_flags)
{
InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
std::unique_ptr<InferenceRequest> ur(tr);
InferenceRequest::Release(std::move(ur), release_flags);
return nullptr; // success
}
///
/// TRITONBACKEND_State
///
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_StateUpdate(TRITONBACKEND_State* state)
{
SequenceState* ts = reinterpret_cast<SequenceState*>(state);
auto status = ts->Update();
if (!status.IsOk()) {
return TRITONSERVER_ErrorNew(
StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
}
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_StateNew(
TRITONBACKEND_State** state, TRITONBACKEND_Request* request,
const char* name, const TRITONSERVER_DataType datatype,
const int64_t* shape, const uint32_t dims_count)
{
InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
SequenceState* lstate;
std::vector<int64_t> lshape(shape, shape + dims_count);
auto& sequence_state = tr->GetSequenceStates();
if (sequence_state == nullptr) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INVALID_ARG,
(std::string("unable to add state '") + name +
"'. State configuration is missing for model '" + tr->ModelName() +
"'.")
.c_str());
}
Status status = sequence_state->OutputState(
name, TritonToDataType(datatype), lshape, &lstate);
if (!status.IsOk()) {
*state = nullptr;
return TRITONSERVER_ErrorNew(
StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
}
*state = reinterpret_cast<TRITONBACKEND_State*>(lstate);
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_StateBuffer(
TRITONBACKEND_State* state, void** buffer, const uint64_t buffer_byte_size,
TRITONSERVER_MemoryType* memory_type, int64_t* memory_type_id)
{
SequenceState* to = reinterpret_cast<SequenceState*>(state);
Status status = Status::Success;
// If the buffer size exactly matches the buffer available, reuse the
// currently allocated buffer.
if (to->Data()->TotalByteSize() == buffer_byte_size) {
const std::shared_ptr<AllocatedMemory>& memory =
reinterpret_cast<const std::shared_ptr<AllocatedMemory>&>(to->Data());
TRITONSERVER_MemoryType current_memory_type;
int64_t current_memory_type_id;
void* lbuffer =
memory->MutableBuffer(&current_memory_type, &current_memory_type_id);
// If the requested memory type doesn't match the current buffer, allocate a
// new buffer with the requested memory type and memory type id.
if (current_memory_type == *memory_type &&
current_memory_type_id == *memory_type_id) {
*buffer = lbuffer;
} else {
std::shared_ptr<AllocatedMemory> memory =
std::make_shared<AllocatedMemory>(
buffer_byte_size, *memory_type, *memory_type_id);
*buffer = memory->MutableBuffer(memory_type, memory_type_id);
to->RemoveAllData();
status = to->SetData(memory);
}
} else {
std::shared_ptr<AllocatedMemory> memory = std::make_shared<AllocatedMemory>(
buffer_byte_size, *memory_type, *memory_type_id);
*buffer = memory->MutableBuffer(memory_type, memory_type_id);
to->RemoveAllData();
status = to->SetData(memory);
}
if (!status.IsOk()) {
*buffer = nullptr;
return TRITONSERVER_ErrorNew(
StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
}
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_StateBufferAttributes(
TRITONBACKEND_State* state,
TRITONSERVER_BufferAttributes** buffer_attributes)
{
SequenceState* to = reinterpret_cast<SequenceState*>(state);
to->Data()->BufferAt(
0, reinterpret_cast<BufferAttributes**>(buffer_attributes));
return nullptr; // success
}
//
// TRITONBACKEND_ResponseFactory
//
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ResponseFactoryNew(
TRITONBACKEND_ResponseFactory** factory, TRITONBACKEND_Request* request)
{
InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
std::shared_ptr<InferenceResponseFactory>* response_factory =
new std::shared_ptr<InferenceResponseFactory>(tr->ResponseFactory());
*factory = reinterpret_cast<TRITONBACKEND_ResponseFactory*>(response_factory);
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ResponseFactoryDelete(TRITONBACKEND_ResponseFactory* factory)
{
std::shared_ptr<InferenceResponseFactory>* response_factory =
reinterpret_cast<std::shared_ptr<InferenceResponseFactory>*>(factory);
delete response_factory;
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ResponseFactorySendFlags(
TRITONBACKEND_ResponseFactory* factory, const uint32_t send_flags)
{
std::shared_ptr<InferenceResponseFactory>* response_factory =
reinterpret_cast<std::shared_ptr<InferenceResponseFactory>*>(factory);
Status status = (*response_factory)->SendFlags(send_flags);
if (!status.IsOk()) {
return TRITONSERVER_ErrorNew(
StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
}
return nullptr; // success
}
///
/// TRITONBACKEND_Response
///
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ResponseNew(
TRITONBACKEND_Response** response, TRITONBACKEND_Request* request)
{
InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
std::unique_ptr<InferenceResponse> tresp;
Status status = tr->ResponseFactory()->CreateResponse(&tresp);
if (!status.IsOk()) {
*response = nullptr;
return TRITONSERVER_ErrorNew(
StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
}
*response = reinterpret_cast<TRITONBACKEND_Response*>(tresp.release());
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ResponseNewFromFactory(
TRITONBACKEND_Response** response, TRITONBACKEND_ResponseFactory* factory)
{
std::shared_ptr<InferenceResponseFactory>* response_factory =
reinterpret_cast<std::shared_ptr<InferenceResponseFactory>*>(factory);
std::unique_ptr<InferenceResponse> tr;
Status status = (*response_factory)->CreateResponse(&tr);
if (!status.IsOk()) {
*response = nullptr;
return TRITONSERVER_ErrorNew(
StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
}
*response = reinterpret_cast<TRITONBACKEND_Response*>(tr.release());
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ResponseDelete(TRITONBACKEND_Response* response)
{
InferenceResponse* tr = reinterpret_cast<InferenceResponse*>(response);
delete tr;
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ResponseSetStringParameter(
TRITONBACKEND_Response* response, const char* name, const char* value)
{
InferenceResponse* tr = reinterpret_cast<InferenceResponse*>(response);
Status status = tr->AddParameter(name, value);
if (!status.IsOk()) {
return TRITONSERVER_ErrorNew(
StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
}
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ResponseSetIntParameter(
TRITONBACKEND_Response* response, const char* name, const int64_t value)
{
InferenceResponse* tr = reinterpret_cast<InferenceResponse*>(response);
Status status = tr->AddParameter(name, value);
if (!status.IsOk()) {
return TRITONSERVER_ErrorNew(
StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
}
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ResponseSetBoolParameter(
TRITONBACKEND_Response* response, const char* name, const bool value)
{
InferenceResponse* tr = reinterpret_cast<InferenceResponse*>(response);
Status status = tr->AddParameter(name, value);
if (!status.IsOk()) {
return TRITONSERVER_ErrorNew(
StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
}
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ResponseOutput(
TRITONBACKEND_Response* response, TRITONBACKEND_Output** output,
const char* name, const TRITONSERVER_DataType datatype,
const int64_t* shape, const uint32_t dims_count)
{
InferenceResponse* tr = reinterpret_cast<InferenceResponse*>(response);
std::vector<int64_t> lshape(shape, shape + dims_count);
InferenceResponse::Output* loutput;
Status status = tr->AddOutput(
name, TritonToDataType(datatype), std::move(lshape), &loutput);
if (!status.IsOk()) {
*output = nullptr;
return TRITONSERVER_ErrorNew(
StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
}
*output = reinterpret_cast<TRITONBACKEND_Output*>(loutput);
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ResponseSend(
TRITONBACKEND_Response* response, const uint32_t send_flags,
TRITONSERVER_Error* error)
{
InferenceResponse* tr = reinterpret_cast<InferenceResponse*>(response);
Status status;
std::unique_ptr<InferenceResponse> utr(tr);
if (error == nullptr) {
status = InferenceResponse::Send(std::move(utr), send_flags);
} else {
status = InferenceResponse::SendWithStatus(
std::move(utr), send_flags,
Status(
TritonCodeToStatusCode(TRITONSERVER_ErrorCode(error)),
TRITONSERVER_ErrorMessage(error)));
}
if (!status.IsOk()) {
return TRITONSERVER_ErrorNew(
StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
}
return nullptr; // success
}
///
/// TRITONBACKEND_Input
///
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_InputProperties(
TRITONBACKEND_Input* input, const char** name,
TRITONSERVER_DataType* datatype, const int64_t** shape,
uint32_t* dims_count, uint64_t* byte_size, uint32_t* buffer_count)
{
InferenceRequest::Input* ti =
reinterpret_cast<InferenceRequest::Input*>(input);
if (name != nullptr) {
*name = ti->Name().c_str();
}
if (datatype != nullptr) {
*datatype = DataTypeToTriton(ti->DType());
}
if (shape != nullptr) {
*shape = ti->ShapeWithBatchDim().data();
}
if (dims_count != nullptr) {
*dims_count = ti->ShapeWithBatchDim().size();
}
if (byte_size != nullptr) {
*byte_size = ti->Data()->TotalByteSize();
}
if (buffer_count != nullptr) {
*buffer_count = ti->DataBufferCount();
}
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_InputPropertiesForHostPolicy(
TRITONBACKEND_Input* input, const char* host_policy_name, const char** name,
TRITONSERVER_DataType* datatype, const int64_t** shape,
uint32_t* dims_count, uint64_t* byte_size, uint32_t* buffer_count)
{
InferenceRequest::Input* ti =
reinterpret_cast<InferenceRequest::Input*>(input);
if (name != nullptr) {
*name = ti->Name().c_str();
}
if (datatype != nullptr) {
*datatype = DataTypeToTriton(ti->DType());
}
if (shape != nullptr) {
*shape = ti->ShapeWithBatchDim().data();
}
if (dims_count != nullptr) {
*dims_count = ti->ShapeWithBatchDim().size();
}
if (host_policy_name != nullptr) {
if (byte_size != nullptr) {
*byte_size = ti->Data(host_policy_name)->TotalByteSize();
}
if (buffer_count != nullptr) {
*buffer_count = ti->DataBufferCountForHostPolicy(host_policy_name);
}
} else {
if (byte_size != nullptr) {
*byte_size = ti->Data()->TotalByteSize();
}
if (buffer_count != nullptr) {
*buffer_count = ti->DataBufferCount();
}
}
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_InputBuffer(
TRITONBACKEND_Input* input, const uint32_t index, const void** buffer,
uint64_t* buffer_byte_size, TRITONSERVER_MemoryType* memory_type,
int64_t* memory_type_id)
{
InferenceRequest::Input* ti =
reinterpret_cast<InferenceRequest::Input*>(input);
Status status = ti->DataBuffer(
index, buffer, buffer_byte_size, memory_type, memory_type_id);
if (!status.IsOk()) {
*buffer = nullptr;
*buffer_byte_size = 0;
return TRITONSERVER_ErrorNew(
StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
}
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_InputBufferAttributes(
TRITONBACKEND_Input* input, const uint32_t index, const void** buffer,
TRITONSERVER_BufferAttributes** buffer_attributes)
{
InferenceRequest::Input* ti =
reinterpret_cast<InferenceRequest::Input*>(input);
Status status = ti->DataBufferAttributes(
index, buffer, reinterpret_cast<BufferAttributes**>(buffer_attributes));
if (!status.IsOk()) {
*buffer = nullptr;
*buffer_attributes = nullptr;
return TRITONSERVER_ErrorNew(
StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
}
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_InputBufferForHostPolicy(
TRITONBACKEND_Input* input, const char* host_policy_name,
const uint32_t index, const void** buffer, uint64_t* buffer_byte_size,
TRITONSERVER_MemoryType* memory_type, int64_t* memory_type_id)
{
InferenceRequest::Input* ti =
reinterpret_cast<InferenceRequest::Input*>(input);
Status status =
(host_policy_name == nullptr)
? ti->DataBuffer(
index, buffer, buffer_byte_size, memory_type, memory_type_id)
: ti->DataBufferForHostPolicy(
index, buffer, buffer_byte_size, memory_type, memory_type_id,
host_policy_name);
if (!status.IsOk()) {
*buffer = nullptr;
*buffer_byte_size = 0;
return TRITONSERVER_ErrorNew(
StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
}
return nullptr; // success
}
///
/// TRITONBACKEND_Output
///
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_OutputBuffer(
TRITONBACKEND_Output* output, void** buffer,
const uint64_t buffer_byte_size, TRITONSERVER_MemoryType* memory_type,
int64_t* memory_type_id)
{
InferenceResponse::Output* to =
reinterpret_cast<InferenceResponse::Output*>(output);
Status status = to->AllocateDataBuffer(
buffer, buffer_byte_size, memory_type, memory_type_id);
if (!status.IsOk()) {
*buffer = nullptr;
return TRITONSERVER_ErrorNew(
StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
}
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_OutputBufferAttributes(
TRITONBACKEND_Output* output,
TRITONSERVER_BufferAttributes** buffer_attributes)
{
InferenceResponse::Output* to =
reinterpret_cast<InferenceResponse::Output*>(output);
*buffer_attributes = reinterpret_cast<TRITONSERVER_BufferAttributes*>(
to->GetBufferAttributes());
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_BackendAttributeAddPreferredInstanceGroup(
TRITONBACKEND_BackendAttribute* backend_attributes,
const TRITONSERVER_InstanceGroupKind kind, const uint64_t count,
const uint64_t* device_ids, const uint64_t id_count)
{
auto ba = reinterpret_cast<TritonBackend::Attribute*>(backend_attributes);
ba->preferred_groups_.emplace_back();
auto& pg = ba->preferred_groups_.back();
switch (kind) {
case TRITONSERVER_INSTANCEGROUPKIND_AUTO:
pg.set_kind(inference::ModelInstanceGroup::KIND_AUTO);
break;
case TRITONSERVER_INSTANCEGROUPKIND_CPU:
pg.set_kind(inference::ModelInstanceGroup::KIND_CPU);
break;
case TRITONSERVER_INSTANCEGROUPKIND_GPU:
pg.set_kind(inference::ModelInstanceGroup::KIND_GPU);
break;
case TRITONSERVER_INSTANCEGROUPKIND_MODEL:
pg.set_kind(inference::ModelInstanceGroup::KIND_MODEL);
break;
}
pg.set_count(count);
if (device_ids != nullptr) {
for (size_t i = 0; i < id_count; ++i) {
pg.add_gpus(device_ids[i]);
}
}
return nullptr;
}
} // extern C
}} // namespace triton::core
// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <memory>
#include <string>
#include "backend_manager.h"
#include "filesystem.h"
#include "infer_request.h"
#include "model.h"
#include "model_config.pb.h"
#include "status.h"
namespace triton { namespace core {
class InferenceServer;
class TritonModelInstance;
//
// Represents a model.
//
// Inheriting from Model to implement backend APIs
//
class TritonModel : public Model {
public:
static Status Create(
InferenceServer* server, const std::string& model_path,
const triton::common::BackendCmdlineConfigMap& backend_cmdline_config_map,
const triton::common::HostPolicyCmdlineConfigMap& host_policy_map,
const std::string& model_name, const int64_t version,
inference::ModelConfig model_config, const bool is_config_provided,
std::unique_ptr<TritonModel>* model);
~TritonModel();
const std::string& LocalizedModelPath() const
{
return localized_model_dir_->Path();
}
InferenceServer* Server() { return server_; }
bool AutoCompleteConfig() const { return auto_complete_config_; }
Status UpdateModelConfig(
const uint32_t config_version,
TRITONSERVER_Message* updated_config_message);
const std::shared_ptr<TritonBackend>& Backend() const { return backend_; }
const std::vector<std::unique_ptr<TritonModelInstance>>& Instances() const
{
return instances_;
}
void* State() { return state_; }
void SetState(void* state) { state_ = state; }
Status AddInstance(
std::unique_ptr<TritonModelInstance>&& instance, const bool passive);
private:
DISALLOW_COPY_AND_ASSIGN(TritonModel);
TritonModel(
InferenceServer* server,
const std::shared_ptr<LocalizedPath>& localized_model_dir,
const std::shared_ptr<TritonBackend>& backend,
const double min_compute_capability, const int64_t version,
const inference::ModelConfig& config, const bool auto_complete_config);
// Set the scheduler based on the model configuration. The scheduler
// can only be set once for a backend.
Status SetConfiguredScheduler();
// Merges the global backend configs with the specific
// backend configs.
static Status ResolveBackendConfigs(
const triton::common::BackendCmdlineConfigMap& backend_cmdline_config_map,
const std::string& backend_name,
triton::common::BackendCmdlineConfig& config);
// Sets defaults for some backend configurations when none are specified on
// the command line.
static Status SetBackendConfigDefaults(
triton::common::BackendCmdlineConfig& config);
Status Initialize();
Status WarmUp();
// The server object that owns this model. The model holds this as a
// raw pointer because the lifetime of the server is guaranteed to
// be longer than the lifetime of a model owned by the server.
InferenceServer* server_;
// The minimum supported compute capability on device.
const double min_compute_capability_;
// Whether the backend should attempt to auto-complete the model config.
const bool auto_complete_config_;
// The localized repo directory holding the model. If localization
// required creation of a temporary local copy then that copy will
// persist as along as this object is retained by this model.
std::shared_ptr<LocalizedPath> localized_model_dir_;
// Backend used by this model.
std::shared_ptr<TritonBackend> backend_;
// The model instances for this model.
std::vector<std::unique_ptr<TritonModelInstance>> instances_;
std::vector<std::unique_ptr<TritonModelInstance>> passive_instances_;
// Opaque state associated with this model.
void* state_;
};
}} // namespace triton::core
// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "backend_model_instance.h"
#ifndef _WIN32
#include <sys/resource.h>
#include <sys/syscall.h>
#include <unistd.h>
#endif
#include "backend_config.h"
#include "backend_model.h"
#include "cuda_utils.h"
#include "metrics.h"
#include "model_config.pb.h"
#include "numa_utils.h"
#include "server.h"
#include "shared_library.h"
#include "triton/common/logging.h"
#include "triton/common/nvtx.h"
#include "tritonserver_apis.h"
// For unknown reason, windows will not export the TRITONBACKEND_*
// functions declared with dllexport in tritonbackend.h. To get those
// functions exported it is (also?) necessary to mark the definitions
// in this file with dllexport as well.
#if defined(_MSC_VER)
#define TRITONAPI_DECLSPEC __declspec(dllexport)
#elif defined(__GNUC__)
#define TRITONAPI_DECLSPEC __attribute__((__visibility__("default")))
#else
#define TRITONAPI_DECLSPEC
#endif
namespace triton { namespace core {
namespace {
// Utilities for warmup feature
TRITONSERVER_Error*
WarmupResponseAlloc(
TRITONSERVER_ResponseAllocator* allocator, const char* tensor_name,
size_t byte_size, TRITONSERVER_MemoryType preferred_memory_type,
int64_t preferred_memory_type_id, void* userp, void** buffer,
void** buffer_userp, TRITONSERVER_MemoryType* actual_memory_type,
int64_t* actual_memory_type_id)
{
*buffer = malloc(byte_size);
if (*buffer != nullptr) {
*actual_memory_type = TRITONSERVER_MEMORY_CPU;
*actual_memory_type_id = 0;
return nullptr;
}
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL,
"failed to allocate output buffer for warmup.");
}
TRITONSERVER_Error*
WarmupResponseRelease(
TRITONSERVER_ResponseAllocator* allocator, void* buffer, void* buffer_userp,
size_t byte_size, TRITONSERVER_MemoryType memory_type,
int64_t memory_type_id)
{
free(buffer);
return nullptr;
}
ResponseAllocator warmup_allocator = ResponseAllocator(
WarmupResponseAlloc, WarmupResponseRelease, nullptr /* start_fn */);
void
WarmupResponseComplete(
TRITONSERVER_InferenceResponse* iresponse, const uint32_t flags,
void* userp)
{
auto res_pair = reinterpret_cast<
std::pair<std::promise<void>, std::vector<std::string>*>*>(userp);
if (iresponse != nullptr) {
auto err = TRITONSERVER_InferenceResponseError(iresponse);
if (err != nullptr) {
// The error vector is shared by all requests in the batch for now
static std::mutex res_mtx;
{
std::lock_guard<std::mutex> lk(res_mtx);
res_pair->second->emplace_back(TRITONSERVER_ErrorMessage(err));
}
TRITONSERVER_ErrorDelete(err);
}
// Just delete the response, warmup doesn't check for correctness
LOG_TRITONSERVER_ERROR(
TRITONSERVER_InferenceResponseDelete(iresponse),
"deleting warmup response");
}
// Last response
if ((flags & TRITONSERVER_RESPONSE_COMPLETE_FINAL) != 0) {
res_pair->first.set_value();
}
}
void
WarmupRequestComplete(
TRITONSERVER_InferenceRequest* request, const uint32_t flags, void* userp)
{
if ((flags & TRITONSERVER_REQUEST_RELEASE_ALL) != 0) {
// Don't need to release request here, it is managed in WarmupData
if (userp != nullptr) {
auto warmup_promise = reinterpret_cast<std::promise<void>*>(userp);
warmup_promise->set_value();
}
}
}
} // namespace
TritonModelInstance::TritonModelInstance(
TritonModel* model, const std::string& name, const size_t index,
const TRITONSERVER_InstanceGroupKind kind, const int32_t device_id,
const std::vector<std::string>& profile_names, const bool passive,
const triton::common::HostPolicyCmdlineConfig& host_policy,
const TritonServerMessage& host_policy_message,
const std::vector<SecondaryDevice>& secondary_devices)
: model_(model), name_(name), index_(index), kind_(kind),
device_id_(device_id), host_policy_(host_policy),
host_policy_message_(host_policy_message), profile_names_(profile_names),
passive_(passive), secondary_devices_(secondary_devices), state_(nullptr)
{
#ifdef TRITON_ENABLE_METRICS
if (Metrics::Enabled()) {
// Use an ID in the metric only for GPU instances. Otherwise use
// METRIC_REPORTER_ID_CPU to indicate no device should be reported in the
// metric.
const int id = (kind_ == TRITONSERVER_INSTANCEGROUPKIND_GPU)
? device_id_
: METRIC_REPORTER_ID_CPU;
MetricModelReporter::Create(
model_->Name(), model_->Version(), id, model_->Config().metric_tags(),
&reporter_);
}
#endif // TRITON_ENABLE_METRICS
}
TritonModelInstance::~TritonModelInstance()
{
if (triton_backend_thread_.get() != nullptr) {
triton_backend_thread_->StopBackendThread();
}
// Model finalization is optional...
if (model_->Backend()->ModelInstanceFiniFn() != nullptr) {
LOG_TRITONSERVER_ERROR(
model_->Backend()->ModelInstanceFiniFn()(
reinterpret_cast<TRITONBACKEND_ModelInstance*>(this)),
"failed finalizing model instance");
}
}
Status
TritonModelInstance::CreateInstances(
TritonModel* model,
const triton::common::BackendCmdlineConfigMap& backend_cmdline_config_map,
const triton::common::HostPolicyCmdlineConfigMap& host_policy_map,
const inference::ModelConfig& model_config, const bool device_blocking)
{
static triton::common::HostPolicyCmdlineConfig empty_host_policy;
// This structure is used to allocate TritonBackendThread to instances on same
// device for device blocking execution policy.
std::map<uint32_t, std::shared_ptr<TritonBackendThread>> device_to_thread_map;
for (const auto& group : model_config.instance_group()) {
std::vector<std::string> profile_names;
for (const auto& profile_name : group.profile()) {
profile_names.push_back(profile_name);
}
std::vector<SecondaryDevice> secondary_devices;
for (const auto& secondary_device : group.secondary_devices()) {
secondary_devices.emplace_back(
inference::
ModelInstanceGroup_SecondaryDevice_SecondaryDeviceKind_Name(
secondary_device.kind()),
secondary_device.device_id());
}
for (int32_t c = 0; c < group.count(); ++c) {
std::string instance_name{group.count() > 1
? group.name() + "_" + std::to_string(c)
: group.name()};
const bool passive = group.passive();
std::vector<std::tuple<
std::string, TRITONSERVER_InstanceGroupKind, int32_t,
const inference::ModelRateLimiter*>>
instance_setting;
if (group.kind() == inference::ModelInstanceGroup::KIND_CPU) {
instance_setting.emplace_back(
group.host_policy().empty() ? "cpu" : group.host_policy(),
TRITONSERVER_INSTANCEGROUPKIND_CPU, 0 /* device_id */,
&group.rate_limiter());
} else if (group.kind() == inference::ModelInstanceGroup::KIND_GPU) {
for (const int32_t device_id : group.gpus()) {
instance_setting.emplace_back(
group.host_policy().empty() ? ("gpu_" + std::to_string(device_id))
: group.host_policy(),
TRITONSERVER_INSTANCEGROUPKIND_GPU, device_id,
&group.rate_limiter());
}
} else if (group.kind() == inference::ModelInstanceGroup::KIND_MODEL) {
instance_setting.emplace_back(
group.host_policy().empty() ? "model" : group.host_policy(),
TRITONSERVER_INSTANCEGROUPKIND_MODEL, 0 /* device_id */,
&group.rate_limiter());
} else {
return Status(
Status::Code::INVALID_ARG,
std::string("instance_group kind ") +
ModelInstanceGroup_Kind_Name(group.kind()) + " not supported");
}
for (const auto is : instance_setting) {
const auto& kind = std::get<1>(is);
const auto& id = std::get<2>(is);
const std::string& policy_name = std::get<0>(is);
const triton::common::HostPolicyCmdlineConfig* host_policy;
const auto policy_it = host_policy_map.find(policy_name);
if (policy_it != host_policy_map.end()) {
host_policy = &policy_it->second;
} else {
host_policy = &empty_host_policy;
}
RETURN_IF_ERROR(SetNumaConfigOnThread(*host_policy));
auto err = CreateInstance(
model, instance_name, c, kind, id, profile_names, passive,
policy_name, *host_policy, *(std::get<3>(is)), device_blocking,
&device_to_thread_map, secondary_devices);
RETURN_IF_ERROR(ResetNumaMemoryPolicy());
RETURN_IF_ERROR(err);
// When deploying on GPU, we want to make sure the GPU memory usage
// is within allowed range, otherwise, stop the creation to ensure
// there is sufficient GPU memory for other use.
// We check the usage after loading the instance to better enforcing
// the limit. If we check before loading, we may create instance
// that occupies the rest of available memory which against the purpose
if (kind == TRITONSERVER_INSTANCEGROUPKIND_GPU) {
size_t free, total;
double memory_limit;
RETURN_IF_ERROR(GetDeviceMemoryInfo(id, &free, &total));
RETURN_IF_ERROR(BackendConfigurationModelLoadGpuFraction(
backend_cmdline_config_map, id, &memory_limit));
const size_t allow = total * memory_limit;
const size_t used = total - free;
if (used > allow) {
return Status(
Status::Code::UNAVAILABLE,
std::string("can not create model '") + instance_name +
"': memory limit set for " +
TRITONSERVER_InstanceGroupKindString(kind) + " " +
std::to_string(id) +
" has exceeded, model loading is rejected.");
}
}
}
}
}
return Status::Success;
}
Status
TritonModelInstance::CreateInstance(
TritonModel* model, const std::string& name, const size_t index,
const TRITONSERVER_InstanceGroupKind kind, const int32_t device_id,
const std::vector<std::string>& profile_names, const bool passive,
const std::string& host_policy_name,
const triton::common::HostPolicyCmdlineConfig& host_policy,
const inference::ModelRateLimiter& rate_limiter_config,
const bool device_blocking,
std::map<uint32_t, std::shared_ptr<TritonBackendThread>>*
device_to_thread_map,
const std::vector<SecondaryDevice>& secondary_devices)
{
// Create the JSON representation of the backend configuration.
triton::common::TritonJson::Value host_policy_json(
triton::common::TritonJson::ValueType::OBJECT);
triton::common::TritonJson::Value policy_setting_json(
host_policy_json, triton::common::TritonJson::ValueType::OBJECT);
for (const auto& pr : host_policy) {
RETURN_IF_ERROR(policy_setting_json.AddString(pr.first.c_str(), pr.second));
}
RETURN_IF_ERROR(host_policy_json.Add(
host_policy_name.c_str(), std::move(policy_setting_json)));
TritonServerMessage host_policy_message(host_policy_json);
std::unique_ptr<TritonModelInstance> local_instance(new TritonModelInstance(
model, name, index, kind, device_id, profile_names, passive, host_policy,
host_policy_message, secondary_devices));
TRITONBACKEND_ModelInstance* triton_instance =
reinterpret_cast<TRITONBACKEND_ModelInstance*>(local_instance.get());
// Instance initialization is optional... We must set set shared
// library path to point to the backend directory in case the
// backend library attempts to load additional shared libaries.
if (model->Backend()->ModelInstanceInitFn() != nullptr) {
std::unique_ptr<SharedLibrary> slib;
RETURN_IF_ERROR(SharedLibrary::Acquire(&slib));
RETURN_IF_ERROR(slib->SetLibraryDirectory(model->Backend()->Directory()));
TRITONSERVER_Error* err =
model->Backend()->ModelInstanceInitFn()(triton_instance);
RETURN_IF_ERROR(slib->ResetLibraryDirectory());
RETURN_IF_TRITONSERVER_ERROR(err);
}
if (!passive) {
RETURN_IF_ERROR(local_instance->GenerateWarmupData());
RETURN_IF_ERROR(model->Server()->GetRateLimiter()->RegisterModelInstance(
local_instance.get(), rate_limiter_config));
RETURN_IF_ERROR(local_instance->SetBackendThread(
kind, device_id, device_blocking, device_to_thread_map));
}
RETURN_IF_ERROR(model->AddInstance(std::move(local_instance), passive));
return Status::Success;
}
Status
TritonModelInstance::SetBackendThread(
const TRITONSERVER_InstanceGroupKind kind, const int32_t device_id,
const bool device_blocking,
std::map<uint32_t, std::shared_ptr<TritonBackendThread>>*
device_to_thread_map)
{
if (device_blocking && (kind == TRITONSERVER_INSTANCEGROUPKIND_GPU)) {
auto thread_it = device_to_thread_map->find(device_id);
if (thread_it != device_to_thread_map->end()) {
LOG_VERBOSE(1) << "Using already started backend thread for " << Name()
<< " on device " << device_id;
triton_backend_thread_ = thread_it->second;
}
}
if (triton_backend_thread_.get() == nullptr) {
std::unique_ptr<TritonBackendThread> local_backend_thread;
RETURN_IF_ERROR(TritonBackendThread::CreateBackendThread(
Name(), this, 0 /* nice */, device_id, &local_backend_thread));
triton_backend_thread_ = std::move(local_backend_thread);
device_to_thread_map->insert({device_id, triton_backend_thread_});
} else {
triton_backend_thread_->AddModelInstance(this);
}
RETURN_IF_ERROR(triton_backend_thread_->InitAndWarmUpModelInstance(this));
return Status::Success;
}
Status
TritonModelInstance::GenerateWarmupData()
{
warmup_samples_.clear();
for (const auto& warmup_setting : model_->Config().model_warmup()) {
if (warmup_setting.batch_size() == 0) {
LOG_VERBOSE(1) << "Skipping batch 0 warmup sample '"
<< warmup_setting.name() << "'";
continue;
}
LOG_VERBOSE(1) << "Generating warmup sample data for '"
<< warmup_setting.name() << "'";
// Two passes. First pass to get max byte size for synthetic
// data. Second pass to add original inputs and override inputs
// for control inputs.
int64_t max_zero_byte_size = 0;
int64_t max_random_byte_size = 0;
for (const auto& input_meta : warmup_setting.inputs()) {
auto element_count =
triton::common::GetElementCount(input_meta.second.dims());
if (element_count == -1) {
return Status(
Status::Code::INVALID_ARG,
"warmup setting expects all variable-size dimensions are specified "
"for input '" +
input_meta.first + "'");
}
int64_t batch_byte_size =
element_count *
triton::common::GetDataTypeByteSize(input_meta.second.data_type());
if (batch_byte_size == 0) {
batch_byte_size = element_count * sizeof(int32_t);
}
switch (input_meta.second.input_data_type_case()) {
case inference::ModelWarmup_Input::InputDataTypeCase::kZeroData:
max_zero_byte_size = std::max(batch_byte_size, max_zero_byte_size);
break;
case inference::ModelWarmup_Input::InputDataTypeCase::kRandomData: {
// Because Triton expects STRING type to be in special format
// (prepend 4 bytes to specify string length), so using zero data
// for simplicity (4 bytes * element count of zeros).
if (input_meta.second.data_type() ==
inference::DataType::TYPE_STRING) {
max_zero_byte_size = std::max(batch_byte_size, max_zero_byte_size);
} else {
max_random_byte_size =
std::max(batch_byte_size, max_random_byte_size);
}
break;
}
default:
break;
}
}
warmup_samples_.emplace_back(warmup_setting.name(), warmup_setting.count());
auto& warmup_data = warmup_samples_.back();
// Create buffers for synthetic data
TRITONSERVER_MemoryType type;
int64_t type_id;
warmup_data.zero_data_.reset(new AllocatedMemory(
max_zero_byte_size, TRITONSERVER_MEMORY_CPU_PINNED /* memory_type */,
0 /* memory_type_id */));
char* zero_buffer = warmup_data.zero_data_->MutableBuffer(&type, &type_id);
memset(zero_buffer, 0, max_zero_byte_size);
warmup_data.random_data_.reset(new AllocatedMemory(
max_random_byte_size, TRITONSERVER_MEMORY_CPU_PINNED /* memory_type */,
0 /* memory_type_id */));
char* random_buffer =
warmup_data.random_data_->MutableBuffer(&type, &type_id);
for (int64_t offset = 0; offset < max_random_byte_size; offset++) {
random_buffer[offset] = rand();
}
// Prepare the inference request for the specified sample, not using
// in-process C API because the request doesn't go through the same pipeline
// (i.e. no normalization / scheduler) so we need to prepare the request to
// the state just before calling instance execute function.
for (size_t cnt = 0; cnt < warmup_setting.batch_size(); cnt++) {
warmup_data.requests_.emplace_back(
new InferenceRequest(model_, model_->Version()));
auto& lrequest = warmup_data.requests_.back();
// Second pass to prepare original inputs.
std::vector<std::shared_ptr<InferenceRequest::Input>> input_sps;
for (const auto& input_meta : warmup_setting.inputs()) {
auto batch1_element_count =
triton::common::GetElementCount(input_meta.second.dims());
auto batch_byte_size =
batch1_element_count *
triton::common::GetDataTypeByteSize(input_meta.second.data_type());
if (batch_byte_size == 0) {
batch_byte_size = batch1_element_count * sizeof(int32_t);
}
const char* allocated_ptr;
switch (input_meta.second.input_data_type_case()) {
case inference::ModelWarmup_Input::InputDataTypeCase::kZeroData:
allocated_ptr = zero_buffer;
break;
case inference::ModelWarmup_Input::InputDataTypeCase::kRandomData: {
if (input_meta.second.data_type() ==
inference::DataType::TYPE_STRING) {
allocated_ptr = zero_buffer;
} else {
allocated_ptr = random_buffer;
}
break;
}
case inference::ModelWarmup_Input::InputDataTypeCase::
kInputDataFile: {
// For data provided from file, we can set buffer in first pass
warmup_data.provided_data_.emplace_back(new std::string());
auto input_data = warmup_data.provided_data_.back().get();
RETURN_IF_ERROR(ReadTextFile(
JoinPath({model_->LocalizedModelPath(), kWarmupDataFolder,
input_meta.second.input_data_file()}),
input_data));
if (input_meta.second.data_type() ==
inference::DataType::TYPE_STRING) {
batch_byte_size = input_data->size();
} else if (((size_t)batch_byte_size) > input_data->size()) {
return Status(
Status::Code::INVALID_ARG,
lrequest->LogRequest() + "warmup setting expects " +
std::to_string(batch_byte_size) +
" bytes, but the data "
"provided from " +
input_meta.second.input_data_file() + "only has " +
std::to_string(input_data->size()) + " bytes");
}
allocated_ptr = input_data->data();
break;
}
default:
return Status(
Status::Code::INVALID_ARG,
lrequest->LogRequest() + "warmup setting expects input '" +
input_meta.first + "' to have input_data_type set");
}
const inference::ModelInput* input_config;
bool is_original_input =
model_->GetInput(input_meta.first, &input_config).IsOk();
InferenceRequest::Input* input = nullptr;
std::vector<int64_t> input_meta_shape;
// Append batch size only if the model supports batching
// and not control inpt.
if ((model_->Config().max_batch_size() != 0) && is_original_input) {
input_meta_shape.push_back(1);
}
for (auto d : input_meta.second.dims()) {
input_meta_shape.push_back(d);
}
if (is_original_input) {
RETURN_IF_ERROR(lrequest->AddOriginalInput(
input_meta.first, input_meta.second.data_type(), input_meta_shape,
&input));
} else {
input_sps.emplace_back();
RETURN_IF_ERROR(lrequest->AddOverrideInput(
input_meta.first, input_meta.second.data_type(),
(model_->Config().max_batch_size() != 0 ? 1 : 0),
input_meta_shape, &input_sps.back()));
input = input_sps.back().get();
}
RETURN_IF_ERROR(input->AppendData(
allocated_ptr, batch_byte_size,
TRITONSERVER_MEMORY_CPU /* memory_type */, 0 /* memory_type_id */));
}
RETURN_IF_ERROR(lrequest->PrepareForInference());
// Override inputs must be added after PrepareForInference() is called
for (const auto& sp : input_sps) {
RETURN_IF_ERROR(lrequest->AddOverrideInput(sp));
}
}
}
return Status::Success;
}
void
TritonModelInstance::Schedule(
std::vector<std::unique_ptr<InferenceRequest>>&& requests,
const std::function<void()>& OnCompletion)
{
// Use a thread local vector to avoid needing to malloc each
// time an inference is run.
thread_local std::vector<TRITONBACKEND_Request*> triton_requests(1024);
triton_requests.clear();
for (auto& r : requests) {
// Load the input states for the inference request.
r->LoadInputStates();
triton_requests.push_back(
reinterpret_cast<TRITONBACKEND_Request*>(r.release()));
}
Execute(triton_requests);
OnCompletion();
}
Status
TritonModelInstance::Initialize()
{
RETURN_IF_ERROR(SetNumaConfigOnThread(HostPolicy()));
return Status::Success;
}
Status
TritonModelInstance::WarmUp()
{
// move samples to local variable for scoped cleanup
std::vector<triton::core::TritonModelInstance::WarmupData> lwarmup_samples;
lwarmup_samples.swap(warmup_samples_);
for (auto& sample : lwarmup_samples) {
for (size_t iteration = 1; iteration <= sample.count_; ++iteration) {
LOG_VERBOSE(1) << "model '" << sample.requests_.back()->ModelName()
<< "' instance " << Name() << " is running warmup sample '"
<< sample.sample_name_ << "' for iteration " << iteration;
// request/response complete is asynchronous so use promise to wait for
// completion. Also collects error message from the responses in a vector.
std::vector<std::promise<void>> request_complete(sample.requests_.size());
std::vector<std::string> response_errors;
std::vector<std::pair<std::promise<void>, std::vector<std::string>*>>
response_complete(sample.requests_.size());
std::vector<TRITONBACKEND_Request*> triton_requests;
for (size_t i = 0; i < sample.requests_.size(); ++i) {
auto& request = sample.requests_[i];
request->SetReleaseCallback(
WarmupRequestComplete, &request_complete[i]);
response_complete[i].second = &response_errors;
request->SetResponseCallback(
&warmup_allocator, nullptr, WarmupResponseComplete,
&response_complete[i]);
// Capture timestamp before run to avoid incorrect accumulation from
// sequential warmup runs
#ifdef TRITON_ENABLE_STATS
request->CaptureRequestStartNs();
#endif // TRITON_ENABLE_STATS
request->CaptureQueueStartNs();
triton_requests.push_back(
reinterpret_cast<TRITONBACKEND_Request*>(request.get()));
}
Execute(triton_requests);
// Wait for warmup sample to complete and check error
for (size_t i = 0; i < sample.requests_.size(); ++i) {
request_complete[i].get_future().get();
response_complete[i].first.get_future().get();
}
if (response_errors.size() != 0) {
std::string err_str =
"failed to run warmup sample '" + sample.sample_name_ + "': ";
for (const auto& error : response_errors) {
err_str += (error + "; ");
}
// End warmup as soon as there is failing sample
LOG_VERBOSE(1) << "model '" << sample.requests_.back()->ModelName()
<< "' instance " << Name()
<< " failed to run warmup sample '"
<< sample.sample_name_ << "'";
return Status(Status::Code::INVALID_ARG, err_str);
}
}
}
return Status::Success;
}
void
TritonModelInstance::Execute(
std::vector<TRITONBACKEND_Request*>& triton_requests)
{
TRITONBACKEND_ModelInstance* triton_model_instance =
reinterpret_cast<TRITONBACKEND_ModelInstance*>(this);
TritonBackend::TritonModelInstanceExecFn_t inst_exec_fn =
model_->Backend()->ModelInstanceExecFn();
// If there is an error then we retain ownership of 'requests'
// and must send error responses.
TRITONSERVER_Error* err = inst_exec_fn(
triton_model_instance, &triton_requests[0], triton_requests.size());
if (err != nullptr) {
Status status = Status(
TritonCodeToStatusCode(TRITONSERVER_ErrorCode(err)),
TRITONSERVER_ErrorMessage(err));
for (TRITONBACKEND_Request* tr : triton_requests) {
std::unique_ptr<InferenceRequest> ur(
reinterpret_cast<InferenceRequest*>(tr));
InferenceRequest::RespondIfError(ur, status, true /* release_requests */);
}
TRITONSERVER_ErrorDelete(err);
}
}
Status
TritonModelInstance::TritonBackendThread::CreateBackendThread(
const std::string name, TritonModelInstance* model_instance, const int nice,
const int32_t device_id,
std::unique_ptr<TritonBackendThread>* triton_backend_thread)
{
TritonBackendThread* raw_triton_backend_thread =
new TritonBackendThread(name, model_instance->Model());
std::unique_ptr<TritonBackendThread> runner(raw_triton_backend_thread);
runner->AddModelInstance(model_instance);
runner->backend_thread_ =
std::thread([raw_triton_backend_thread, nice, device_id]() {
raw_triton_backend_thread->BackendThread(nice, device_id);
});
triton_backend_thread->reset(runner.release());
return Status::Success;
}
void
TritonModelInstance::TritonBackendThread::AddModelInstance(
TritonModelInstance* model_instance)
{
model_instances_.push_back(model_instance);
}
Status
TritonModelInstance::TritonBackendThread::InitAndWarmUpModelInstance(
TritonModelInstance* model_instance)
{
// Initialize the instance on the backend thread
auto init_payload = model_->Server()->GetRateLimiter()->GetPayload(
Payload::Operation::INIT, model_instance);
RETURN_IF_ERROR(
model_->Server()->GetRateLimiter()->EnqueuePayload(model_, init_payload));
RETURN_IF_ERROR(init_payload->Wait());
// Warm-up the instance on the backend thread
auto warmup_payload = model_->Server()->GetRateLimiter()->GetPayload(
Payload::Operation::WARM_UP, model_instance);
RETURN_IF_ERROR(model_->Server()->GetRateLimiter()->EnqueuePayload(
model_, warmup_payload));
RETURN_IF_ERROR(warmup_payload->Wait());
return Status::Success;
}
TritonModelInstance::TritonBackendThread::TritonBackendThread(
const std::string& name, TritonModel* model)
: name_(name), model_(model)
{
}
TritonModelInstance::TritonBackendThread::~TritonBackendThread()
{
StopBackendThread();
}
void
TritonModelInstance::TritonBackendThread::StopBackendThread()
{
if (backend_thread_.joinable()) {
// Signal the backend thread to exit and then wait for it...
auto exit_payload = model_->Server()->GetRateLimiter()->GetPayload(
Payload::Operation::EXIT, model_instances_.back());
model_->Server()->GetRateLimiter()->EnqueuePayload(model_, exit_payload);
backend_thread_.join();
}
}
void
TritonModelInstance::TritonBackendThread::BackendThread(
const int nice, const int32_t device_id)
{
#ifndef _WIN32
if (setpriority(PRIO_PROCESS, syscall(SYS_gettid), nice) == 0) {
LOG_VERBOSE(1) << "Starting backend thread for " << name_ << " at nice "
<< nice << " on device " << device_id << "...";
} else {
LOG_VERBOSE(1) << "Starting backend thread for " << name_
<< " at default nice (requested nice " << nice << " failed)"
<< " on device " << device_id << "...";
}
#else
LOG_VERBOSE(1) << "Starting backend thread for " << name_
<< " at default nice on device " << device_id << "...";
#endif
bool should_exit = false;
while (!should_exit) {
std::shared_ptr<Payload> payload;
model_->Server()->GetRateLimiter()->DequeuePayload(
model_instances_, &payload);
NVTX_RANGE(nvtx_, "BackendThread " + name_);
payload->Execute(&should_exit);
model_instances_.push_back(payload->GetInstance());
// Release the payload to the RateLimiter
model_->Server()->GetRateLimiter()->PayloadRelease(payload);
}
LOG_VERBOSE(1) << "Stopping backend thread for " << name_ << "...";
}
extern "C" {
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceName(
TRITONBACKEND_ModelInstance* instance, const char** name)
{
TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
*name = ti->Name().c_str();
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceKind(
TRITONBACKEND_ModelInstance* instance, TRITONSERVER_InstanceGroupKind* kind)
{
TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
*kind = ti->Kind();
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceDeviceId(
TRITONBACKEND_ModelInstance* instance, int32_t* device_id)
{
TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
*device_id = ti->DeviceId();
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceHostPolicy(
TRITONBACKEND_ModelInstance* instance, TRITONSERVER_Message** host_policy)
{
TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
*host_policy = const_cast<TRITONSERVER_Message*>(
reinterpret_cast<const TRITONSERVER_Message*>(&ti->HostPolicyMessage()));
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceProfileCount(
TRITONBACKEND_ModelInstance* instance, uint32_t* count)
{
TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
*count = ti->Profiles().size();
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceProfileName(
TRITONBACKEND_ModelInstance* instance, const uint32_t index,
const char** profile_name)
{
*profile_name = nullptr;
TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
const auto& rprofiles = ti->Profiles();
if (index >= rprofiles.size()) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INVALID_ARG,
(std::string("out of bounds index ") + std::to_string(index) +
": instance is configured with " + std::to_string(rprofiles.size()) +
" profiles")
.c_str());
}
*profile_name = rprofiles[index].c_str();
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceSecondaryDeviceCount(
TRITONBACKEND_ModelInstance* instance, uint32_t* count)
{
TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
*count = ti->SecondaryDevices().size();
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceSecondaryDeviceProperties(
TRITONBACKEND_ModelInstance* instance, uint32_t index, const char** kind,
int64_t* id)
{
TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
const auto& rsecondarydevices = ti->SecondaryDevices();
if (index >= rsecondarydevices.size()) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INVALID_ARG,
(std::string("out of bounds index ") + std::to_string(index) +
": instance is configured with " +
std::to_string(rsecondarydevices.size()) + " secondary devices")
.c_str());
}
*kind = rsecondarydevices[index].kind_.c_str();
*id = rsecondarydevices[index].id_;
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceIsPassive(
TRITONBACKEND_ModelInstance* instance, bool* is_passive)
{
TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
*is_passive = ti->IsPassive();
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceModel(
TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Model** model)
{
TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
*model = reinterpret_cast<TRITONBACKEND_Model*>(ti->Model());
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceState(
TRITONBACKEND_ModelInstance* instance, void** state)
{
TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
*state = ti->State();
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceSetState(
TRITONBACKEND_ModelInstance* instance, void* state)
{
TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
ti->SetState(state);
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceReportStatistics(
TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Request* request,
const bool success, const uint64_t exec_start_ns,
const uint64_t compute_start_ns, const uint64_t compute_end_ns,
const uint64_t exec_end_ns)
{
#ifdef TRITON_ENABLE_STATS
TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
tr->ReportStatistics(
ti->MetricReporter(), success, exec_start_ns, compute_start_ns,
compute_end_ns, exec_end_ns);
#endif // TRITON_ENABLE_STATS
return nullptr; // success
}
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceReportBatchStatistics(
TRITONBACKEND_ModelInstance* instance, const uint64_t batch_size,
const uint64_t exec_start_ns, const uint64_t compute_start_ns,
const uint64_t compute_end_ns, const uint64_t exec_end_ns)
{
#ifdef TRITON_ENABLE_STATS
TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
ti->Model()->MutableStatsAggregator()->UpdateInferBatchStats(
ti->MetricReporter(), batch_size, exec_start_ns, compute_start_ns,
compute_end_ns, exec_end_ns);
#endif // TRITON_ENABLE_STATS
return nullptr; // success
}
} // extern C
}} // namespace triton::core
// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <functional>
#include <future>
#include <memory>
#include <string>
#include <thread>
#include "constants.h"
#include "memory.h"
#include "metric_model_reporter.h"
#include "model_config.pb.h"
#include "server_message.h"
#include "status.h"
#include "triton/common/sync_queue.h"
namespace triton { namespace core {
class TritonModel;
class InferenceRequest;
//
// Represents a model instance.
//
class TritonModelInstance {
public:
static Status CreateInstances(
TritonModel* model,
const triton::common::BackendCmdlineConfigMap& backend_cmdline_config_map,
const triton::common::HostPolicyCmdlineConfigMap& host_policy_map,
const inference::ModelConfig& model_config, const bool device_blocking);
~TritonModelInstance();
const std::string& Name() const { return name_; }
size_t Index() const { return index_; }
TRITONSERVER_InstanceGroupKind Kind() const { return kind_; }
int32_t DeviceId() const { return device_id_; }
const triton::common::HostPolicyCmdlineConfig& HostPolicy() const
{
return host_policy_;
}
const TritonServerMessage& HostPolicyMessage() const
{
return host_policy_message_;
}
bool IsPassive() const { return passive_; }
const std::vector<std::string>& Profiles() const { return profile_names_; }
struct SecondaryDevice {
SecondaryDevice(const std::string kind, const int64_t id)
: kind_(kind), id_(id)
{
}
const std::string kind_;
const int64_t id_;
};
const std::vector<SecondaryDevice>& SecondaryDevices() const
{
return secondary_devices_;
}
Status Initialize();
Status WarmUp();
void Schedule(
std::vector<std::unique_ptr<InferenceRequest>>&& requests,
const std::function<void()>& OnCompletion);
TritonModel* Model() const { return model_; }
void* State() { return state_; }
void SetState(void* state) { state_ = state; }
MetricModelReporter* MetricReporter() const { return reporter_.get(); }
private:
DISALLOW_COPY_AND_ASSIGN(TritonModelInstance);
class TritonBackendThread;
TritonModelInstance(
TritonModel* model, const std::string& name, const size_t index,
const TRITONSERVER_InstanceGroupKind kind, const int32_t device_id,
const std::vector<std::string>& profile_names, const bool passive,
const triton::common::HostPolicyCmdlineConfig& host_policy,
const TritonServerMessage& host_policy_message,
const std::vector<SecondaryDevice>& secondary_devices);
static Status CreateInstance(
TritonModel* model, const std::string& name, const size_t index,
const TRITONSERVER_InstanceGroupKind kind, const int32_t device_id,
const std::vector<std::string>& profile_names, const bool passive,
const std::string& host_policy_name,
const triton::common::HostPolicyCmdlineConfig& host_policy,
const inference::ModelRateLimiter& rate_limiter_config,
const bool device_blocking,
std::map<uint32_t, std::shared_ptr<TritonBackendThread>>*
device_to_thread_map,
const std::vector<SecondaryDevice>& secondary_devices);
Status SetBackendThread(
const TRITONSERVER_InstanceGroupKind kind, const int32_t device_id,
const bool device_blocking,
std::map<uint32_t, std::shared_ptr<TritonBackendThread>>*
device_to_thread_map);
Status GenerateWarmupData();
void Execute(std::vector<TRITONBACKEND_Request*>& triton_requests);
class TritonBackendThread {
public:
static Status CreateBackendThread(
const std::string name, TritonModelInstance* model, const int nice,
const int32_t device_id,
std::unique_ptr<TritonBackendThread>* triton_backend_thread);
void AddModelInstance(TritonModelInstance* model_instance);
Status InitAndWarmUpModelInstance(TritonModelInstance* model_instance);
void StopBackendThread();
~TritonBackendThread();
private:
TritonBackendThread(const std::string& name, TritonModel* model);
void BackendThread(const int nice, const int32_t device_id);
std::string name_;
TritonModel* model_;
std::deque<TritonModelInstance*> model_instances_;
std::thread backend_thread_;
std::atomic<bool> backend_thread_exit_;
};
std::shared_ptr<TritonBackendThread> triton_backend_thread_;
struct WarmupData {
WarmupData(const std::string& sample_name, const size_t count)
: sample_name_(sample_name), count_(std::max(count, size_t{1}))
{
}
std::string sample_name_;
size_t count_;
// Using a batch of requests to satisfy batch size, this provides better
// alignment on the batch expected by the model, especially for sequence
// model.
std::vector<std::unique_ptr<InferenceRequest>> requests_;
// Placeholder for input data
std::unique_ptr<AllocatedMemory> zero_data_;
std::unique_ptr<AllocatedMemory> random_data_;
std::vector<std::unique_ptr<std::string>> provided_data_;
};
std::vector<WarmupData> warmup_samples_;
// The TritonModel object that owns this instance. The instance
// holds this as a raw pointer because the lifetime of the model is
// guaranteed to be longer than the lifetime of an instance owned by the
// model.
TritonModel* model_;
std::string name_;
size_t index_;
// For CPU device_id_ is always 0. For GPU device_id_ indicates the
// GPU device to be used by the instance.
TRITONSERVER_InstanceGroupKind kind_;
int32_t device_id_;
const triton::common::HostPolicyCmdlineConfig host_policy_;
TritonServerMessage host_policy_message_;
std::vector<std::string> profile_names_;
bool passive_;
std::vector<SecondaryDevice> secondary_devices_;
// Reporter for metrics, or nullptr if no metrics should be reported
std::shared_ptr<MetricModelReporter> reporter_;
// Opaque state associated with this model instance.
void* state_;
};
}} // namespace triton::core
// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "buffer_attributes.h"
#include <cstring>
#include "constants.h"
namespace triton { namespace core {
void
BufferAttributes::SetByteSize(const size_t& byte_size)
{
byte_size_ = byte_size;
}
void
BufferAttributes::SetMemoryType(const TRITONSERVER_MemoryType& memory_type)
{
memory_type_ = memory_type;
}
void
BufferAttributes::SetMemoryTypeId(const int64_t& memory_type_id)
{
memory_type_id_ = memory_type_id;
}
void
BufferAttributes::SetCudaIpcHandle(void* cuda_ipc_handle)
{
char* lcuda_ipc_handle = reinterpret_cast<char*>(cuda_ipc_handle);
cuda_ipc_handle_.clear();
std::copy(
lcuda_ipc_handle, lcuda_ipc_handle + CUDA_IPC_STRUCT_SIZE,
std::back_inserter(cuda_ipc_handle_));
}
void*
BufferAttributes::CudaIpcHandle()
{
if (cuda_ipc_handle_.empty()) {
return nullptr;
} else {
return reinterpret_cast<void*>(cuda_ipc_handle_.data());
}
}
size_t
BufferAttributes::ByteSize() const
{
return byte_size_;
}
TRITONSERVER_MemoryType
BufferAttributes::MemoryType() const
{
return memory_type_;
}
int64_t
BufferAttributes::MemoryTypeId() const
{
return memory_type_id_;
}
BufferAttributes::BufferAttributes(
size_t byte_size, TRITONSERVER_MemoryType memory_type,
int64_t memory_type_id, char* cuda_ipc_handle)
: byte_size_(byte_size), memory_type_(memory_type),
memory_type_id_(memory_type_id)
{
// cuda ipc handle size
cuda_ipc_handle_.reserve(CUDA_IPC_STRUCT_SIZE);
if (cuda_ipc_handle != nullptr) {
std::copy(
cuda_ipc_handle, cuda_ipc_handle + CUDA_IPC_STRUCT_SIZE,
std::back_inserter(cuda_ipc_handle_));
}
}
}} // namespace triton::core
// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <iterator>
#include <vector>
#include "tritonserver_apis.h"
#pragma once
namespace triton { namespace core {
//
// A class to hold information about the buffer allocation.
//
class BufferAttributes {
public:
BufferAttributes(
size_t byte_size, TRITONSERVER_MemoryType memory_type,
int64_t memory_type_id, char cuda_ipc_handle[64]);
BufferAttributes()
{
memory_type_ = TRITONSERVER_MEMORY_CPU;
memory_type_id_ = 0;
cuda_ipc_handle_.reserve(64);
}
// Set the buffer byte size
void SetByteSize(const size_t& byte_size);
// Set the buffer memory_type
void SetMemoryType(const TRITONSERVER_MemoryType& memory_type);
// Set the buffer memory type id
void SetMemoryTypeId(const int64_t& memory_type_id);
// Set the cuda ipc handle
void SetCudaIpcHandle(void* cuda_ipc_handle);
// Get the cuda ipc handle
void* CudaIpcHandle();
// Get the byte size
size_t ByteSize() const;
// Get the memory type
TRITONSERVER_MemoryType MemoryType() const;
// Get the memory type id
int64_t MemoryTypeId() const;
private:
size_t byte_size_;
TRITONSERVER_MemoryType memory_type_;
int64_t memory_type_id_;
std::vector<char> cuda_ipc_handle_;
};
}} // namespace triton::core
// Copyright 2018-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <stdint.h>
namespace triton { namespace core {
constexpr char kInferHeaderContentLengthHTTPHeader[] =
"Inference-Header-Content-Length";
constexpr char kAcceptEncodingHTTPHeader[] = "Accept-Encoding";
constexpr char kContentEncodingHTTPHeader[] = "Content-Encoding";
constexpr char kContentTypeHeader[] = "Content-Type";
constexpr char kContentLengthHeader[] = "Content-Length";
constexpr char kTensorFlowGraphDefPlatform[] = "tensorflow_graphdef";
constexpr char kTensorFlowSavedModelPlatform[] = "tensorflow_savedmodel";
constexpr char kTensorFlowGraphDefFilename[] = "model.graphdef";
constexpr char kTensorFlowSavedModelFilename[] = "model.savedmodel";
constexpr char kTensorFlowBackend[] = "tensorflow";
constexpr char kTensorRTPlanPlatform[] = "tensorrt_plan";
constexpr char kTensorRTPlanFilename[] = "model.plan";
constexpr char kTensorRTBackend[] = "tensorrt";
constexpr char kOnnxRuntimeOnnxPlatform[] = "onnxruntime_onnx";
constexpr char kOnnxRuntimeOnnxFilename[] = "model.onnx";
constexpr char kOnnxRuntimeBackend[] = "onnxruntime";
constexpr char kOpenVINORuntimeOpenVINOFilename[] = "model.xml";
constexpr char kOpenVINORuntimeBackend[] = "openvino";
constexpr char kPyTorchLibTorchPlatform[] = "pytorch_libtorch";
constexpr char kPyTorchLibTorchFilename[] = "model.pt";
constexpr char kPyTorchBackend[] = "pytorch";
constexpr char kPythonFilename[] = "model.py";
constexpr char kPythonBackend[] = "python";
#ifdef TRITON_ENABLE_ENSEMBLE
constexpr char kEnsemblePlatform[] = "ensemble";
#endif // TRITON_ENABLE_ENSEMBLE
constexpr char kTensorRTExecutionAccelerator[] = "tensorrt";
constexpr char kOpenVINOExecutionAccelerator[] = "openvino";
constexpr char kGPUIOExecutionAccelerator[] = "gpu_io";
constexpr char kAutoMixedPrecisionExecutionAccelerator[] =
"auto_mixed_precision";
constexpr char kModelConfigPbTxt[] = "config.pbtxt";
constexpr char kMetricsLabelModelName[] = "model";
constexpr char kMetricsLabelModelVersion[] = "version";
constexpr char kMetricsLabelGpuUuid[] = "gpu_uuid";
constexpr char kWarmupDataFolder[] = "warmup";
constexpr char kInitialStateFolder[] = "initial_state";
constexpr uint64_t NANOS_PER_SECOND = 1000000000;
constexpr uint64_t NANOS_PER_MILLIS = 1000000;
constexpr int MAX_GRPC_MESSAGE_SIZE = INT32_MAX;
constexpr uint64_t SEQUENCE_IDLE_DEFAULT_MICROSECONDS = 1000 * 1000;
constexpr size_t STRING_CORRELATION_ID_MAX_LENGTH_BYTES = 128;
constexpr size_t CUDA_IPC_STRUCT_SIZE = 64;
#ifdef TRITON_ENABLE_METRICS
// MetricModelReporter expects a device ID for GPUs, but we reuse this device
// ID for other metrics as well such as for CPU and Response Cache metrics
constexpr int METRIC_REPORTER_ID_CPU = -1;
constexpr int METRIC_REPORTER_ID_RESPONSE_CACHE = -2;
#endif
#define TIMESPEC_TO_NANOS(TS) \
((TS).tv_sec * triton::core::NANOS_PER_SECOND + (TS).tv_nsec)
#define TIMESPEC_TO_MILLIS(TS) \
(TIMESPEC_TO_NANOS(TS) / triton::core::NANOS_PER_MILLIS)
#define DISALLOW_MOVE(TypeName) TypeName(Context&& o) = delete;
#define DISALLOW_COPY(TypeName) TypeName(const TypeName&) = delete;
#define DISALLOW_ASSIGN(TypeName) void operator=(const TypeName&) = delete;
#define DISALLOW_COPY_AND_ASSIGN(TypeName) \
DISALLOW_COPY(TypeName) \
DISALLOW_ASSIGN(TypeName)
}} // namespace triton::core
// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
#include "cuda_memory_manager.h"
#include <cnmem.h>
#include <string.h>
#include <set>
#include "cuda_utils.h"
#include "triton/common/logging.h"
namespace {
#define RETURN_IF_CNMEM_ERROR(S, MSG) \
do { \
auto status__ = (S); \
if (status__ != CNMEM_STATUS_SUCCESS) { \
return Status( \
Status::Code::INTERNAL, \
(MSG) + ": " + cnmemGetErrorString(status__)); \
} \
} while (false)
std::string
PointerToString(void* ptr)
{
std::stringstream ss;
ss << ptr;
return ss.str();
}
} // namespace
namespace triton { namespace core {
std::unique_ptr<CudaMemoryManager> CudaMemoryManager::instance_;
std::mutex CudaMemoryManager::instance_mu_;
CudaMemoryManager::~CudaMemoryManager()
{
if (has_allocation_) {
auto status = cnmemFinalize();
if (status != CNMEM_STATUS_SUCCESS) {
LOG_ERROR << "Failed to finalize CUDA memory manager: [" << status << "] "
<< cnmemGetErrorString(status);
}
}
}
void
CudaMemoryManager::Reset()
{
std::lock_guard<std::mutex> lock(instance_mu_);
instance_.reset();
}
Status
CudaMemoryManager::Create(const CudaMemoryManager::Options& options)
{
// Ensure thread-safe creation of CUDA memory pool
std::lock_guard<std::mutex> lock(instance_mu_);
if (instance_ != nullptr) {
LOG_WARNING << "New CUDA memory pools could not be created since they "
"already exists";
return Status::Success;
}
std::set<int> supported_gpus;
auto status = GetSupportedGPUs(
&supported_gpus, options.min_supported_compute_capability_);
if (status.IsOk()) {
std::vector<cnmemDevice_t> devices;
for (auto gpu : supported_gpus) {
const auto it = options.memory_pool_byte_size_.find(gpu);
if ((it != options.memory_pool_byte_size_.end()) && (it->second != 0)) {
devices.emplace_back();
auto& device = devices.back();
memset(&device, 0, sizeof(device));
device.device = gpu;
device.size = it->second;
LOG_INFO << "CUDA memory pool is created on device " << device.device
<< " with size " << device.size;
}
}
if (!devices.empty()) {
RETURN_IF_CNMEM_ERROR(
cnmemInit(devices.size(), devices.data(), CNMEM_FLAGS_CANNOT_GROW),
std::string("Failed to finalize CUDA memory manager"));
} else {
LOG_INFO << "CUDA memory pool disabled";
}
// Use to finalize CNMeM properly when out of scope
instance_.reset(new CudaMemoryManager(!devices.empty()));
} else {
return Status(
status.ErrorCode(),
"Failed to initialize CUDA memory manager: " + status.Message());
}
return Status::Success;
}
Status
CudaMemoryManager::Alloc(void** ptr, uint64_t size, int64_t device_id)
{
if (instance_ == nullptr) {
return Status(
Status::Code::UNAVAILABLE, "CudaMemoryManager has not been created");
} else if (!instance_->has_allocation_) {
return Status(
Status::Code::UNAVAILABLE,
"CudaMemoryManager has no preallocated CUDA memory");
}
int current_device;
RETURN_IF_CUDA_ERR(
cudaGetDevice(&current_device), std::string("Failed to get device"));
bool overridden = (current_device != device_id);
if (overridden) {
RETURN_IF_CUDA_ERR(
cudaSetDevice(device_id), std::string("Failed to set device"));
}
// Defer returning error to make sure the device is recovered
auto err = cnmemMalloc(ptr, size, nullptr);
if (overridden) {
cudaSetDevice(current_device);
}
RETURN_IF_CNMEM_ERROR(
err, std::string("Failed to allocate CUDA memory with byte size ") +
std::to_string(size) + " on GPU " + std::to_string(device_id));
return Status::Success;
}
Status
CudaMemoryManager::Free(void* ptr, int64_t device_id)
{
if (instance_ == nullptr) {
return Status(
Status::Code::UNAVAILABLE, "CudaMemoryManager has not been created");
} else if (!instance_->has_allocation_) {
return Status(
Status::Code::UNAVAILABLE,
"CudaMemoryManager has no preallocated CUDA memory");
}
int current_device;
RETURN_IF_CUDA_ERR(
cudaGetDevice(&current_device), std::string("Failed to get device"));
bool overridden = (current_device != device_id);
if (overridden) {
RETURN_IF_CUDA_ERR(
cudaSetDevice(device_id), std::string("Failed to set device"));
}
// Defer returning error to make sure the device is recovered
auto err = cnmemFree(ptr, nullptr);
if (overridden) {
cudaSetDevice(current_device);
}
RETURN_IF_CNMEM_ERROR(
err, std::string("Failed to deallocate CUDA memory at address ") +
PointerToString(ptr) + " on GPU " + std::to_string(device_id));
return Status::Success;
}
}} // namespace triton::core
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
#pragma once
#include <map>
#include <memory>
#include <mutex>
#include "status.h"
namespace triton { namespace core {
// This is a singleton class responsible for maintaining CUDA memory pool
// used by the inference server. CUDA memory allocations and deallocations
// must be requested via functions provided by this class.
class CudaMemoryManager {
public:
// Options to configure CUDA memory manager.
struct Options {
Options(double cc = 6.0, const std::map<int, uint64_t>& s = {})
: min_supported_compute_capability_(cc), memory_pool_byte_size_(s)
{
}
// The minimum compute capability of the supported devices.
double min_supported_compute_capability_;
// The size of CUDA memory reserved for the specified devices.
// The memory size will be rounded up to align with
// the default granularity (512 bytes).
// No memory will be reserved for devices that is not listed.
std::map<int, uint64_t> memory_pool_byte_size_;
};
~CudaMemoryManager();
// Create the memory manager based on 'options' specified.
// Return Status object indicating success or failure.
static Status Create(const Options& options);
// Allocate CUDA memory on GPU 'device_id' with
// the requested 'size' and return the pointer in 'ptr'.
// Return Status object indicating success or failure.
static Status Alloc(void** ptr, uint64_t size, int64_t device_id);
// Free the memory allocated by the memory manager on 'device_id'.
// Return Status object indicating success or failure.
static Status Free(void* ptr, int64_t device_id);
protected:
// Provide explicit control on the lifecycle of the CUDA memory manager,
// for testing only.
static void Reset();
private:
CudaMemoryManager(bool has_allocation) : has_allocation_(has_allocation) {}
bool has_allocation_;
static std::unique_ptr<CudaMemoryManager> instance_;
static std::mutex instance_mu_;
};
}} // namespace triton::core
// Copyright 2019-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "cuda_utils.h"
#include "model_config_utils.h"
#include "triton/common/nvtx.h"
namespace triton { namespace core {
#ifdef TRITON_ENABLE_GPU
void CUDART_CB
MemcpyHost(void* args)
{
auto* copy_params = reinterpret_cast<CopyParams*>(args);
memcpy(copy_params->dst_, copy_params->src_, copy_params->byte_size_);
delete copy_params;
}
#endif // TRITON_ENABLE_GPU
Status
GetDeviceMemoryInfo(const int device_id, size_t* free, size_t* total)
{
*free = 0;
*total = 0;
#ifdef TRITON_ENABLE_GPU
// Make sure that correct device is set before creating stream and
// then restore the device to what was set by the caller.
int current_device;
auto cuerr = cudaGetDevice(&current_device);
bool overridden = false;
if (cuerr == cudaSuccess) {
overridden = (current_device != device_id);
if (overridden) {
cuerr = cudaSetDevice(device_id);
}
}
if (cuerr == cudaSuccess) {
cuerr = cudaMemGetInfo(free, total);
}
if (overridden) {
cudaSetDevice(current_device);
}
if (cuerr != cudaSuccess) {
return Status(
Status::Code::INTERNAL,
(std::string("unable to get memory info for device ") +
std::to_string(device_id) + ": " + cudaGetErrorString(cuerr)));
}
#endif // TRITON_ENABLE_GPU
return Status::Success;
}
Status
EnablePeerAccess(const double min_compute_capability)
{
#ifdef TRITON_ENABLE_GPU
// If we can't enable peer access for one device pair, the best we can
// do is skipping it...
std::set<int> supported_gpus;
bool all_enabled = false;
if (GetSupportedGPUs(&supported_gpus, min_compute_capability).IsOk()) {
all_enabled = true;
int can_access_peer = false;
for (const auto& host : supported_gpus) {
auto cuerr = cudaSetDevice(host);
if (cuerr == cudaSuccess) {
for (const auto& peer : supported_gpus) {
if (host == peer) {
continue;
}
cuerr = cudaDeviceCanAccessPeer(&can_access_peer, host, peer);
if ((cuerr == cudaSuccess) && (can_access_peer == 1)) {
cuerr = cudaDeviceEnablePeerAccess(peer, 0);
}
all_enabled &= ((cuerr == cudaSuccess) && (can_access_peer == 1));
}
}
}
}
if (!all_enabled) {
return Status(
Status::Code::UNSUPPORTED,
"failed to enable peer access for some device pairs");
}
#endif // TRITON_ENABLE_GPU
return Status::Success;
}
Status
CopyBuffer(
const std::string& msg, const TRITONSERVER_MemoryType src_memory_type,
const int64_t src_memory_type_id,
const TRITONSERVER_MemoryType dst_memory_type,
const int64_t dst_memory_type_id, const size_t byte_size, const void* src,
void* dst, cudaStream_t cuda_stream, bool* cuda_used, bool copy_on_stream)
{
NVTX_RANGE(nvtx_, "CopyBuffer");
*cuda_used = false;
// For CUDA memcpy, all host to host copy will be blocked in respect to the
// host, so use memcpy() directly. In this case, need to be careful on whether
// the src buffer is valid.
if ((src_memory_type != TRITONSERVER_MEMORY_GPU) &&
(dst_memory_type != TRITONSERVER_MEMORY_GPU)) {
#ifdef TRITON_ENABLE_GPU
if (copy_on_stream) {
auto params = new CopyParams(dst, src, byte_size);
cudaLaunchHostFunc(
cuda_stream, MemcpyHost, reinterpret_cast<void*>(params));
*cuda_used = true;
} else {
memcpy(dst, src, byte_size);
}
#else
memcpy(dst, src, byte_size);
#endif // TRITON_ENABLE_GPU
} else {
#ifdef TRITON_ENABLE_GPU
RETURN_IF_CUDA_ERR(
cudaMemcpyAsync(dst, src, byte_size, cudaMemcpyDefault, cuda_stream),
msg + ": failed to perform CUDA copy");
*cuda_used = true;
#else
return Status(
Status::Code::INTERNAL,
msg + ": try to use CUDA copy while GPU is not supported");
#endif // TRITON_ENABLE_GPU
}
return Status::Success;
}
void
CopyBufferHandler(
const std::string& msg, const TRITONSERVER_MemoryType src_memory_type,
const int64_t src_memory_type_id,
const TRITONSERVER_MemoryType dst_memory_type,
const int64_t dst_memory_type_id, const size_t byte_size, const void* src,
void* dst, cudaStream_t cuda_stream, void* response_ptr,
triton::common::SyncQueue<std::tuple<Status, bool, void*>>*
completion_queue)
{
bool cuda_used = false;
Status status = CopyBuffer(
msg, src_memory_type, src_memory_type_id, dst_memory_type,
dst_memory_type_id, byte_size, src, dst, cuda_stream, &cuda_used);
completion_queue->Put(std::make_tuple(status, cuda_used, response_ptr));
}
#ifdef TRITON_ENABLE_GPU
Status
CheckGPUCompatibility(const int gpu_id, const double min_compute_capability)
{
// Query the compute capability from the device
cudaDeviceProp cuprops;
cudaError_t cuerr = cudaGetDeviceProperties(&cuprops, gpu_id);
if (cuerr != cudaSuccess) {
return Status(
Status::Code::INTERNAL,
"unable to get CUDA device properties for GPU ID" +
std::to_string(gpu_id) + ": " + cudaGetErrorString(cuerr));
}
double compute_compability = cuprops.major + (cuprops.minor / 10.0);
if ((compute_compability > min_compute_capability) ||
(abs(compute_compability - min_compute_capability) < 0.01)) {
return Status::Success;
} else {
return Status(
Status::Code::UNSUPPORTED,
"gpu " + std::to_string(gpu_id) + " has compute capability '" +
std::to_string(cuprops.major) + "." +
std::to_string(cuprops.minor) +
"' which is less than the minimum supported of '" +
std::to_string(min_compute_capability) + "'");
}
}
Status
GetSupportedGPUs(
std::set<int>* supported_gpus, const double min_compute_capability)
{
// Make sure set is empty before starting
supported_gpus->clear();
int device_cnt;
cudaError_t cuerr = cudaGetDeviceCount(&device_cnt);
if ((cuerr == cudaErrorNoDevice) || (cuerr == cudaErrorInsufficientDriver)) {
device_cnt = 0;
} else if (cuerr != cudaSuccess) {
return Status(
Status::Code::INTERNAL, "unable to get number of CUDA devices: " +
std::string(cudaGetErrorString(cuerr)));
}
// populates supported_gpus
for (int gpu_id = 0; gpu_id < device_cnt; gpu_id++) {
Status status = CheckGPUCompatibility(gpu_id, min_compute_capability);
if (status.IsOk()) {
supported_gpus->insert(gpu_id);
}
}
return Status::Success;
}
Status
SupportsIntegratedZeroCopy(const int gpu_id, bool* zero_copy_support)
{
// Query the device to check if integrated
cudaDeviceProp cuprops;
cudaError_t cuerr = cudaGetDeviceProperties(&cuprops, gpu_id);
if (cuerr != cudaSuccess) {
return Status(
Status::Code::INTERNAL,
"unable to get CUDA device properties for GPU ID" +
std::to_string(gpu_id) + ": " + cudaGetErrorString(cuerr));
}
// Zero-copy supported only on integrated GPU when it can map host memory
if (cuprops.integrated && cuprops.canMapHostMemory) {
*zero_copy_support = true;
} else {
*zero_copy_support = false;
}
return Status::Success;
}
#endif
}} // namespace triton::core
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment