Commit 0a21fff9 authored by xiabo's avatar xiabo
Browse files

Adapt to 0.1.0

parent 9484fd1c
# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
{
global:
TRITONBACKEND_*;
local: *;
};
// Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "triton/backend/backend_common.h"
#include "triton/backend/backend_input_collector.h"
#include "triton/backend/backend_model.h"
#include "triton/backend/backend_model_instance.h"
#include "triton/backend/backend_output_responder.h"
#include "triton/core/tritonbackend.h"
namespace triton { namespace backend { namespace recommended {
//
// Backend that demonstrates the TRITONBACKEND API. This backend works
// for any model that has 1 input with any datatype and any shape and
// 1 output with the same shape and datatype as the input. The backend
// supports both batching and non-batching models.
//
// For each batch of requests, the backend returns the input tensor
// value in the output tensor.
//
/////////////
extern "C" {
// Triton calls TRITONBACKEND_Initialize when a backend is loaded into
// Triton to allow the backend to create and initialize any state that
// is intended to be shared across all models and model instances that
// use the backend. The backend should also verify version
// compatibility with Triton in this function.
//
TRITONSERVER_Error*
TRITONBACKEND_Initialize(TRITONBACKEND_Backend* backend)
{
const char* cname;
RETURN_IF_ERROR(TRITONBACKEND_BackendName(backend, &cname));
std::string name(cname);
LOG_MESSAGE(
TRITONSERVER_LOG_INFO,
(std::string("TRITONBACKEND_Initialize: ") + name).c_str());
// Check the backend API version that Triton supports vs. what this
// backend was compiled against. Make sure that the Triton major
// version is the same and the minor version is >= what this backend
// uses.
uint32_t api_version_major, api_version_minor;
RETURN_IF_ERROR(
TRITONBACKEND_ApiVersion(&api_version_major, &api_version_minor));
LOG_MESSAGE(
TRITONSERVER_LOG_INFO,
(std::string("Triton TRITONBACKEND API version: ") +
std::to_string(api_version_major) + "." +
std::to_string(api_version_minor))
.c_str());
LOG_MESSAGE(
TRITONSERVER_LOG_INFO,
(std::string("'") + name + "' TRITONBACKEND API version: " +
std::to_string(TRITONBACKEND_API_VERSION_MAJOR) + "." +
std::to_string(TRITONBACKEND_API_VERSION_MINOR))
.c_str());
if ((api_version_major != TRITONBACKEND_API_VERSION_MAJOR) ||
(api_version_minor < TRITONBACKEND_API_VERSION_MINOR)) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_UNSUPPORTED,
"triton backend API version does not support this backend");
}
// The backend configuration may contain information needed by the
// backend, such as tritonserver command-line arguments. This
// backend doesn't use any such configuration but for this example
// print whatever is available.
TRITONSERVER_Message* backend_config_message;
RETURN_IF_ERROR(
TRITONBACKEND_BackendConfig(backend, &backend_config_message));
const char* buffer;
size_t byte_size;
RETURN_IF_ERROR(TRITONSERVER_MessageSerializeToJson(
backend_config_message, &buffer, &byte_size));
LOG_MESSAGE(
TRITONSERVER_LOG_INFO,
(std::string("backend configuration:\n") + buffer).c_str());
// This backend does not require any "global" state but as an
// example create a string to demonstrate.
std::string* state = new std::string("backend state");
RETURN_IF_ERROR(
TRITONBACKEND_BackendSetState(backend, reinterpret_cast<void*>(state)));
return nullptr; // success
}
// Triton calls TRITONBACKEND_Finalize when a backend is no longer
// needed.
//
TRITONSERVER_Error*
TRITONBACKEND_Finalize(TRITONBACKEND_Backend* backend)
{
// Delete the "global" state associated with the backend.
void* vstate;
RETURN_IF_ERROR(TRITONBACKEND_BackendState(backend, &vstate));
std::string* state = reinterpret_cast<std::string*>(vstate);
LOG_MESSAGE(
TRITONSERVER_LOG_INFO,
(std::string("TRITONBACKEND_Finalize: state is '") + *state + "'")
.c_str());
delete state;
return nullptr; // success
}
} // extern "C"
/////////////
//
// ModelState
//
// State associated with a model that is using this backend. An object
// of this class is created and associated with each
// TRITONBACKEND_Model. ModelState is derived from BackendModel class
// provided in the backend utilities that provides many common
// functions.
//
class ModelState : public BackendModel {
public:
static TRITONSERVER_Error* Create(
TRITONBACKEND_Model* triton_model, ModelState** state);
virtual ~ModelState() = default;
// Name of the input and output tensor
const std::string& InputTensorName() const { return input_name_; }
const std::string& OutputTensorName() const { return output_name_; }
// Datatype of the input and output tensor
TRITONSERVER_DataType TensorDataType() const { return datatype_; }
// Shape of the input and output tensor as given in the model
// configuration file. This shape will not include the batch
// dimension (if the model has one).
const std::vector<int64_t>& TensorNonBatchShape() const { return nb_shape_; }
// Shape of the input and output tensor, including the batch
// dimension (if the model has one). This method cannot be called
// until the model is completely loaded and initialized, including
// all instances of the model. In practice, this means that backend
// should only call it in TRITONBACKEND_ModelInstanceExecute.
TRITONSERVER_Error* TensorShape(std::vector<int64_t>& shape);
// Validate that this model is supported by this backend.
TRITONSERVER_Error* ValidateModelConfig();
private:
ModelState(TRITONBACKEND_Model* triton_model);
std::string input_name_;
std::string output_name_;
TRITONSERVER_DataType datatype_;
bool shape_initialized_;
std::vector<int64_t> nb_shape_;
std::vector<int64_t> shape_;
};
ModelState::ModelState(TRITONBACKEND_Model* triton_model)
: BackendModel(triton_model), shape_initialized_(false)
{
// Validate that the model's configuration matches what is supported
// by this backend.
THROW_IF_BACKEND_MODEL_ERROR(ValidateModelConfig());
}
TRITONSERVER_Error*
ModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state)
{
try {
*state = new ModelState(triton_model);
}
catch (const BackendModelException& ex) {
RETURN_ERROR_IF_TRUE(
ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL,
std::string("unexpected nullptr in BackendModelException"));
RETURN_IF_ERROR(ex.err_);
}
return nullptr; // success
}
TRITONSERVER_Error*
ModelState::TensorShape(std::vector<int64_t>& shape)
{
// This backend supports models that batch along the first dimension
// and those that don't batch. For non-batch models the output shape
// will be the shape from the model configuration. For batch models
// the output shape will be the shape from the model configuration
// prepended with [ -1 ] to represent the batch dimension. The
// backend "responder" utility used below will set the appropriate
// batch dimension value for each response. The shape needs to be
// initialized lazily because the SupportsFirstDimBatching function
// cannot be used until the model is completely loaded.
if (!shape_initialized_) {
bool supports_first_dim_batching;
RETURN_IF_ERROR(SupportsFirstDimBatching(&supports_first_dim_batching));
if (supports_first_dim_batching) {
shape_.push_back(-1);
}
shape_.insert(shape_.end(), nb_shape_.begin(), nb_shape_.end());
shape_initialized_ = true;
}
shape = shape_;
return nullptr; // success
}
TRITONSERVER_Error*
ModelState::ValidateModelConfig()
{
// If verbose logging is enabled, dump the model's configuration as
// JSON into the console output.
if (TRITONSERVER_LogIsEnabled(TRITONSERVER_LOG_VERBOSE)) {
common::TritonJson::WriteBuffer buffer;
RETURN_IF_ERROR(ModelConfig().PrettyWrite(&buffer));
LOG_MESSAGE(
TRITONSERVER_LOG_VERBOSE,
(std::string("model configuration:\n") + buffer.Contents()).c_str());
}
// ModelConfig is the model configuration as a TritonJson
// object. Use the TritonJson utilities to parse the JSON and
// determine if the configuration is supported by this backend.
common::TritonJson::Value inputs, outputs;
RETURN_IF_ERROR(ModelConfig().MemberAsArray("input", &inputs));
RETURN_IF_ERROR(ModelConfig().MemberAsArray("output", &outputs));
// The model must have exactly 1 input and 1 output.
RETURN_ERROR_IF_FALSE(
inputs.ArraySize() == 1, TRITONSERVER_ERROR_INVALID_ARG,
std::string("model configuration must have 1 input"));
RETURN_ERROR_IF_FALSE(
outputs.ArraySize() == 1, TRITONSERVER_ERROR_INVALID_ARG,
std::string("model configuration must have 1 output"));
common::TritonJson::Value input, output;
RETURN_IF_ERROR(inputs.IndexAsObject(0, &input));
RETURN_IF_ERROR(outputs.IndexAsObject(0, &output));
// Record the input and output name in the model state.
const char* input_name;
size_t input_name_len;
RETURN_IF_ERROR(input.MemberAsString("name", &input_name, &input_name_len));
input_name_ = std::string(input_name);
const char* output_name;
size_t output_name_len;
RETURN_IF_ERROR(
output.MemberAsString("name", &output_name, &output_name_len));
output_name_ = std::string(output_name);
// Input and output must have same datatype
std::string input_dtype, output_dtype;
RETURN_IF_ERROR(input.MemberAsString("data_type", &input_dtype));
RETURN_IF_ERROR(output.MemberAsString("data_type", &output_dtype));
RETURN_ERROR_IF_FALSE(
input_dtype == output_dtype, TRITONSERVER_ERROR_INVALID_ARG,
std::string("expected input and output datatype to match, got ") +
input_dtype + " and " + output_dtype);
datatype_ = ModelConfigDataTypeToTritonServerDataType(input_dtype);
// Input and output must have same shape. Reshape is not supported
// on either input or output so flag an error is the model
// configuration uses it.
triton::common::TritonJson::Value reshape;
RETURN_ERROR_IF_TRUE(
input.Find("reshape", &reshape), TRITONSERVER_ERROR_UNSUPPORTED,
std::string("reshape not supported for input tensor"));
RETURN_ERROR_IF_TRUE(
output.Find("reshape", &reshape), TRITONSERVER_ERROR_UNSUPPORTED,
std::string("reshape not supported for output tensor"));
std::vector<int64_t> input_shape, output_shape;
RETURN_IF_ERROR(backend::ParseShape(input, "dims", &input_shape));
RETURN_IF_ERROR(backend::ParseShape(output, "dims", &output_shape));
RETURN_ERROR_IF_FALSE(
input_shape == output_shape, TRITONSERVER_ERROR_INVALID_ARG,
std::string("expected input and output shape to match, got ") +
backend::ShapeToString(input_shape) + " and " +
backend::ShapeToString(output_shape));
nb_shape_ = input_shape;
return nullptr; // success
}
extern "C" {
// Triton calls TRITONBACKEND_ModelInitialize when a model is loaded
// to allow the backend to create any state associated with the model,
// and to also examine the model configuration to determine if the
// configuration is suitable for the backend. Any errors reported by
// this function will prevent the model from loading.
//
TRITONSERVER_Error*
TRITONBACKEND_ModelInitialize(TRITONBACKEND_Model* model)
{
// Create a ModelState object and associate it with the
// TRITONBACKEND_Model. If anything goes wrong with initialization
// of the model state then an error is returned and Triton will fail
// to load the model.
ModelState* model_state;
RETURN_IF_ERROR(ModelState::Create(model, &model_state));
RETURN_IF_ERROR(
TRITONBACKEND_ModelSetState(model, reinterpret_cast<void*>(model_state)));
return nullptr; // success
}
// Triton calls TRITONBACKEND_ModelFinalize when a model is no longer
// needed. The backend should cleanup any state associated with the
// model. This function will not be called until all model instances
// of the model have been finalized.
//
TRITONSERVER_Error*
TRITONBACKEND_ModelFinalize(TRITONBACKEND_Model* model)
{
void* vstate;
RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vstate));
ModelState* model_state = reinterpret_cast<ModelState*>(vstate);
delete model_state;
return nullptr; // success
}
} // extern "C"
/////////////
//
// ModelInstanceState
//
// State associated with a model instance. An object of this class is
// created and associated with each
// TRITONBACKEND_ModelInstance. ModelInstanceState is derived from
// BackendModelInstance class provided in the backend utilities that
// provides many common functions.
//
class ModelInstanceState : public BackendModelInstance {
public:
static TRITONSERVER_Error* Create(
ModelState* model_state,
TRITONBACKEND_ModelInstance* triton_model_instance,
ModelInstanceState** state);
virtual ~ModelInstanceState() = default;
// Get the state of the model that corresponds to this instance.
ModelState* StateForModel() const { return model_state_; }
private:
ModelInstanceState(
ModelState* model_state,
TRITONBACKEND_ModelInstance* triton_model_instance)
: BackendModelInstance(model_state, triton_model_instance),
model_state_(model_state)
{
}
ModelState* model_state_;
};
TRITONSERVER_Error*
ModelInstanceState::Create(
ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance,
ModelInstanceState** state)
{
try {
*state = new ModelInstanceState(model_state, triton_model_instance);
}
catch (const BackendModelInstanceException& ex) {
RETURN_ERROR_IF_TRUE(
ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL,
std::string("unexpected nullptr in BackendModelInstanceException"));
RETURN_IF_ERROR(ex.err_);
}
return nullptr; // success
}
extern "C" {
// Triton calls TRITONBACKEND_ModelInstanceInitialize when a model
// instance is created to allow the backend to initialize any state
// associated with the instance.
//
TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance)
{
// Get the model state associated with this instance's model.
TRITONBACKEND_Model* model;
RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceModel(instance, &model));
void* vmodelstate;
RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vmodelstate));
ModelState* model_state = reinterpret_cast<ModelState*>(vmodelstate);
// Create a ModelInstanceState object and associate it with the
// TRITONBACKEND_ModelInstance.
ModelInstanceState* instance_state;
RETURN_IF_ERROR(
ModelInstanceState::Create(model_state, instance, &instance_state));
RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceSetState(
instance, reinterpret_cast<void*>(instance_state)));
return nullptr; // success
}
// Triton calls TRITONBACKEND_ModelInstanceFinalize when a model
// instance is no longer needed. The backend should cleanup any state
// associated with the model instance.
//
TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance)
{
void* vstate;
RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(instance, &vstate));
ModelInstanceState* instance_state =
reinterpret_cast<ModelInstanceState*>(vstate);
delete instance_state;
return nullptr; // success
}
} // extern "C"
/////////////
extern "C" {
// When Triton calls TRITONBACKEND_ModelInstanceExecute it is required
// that a backend create a response for each request in the batch. A
// response may be the output tensors required for that request or may
// be an error that is returned in the response.
//
TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceExecute(
TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Request** requests,
const uint32_t request_count)
{
// Collect various timestamps during the execution of this batch or
// requests. These values are reported below before returning from
// the function.
uint64_t exec_start_ns = 0;
SET_TIMESTAMP(exec_start_ns);
// Triton will not call this function simultaneously for the same
// 'instance'. But since this backend could be used by multiple
// instances from multiple models the implementation needs to handle
// multiple calls to this function at the same time (with different
// 'instance' objects). Best practice for a high-performance
// implementation is to avoid introducing mutex/lock and instead use
// only function-local and model-instance-specific state.
ModelInstanceState* instance_state;
RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(
instance, reinterpret_cast<void**>(&instance_state)));
ModelState* model_state = instance_state->StateForModel();
// 'responses' is initialized as a parallel array to 'requests',
// with one TRITONBACKEND_Response object for each
// TRITONBACKEND_Request object. If something goes wrong while
// creating these response objects, the backend simply returns an
// error from TRITONBACKEND_ModelInstanceExecute, indicating to
// Triton that this backend did not create or send any responses and
// so it is up to Triton to create and send an appropriate error
// response for each request. RETURN_IF_ERROR is one of several
// useful macros for error handling that can be found in
// backend_common.h.
std::vector<TRITONBACKEND_Response*> responses;
responses.reserve(request_count);
for (uint32_t r = 0; r < request_count; ++r) {
TRITONBACKEND_Request* request = requests[r];
TRITONBACKEND_Response* response;
RETURN_IF_ERROR(TRITONBACKEND_ResponseNew(&response, request));
responses.push_back(response);
}
// At this point, the backend takes ownership of 'requests', which
// means that it is responsible for sending a response for every
// request. From here, even if something goes wrong in processing,
// the backend must return 'nullptr' from this function to indicate
// success. Any errors and failures must be communicated via the
// response objects.
//
// To simplify error handling, the backend utilities manage
// 'responses' in a specific way and it is recommended that backends
// follow this same pattern. When an error is detected in the
// processing of a request, an appropriate error response is sent
// and the corresponding TRITONBACKEND_Response object within
// 'responses' is set to nullptr to indicate that the
// request/response has already been handled and no futher processing
// should be performed for that request. Even if all responses fail,
// the backend still allows execution to flow to the end of the
// function so that statistics are correctly reported by the calls
// to TRITONBACKEND_ModelInstanceReportStatistics and
// TRITONBACKEND_ModelInstanceReportBatchStatistics.
// RESPOND_AND_SET_NULL_IF_ERROR, and
// RESPOND_ALL_AND_SET_NULL_IF_ERROR are macros from
// backend_common.h that assist in this management of response
// objects.
// The backend could iterate over the 'requests' and process each
// one separately. But for performance reasons it is usually
// preferred to create batched input tensors that are processed
// simultaneously. This is especially true for devices like GPUs
// that are capable of exploiting the large amount parallelism
// exposed by larger data sets.
//
// The backend utilities provide a "collector" to facilitate this
// batching process. The 'collector's ProcessTensor function will
// combine a tensor's value from each request in the batch into a
// single contiguous buffer. The buffer can be provided by the
// backend or 'collector' can create and manage it. In this backend,
// there is not a specific buffer into which the batch should be
// created, so use ProcessTensor arguments that cause collector to
// manage it. ProcessTensor does NOT support TRITONSERVER_TYPE_BYTES
// data type.
BackendInputCollector collector(
requests, request_count, &responses, model_state->TritonMemoryManager(),
false /* pinned_enabled */, nullptr /* stream*/);
// To instruct ProcessTensor to "gather" the entire batch of input
// tensors into a single contiguous buffer in CPU memory, set the
// "allowed input types" to be the CPU ones (see tritonserver.h in
// the triton-inference-server/core repo for allowed memory types).
std::vector<std::pair<TRITONSERVER_MemoryType, int64_t>> allowed_input_types =
{{TRITONSERVER_MEMORY_CPU_PINNED, 0}, {TRITONSERVER_MEMORY_CPU, 0}};
const char* input_buffer;
size_t input_buffer_byte_size;
TRITONSERVER_MemoryType input_buffer_memory_type;
int64_t input_buffer_memory_type_id;
RESPOND_ALL_AND_SET_NULL_IF_ERROR(
responses, request_count,
collector.ProcessTensor(
model_state->InputTensorName().c_str(), nullptr /* existing_buffer */,
0 /* existing_buffer_byte_size */, allowed_input_types, &input_buffer,
&input_buffer_byte_size, &input_buffer_memory_type,
&input_buffer_memory_type_id));
// Finalize the collector. If 'true' is returned, 'input_buffer'
// will not be valid until the backend synchronizes the CUDA
// stream or event that was used when creating the collector. For
// this backend, GPU is not supported and so no CUDA sync should
// be needed; so if 'true' is returned simply log an error.
const bool need_cuda_input_sync = collector.Finalize();
if (need_cuda_input_sync) {
LOG_MESSAGE(
TRITONSERVER_LOG_ERROR,
"'recommended' backend: unexpected CUDA sync required by collector");
}
// 'input_buffer' contains the batched input tensor. The backend can
// implement whatever logic is necessary to produce the output
// tensor. This backend simply logs the input tensor value and then
// returns the input tensor value in the output tensor so no actual
// computation is needed.
uint64_t compute_start_ns = 0;
SET_TIMESTAMP(compute_start_ns);
LOG_MESSAGE(
TRITONSERVER_LOG_INFO,
(std::string("model ") + model_state->Name() + ": requests in batch " +
std::to_string(request_count))
.c_str());
std::string tstr;
IGNORE_ERROR(BufferAsTypedString(
tstr, input_buffer, input_buffer_byte_size,
model_state->TensorDataType()));
LOG_MESSAGE(
TRITONSERVER_LOG_INFO,
(std::string("batched " + model_state->InputTensorName() + " value: ") +
tstr)
.c_str());
const char* output_buffer = input_buffer;
TRITONSERVER_MemoryType output_buffer_memory_type = input_buffer_memory_type;
int64_t output_buffer_memory_type_id = input_buffer_memory_type_id;
uint64_t compute_end_ns = 0;
SET_TIMESTAMP(compute_end_ns);
bool supports_first_dim_batching;
RESPOND_ALL_AND_SET_NULL_IF_ERROR(
responses, request_count,
model_state->SupportsFirstDimBatching(&supports_first_dim_batching));
std::vector<int64_t> tensor_shape;
RESPOND_ALL_AND_SET_NULL_IF_ERROR(
responses, request_count, model_state->TensorShape(tensor_shape));
// Because the output tensor values are concatenated into a single
// contiguous 'output_buffer', the backend must "scatter" them out
// to the individual response output tensors. The backend utilities
// provide a "responder" to facilitate this scattering process.
// BackendOutputResponder does NOT support TRITONSERVER_TYPE_BYTES
// data type.
// The 'responders's ProcessTensor function will copy the portion of
// 'output_buffer' corresonding to each request's output into the
// response for that request.
BackendOutputResponder responder(
requests, request_count, &responses, model_state->TritonMemoryManager(),
supports_first_dim_batching, false /* pinned_enabled */,
nullptr /* stream*/);
responder.ProcessTensor(
model_state->OutputTensorName().c_str(), model_state->TensorDataType(),
tensor_shape, output_buffer, output_buffer_memory_type,
output_buffer_memory_type_id);
// Finalize the responder. If 'true' is returned, the output
// tensors' data will not be valid until the backend synchronizes
// the CUDA stream or event that was used when creating the
// responder. For this backend, GPU is not supported and so no CUDA
// sync should be needed; so if 'true' is returned simply log an
// error.
const bool need_cuda_output_sync = responder.Finalize();
if (need_cuda_output_sync) {
LOG_MESSAGE(
TRITONSERVER_LOG_ERROR,
"'recommended' backend: unexpected CUDA sync required by responder");
}
// Send all the responses that haven't already been sent because of
// an earlier error.
for (auto& response : responses) {
if (response != nullptr) {
LOG_IF_ERROR(
TRITONBACKEND_ResponseSend(
response, TRITONSERVER_RESPONSE_COMPLETE_FINAL, nullptr),
"failed to send response");
}
}
uint64_t exec_end_ns = 0;
SET_TIMESTAMP(exec_end_ns);
#ifdef TRITON_ENABLE_STATS
// For batch statistics need to know the total batch size of the
// requests. This is not necessarily just the number of requests,
// because if the model supports batching then any request can be a
// batched request itself.
size_t total_batch_size = 0;
if (!supports_first_dim_batching) {
total_batch_size = request_count;
} else {
for (uint32_t r = 0; r < request_count; ++r) {
auto& request = requests[r];
TRITONBACKEND_Input* input = nullptr;
LOG_IF_ERROR(
TRITONBACKEND_RequestInputByIndex(request, 0 /* index */, &input),
"failed getting request input");
if (input != nullptr) {
const int64_t* shape = nullptr;
LOG_IF_ERROR(
TRITONBACKEND_InputProperties(
input, nullptr, nullptr, &shape, nullptr, nullptr, nullptr),
"failed getting input properties");
if (shape != nullptr) {
total_batch_size += shape[0];
}
}
}
}
#else
(void)exec_start_ns;
(void)exec_end_ns;
(void)compute_start_ns;
(void)compute_end_ns;
#endif // TRITON_ENABLE_STATS
// Report statistics for each request, and then release the request.
for (uint32_t r = 0; r < request_count; ++r) {
auto& request = requests[r];
#ifdef TRITON_ENABLE_STATS
LOG_IF_ERROR(
TRITONBACKEND_ModelInstanceReportStatistics(
instance_state->TritonModelInstance(), request,
(responses[r] != nullptr) /* success */, exec_start_ns,
compute_start_ns, compute_end_ns, exec_end_ns),
"failed reporting request statistics");
#endif // TRITON_ENABLE_STATS
LOG_IF_ERROR(
TRITONBACKEND_RequestRelease(request, TRITONSERVER_REQUEST_RELEASE_ALL),
"failed releasing request");
}
#ifdef TRITON_ENABLE_STATS
// Report batch statistics.
LOG_IF_ERROR(
TRITONBACKEND_ModelInstanceReportBatchStatistics(
instance_state->TritonModelInstance(), total_batch_size,
exec_start_ns, compute_start_ns, compute_end_ns, exec_end_ns),
"failed reporting batch request statistics");
#endif // TRITON_ENABLE_STATS
return nullptr; // success
}
} // extern "C"
}}} // namespace triton::backend::recommended
#!/usr/bin/python
# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import sys
import argparse
import numpy as np
import tritonhttpclient as httpclient
from tritonclientutils import np_to_triton_dtype
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-u',
'--url',
type=str,
required=False,
default='localhost:8000',
help='Inference server URL. Default is localhost:8000.')
FLAGS = parser.parse_args()
model_name = "bls_fp32"
shape = [16]
with httpclient.InferenceServerClient(url=FLAGS.url) as client:
input0_data = np.random.rand(*shape).astype(np.float32)
input1_data = np.random.rand(*shape).astype(np.float32)
inputs = [
httpclient.InferInput("INPUT0", input0_data.shape,
np_to_triton_dtype(input0_data.dtype)),
httpclient.InferInput("INPUT1", input1_data.shape,
np_to_triton_dtype(input1_data.dtype)),
]
inputs[0].set_data_from_numpy(input0_data)
inputs[1].set_data_from_numpy(input1_data)
outputs = [
httpclient.InferRequestedOutput("OUTPUT0"),
httpclient.InferRequestedOutput("OUTPUT1"),
]
response = client.infer(model_name,
inputs,
request_id=str(1),
outputs=outputs)
result = response.get_response()
output0_data = response.as_numpy("OUTPUT0")
output1_data = response.as_numpy("OUTPUT1")
print("INPUT0 ({}) + INPUT1 ({}) = OUTPUT0 ({})".format(
input0_data, input1_data, output0_data))
print("INPUT0 ({}) - INPUT1 ({}) = OUTPUT1 ({})".format(
input0_data, input1_data, output1_data))
if not np.allclose(input0_data + input1_data, output0_data):
print("error: incorrect sum")
sys.exit(1)
if not np.allclose(input0_data - input1_data, output1_data):
print("error: incorrect difference")
sys.exit(1)
print('\nPASS')
sys.exit(0)
#!/usr/bin/env python
# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import argparse
import numpy as np
import tritonclient.http as httpclient
from tritonclient.utils import InferenceServerException
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-u',
'--url',
type=str,
required=False,
default='localhost:8000',
help='Inference server URL. Default is localhost:8000.')
FLAGS = parser.parse_args()
# For the HTTP client, need to specify large enough concurrency to
# issue all the inference requests to the server in parallel. For
# this example we want to be able to send 2 requests concurrently.
try:
concurrent_request_count = 2
triton_client = httpclient.InferenceServerClient(
url=FLAGS.url, concurrency=concurrent_request_count)
except Exception as e:
print("channel creation failed: " + str(e))
sys.exit(1)
# First send a single request to the nonbatching model.
print('=========')
input0_data = np.array([ 1, 2, 3, 4 ], dtype=np.int32)
print('Sending request to nonbatching model: IN0 = {}'.format(input0_data))
inputs = [ httpclient.InferInput('IN0', [4], "INT32") ]
inputs[0].set_data_from_numpy(input0_data)
result = triton_client.infer('nonbatching', inputs)
print('Response: {}'.format(result.get_response()))
print('OUT0 = {}'.format(result.as_numpy('OUT0')))
# Send 2 requests to the batching model. Because these are sent
# asynchronously and Triton's dynamic batcher is configured to
# delay up to 5 seconds when forming a batch for this model, we
# expect these 2 requests to be batched within Triton and sent to
# the minimal backend as a single batch.
print('\n=========')
async_requests = []
input0_data = np.array([[ 10, 11, 12, 13 ]], dtype=np.int32)
print('Sending request to batching model: IN0 = {}'.format(input0_data))
inputs = [ httpclient.InferInput('IN0', [1, 4], "INT32") ]
inputs[0].set_data_from_numpy(input0_data)
async_requests.append(triton_client.async_infer('batching', inputs))
input0_data = np.array([[ 20, 21, 22, 23 ]], dtype=np.int32)
print('Sending request to batching model: IN0 = {}'.format(input0_data))
inputs = [ httpclient.InferInput('IN0', [1, 4], "INT32") ]
inputs[0].set_data_from_numpy(input0_data)
async_requests.append(triton_client.async_infer('batching', inputs))
for async_request in async_requests:
# Get the result from the initiated asynchronous inference
# request. This call will block till the server responds.
result = async_request.get_result()
print('Response: {}'.format(result.get_response()))
print('OUT0 = {}'.format(result.as_numpy('OUT0')))
#!/usr/bin/env python
# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import argparse
import numpy as np
import tritonclient.http as httpclient
from tritonclient.utils import InferenceServerException
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-u',
'--url',
type=str,
required=False,
default='localhost:8000',
help='Inference server URL. Default is localhost:8000.')
FLAGS = parser.parse_args()
# For the HTTP client, need to specify large enough concurrency to
# issue all the inference requests to the server in parallel. For
# this example we want to be able to send 2 requests concurrently.
try:
concurrent_request_count = 2
triton_client = httpclient.InferenceServerClient(
url=FLAGS.url, concurrency=concurrent_request_count)
except Exception as e:
print("channel creation failed: " + str(e))
sys.exit(1)
# Send 2 requests to the batching model. Because these are sent
# asynchronously and Triton's dynamic batcher is configured to
# delay up to 5 seconds when forming a batch for this model, we
# expect these 2 requests to be batched within Triton and sent to
# the backend as a single batch.
#
# The recommended backend can handle any model with 1 input and 1
# output as long as the input and output datatype and shape are
# the same. The batching model uses datatype FP32 and shape
# [ 4, 4 ].
print('\n=========')
async_requests = []
input0_data = np.array([[[ 1.0, 1.1, 1.2, 1.3 ],
[ 2.0, 2.1, 2.2, 2.3 ],
[ 3.0, 3.1, 3.2, 3.3 ],
[ 4.0, 4.1, 4.2, 4.3 ]]], dtype=np.float32)
print('Sending request to batching model: input = {}'.format(input0_data))
inputs = [ httpclient.InferInput('INPUT', [1, 4, 4], "FP32") ]
inputs[0].set_data_from_numpy(input0_data)
async_requests.append(triton_client.async_infer('batching', inputs))
input0_data = np.array([[[ 10.0, 10.1, 10.2, 10.3 ],
[ 20.0, 20.1, 20.2, 20.3 ],
[ 30.0, 30.1, 30.2, 30.3 ],
[ 40.0, 40.1, 40.2, 40.3 ]]], dtype=np.float32)
print('Sending request to batching model: input = {}'.format(input0_data))
inputs = [ httpclient.InferInput('INPUT', [1, 4, 4], "FP32") ]
inputs[0].set_data_from_numpy(input0_data)
async_requests.append(triton_client.async_infer('batching', inputs))
for async_request in async_requests:
# Get the result from the initiated asynchronous inference
# request. This call will block till the server responds.
result = async_request.get_result()
print('Response: {}'.format(result.get_response()))
print('OUTPUT = {}'.format(result.as_numpy('OUTPUT')))
# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import json
import triton_python_backend_utils as pb_utils
# This model calculates the sum and difference of the INPUT0 and INPUT1 and put
# the results in OUTPUT0 and OUTPUT1 respectively. For more information
# regarding how this model.py was written, please refer to Python Backend.
class TritonPythonModel:
def initialize(self, args):
self.model_config = model_config = json.loads(args['model_config'])
output0_config = pb_utils.get_output_config_by_name(
model_config, "OUTPUT0")
output1_config = pb_utils.get_output_config_by_name(
model_config, "OUTPUT1")
self.output0_dtype = pb_utils.triton_string_to_numpy(
output0_config['data_type'])
self.output1_dtype = pb_utils.triton_string_to_numpy(
output1_config['data_type'])
def execute(self, requests):
output0_dtype = self.output0_dtype
output1_dtype = self.output1_dtype
responses = []
for request in requests:
in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0")
in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1")
out_0, out_1 = (in_0.as_numpy() + in_1.as_numpy(),
in_0.as_numpy() - in_1.as_numpy())
out_tensor_0 = pb_utils.Tensor("OUTPUT0",
out_0.astype(output0_dtype))
out_tensor_1 = pb_utils.Tensor("OUTPUT1",
out_1.astype(output1_dtype))
inference_response = pb_utils.InferenceResponse(
output_tensors=[out_tensor_0, out_tensor_1])
responses.append(inference_response)
return responses
def finalize(self):
print('Cleaning up...')
# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
name: "addsub_python"
backend: "python"
max_batch_size: 0
input [
{
name: "INPUT0"
data_type: TYPE_FP32
dims: [ 16 ]
}
]
input [
{
name: "INPUT1"
data_type: TYPE_FP32
dims: [ 16 ]
}
]
output [
{
name: "OUTPUT0"
data_type: TYPE_FP32
dims: [ 16 ]
}
]
output [
{
name: "OUTPUT1"
data_type: TYPE_FP32
dims: [ 16 ]
}
]
name: "addsub_tf"
platform: "tensorflow_savedmodel"
max_batch_size: 0
input [
{
name: "INPUT0"
data_type: TYPE_FP32
dims: [ 16 ]
},
{
name: "INPUT1"
data_type: TYPE_FP32
dims: [ 16 ]
}
]
output [
{
name: "OUTPUT0"
data_type: TYPE_FP32
dims: [ 16 ]
},
{
name: "OUTPUT1"
data_type: TYPE_FP32
dims: [ 16 ]
}
]
# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
name: "bls_fp32"
backend: "bls"
max_batch_size: 0
input [
{
name: "INPUT0"
data_type: TYPE_FP32
dims: [ 16 ]
}
]
input [
{
name: "INPUT1"
data_type: TYPE_FP32
dims: [ 16 ]
}
]
output [
{
name: "OUTPUT0"
data_type: TYPE_FP32
dims: [ 16 ]
}
]
output [
{
name: "OUTPUT1"
data_type: TYPE_FP32
dims: [ 16 ]
}
]
instance_group [
{
kind: KIND_CPU
}
]
backend: "minimal"
max_batch_size: 8
dynamic_batching {
max_queue_delay_microseconds: 5000000
}
input [
{
name: "IN0"
data_type: TYPE_INT32
dims: [ 4 ]
}
]
output [
{
name: "OUT0"
data_type: TYPE_INT32
dims: [ 4 ]
}
]
instance_group [
{
kind: KIND_CPU
}
]
backend: "minimal"
max_batch_size: 0
input [
{
name: "IN0"
data_type: TYPE_INT32
dims: [ 4 ]
}
]
output [
{
name: "OUT0"
data_type: TYPE_INT32
dims: [ 4 ]
}
]
instance_group [
{
kind: KIND_CPU
}
]
backend: "recommended"
max_batch_size: 8
dynamic_batching {
max_queue_delay_microseconds: 5000000
}
input [
{
name: "INPUT"
data_type: TYPE_FP32
dims: [ 4, 4 ]
}
]
output [
{
name: "OUTPUT"
data_type: TYPE_FP32
dims: [ 4, 4 ]
}
]
instance_group [
{
kind: KIND_CPU
}
]
// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <chrono>
#include <condition_variable>
#include <deque>
#include <iostream>
#include <mutex>
#include <set>
#include <string>
#include <unordered_map>
#include <vector>
#include "triton/common/error.h"
#include "triton/core/tritonbackend.h"
#define TRITONJSON_STATUSTYPE TRITONSERVER_Error*
#define TRITONJSON_STATUSRETURN(M) \
return TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, (M).c_str())
#define TRITONJSON_STATUSSUCCESS nullptr
#include "triton/common/triton_json.h"
#ifdef TRITON_ENABLE_GPU
#include <cuda_runtime_api.h>
#endif // TRITON_ENABLE_GPU
namespace triton { namespace backend {
#define IGNORE_ERROR(X) \
do { \
TRITONSERVER_Error* ie_err__ = (X); \
if (ie_err__ != nullptr) { \
TRITONSERVER_ErrorDelete(ie_err__); \
} \
} while (false)
#define LOG_IF_ERROR(X, MSG) \
do { \
TRITONSERVER_Error* lie_err__ = (X); \
if (lie_err__ != nullptr) { \
IGNORE_ERROR(TRITONSERVER_LogMessage( \
TRITONSERVER_LOG_INFO, __FILE__, __LINE__, \
(std::string(MSG) + ": " + TRITONSERVER_ErrorCodeString(lie_err__) + \
" - " + TRITONSERVER_ErrorMessage(lie_err__)) \
.c_str())); \
TRITONSERVER_ErrorDelete(lie_err__); \
} \
} while (false)
#define LOG_MESSAGE(LEVEL, MSG) \
do { \
LOG_IF_ERROR( \
TRITONSERVER_LogMessage(LEVEL, __FILE__, __LINE__, MSG), \
("failed to log message: ")); \
} while (false)
#define RETURN_ERROR_IF_FALSE(P, C, MSG) \
do { \
if (!(P)) { \
return TRITONSERVER_ErrorNew(C, (MSG).c_str()); \
} \
} while (false)
#define RETURN_ERROR_IF_TRUE(P, C, MSG) \
do { \
if ((P)) { \
return TRITONSERVER_ErrorNew(C, (MSG).c_str()); \
} \
} while (false)
#define RETURN_IF_ERROR(X) \
do { \
TRITONSERVER_Error* rie_err__ = (X); \
if (rie_err__ != nullptr) { \
return rie_err__; \
} \
} while (false)
#ifdef TRITON_ENABLE_GPU
#define LOG_IF_CUDA_ERROR(X, MSG) \
do { \
cudaError_t lice_err__ = (X); \
if (lice_err__ != cudaSuccess) { \
IGNORE_ERROR(TRITONSERVER_LogMessage( \
TRITONSERVER_LOG_INFO, __FILE__, __LINE__, \
(std::string(MSG) + ": " + cudaGetErrorString(lice_err__)) \
.c_str())); \
} \
} while (false)
#define RETURN_IF_CUDA_ERROR(X, C, MSG) \
do { \
cudaError_t rice_err__ = (X); \
if (rice_err__ != cudaSuccess) { \
return TRITONSERVER_ErrorNew( \
C, ((MSG) + ": " + cudaGetErrorString(rice_err__)).c_str()); \
} \
} while (false)
#endif // TRITON_ENABLE_GPU
#define RESPOND_AND_SET_NULL_IF_ERROR(RESPONSE_PTR, X) \
do { \
TRITONSERVER_Error* rarie_err__ = (X); \
if (rarie_err__ != nullptr) { \
if (*RESPONSE_PTR != nullptr) { \
LOG_IF_ERROR( \
TRITONBACKEND_ResponseSend( \
*RESPONSE_PTR, TRITONSERVER_RESPONSE_COMPLETE_FINAL, \
rarie_err__), \
"failed to send error response"); \
*RESPONSE_PTR = nullptr; \
} \
TRITONSERVER_ErrorDelete(rarie_err__); \
} \
} while (false)
#define RESPOND_ALL_AND_SET_NULL_IF_ERROR(RESPONSES, RESPONSES_COUNT, X) \
do { \
TRITONSERVER_Error* raasnie_err__ = (X); \
if (raasnie_err__ != nullptr) { \
for (size_t ridx = 0; ridx < RESPONSES_COUNT; ++ridx) { \
if (RESPONSES[ridx] != nullptr) { \
LOG_IF_ERROR( \
TRITONBACKEND_ResponseSend( \
RESPONSES[ridx], TRITONSERVER_RESPONSE_COMPLETE_FINAL, \
raasnie_err__), \
"failed to send error response"); \
RESPONSES[ridx] = nullptr; \
} \
} \
TRITONSERVER_ErrorDelete(raasnie_err__); \
} \
} while (false)
#define RESPOND_ALL_AND_SET_TRUE_IF_ERROR(RESPONSES, RESPONSES_COUNT, BOOL, X) \
do { \
TRITONSERVER_Error* raasnie_err__ = (X); \
if (raasnie_err__ != nullptr) { \
BOOL = true; \
for (size_t ridx = 0; ridx < RESPONSES_COUNT; ++ridx) { \
if (RESPONSES[ridx] != nullptr) { \
LOG_IF_ERROR( \
TRITONBACKEND_ResponseSend( \
RESPONSES[ridx], TRITONSERVER_RESPONSE_COMPLETE_FINAL, \
raasnie_err__), \
"failed to send error response"); \
RESPONSES[ridx] = nullptr; \
} \
} \
TRITONSERVER_ErrorDelete(raasnie_err__); \
} \
} while (false)
#ifdef TRITON_ENABLE_STATS
#define TIMESPEC_TO_NANOS(TS) ((TS).tv_sec * 1000000000 + (TS).tv_nsec)
#define SET_TIMESTAMP(TS_NS) \
{ \
TS_NS = std::chrono::duration_cast<std::chrono::nanoseconds>( \
std::chrono::steady_clock::now().time_since_epoch()) \
.count(); \
}
#define DECL_TIMESTAMP(TS_NS) \
uint64_t TS_NS; \
SET_TIMESTAMP(TS_NS);
#else
#define DECL_TIMESTAMP(TS_NS)
#define SET_TIMESTAMP(TS_NS)
#endif // TRITON_ENABLE_STATS
#ifndef TRITON_ENABLE_GPU
using cudaStream_t = void*;
#endif // !TRITON_ENABLE_GPU
/// Convenience deleter for TRITONBACKEND_ResponseFactory.
struct ResponseFactoryDeleter {
void operator()(TRITONBACKEND_ResponseFactory* f)
{
LOG_IF_ERROR(
TRITONBACKEND_ResponseFactoryDelete(f),
"failed deleting response factory");
}
};
// A representation of the BatchInput message in model config
class BatchInput {
public:
enum class Kind {
BATCH_ELEMENT_COUNT,
BATCH_ACCUMULATED_ELEMENT_COUNT,
BATCH_ACCUMULATED_ELEMENT_COUNT_WITH_ZERO,
BATCH_MAX_ELEMENT_COUNT_AS_SHAPE,
BATCH_ITEM_SHAPE,
BATCH_ITEM_SHAPE_FLATTEN
};
static TRITONSERVER_Error* ParseFromModelConfig(
triton::common::TritonJson::Value& config,
std::vector<BatchInput>* batch_inputs);
const std::vector<std::string>& TargetNames() const { return target_names_; }
TRITONSERVER_DataType DataType() const { return data_type_; }
Kind BatchInputKind() const { return kind_; }
std::string BatchInputKindString() const { return kind_str_; }
const std::vector<std::string>& SourceInputs() const
{
return source_inputs_;
}
private:
TRITONSERVER_Error* Init(triton::common::TritonJson::Value& bi_config);
Kind kind_;
std::string kind_str_;
std::vector<std::string> target_names_;
TRITONSERVER_DataType data_type_;
std::vector<std::string> source_inputs_;
};
// A representation of the BatchOutput message in model config
class BatchOutput {
public:
enum class Kind { BATCH_SCATTER_WITH_INPUT_SHAPE };
static TRITONSERVER_Error* ParseFromModelConfig(
triton::common::TritonJson::Value& config,
std::vector<BatchOutput>* batch_outputs);
const std::vector<std::string>& TargetNames() const { return target_names_; }
TRITONSERVER_DataType DataType() const { return data_type_; }
const std::vector<int64_t>& OutputShape() const { return shape_; }
Kind BatchOutputKind() const { return kind_; }
const std::vector<std::string>& SourceInputs() const
{
return source_inputs_;
}
private:
Kind kind_;
std::vector<std::string> target_names_;
TRITONSERVER_DataType data_type_;
std::vector<int64_t> shape_;
std::vector<std::string> source_inputs_;
};
struct CopyParams {
CopyParams(void* dst, const void* src, const size_t byte_size)
: dst_(dst), src_(src), byte_size_(byte_size)
{
}
void* dst_;
const void* src_;
const size_t byte_size_;
};
/// The value for a dimension in a shape that indicates that that
/// dimension can take on any size.
constexpr int WILDCARD_DIM = -1;
constexpr char kTensorRTExecutionAccelerator[] = "tensorrt";
constexpr char kOpenVINOExecutionAccelerator[] = "openvino";
constexpr char kGPUIOExecutionAccelerator[] = "gpu_io";
constexpr char kAutoMixedPrecisionExecutionAccelerator[] =
"auto_mixed_precision";
TRITONSERVER_MemoryType GetUsePinnedMemoryType(
TRITONSERVER_MemoryType ref_buffer_type);
TRITONSERVER_Error* CommonErrorToTritonError(triton::common::Error error);
TRITONSERVER_Error_Code StatusCodeToTritonCode(
triton::common::Error::Code error_code);
/// Parse an array in a JSON object into the corresponding shape. The
/// array must be composed of integers.
///
/// \param io The JSON object containing the member array.
/// \param name The name of the array member in the JSON object.
/// \param shape Returns the shape.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_Error* ParseShape(
common::TritonJson::Value& io, const std::string& name,
std::vector<int64_t>* shape);
/// Return the string representation of a shape.
///
/// \param dims The shape dimensions.
/// \param dims_count The number of dimensions.
/// \return The string representation.
std::string ShapeToString(const int64_t* dims, const size_t dims_count);
/// Return the string representation of a shape.
///
/// \param shape The shape as a vector of dimensions.
/// \return The string representation.
std::string ShapeToString(const std::vector<int64_t>& shape);
/// Return the number of elements of a shape.
///
/// \param dims The shape dimensions.
/// \param dims_count The number of dimensions.
/// \return The number of elements.
int64_t GetElementCount(const int64_t* dims, const size_t dims_count);
/// Return the number of elements of a shape.
///
/// \param shape The shape as a vector of dimensions.
/// \return The number of elements.
int64_t GetElementCount(const std::vector<int64_t>& shape);
/// Get the size, in bytes, of a tensor based on datatype and
/// shape.
/// \param dtype The data-type.
/// \param dims The shape.
/// \return The size, in bytes, of the corresponding tensor, or -1 if
/// unable to determine the size.
int64_t GetByteSize(
const TRITONSERVER_DataType& dtype, const std::vector<int64_t>& dims);
/// Get an input tensor's contents into a buffer. This overload expects
/// both 'buffer' and buffers of the input to be in CPU.
///
/// \param request The inference request.
/// \param input_name The name of the input buffer.
/// \param buffer The buffer where the input tensor content is copied into.
/// \param buffer_byte_size Acts as both input and output. On input
/// gives the size of 'buffer', in bytes. The function will fail if
/// the buffer is not large enough to hold the input tensor
/// contents. Returns the size of the input tensor data returned in
/// 'buffer'.
/// \param host_policy_name The host policy name to look up the input buffer.
/// Default input buffer will be used if nullptr is provided.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_Error* ReadInputTensor(
TRITONBACKEND_Request* request, const std::string& input_name, char* buffer,
size_t* buffer_byte_size, const char* host_policy_name = nullptr);
/// Get an input tensor's contents into a buffer. This overload of
/// 'ReadInputTensor' supports input buffers that can be in any memory.
///
/// \param request The inference request.
/// \param input_name The name of the input buffer.
/// \param buffer The buffer where the input tensor content is copied into.
/// \param buffer_byte_size Acts as both input and output. On input
/// gives the size of 'buffer', in bytes. The function will fail if
/// the buffer is not large enough to hold the input tensor
/// contents. Returns the size of the input tensor data returned in
/// 'buffer'.
/// \param host_policy_name The host policy name to look up the input buffer.
/// Default input buffer will be used if nullptr is provided.
/// \param memory_type The memory type of the buffer provided.
/// \param memory_type_id The memory type id of the buffer provided.
/// \param cuda_stream specifies the stream to be associated with, and 0 can be
/// passed for default stream.
/// \param cuda_used returns whether a CUDA memory copy is initiated. If true,
/// the caller should synchronize on the given 'cuda_stream' to ensure data copy
/// is completed.
/// \param copy_on_stream whether the memory copies should be performed in cuda
/// host functions on the 'cuda_stream'.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_Error* ReadInputTensor(
TRITONBACKEND_Request* request, const std::string& input_name, char* buffer,
size_t* buffer_byte_size, TRITONSERVER_MemoryType memory_type,
int64_t memory_type_id, cudaStream_t cuda_stream, bool* cuda_used,
const char* host_policy_name = nullptr, const bool copy_on_stream = false);
/// Validate that an input matches one of the allowed input names.
/// \param io The model input.
/// \param allowed The set of allowed input names.
/// \return The error status. A non-OK status indicates the input
/// is not valid.
TRITONSERVER_Error* CheckAllowedModelInput(
common::TritonJson::Value& io, const std::set<std::string>& allowed);
/// Validate that an output matches one of the allowed output names.
/// \param io The model output.
/// \param allowed The set of allowed output names.
/// \return The error status. A non-OK status indicates the output
/// is not valid.
TRITONSERVER_Error* CheckAllowedModelOutput(
common::TritonJson::Value& io, const std::set<std::string>& allowed);
/// Get the tensor name, false value, and true value for a boolean
/// sequence batcher control kind. If 'required' is true then must
/// find a tensor for the control. If 'required' is false, return
/// 'tensor_name' as empty-string if the control is not mapped to any
/// tensor.
///
/// \param batcher The JSON object of the sequence batcher.
/// \param model_name The name of the model.
/// \param control_kind The kind of control tensor to look for.
/// \param required Whether the tensor must be specified.
/// \param tensor_name Returns the name of the tensor.
/// \param tensor_datatype Returns the data type of the tensor.
/// \param fp32_false_value Returns the float value for false if
/// the tensor type is FP32.
/// \param fp32_true_value Returns the float value for true if
/// the tensor type is FP32.
/// \param int32_false_value Returns the int value for false if
/// the tensor type is INT32.
/// \param int32_true_value Returns the int value for true if
/// the tensor type is INT32.
/// \param bool_false_value Returns the bool value for false if
/// the tensor type is BOOL.
/// \param bool_true_value Returns the bool value for true if
/// the tensor type is BOOL.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_Error* GetBooleanSequenceControlProperties(
common::TritonJson::Value& batcher, const std::string& model_name,
const std::string& control_kind, const bool required,
std::string* tensor_name, std::string* tensor_datatype,
float* fp32_false_value, float* fp32_true_value, int32_t* int32_false_value,
int32_t* int32_true_value, bool* bool_false_value, bool* bool_true_value);
/// Get the tensor name and datatype for a non-boolean sequence
/// batcher control kind. If 'required' is true then must find a
/// tensor for the control. If 'required' is false, return
/// 'tensor_name' as empty-string if the control is not mapped to any
/// tensor. 'tensor_datatype' returns the required datatype for the
/// control.
///
/// \param batcher The JSON object of the sequence batcher.
/// \param model_name The name of the model.
/// \param control_kind The kind of control tensor to look for.
/// \param required Whether the tensor must be specified.
/// \param tensor_name Returns the name of the tensor.
/// \param tensor_datatype Returns the data type of the tensor.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_Error* GetTypedSequenceControlProperties(
common::TritonJson::Value& batcher, const std::string& model_name,
const std::string& control_kind, const bool required,
std::string* tensor_name, std::string* tensor_datatype);
/// Create and send an error response for a set of requests. This
/// function takes ownership of 'response_err' and so the caller must
/// not access or delete it after this call returns.
///
/// \param requests The requests.
/// \param request_count The number of 'requests'.
/// \param response_err The error to send to each request.
/// \param release_request If true, the requests will be released after
/// sending the error responses and the request pointers are set to
/// nullptr.
void RequestsRespondWithError(
TRITONBACKEND_Request** requests, const uint32_t request_count,
TRITONSERVER_Error* response_err, const bool release_request = true);
/// Send an error response for a set of responses. This function takes
/// ownership of 'response_err' and so the caller must not access or
/// delete it after this call returns.
///
/// \param responses The responses.
/// \param response_count The number of 'responses'.
/// \param response_err The error to send.
void SendErrorForResponses(
std::vector<TRITONBACKEND_Response*>* responses,
const uint32_t response_count, TRITONSERVER_Error* response_err);
/// Copy buffer from 'src' to 'dst' for given 'byte_size'. The buffer location
/// is identified by the memory type and id, and the corresponding copy will be
/// initiated.
/// \param msg The message to be prepended in error message.
/// \param src_memory_type The memory type of the source buffer.
/// \param src_memory_type_id The memory type id of the source buffer.
/// \param dst_memory_type The memory type of the destination buffer.
/// \param dst_memory_type_id The memory type id of the destination buffer.
/// \param byte_size The byte size of the source buffer.
/// \param src The pointer to the source buffer.
/// \param dst The pointer to the destination buffer.
/// \param cuda_stream specifies the stream to be associated with, and 0 can be
/// passed for default stream.
/// \param cuda_used returns whether a CUDA memory copy is initiated. If true,
/// the caller should synchronize on the given 'cuda_stream' to ensure data copy
/// is completed.
/// \param copy_on_stream whether the memory copies should be performed in cuda
/// host functions on the 'cuda_stream'.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_Error* CopyBuffer(
const std::string& msg, const TRITONSERVER_MemoryType src_memory_type,
const int64_t src_memory_type_id,
const TRITONSERVER_MemoryType dst_memory_type,
const int64_t dst_memory_type_id, const size_t byte_size, const void* src,
void* dst, cudaStream_t cuda_stream, bool* cuda_used,
const bool copy_on_stream = false);
/// Does a file or directory exist?
/// \param path The path to check for existance.
/// \param exists Returns true if file/dir exists
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_Error* FileExists(const std::string& path, bool* exists);
/// Read a text file into a string.
/// \param path The path of the file.
/// \param contents Returns the contents of the file.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_Error* ReadTextFile(
const std::string& path, std::string* contents);
/// Is a path a directory?
/// \param path The path to check.
/// \param is_dir Returns true if path represents a directory
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_Error* IsDirectory(const std::string& path, bool* is_dir);
/// Join path segments into a longer path
/// \param segments The path segments.
/// \return the path formed by joining the segments.
std::string JoinPath(std::initializer_list<std::string> segments);
/// Returns the content in the model version path and the path to the content as
/// key-value pair.
/// \param model_repository_path The path to the model repository.
/// \param version The version of the model.
/// \param ignore_directories Whether the directories will be ignored.
/// \param ignore_files Whether the files will be ignored.
/// \param model_paths Returns the content in the model version path and
/// the path to the content.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_Error* ModelPaths(
const std::string& model_repository_path, uint64_t version,
const bool ignore_directories, const bool ignore_files,
std::unordered_map<std::string, std::string>* model_paths);
/// Create a CUDA stream appropriate for GPU<->CPU data transfer
/// operations for a given GPU device. The caller takes ownership of
/// the stream. 'stream' returns nullptr if GPU support is disabled.
///
/// \param device_id The ID of the GPU.
/// \param priority The stream priority. Use 0 for normal priority.
/// \param stream Returns the created stream.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_Error* CreateCudaStream(
const int device_id, const int cuda_stream_priority, cudaStream_t* stream);
/// Parse the string as long long integer.
///
/// \param value The string.
/// \param parse_value The long long integral value of the string.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_Error* ParseLongLongValue(
const std::string& value, int64_t* parsed_value);
/// Parse the string as unsigned long long integer.
///
/// \param value The string.
/// \param parse_value The unsigned long long integral value of the string.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_Error* ParseUnsignedLongLongValue(
const std::string& value, uint64_t* parsed_value);
/// Parse the string as boolean.
///
/// \param value The string.
/// \param parse_value The boolean value of the string.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_Error* ParseBoolValue(
const std::string& value, bool* parsed_value);
/// Parse the string as integer.
///
/// \param value The string.
/// \param parse_value The integral value of the string.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_Error* ParseIntValue(const std::string& value, int* parsed_value);
/// Parse the string as double.
///
/// \param value The string.
/// \param parse_value The double value of the string.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_Error* ParseDoubleValue(
const std::string& value, double* parsed_value);
/// Return the value of the specified key in a JSON object.
///
/// \param params The JSON object containing the key-value mapping.
/// \param key The key to look up the value in the JSON object.
/// \param value Returns the value.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_Error* GetParameterValue(
triton::common::TritonJson::Value& params, const std::string& key,
std::string* value);
/// Return the Triton server data type of the data type string specified
/// in model config JSON.
///
/// \param data_type_str The string representation of the data type.
/// \return the Triton server data type.
TRITONSERVER_DataType ModelConfigDataTypeToTritonServerDataType(
const std::string& data_type_str);
/// Try to parse the requested parameter.
///
/// \param params The param in model config
/// \param mkey Key in the model config.
/// \param value The parsed string value.
/// \param default_value Default value to use when key is not found.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_Error* TryParseModelStringParameter(
triton::common::TritonJson::Value& params, const std::string& mkey,
std::string* value, const std::string& default_value);
/// Try to parse the requested parameter.
///
/// \param params The param in model config
/// \param mkey Key in the model config.
/// \param value The parsed int value.
/// \param default_value Default value to use when key is not found.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_Error* TryParseModelStringParameter(
triton::common::TritonJson::Value& params, const std::string& mkey,
int* value, const int& default_value);
/// Try to parse the requested parameter.
///
/// \param params The param in model config
/// \param mkey Key in the model config.
/// \param value The parsed bool value.
/// \param default_value Default value to use when key is not found.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_Error* TryParseModelStringParameter(
triton::common::TritonJson::Value& params, const std::string& mkey,
bool* value, const bool& default_value);
/// Try to parse the requested parameter.
///
/// \param params The param in model config
/// \param mkey Key in the model config.
/// \param value The parsed uint64 value.
/// \param default_value Default value to use when key is not found.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_Error* TryParseModelStringParameter(
triton::common::TritonJson::Value& params, const std::string& mkey,
uint64_t* value, const uint64_t& default_value);
/// Get a string representation of a tensor buffer.
///
/// \param str Returns the string.
/// \param buffer The base pointer to the tensor buffer.
/// \param buffer_byte_size The size of the buffer in bytes.
/// \param datatype The type of the tensor
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONSERVER_Error* BufferAsTypedString(
std::string& str, const char* buffer, size_t buffer_byte_size,
TRITONSERVER_DataType datatype);
/// Get the ID of the request as a string formatted for logging.
///
/// \param request Request of which to get the ID.
/// \return a formatted string for logging the request ID.
std::string GetRequestId(TRITONBACKEND_Request* request);
}} // namespace triton::backend
// Copyright 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <list>
#include <memory>
#include <string>
#include <vector>
#include "triton/backend/backend_common.h"
#include "triton/backend/backend_memory.h"
#include "triton/common/async_work_queue.h"
#include "triton/common/sync_queue.h"
#include "triton/core/tritonbackend.h"
#ifdef TRITON_ENABLE_GPU
#include <cuda_runtime_api.h>
#endif // TRITON_ENABLE_GPU
namespace triton { namespace backend {
#ifndef TRITON_ENABLE_GPU
using cudaStream_t = void*;
using cudaEvent_t = void*;
#endif // !TRITON_ENABLE_GPU
//
// BackendInputCollector
//
class BackendInputCollector {
public:
// The caller can optionally provide 'event' for internal synchronization
// instead of using 'stream'. If 'host_policy_name' is provided, it must be
// valid for the lifetime of the collector
explicit BackendInputCollector(
TRITONBACKEND_Request** requests, const uint32_t request_count,
std::vector<TRITONBACKEND_Response*>* responses,
TRITONBACKEND_MemoryManager* memory_manager, const bool pinned_enabled,
cudaStream_t stream, cudaEvent_t event = nullptr,
cudaEvent_t buffer_ready_event = nullptr,
const size_t kernel_buffer_threshold = 0,
const char* host_policy_name = nullptr, const bool copy_on_stream = false,
const bool coalesce_request_input = false)
: need_sync_(false), requests_(requests), request_count_(request_count),
responses_(responses), memory_manager_(memory_manager),
pinned_enabled_(pinned_enabled),
use_async_cpu_copy_(triton::common::AsyncWorkQueue::WorkerCount() > 1),
stream_(stream), event_(event), buffer_ready_event_(buffer_ready_event),
kernel_buffer_threshold_(kernel_buffer_threshold),
pending_pinned_byte_size_(0), pending_pinned_offset_(0),
pending_copy_kernel_buffer_byte_size_(0),
pending_copy_kernel_buffer_offset_(0),
pending_copy_kernel_input_buffer_counts_(0), async_task_count_(0),
host_policy_cstr_(host_policy_name), copy_on_stream_(copy_on_stream),
coalesce_request_input_(coalesce_request_input)
{
}
~BackendInputCollector() = default;
// Process all requests for a named input tensor and return the
// concatenated values of those requests in a single contiguous
// buffer. This overload of the function can avoid data copy if the
// tensor values are already contiguous and the caller doesn't
// provide a destination 'buffer'.
//
// 'buffer' is used to determine whether the input should be placed at the
// 'buffer' provided by the caller. If 'buffer' == nullptr, the returned
// buffer will be managed by the BackendInputCollector object and
// has the same lifecycle as the BackendInputCollector object.
// 'buffer_byte_size' is the byte size of 'buffer' if it is not nullptr.
// 'allowed_input_types' is the ordered list of the memory type and id pairs
// that the returned buffer can be. It must only contain the memory type
// and id of 'buffer' if 'buffer' is not nullptr.
// 'dst_buffer' returns the contiguous buffer of the input tensor.
// 'dst_buffer_byte_size' the byte size of 'dst_buffer'.
// 'dst_memory_type' returns the memory type of 'dst_buffer'.
// 'dst_memory_type_id' returns the memory type id of 'dst_buffer'.
TRITONSERVER_Error* ProcessTensor(
const char* input_name, char* buffer, const size_t buffer_byte_size,
const std::vector<std::pair<TRITONSERVER_MemoryType, int64_t>>&
allowed_input_types,
const char** dst_buffer, size_t* dst_buffer_byte_size,
TRITONSERVER_MemoryType* dst_memory_type, int64_t* dst_memory_type_id);
// Process all requests for a named input tensor and return the
// concatenated values of those requests in a single contiguous
// 'buffer'.
//
// 'buffer' The buffer to hold the concatenates tensor value. Must
// be large enough to hold all tensor value.
// 'buffer_byte_size' is the byte size of 'buffer'.
// 'dst_memory_type' The memory type of 'buffer'.
// 'dst_memory_type_id' The memory type id of 'buffer'.
void ProcessTensor(
const char* input_name, char* buffer, const size_t buffer_byte_size,
const TRITONSERVER_MemoryType memory_type, const int64_t memory_type_id);
// Process the batch input and return its shape. Returning error indicates
// that the batch input can't be formed properly and the caller should abort
// the whole batch.
TRITONSERVER_Error* BatchInputShape(
const BatchInput& batch_input, std::vector<int64_t>* shape);
// Process the batch input and derive its value into 'buffer'. Returning
// error indicates that the batch input can't be formed properly and
// the caller should abort the whole batch.
// 'buffer' is used to determine whether the input should be placed at the
// 'buffer' provided by the caller. If 'buffer' == nullptr, the returned
// buffer will be managed by the BackendInputCollector object and
// has the same lifecycle as the BackendInputCollector object.
// 'buffer_byte_size' is the byte size of 'buffer' if it is not nullptr.
// 'allowed_input_types' is the ordered list of the memory type and id pairs
// that the returned buffer can be. It must only contain the memory type
// and id of 'buffer' if it is not nullptr.
// 'dst_buffer' returns the contiguous buffer of the input tensor.
// 'dst_memory_type' returns the memory type of 'dst_buffer'.
// 'dst_memory_type_id' returns the memory type id of 'dst_buffer'.
TRITONSERVER_Error* ProcessBatchInput(
const BatchInput& batch_input, char* buffer,
const size_t buffer_byte_size,
const std::vector<std::pair<TRITONSERVER_MemoryType, int64_t>>&
allowed_input_types,
const char** dst_buffer, size_t* dst_buffer_byte_size,
TRITONSERVER_MemoryType* dst_memory_type, int64_t* dst_memory_type_id);
// Finalize processing of all requests for all input tensors. Return
// true if cudaMemcpyAsync is called, and the caller should call
// cudaStreamSynchronize (or cudaEventSynchronize on 'event') before
// using the data.
bool Finalize();
private:
struct ContiguousBuffer {
ContiguousBuffer() : start_request_idx_(0), end_request_idx_(0) {}
MemoryDesc memory_desc_;
size_t start_request_idx_;
size_t end_request_idx_;
};
class InputIterator {
public:
InputIterator(
TRITONBACKEND_Request** requests, const uint32_t request_count,
std::vector<TRITONBACKEND_Response*>* responses, const char* input_name,
const char* host_policy_name, const bool coalesce_request_input);
// Return false if iterator reaches the end of inputs, 'input' is not set.
bool GetNextContiguousInput(ContiguousBuffer* input);
private:
TRITONBACKEND_Request** requests_;
const uint32_t request_count_;
std::vector<TRITONBACKEND_Response*>* responses_;
const char* input_name_;
const char* host_policy_;
const bool coalesce_request_input_;
TRITONBACKEND_Input* curr_input_;
size_t curr_request_idx_;
size_t curr_buffer_idx_;
uint32_t curr_buffer_cnt_;
bool reach_end_;
};
// Return whether the entire input is in a contiguous buffer. If returns true,
// the properties of the contiguous input buffer will also be returned.
// Otherwise, only 'buffer_byte_size' will be set and return the total byte
// size of the input.
bool GetInputBufferIfContiguous(
const char* input_name, const char** buffer, size_t* buffer_byte_size,
TRITONSERVER_MemoryType* memory_type, int64_t* memory_type_id);
bool FlushPendingPinned(
char* tensor_buffer, const size_t tensor_buffer_byte_size,
const TRITONSERVER_MemoryType tensor_memory_type,
const int64_t tensor_memory_type_id);
bool FlushPendingCopyKernel(
char* tensor_buffer, const size_t tensor_buffer_byte_size,
const TRITONSERVER_MemoryType tensor_memory_type,
const int64_t tensor_memory_type_id);
TRITONSERVER_Error* LaunchCopyKernel(
char* tensor_buffer, const size_t tensor_buffer_byte_size,
const TRITONSERVER_MemoryType tensor_memory_type,
const int64_t tensor_memory_type_id);
bool SetInputTensor(
const char* input_name, const ContiguousBuffer& input,
char* tensor_buffer, const size_t tensor_buffer_byte_size,
const TRITONSERVER_MemoryType tensor_memory_type,
const int64_t tensor_memory_type_id, const size_t tensor_buffer_offset,
const TRITONSERVER_MemoryType use_pinned_memory_type,
const bool use_kernel, const bool wait_buffer);
template <typename T>
TRITONSERVER_Error* SetElementCount(
const std::string& source_input, char* buffer,
const size_t buffer_byte_size);
template <typename T>
TRITONSERVER_Error* SetAccumulatedElementCount(
const std::string& source_input, char* buffer,
const size_t buffer_byte_size);
template <typename T>
TRITONSERVER_Error* SetBatchItemShape(
const std::string& source_input, char* buffer,
const size_t buffer_byte_size);
bool need_sync_;
TRITONBACKEND_Request** requests_;
const uint32_t request_count_;
std::vector<TRITONBACKEND_Response*>* responses_;
TRITONBACKEND_MemoryManager* memory_manager_;
const bool pinned_enabled_;
const bool use_async_cpu_copy_;
cudaStream_t stream_;
cudaEvent_t event_;
cudaEvent_t buffer_ready_event_;
const size_t kernel_buffer_threshold_;
size_t pending_pinned_byte_size_;
size_t pending_pinned_offset_;
std::list<ContiguousBuffer> pending_pinned_input_buffers_;
// managed memories that need to live over the lifetime of this
// BackendInputCollector object.
std::list<std::unique_ptr<BackendMemory>> in_use_memories_;
size_t pending_copy_kernel_buffer_byte_size_;
size_t pending_copy_kernel_buffer_offset_;
size_t pending_copy_kernel_input_buffer_counts_;
std::list<ContiguousBuffer> pending_copy_kernel_input_buffers_;
std::vector<std::unique_ptr<std::vector<int8_t*>>> input_ptr_buffer_host_;
std::vector<std::unique_ptr<std::vector<size_t>>> byte_size_buffer_host_;
std::vector<std::unique_ptr<std::vector<size_t>>>
byte_size_offset_buffer_host_;
// Pinned memory buffers and the corresponding request_inputs where
// the final copy to the tensor is deferred until Finalize() after
// waiting for all in-flight copies.
struct DeferredPinned {
DeferredPinned(
char* pinned_memory, const size_t pinned_memory_size,
char* tensor_buffer, const size_t tensor_buffer_offset,
const TRITONSERVER_MemoryType tensor_memory_type,
const int64_t tensor_memory_id,
std::list<ContiguousBuffer>&& request_buffers,
std::vector<TRITONBACKEND_Response*>* responses)
: finalized_(false), pinned_memory_(pinned_memory),
pinned_memory_size_(pinned_memory_size),
tensor_buffer_(tensor_buffer),
tensor_buffer_offset_(tensor_buffer_offset),
tensor_memory_type_(tensor_memory_type),
tensor_memory_id_(tensor_memory_id),
requests_(std::move(request_buffers)), responses_(responses)
{
}
bool Finalize(cudaStream_t stream);
bool finalized_;
// Holding reference to the pinned memory buffer, which is managed
// by BackendInputCollector as 'pinned_memory'
char* pinned_memory_;
const size_t pinned_memory_size_;
char* tensor_buffer_;
const size_t tensor_buffer_offset_;
const TRITONSERVER_MemoryType tensor_memory_type_;
const int64_t tensor_memory_id_;
std::list<ContiguousBuffer> requests_;
std::vector<TRITONBACKEND_Response*>* responses_;
};
std::list<DeferredPinned> deferred_pinned_;
// FIXME use future to maintain an issue-order queue to drop task count
triton::common::SyncQueue<bool> completion_queue_;
size_t async_task_count_;
const char* host_policy_cstr_;
const bool copy_on_stream_;
const bool coalesce_request_input_;
};
}} // namespace triton::backend
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <string>
#include <vector>
#include "triton/core/tritonbackend.h"
#include "triton/core/tritonserver.h"
namespace triton { namespace backend {
// Colletion of common properties that describes a buffer in Triton
struct MemoryDesc {
MemoryDesc()
: buffer_(nullptr), byte_size_(0), memory_type_(TRITONSERVER_MEMORY_CPU),
memory_type_id_(0)
{
}
MemoryDesc(
const char* buffer, size_t byte_size, TRITONSERVER_MemoryType memory_type,
int64_t memory_type_id)
: buffer_(buffer), byte_size_(byte_size), memory_type_(memory_type),
memory_type_id_(memory_type_id)
{
}
const char* buffer_;
size_t byte_size_;
TRITONSERVER_MemoryType memory_type_;
int64_t memory_type_id_;
};
//
// BackendMemory
//
// Utility class for allocating and deallocating memory using both
// TRITONBACKEND_MemoryManager and direct GPU and CPU malloc/free.
//
class BackendMemory {
public:
enum class AllocationType { CPU, CPU_PINNED, GPU, CPU_PINNED_POOL, GPU_POOL };
// Allocate a contiguous block of 'alloc_type' memory. 'mem'
// returns the pointer to the allocated memory.
//
// CPU, CPU_PINNED_POOL and GPU_POOL are allocated using
// TRITONBACKEND_MemoryManagerAllocate. Note that CPU_PINNED and GPU
// allocations can be much slower than the POOL variants.
//
// Two error codes have specific interpretations for this function:
//
// TRITONSERVER_ERROR_UNSUPPORTED: Indicates that function is
// incapable of allocating the requested memory type and memory
// type ID. Requests for the memory type and ID will always fail
// no matter 'byte_size' of the request.
//
// TRITONSERVER_ERROR_UNAVAILABLE: Indicates that function can
// allocate the memory type and ID but that currently it cannot
// allocate a contiguous block of memory of the requested
// 'byte_size'.
static TRITONSERVER_Error* Create(
TRITONBACKEND_MemoryManager* manager, const AllocationType alloc_type,
const int64_t memory_type_id, const size_t byte_size,
BackendMemory** mem);
// Allocate a contiguous block of memory by attempting the
// allocation using 'alloc_types' in order until one is successful.
// See BackendMemory::Create() above for details.
static TRITONSERVER_Error* Create(
TRITONBACKEND_MemoryManager* manager,
const std::vector<AllocationType>& alloc_types,
const int64_t memory_type_id, const size_t byte_size,
BackendMemory** mem);
// Creates a BackendMemory object from a pre-allocated buffer. The buffer
// is not owned by the object created with this function. Hence, for
// proper operation, the lifetime of the buffer should atleast extend till
// the corresponding BackendMemory.
static TRITONSERVER_Error* Create(
TRITONBACKEND_MemoryManager* manager, const AllocationType alloc_type,
const int64_t memory_type_id, void* buffer, const size_t byte_size,
BackendMemory** mem);
~BackendMemory();
AllocationType AllocType() const { return alloctype_; }
int64_t MemoryTypeId() const { return memtype_id_; }
char* MemoryPtr() { return buffer_; }
size_t ByteSize() const { return byte_size_; }
TRITONSERVER_MemoryType MemoryType() const
{
return AllocTypeToMemoryType(alloctype_);
}
static TRITONSERVER_MemoryType AllocTypeToMemoryType(const AllocationType a);
static const char* AllocTypeString(const AllocationType a);
private:
BackendMemory(
TRITONBACKEND_MemoryManager* manager, const AllocationType alloctype,
const int64_t memtype_id, char* buffer, const size_t byte_size,
const bool owns_buffer = true)
: manager_(manager), alloctype_(alloctype), memtype_id_(memtype_id),
buffer_(buffer), byte_size_(byte_size), owns_buffer_(owns_buffer)
{
}
TRITONBACKEND_MemoryManager* manager_;
AllocationType alloctype_;
int64_t memtype_id_;
char* buffer_;
size_t byte_size_;
bool owns_buffer_;
};
}} // namespace triton::backend
// Copyright 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <map>
#include <set>
#include <string>
#include "triton/backend/backend_common.h"
#include "triton/core/tritonbackend.h"
#include "triton/core/tritonserver.h"
namespace triton { namespace backend {
//
// BackendModel
//
// Common functionality for a backend model. This class is provided as
// a convenience; backends are not required to use this class.
//
class BackendModel {
public:
BackendModel(
TRITONBACKEND_Model* triton_model, const bool allow_optional = false);
virtual ~BackendModel() = default;
// Get the handle to the TRITONBACKEND server hosting this model.
TRITONSERVER_Server* TritonServer() { return triton_server_; }
// Get the handle to the memory manager for this model.
TRITONBACKEND_MemoryManager* TritonMemoryManager()
{
return triton_memory_manager_;
}
// Get the handle to the TRITONBACKEND model.
TRITONBACKEND_Model* TritonModel() { return triton_model_; }
// Get the name and version of the model.
const std::string& Name() const { return name_; }
uint64_t Version() const { return version_; }
const std::string& RepositoryPath() const { return repository_path_; }
// The model configuration.
common::TritonJson::Value& ModelConfig() { return model_config_; }
// Sets the updated model configuration to the core.
TRITONSERVER_Error* SetModelConfig();
// Parses information out of the model configuration.
TRITONSERVER_Error* ParseModelConfig();
// Maximum batch size supported by the model. A value of 0
// indicates that the model does not support batching.
int MaxBatchSize() const { return max_batch_size_; }
// Set the max batch size for the model. When a backend
// auto-completes a configuration it may set or change the maximum
// batch size.
void SetMaxBatchSize(const int b) { max_batch_size_ = b; }
// Does this model support batching in the first dimension?
TRITONSERVER_Error* SupportsFirstDimBatching(bool* supports);
// Use indirect pinned memory buffer when copying an input or output
// tensor to/from the model.
bool EnablePinnedInput() const { return enable_pinned_input_; }
bool EnablePinnedOutput() const { return enable_pinned_output_; }
const std::vector<BatchInput>& BatchInputs() const { return batch_inputs_; }
const std::vector<BatchOutput>& BatchOutputs() const
{
return batch_outputs_;
}
const BatchOutput* FindBatchOutput(const std::string& output_name) const;
bool IsInputRagged(const std::string& input_name) const
{
return (ragged_inputs_.find(input_name) != ragged_inputs_.end());
}
bool IsInputOptional(const std::string& input_name) const
{
return (optional_inputs_.find(input_name) != optional_inputs_.end());
}
protected:
TRITONSERVER_Server* triton_server_;
TRITONBACKEND_MemoryManager* triton_memory_manager_;
TRITONBACKEND_Model* triton_model_;
std::string name_;
uint64_t version_;
std::string repository_path_;
bool allow_optional_;
common::TritonJson::Value model_config_;
int max_batch_size_;
bool enable_pinned_input_;
bool enable_pinned_output_;
std::vector<BatchInput> batch_inputs_;
std::vector<BatchOutput> batch_outputs_;
std::map<std::string, const BatchOutput*> batch_output_map_;
std::set<std::string> ragged_inputs_;
std::set<std::string> optional_inputs_;
};
//
// BackendModelException
//
// Exception thrown if error occurs while constructing an
// BackendModel.
//
struct BackendModelException {
BackendModelException(TRITONSERVER_Error* err) : err_(err) {}
TRITONSERVER_Error* err_;
};
#define THROW_IF_BACKEND_MODEL_ERROR(X) \
do { \
TRITONSERVER_Error* tie_err__ = (X); \
if (tie_err__ != nullptr) { \
throw triton::backend::BackendModelException(tie_err__); \
} \
} while (false)
}} // namespace triton::backend
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment