Commit 0a21fff9 authored by xiabo's avatar xiabo
Browse files

Adapt to 0.1.0

parent 9484fd1c
// Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <string>
#include "triton/core/tritonbackend.h"
#ifdef TRITON_ENABLE_GPU
#include <cuda_runtime_api.h>
#endif // TRITON_ENABLE_GPU
namespace triton { namespace backend {
#ifndef TRITON_ENABLE_GPU
using cudaStream_t = void*;
#endif // !TRITON_ENABLE_GPU
class BackendModel;
//
// BackendModelInstance
//
// Common functionality for a backend model instance. This class is
// provided as a convenience; backends are not required to use this
// class.
//
class BackendModelInstance {
public:
BackendModelInstance(
BackendModel* backend_model,
TRITONBACKEND_ModelInstance* triton_model_instance);
virtual ~BackendModelInstance();
// Get the name, kind and device ID of the instance.
const std::string& Name() const { return name_; }
TRITONSERVER_InstanceGroupKind Kind() const { return kind_; }
int32_t DeviceId() const { return device_id_; }
// Get the handle to the TRITONBACKEND model instance.
TRITONBACKEND_ModelInstance* TritonModelInstance()
{
return triton_model_instance_;
}
// Get the BackendModel representing the model that corresponds to
// this instance.
BackendModel* Model() const { return backend_model_; }
// The model configuration 'default_model_filename' value, or the
// value in model configuration 'cc_model_filenames' for the GPU
// targeted by this instance. If neither are specified in the model
// configuration, the return empty string.
const std::string& ArtifactFilename() const { return artifact_filename_; }
// Returns the stream associated with this instance that can be used
// for GPU<->CPU memory transfers. Returns nullptr if GPU support is
// disabled or if this instance is not executing on a GPU.
cudaStream_t CudaStream() { return stream_; }
const std::string& HostPolicyName() const { return host_policy_name_; }
protected:
BackendModel* backend_model_;
TRITONBACKEND_ModelInstance* triton_model_instance_;
std::string name_;
TRITONSERVER_InstanceGroupKind kind_;
int32_t device_id_;
std::string artifact_filename_;
cudaStream_t stream_;
std::string host_policy_name_;
};
//
// BackendModelInstanceException
//
// Exception thrown if error occurs while constructing an
// BackendModelInstance.
//
struct BackendModelInstanceException {
BackendModelInstanceException(TRITONSERVER_Error* err) : err_(err) {}
TRITONSERVER_Error* err_;
};
#define THROW_IF_BACKEND_INSTANCE_ERROR(X) \
do { \
TRITONSERVER_Error* tie_err__ = (X); \
if (tie_err__ != nullptr) { \
throw triton::backend::BackendModelInstanceException(tie_err__); \
} \
} while (false)
}} // namespace triton::backend
// Copyright 2019-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <list>
#include <string>
#include <vector>
#include "triton/backend/backend_common.h"
#include "triton/common/async_work_queue.h"
#include "triton/core/tritonbackend.h"
#ifdef TRITON_ENABLE_GPU
#include <cuda_runtime_api.h>
#endif // TRITON_ENABLE_GPU
namespace triton { namespace backend {
#ifndef TRITON_ENABLE_GPU
using cudaStream_t = void*;
using cudaEvent_t = void*;
#endif // !TRITON_ENABLE_GPU
//
// BackendOutputResponder
//
class BackendOutputResponder {
public:
// The caller can optionally provide 'event' for internal synchronization
// instead of using 'stream'.
explicit BackendOutputResponder(
TRITONBACKEND_Request** requests, const uint32_t request_count,
std::vector<TRITONBACKEND_Response*>* responses,
TRITONBACKEND_MemoryManager* memory_manager,
const bool first_dim_batching, const bool pinned_enabled,
cudaStream_t stream, cudaEvent_t event = nullptr,
bool copy_on_stream = false)
: need_sync_(false), requests_(requests), request_count_(request_count),
responses_(responses), memory_manager_(memory_manager),
first_dim_batching_(first_dim_batching),
pinned_enabled_(pinned_enabled),
use_async_cpu_copy_(triton::common::AsyncWorkQueue::WorkerCount() > 1),
stream_(stream), event_(event), pending_pinned_byte_size_(0),
copy_on_stream_(copy_on_stream)
{
}
// Legacy constructor for backwards compatibility. The above
// constructor should be used for all new cases. The responder needs
// to know if the model is batching along the first dimension. With
// this constructor we derive that information from the
// max_batch_size value instead of having it provided directly as in
// the above constructor.
explicit BackendOutputResponder(
TRITONBACKEND_Request** requests, const uint32_t request_count,
std::vector<TRITONBACKEND_Response*>* responses, const int max_batch_size,
TRITONBACKEND_MemoryManager* memory_manager, const bool pinned_enabled,
cudaStream_t stream, cudaEvent_t event = nullptr,
bool copy_on_stream = false)
: need_sync_(false), requests_(requests), request_count_(request_count),
responses_(responses), memory_manager_(memory_manager),
first_dim_batching_(max_batch_size >= 1),
pinned_enabled_(pinned_enabled),
use_async_cpu_copy_(triton::common::AsyncWorkQueue::WorkerCount() > 1),
stream_(stream), event_(event), pending_pinned_byte_size_(0),
copy_on_stream_(copy_on_stream)
{
}
~BackendOutputResponder();
// Process all responses for a named output tensor.
// 'batchn_shape' may be modified by the call.
void ProcessTensor(
const std::string& name, const TRITONSERVER_DataType datatype,
std::vector<int64_t>& batchn_shape, const char* buffer,
const TRITONSERVER_MemoryType memory_type, const int64_t memory_type_id);
// Process all responses for a named state tensor. Returns a vector of
// TRITONBACKEND_State objects that the backend can use to update the state.
// If TRITONBACKEND_StateUpdate is not called on the vector elements, the
// state will not be updated.
// 'batchn_shape' may be modified by the call.
std::vector<TRITONBACKEND_State*> ProcessStateTensor(
const std::string& name, const TRITONSERVER_DataType datatype,
std::vector<int64_t>& batchn_shape, const char* buffer,
const TRITONSERVER_MemoryType memory_type, const int64_t memory_type_id);
// Process all responses for a batch output and derive its value from
// 'buffer'.
void ProcessBatchOutput(
const std::string& name, const BatchOutput& batch_output,
const char* buffer, const TRITONSERVER_MemoryType memory_type,
const int64_t memory_type_id);
// Finalize processing of all responses for all output
// tensors. Return true if cudaMemcpyAsync is called, and the caller
// should call cudaStreamSynchronize (or cudaEventSynchronize on 'event')
// before using the data.
bool Finalize();
private:
bool FlushPendingPinned(
const char* tensor_buffer,
const TRITONSERVER_MemoryType tensor_memory_type,
const int64_t tensor_memory_type_id);
bool SetFixedSizeBuffer(
TRITONBACKEND_Response** response, void* response_state_or_output,
const std::string& output_name, const size_t tensor_byte_size,
const size_t tensor_offset, const char* tensor_buffer,
const TRITONSERVER_MemoryType tensor_memory_type,
const int64_t tensor_memory_type_id,
const TRITONSERVER_MemoryType use_pinned_memory_type, bool state);
struct OutputData {
OutputData(
const std::string& name, void* buffer, const size_t buffer_byte_size,
const TRITONSERVER_MemoryType memory_type, const int64_t memory_type_id)
: name_(name), buffer_(buffer), buffer_byte_size_(buffer_byte_size),
memory_type_(memory_type), memory_type_id_(memory_type_id)
{
}
const std::string name_;
void* buffer_;
const size_t buffer_byte_size_;
const TRITONSERVER_MemoryType memory_type_;
const int64_t memory_type_id_;
};
bool need_sync_;
TRITONBACKEND_Request** requests_;
const uint32_t request_count_;
std::vector<TRITONBACKEND_Response*>* responses_;
TRITONBACKEND_MemoryManager* memory_manager_;
const bool first_dim_batching_;
const bool pinned_enabled_;
const bool use_async_cpu_copy_;
cudaStream_t stream_;
cudaEvent_t event_;
using ResponsesList =
std::list<std::pair<TRITONBACKEND_Response**, OutputData>>;
size_t pending_pinned_byte_size_;
size_t pending_pinned_offset_;
ResponsesList pending_pinned_outputs_;
const bool copy_on_stream_;
// Pinned memories that need to live over the lifetime of this
// BackendOutputResponder object.
std::list<char*> pinned_memories_;
// Pinned memory buffers and the corresponding response outputs
// where the final copy to the response is deferred until Finalize()
// after waiting for all in-flight copies.
struct DeferredPinned {
DeferredPinned(
char* pinned_memory, const size_t pinned_memory_size,
ResponsesList&& responses)
: pinned_memory_(pinned_memory),
pinned_memory_size_(pinned_memory_size),
responses_(std::move(responses))
{
}
char* pinned_memory_;
const size_t pinned_memory_size_;
ResponsesList responses_;
};
std::list<DeferredPinned> deferred_pinned_;
};
}} // namespace triton::backend
// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "triton/backend/backend_common.h"
#ifdef _WIN32
// suppress the min and max definitions in Windef.h.
#define NOMINMAX
#include <Windows.h>
// _CRT_INTERNAL_NONSTDC_NAMES 1 before including Microsoft provided C Runtime
// library to expose declarations without "_" prefix to match POSIX style.
#define _CRT_INTERNAL_NONSTDC_NAMES 1
#include <direct.h>
#include <io.h>
#else
#include <dirent.h>
#include <unistd.h>
#endif
#include <sys/stat.h>
#include <algorithm>
#include <cerrno>
#include <fstream>
#include <functional>
#include <memory>
#ifdef _WIN32
// <sys/stat.h> in Windows doesn't define S_ISDIR macro
#if !defined(S_ISDIR) && defined(S_IFMT) && defined(S_IFDIR)
#define S_ISDIR(m) (((m)&S_IFMT) == S_IFDIR)
#endif
#define F_OK 0
#endif
namespace triton { namespace backend {
#ifdef TRITON_ENABLE_GPU
void CUDART_CB
MemcpyHost(void* args)
{
auto* copy_params = reinterpret_cast<CopyParams*>(args);
memcpy(copy_params->dst_, copy_params->src_, copy_params->byte_size_);
delete copy_params;
}
#endif // TRITON_ENABLE_GPU
TRITONSERVER_MemoryType
GetUsePinnedMemoryType(TRITONSERVER_MemoryType ref_buffer_type)
{
// The following matrix is used for both input and output.
// src \ dest | non-pinned | pinned | device
// non-pinned | memcpy | memcpy | buffer needed
// pinned | memcpy | memcpy | cudaMemcpy
// device | buffer needed | cudaMemcpy | cudaMemcpy
if (ref_buffer_type == TRITONSERVER_MEMORY_CPU_PINNED) {
return TRITONSERVER_MEMORY_CPU_PINNED;
}
return (ref_buffer_type == TRITONSERVER_MEMORY_CPU) ? TRITONSERVER_MEMORY_GPU
: TRITONSERVER_MEMORY_CPU;
}
TRITONSERVER_Error_Code
StatusCodeToTritonCode(triton::common::Error::Code error_code)
{
switch (error_code) {
case triton::common::Error::Code::UNKNOWN:
return TRITONSERVER_ERROR_UNKNOWN;
case triton::common::Error::Code::INTERNAL:
return TRITONSERVER_ERROR_INTERNAL;
case triton::common::Error::Code::NOT_FOUND:
return TRITONSERVER_ERROR_NOT_FOUND;
case triton::common::Error::Code::INVALID_ARG:
return TRITONSERVER_ERROR_INVALID_ARG;
case triton::common::Error::Code::UNAVAILABLE:
return TRITONSERVER_ERROR_UNAVAILABLE;
case triton::common::Error::Code::UNSUPPORTED:
return TRITONSERVER_ERROR_UNSUPPORTED;
case triton::common::Error::Code::ALREADY_EXISTS:
return TRITONSERVER_ERROR_ALREADY_EXISTS;
default:
break;
}
return TRITONSERVER_ERROR_UNKNOWN;
}
TRITONSERVER_Error*
CommonErrorToTritonError(triton::common::Error error)
{
return TRITONSERVER_ErrorNew(
StatusCodeToTritonCode(error.ErrorCode()), error.Message().c_str());
}
TRITONSERVER_Error*
ParseShape(
common::TritonJson::Value& io, const std::string& name,
std::vector<int64_t>* shape)
{
common::TritonJson::Value shape_array;
RETURN_IF_ERROR(io.MemberAsArray(name.c_str(), &shape_array));
for (size_t i = 0; i < shape_array.ArraySize(); ++i) {
int64_t d = 0;
RETURN_IF_ERROR(shape_array.IndexAsInt(i, &d));
shape->push_back(d);
}
return nullptr; // success
}
std::string
ShapeToString(const int64_t* dims, const size_t dims_count)
{
bool first = true;
std::string str("[");
for (size_t i = 0; i < dims_count; ++i) {
const int64_t dim = dims[i];
if (!first) {
str += ",";
}
str += std::to_string(dim);
first = false;
}
str += "]";
return str;
}
std::string
ShapeToString(const std::vector<int64_t>& shape)
{
return ShapeToString(shape.data(), shape.size());
}
int64_t
GetElementCount(const int64_t* dims, const size_t dims_count)
{
bool first = true;
int64_t cnt = 0;
for (size_t i = 0; i < dims_count; i++) {
if (dims[i] == WILDCARD_DIM) {
return -1;
}
if (first) {
cnt = dims[i];
first = false;
} else {
cnt *= dims[i];
}
}
return cnt;
}
int64_t
GetElementCount(const std::vector<int64_t>& shape)
{
return GetElementCount(shape.data(), shape.size());
}
int64_t
GetByteSize(
const TRITONSERVER_DataType& dtype, const std::vector<int64_t>& dims)
{
size_t dt_size = TRITONSERVER_DataTypeByteSize(dtype);
if (dt_size == 0) {
return -1;
}
int64_t cnt = GetElementCount(dims);
if (cnt == -1) {
return -1;
}
return cnt * dt_size;
}
TRITONSERVER_Error*
ReadInputTensor(
TRITONBACKEND_Request* request, const std::string& input_name, char* buffer,
size_t* buffer_byte_size, TRITONSERVER_MemoryType memory_type,
int64_t memory_type_id, cudaStream_t cuda_stream, bool* cuda_used,
const char* host_policy_name, const bool copy_on_stream)
{
TRITONBACKEND_Input* input;
RETURN_IF_ERROR(
TRITONBACKEND_RequestInput(request, input_name.c_str(), &input));
uint64_t input_byte_size;
uint32_t input_buffer_count;
RETURN_IF_ERROR(TRITONBACKEND_InputPropertiesForHostPolicy(
input, host_policy_name, nullptr, nullptr, nullptr, nullptr,
&input_byte_size, &input_buffer_count));
RETURN_ERROR_IF_FALSE(
input_byte_size <= *buffer_byte_size, TRITONSERVER_ERROR_INVALID_ARG,
std::string(
GetRequestId(request) + "buffer too small for input tensor '" +
input_name + "', " + std::to_string(*buffer_byte_size) + " < " +
std::to_string(input_byte_size)));
size_t output_buffer_offset = 0;
for (uint32_t b = 0; b < input_buffer_count; ++b) {
const void* input_buffer = nullptr;
uint64_t input_buffer_byte_size = 0;
TRITONSERVER_MemoryType input_memory_type = TRITONSERVER_MEMORY_CPU;
int64_t input_memory_type_id = 0;
RETURN_IF_ERROR(TRITONBACKEND_InputBufferForHostPolicy(
input, host_policy_name, b, &input_buffer, &input_buffer_byte_size,
&input_memory_type, &input_memory_type_id));
RETURN_IF_ERROR(CopyBuffer(
"Failed to copy buffer", input_memory_type, input_memory_type_id,
memory_type, memory_type_id, input_buffer_byte_size, input_buffer,
buffer + output_buffer_offset, cuda_stream, cuda_used, copy_on_stream));
output_buffer_offset += input_buffer_byte_size;
}
*buffer_byte_size = input_byte_size;
return nullptr; // success
}
TRITONSERVER_Error*
ReadInputTensor(
TRITONBACKEND_Request* request, const std::string& input_name, char* buffer,
size_t* buffer_byte_size, const char* host_policy_name)
{
bool cuda_used;
return ReadInputTensor(
request, input_name, buffer, buffer_byte_size,
TRITONSERVER_MEMORY_CPU /* memory_type */, 0 /* memory_type_id */,
0 /* cuda_stream */, &cuda_used);
}
TRITONSERVER_Error*
CheckAllowedModelInput(
common::TritonJson::Value& io, const std::set<std::string>& allowed)
{
std::string io_name;
RETURN_IF_ERROR(io.MemberAsString("name", &io_name));
if (allowed.find(io_name) == allowed.end()) {
std::string astr;
for (const auto& a : allowed) {
if (!astr.empty()) {
astr.append(", ");
}
astr.append(a);
}
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INVALID_ARG,
std::string(
"unexpected inference input '" + io_name +
"', allowed inputs are: " + astr)
.c_str());
}
return nullptr; // success
}
TRITONSERVER_Error*
CheckAllowedModelOutput(
common::TritonJson::Value& io, const std::set<std::string>& allowed)
{
std::string io_name;
RETURN_IF_ERROR(io.MemberAsString("name", &io_name));
if (allowed.find(io_name) == allowed.end()) {
std::string astr;
for (const auto& a : allowed) {
if (!astr.empty()) {
astr.append(", ");
}
astr.append(a);
}
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INVALID_ARG,
std::string(
"unexpected inference output '" + io_name +
"', allowed outputs are: " + astr)
.c_str());
}
return nullptr; // success
}
TRITONSERVER_Error*
GetBooleanSequenceControlProperties(
common::TritonJson::Value& batcher, const std::string& model_name,
const std::string& control_kind, const bool required,
std::string* tensor_name, std::string* tensor_datatype,
float* fp32_false_value, float* fp32_true_value, int32_t* int32_false_value,
int32_t* int32_true_value, bool* bool_false_value, bool* bool_true_value)
{
// Make sure same tensor is not configured for multiple controls
std::set<std::string> seen_tensors;
// Make sure the control kind is not mentioned multiple times.
bool seen_control = false;
common::TritonJson::Value control_inputs;
if (batcher.Find("control_input", &control_inputs)) {
for (size_t ci_idx = 0; ci_idx < control_inputs.ArraySize(); ci_idx++) {
common::TritonJson::Value control_input;
RETURN_IF_ERROR(control_inputs.IndexAsObject(ci_idx, &control_input));
std::string input_name;
RETURN_IF_ERROR(control_input.MemberAsString("name", &input_name));
if (input_name.empty()) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INVALID_ARG,
(std::string(
"sequence batching control tensor must have a name for ") +
model_name)
.c_str());
}
if (seen_tensors.find(input_name) != seen_tensors.end()) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INVALID_ARG,
(std::string("sequence batching control tensor '") + input_name +
"' is specified for multiple control kinds for " + model_name)
.c_str());
}
seen_tensors.insert(input_name);
common::TritonJson::Value controls;
if (control_input.Find("control", &controls)) {
for (size_t c_idx = 0; c_idx < controls.ArraySize(); c_idx++) {
common::TritonJson::Value c;
RETURN_IF_ERROR(controls.IndexAsObject(c_idx, &c));
std::string kind_str;
RETURN_IF_ERROR(c.MemberAsString("kind", &kind_str));
if (kind_str == control_kind) {
if (seen_control) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INVALID_ARG,
(std::string(
"sequence batching specifies multiple " + control_kind +
" tensors for " + model_name)
.c_str()));
}
*tensor_name = input_name;
seen_control = true;
common::TritonJson::Value int32_false_true, fp32_false_true,
bool_false_true;
bool found_int32 =
(c.Find("int32_false_true", &int32_false_true) &&
(int32_false_true.ArraySize() > 0));
bool found_fp32 =
(c.Find("fp32_false_true", &fp32_false_true) &&
(fp32_false_true.ArraySize() > 0));
bool found_bool =
(c.Find("bool_false_true", &bool_false_true) &&
(bool_false_true.ArraySize() > 0));
// Make sure only one of int, float, or bool type is specified.
if (!(found_int32 || found_fp32 || found_bool)) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INVALID_ARG,
(std::string(
"sequence batching must specify either "
"'int32_false_true', 'fp32_false_true' or "
"'bool_false_true' for " +
control_kind + " for " + model_name))
.c_str());
} else if (
(found_fp32 && found_int32) || (found_fp32 && found_bool) ||
(found_int32 && found_bool)) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INVALID_ARG,
(std::string(
"sequence batching specifies more than one from "
"'int32_false_true', 'fp32_false_true' and "
"'bool_false_true' for " +
control_kind + " for " + model_name))
.c_str());
}
if (found_int32) {
if (int32_false_true.ArraySize() != 2) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INVALID_ARG,
(std::string(
"sequence batching control 'int32_false_true' must "
"have "
"exactly 2 entries for " +
control_kind + " for " + model_name))
.c_str());
}
if (tensor_datatype != nullptr) {
*tensor_datatype = "TYPE_INT32";
}
if (int32_false_value != nullptr) {
int64_t value;
RETURN_IF_ERROR(int32_false_true.IndexAsInt(0, &value));
*int32_false_value = value;
}
if (int32_true_value != nullptr) {
int64_t value;
RETURN_IF_ERROR(int32_false_true.IndexAsInt(1, &value));
*int32_true_value = value;
}
} else if (found_fp32) {
if (fp32_false_true.ArraySize() != 2) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INVALID_ARG,
(std::string(
"sequence batching control 'fp32_false_true' must "
"have exactly "
"2 entries for " +
control_kind + " for " + model_name))
.c_str());
}
if (tensor_datatype != nullptr) {
*tensor_datatype = "TYPE_FP32";
}
if (fp32_false_value != nullptr) {
double value = 0.0;
RETURN_IF_ERROR(fp32_false_true.IndexAsDouble(0, &value));
*fp32_false_value = value;
}
if (fp32_true_value != nullptr) {
double value = 0.0;
RETURN_IF_ERROR(fp32_false_true.IndexAsDouble(1, &value));
*fp32_true_value = value;
}
} else {
if (bool_false_true.ArraySize() != 2) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INVALID_ARG,
(std::string(
"sequence batching control 'bool_false_true' must "
"have exactly "
"2 entries for " +
control_kind + " for " + model_name))
.c_str());
}
if (tensor_datatype != nullptr) {
*tensor_datatype = "TYPE_BOOL";
}
if (bool_false_value != nullptr) {
bool value;
RETURN_IF_ERROR(bool_false_true.IndexAsBool(0, &value));
*bool_false_value = value;
}
if (bool_true_value != nullptr) {
bool value;
RETURN_IF_ERROR(bool_false_true.IndexAsBool(1, &value));
*bool_true_value = value;
}
}
}
}
}
}
}
if (!seen_control) {
if (required) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INVALID_ARG,
(std::string(
"sequence batching control tensor must specify a " +
control_kind + " value for " + model_name))
.c_str());
}
tensor_name->clear();
}
return nullptr; // success
}
TRITONSERVER_Error*
GetTypedSequenceControlProperties(
common::TritonJson::Value& batcher, const std::string& model_name,
const std::string& control_kind, const bool required,
std::string* tensor_name, std::string* tensor_datatype)
{
// Make sure same tensor is not configured for multiple controls
std::set<std::string> seen_tensors;
// Make sure the control kind is not mentioned multiple times.
bool seen_control = false;
common::TritonJson::Value control_inputs;
if (batcher.Find("control_input", &control_inputs)) {
for (size_t ci_idx = 0; ci_idx < control_inputs.ArraySize(); ci_idx++) {
common::TritonJson::Value control_input;
RETURN_IF_ERROR(control_inputs.IndexAsObject(ci_idx, &control_input));
std::string input_name;
RETURN_IF_ERROR(control_input.MemberAsString("name", &input_name));
if (input_name.empty()) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INVALID_ARG,
(std::string(
"sequence batching control tensor must have a name for ") +
model_name)
.c_str());
}
if (seen_tensors.find(input_name) != seen_tensors.end()) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INVALID_ARG,
(std::string("sequence batching control tensor '") + input_name +
"' is specified for multiple control kinds for " + model_name)
.c_str());
}
seen_tensors.insert(input_name);
common::TritonJson::Value controls;
if (control_input.Find("control", &controls)) {
for (size_t c_idx = 0; c_idx < controls.ArraySize(); c_idx++) {
common::TritonJson::Value c;
RETURN_IF_ERROR(controls.IndexAsObject(c_idx, &c));
std::string kind_str;
RETURN_IF_ERROR(c.MemberAsString("kind", &kind_str));
if (kind_str == control_kind) {
if (seen_control) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INVALID_ARG,
(std::string(
"sequence batching specifies multiple " + control_kind +
" tensors for " + model_name)
.c_str()));
}
*tensor_name = input_name;
if (tensor_datatype != nullptr) {
RETURN_IF_ERROR(c.MemberAsString("data_type", tensor_datatype));
}
seen_control = true;
common::TritonJson::Value int32_false_true, fp32_false_true,
bool_false_true;
bool found_int32 =
(c.Find("int32_false_true", &int32_false_true) &&
(int32_false_true.ArraySize() > 0));
bool found_fp32 =
(c.Find("fp32_false_true", &fp32_false_true) &&
(fp32_false_true.ArraySize() > 0));
bool found_bool =
(c.Find("bool_false_true", &bool_false_true) &&
(bool_false_true.ArraySize() > 0));
if (found_fp32 || found_int32 || found_bool) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INVALID_ARG,
(std::string(
"sequence batching must not specify either "
"'int32_false_true', 'fp32_false_true' or "
"'bool_false_true' for " +
control_kind + " for " + model_name))
.c_str());
}
}
}
}
}
}
if (!seen_control) {
if (required) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INVALID_ARG,
(std::string(
"sequence batching control tensor must specify a " +
control_kind + " value for " + model_name))
.c_str());
}
tensor_name->clear();
}
return nullptr; // success
}
void
RequestsRespondWithError(
TRITONBACKEND_Request** requests, const uint32_t request_count,
TRITONSERVER_Error* response_err, const bool release_request)
{
for (size_t i = 0; i < request_count; i++) {
TRITONBACKEND_Response* response;
auto err = TRITONBACKEND_ResponseNew(&response, requests[i]);
if (err != nullptr) {
LOG_MESSAGE(
TRITONSERVER_LOG_ERROR,
(GetRequestId(requests[i]) + "fail to create response").c_str());
TRITONSERVER_ErrorDelete(err);
} else {
LOG_IF_ERROR(
TRITONBACKEND_ResponseSend(
response, TRITONSERVER_RESPONSE_COMPLETE_FINAL, response_err),
(GetRequestId(requests[i]) + "fail to send error response").c_str());
}
if (release_request) {
LOG_IF_ERROR(
TRITONBACKEND_RequestRelease(
requests[i], TRITONSERVER_REQUEST_RELEASE_ALL),
"fail to release request");
requests[i] = nullptr;
}
}
TRITONSERVER_ErrorDelete(response_err);
}
void
SendErrorForResponses(
std::vector<TRITONBACKEND_Response*>* responses,
const uint32_t response_count, TRITONSERVER_Error* response_err)
{
for (size_t i = 0; i < response_count; i++) {
TRITONBACKEND_Response* response = (*responses)[i];
if (response != nullptr) {
LOG_IF_ERROR(
TRITONBACKEND_ResponseSend(
response, TRITONSERVER_RESPONSE_COMPLETE_FINAL, response_err),
"fail to send error response");
(*responses)[i] = nullptr;
}
}
TRITONSERVER_ErrorDelete(response_err);
}
TRITONSERVER_Error*
CopyBuffer(
const std::string& msg, const TRITONSERVER_MemoryType src_memory_type,
const int64_t src_memory_type_id,
const TRITONSERVER_MemoryType dst_memory_type,
const int64_t dst_memory_type_id, const size_t byte_size, const void* src,
void* dst, cudaStream_t cuda_stream, bool* cuda_used,
const bool copy_on_stream)
{
*cuda_used = false;
if (byte_size > 0) {
if (src == nullptr) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL,
std::string(
msg + ": attempted a copy of " + std::to_string(byte_size) +
" Bytes from an uninitialized memory")
.c_str());
}
if (dst == nullptr) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL,
std::string(
msg + ": attempted a copy of " + std::to_string(byte_size) +
" Bytes to an uninitialized memory")
.c_str());
}
}
// For CUDA memcpy, if copy_on_stream is false, all host to host copy will be
// blocked in respect to the host, so use memcpy() directly. In this case,
// need to be careful on whether the src buffer is valid.
if ((src_memory_type != TRITONSERVER_MEMORY_GPU) &&
(dst_memory_type != TRITONSERVER_MEMORY_GPU)) {
#ifdef TRITON_ENABLE_GPU
if (copy_on_stream) {
auto params = new CopyParams(dst, src, byte_size);
cudaLaunchHostFunc(
cuda_stream, MemcpyHost, reinterpret_cast<void*>(params));
*cuda_used = true;
} else {
memcpy(dst, src, byte_size);
}
#else
memcpy(dst, src, byte_size);
#endif // TRITON_ENABLE_GPU
} else {
#ifdef TRITON_ENABLE_GPU
// [TODO] use cudaMemcpyDefault if UVM is supported for the device
auto copy_kind = cudaMemcpyDeviceToDevice;
if (src_memory_type != TRITONSERVER_MEMORY_GPU) {
copy_kind = cudaMemcpyHostToDevice;
} else if (dst_memory_type != TRITONSERVER_MEMORY_GPU) {
copy_kind = cudaMemcpyDeviceToHost;
}
if ((src_memory_type_id != dst_memory_type_id) &&
(copy_kind == cudaMemcpyDeviceToDevice)) {
RETURN_IF_CUDA_ERROR(
cudaMemcpyPeerAsync(
dst, dst_memory_type_id, src, src_memory_type_id, byte_size,
cuda_stream),
TRITONSERVER_ERROR_INTERNAL, msg + ": failed to perform CUDA copy");
} else {
RETURN_IF_CUDA_ERROR(
cudaMemcpyAsync(dst, src, byte_size, copy_kind, cuda_stream),
TRITONSERVER_ERROR_INTERNAL, msg + ": failed to perform CUDA copy");
}
*cuda_used = true;
#else
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL,
std::string(msg + ": try to use CUDA copy while GPU is not supported")
.c_str());
#endif // TRITON_ENABLE_GPU
}
return nullptr; // success
}
TRITONSERVER_Error*
GetDirectoryContents(const std::string& path, std::set<std::string>* contents)
{
#ifdef _WIN32
WIN32_FIND_DATA entry;
HANDLE dir = FindFirstFile(path.c_str(), &entry);
if (dir == INVALID_HANDLE_VALUE) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL,
(std::string("failed to open directory: ") + path).c_str());
}
if ((entry.cFileName != ".") && (entry.cFileName != "..")) {
contents->insert(entry.cFileName);
}
while (FindNextFileA(dir, &entry)) {
if ((entry.cFileName != ".") && (entry.cFileName != "..")) {
contents->insert(entry.cFileName);
}
}
FindClose(dir);
#else
DIR* dir = opendir(path.c_str());
if (dir == nullptr) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL,
(std::string("failed to open directory: ") + path).c_str());
}
struct dirent* entry;
while ((entry = readdir(dir)) != nullptr) {
std::string entryname = entry->d_name;
if ((entryname != ".") && (entryname != "..")) {
contents->insert(entryname);
}
}
closedir(dir);
#endif
return nullptr; // success
}
TRITONSERVER_Error*
FileExists(const std::string& path, bool* exists)
{
*exists = (access(path.c_str(), F_OK) == 0);
return nullptr; // success
}
TRITONSERVER_Error*
ReadTextFile(const std::string& path, std::string* contents)
{
std::ifstream in(path, std::ios::in | std::ios::binary);
if (!in) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL,
("failed to open/read file '" + path + "': " + strerror(errno))
.c_str());
}
in.seekg(0, std::ios::end);
contents->resize(in.tellg());
in.seekg(0, std::ios::beg);
in.read(&(*contents)[0], contents->size());
in.close();
return nullptr; // success
}
TRITONSERVER_Error*
IsDirectory(const std::string& path, bool* is_dir)
{
*is_dir = false;
struct stat st;
if (stat(path.c_str(), &st) != 0) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL,
(std::string("failed to stat file ") + path).c_str());
}
*is_dir = S_ISDIR(st.st_mode);
return nullptr; // success
}
std::string
JoinPath(std::initializer_list<std::string> segments)
{
std::string joined;
for (const auto& seg : segments) {
if (joined.empty()) {
joined = seg;
} else if (!seg.empty() && (seg[0] == '/')) { // IsAbsolutePath(seg)
if (joined[joined.size() - 1] == '/') {
joined.append(seg.substr(1));
} else {
joined.append(seg);
}
} else { // !IsAbsolutePath(seg)
if (joined[joined.size() - 1] != '/') {
joined.append("/");
}
joined.append(seg);
}
}
return joined;
}
TRITONSERVER_Error*
ModelPaths(
const std::string& model_repository_path, uint64_t version,
const bool ignore_directories, const bool ignore_files,
std::unordered_map<std::string, std::string>* model_paths)
{
std::set<std::string> model_files;
// Read all the files in 'path' and filter by type for different requirements
auto path = JoinPath({model_repository_path, std::to_string(version)});
RETURN_IF_ERROR(GetDirectoryContents(path, &model_files));
if (ignore_directories) {
// Erase directory entries...
for (auto iter = model_files.begin(); iter != model_files.end();) {
bool is_dir;
RETURN_IF_ERROR(IsDirectory(JoinPath({path, *iter}), &is_dir));
if (is_dir) {
iter = model_files.erase(iter);
} else {
++iter;
}
}
}
if (ignore_files) {
// Erase non-directory entries...
for (auto iter = model_files.begin(); iter != model_files.end();) {
bool is_dir;
RETURN_IF_ERROR(IsDirectory(JoinPath({path, *iter}), &is_dir));
if (!is_dir) {
iter = model_files.erase(iter);
} else {
++iter;
}
}
}
for (const auto& filename : model_files) {
const auto model_path = JoinPath({path, filename});
model_paths->emplace(
std::piecewise_construct, std::make_tuple(filename),
std::make_tuple(model_path));
}
return nullptr; // success
}
TRITONSERVER_Error*
CreateCudaStream(
const int device_id, const int cuda_stream_priority, cudaStream_t* stream)
{
*stream = nullptr;
#ifdef TRITON_ENABLE_GPU
// Make sure that correct device is set before creating stream and
// then restore the device to what was set by the caller.
int current_device;
auto cuerr = cudaGetDevice(&current_device);
bool overridden = false;
if (cuerr == cudaSuccess) {
overridden = (current_device != device_id);
if (overridden) {
cuerr = cudaSetDevice(device_id);
}
}
if (cuerr == cudaSuccess) {
cuerr = cudaStreamCreateWithPriority(
stream, cudaStreamDefault, cuda_stream_priority);
}
if (overridden) {
cudaSetDevice(current_device);
}
if (cuerr != cudaSuccess) {
*stream = nullptr;
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL,
(std::string("unable to create stream: ") + cudaGetErrorString(cuerr))
.c_str());
}
#endif // TRITON_ENABLE_GPU
return nullptr; // success
}
TRITONSERVER_Error*
ParseLongLongValue(const std::string& value, int64_t* parsed_value)
{
try {
*parsed_value = std::stoll(value);
}
catch (const std::invalid_argument& ia) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INVALID_ARG,
(std::string("failed to convert '") + value +
"' to long long integral number")
.c_str());
}
return nullptr; // success
}
TRITONSERVER_Error*
ParseUnsignedLongLongValue(const std::string& value, uint64_t* parsed_value)
{
try {
*parsed_value = std::stoull(value);
}
catch (const std::invalid_argument& ia) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INVALID_ARG,
(std::string("failed to convert '") + value +
"' to unsigned long long integral number")
.c_str());
}
return nullptr; // success
}
TRITONSERVER_Error*
ParseBoolValue(const std::string& value, bool* parsed_value)
{
std::string lvalue = value;
std::transform(
lvalue.begin(), lvalue.end(), lvalue.begin(),
[](unsigned char c) { return std::tolower(c); });
if ((lvalue == "true") || (lvalue == "on") || (lvalue == "1")) {
*parsed_value = true;
return nullptr; // success
}
if ((lvalue == "false") || (lvalue == "off") || (lvalue == "0")) {
*parsed_value = false;
return nullptr; // success
}
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INVALID_ARG,
(std::string("failed to convert '") + value + "' to boolean").c_str());
}
TRITONSERVER_Error*
ParseIntValue(const std::string& value, int* parsed_value)
{
try {
*parsed_value = std::stoi(value);
}
catch (const std::invalid_argument& ia) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INVALID_ARG,
(std::string("failed to convert '") + value + "' to integral number")
.c_str());
}
return nullptr; // success
}
TRITONSERVER_Error*
ParseDoubleValue(const std::string& value, double* parsed_value)
{
try {
*parsed_value = std::stod(value);
}
catch (const std::invalid_argument& ia) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INVALID_ARG,
(std::string("failed to convert '") + value + "' to double number")
.c_str());
}
return nullptr; // success
}
TRITONSERVER_Error*
GetParameterValue(
triton::common::TritonJson::Value& params, const std::string& key,
std::string* value)
{
triton::common::TritonJson::Value json_value;
RETURN_ERROR_IF_FALSE(
params.Find(key.c_str(), &json_value), TRITONSERVER_ERROR_NOT_FOUND,
std::string("model configuration is missing the parameter ") + key);
RETURN_IF_ERROR(json_value.MemberAsString("string_value", value));
return nullptr; // success
}
TRITONSERVER_Error*
BatchInput::ParseFromModelConfig(
triton::common::TritonJson::Value& config,
std::vector<BatchInput>* batch_inputs)
{
batch_inputs->clear();
triton::common::TritonJson::Value bis;
RETURN_IF_ERROR(config.MemberAsArray("batch_input", &bis));
for (size_t i = 0; i < bis.ArraySize(); ++i) {
triton::common::TritonJson::Value bi;
RETURN_IF_ERROR(bis.IndexAsObject(i, &bi));
batch_inputs->emplace_back();
RETURN_IF_ERROR(batch_inputs->back().Init(bi));
}
return nullptr; // success
}
TRITONSERVER_Error*
BatchInput::Init(triton::common::TritonJson::Value& bi_config)
{
{
triton::common::TritonJson::Value bi_target_names;
RETURN_IF_ERROR(bi_config.MemberAsArray("target_name", &bi_target_names));
for (size_t i = 0; i < bi_target_names.ArraySize(); ++i) {
std::string tn;
RETURN_IF_ERROR(bi_target_names.IndexAsString(i, &tn));
target_names_.emplace_back(std::move(tn));
}
}
{
RETURN_IF_ERROR(bi_config.MemberAsString("kind", &kind_str_));
if (kind_str_ == "BATCH_ELEMENT_COUNT") {
kind_ = Kind::BATCH_ELEMENT_COUNT;
} else if (kind_str_ == "BATCH_ACCUMULATED_ELEMENT_COUNT") {
kind_ = Kind::BATCH_ACCUMULATED_ELEMENT_COUNT;
} else if (kind_str_ == "BATCH_ACCUMULATED_ELEMENT_COUNT_WITH_ZERO") {
kind_ = Kind::BATCH_ACCUMULATED_ELEMENT_COUNT_WITH_ZERO;
} else if (kind_str_ == "BATCH_MAX_ELEMENT_COUNT_AS_SHAPE") {
kind_ = Kind::BATCH_MAX_ELEMENT_COUNT_AS_SHAPE;
} else if (kind_str_ == "BATCH_ITEM_SHAPE") {
kind_ = Kind::BATCH_ITEM_SHAPE;
} else if (kind_str_ == "BATCH_ITEM_SHAPE_FLATTEN") {
kind_ = Kind::BATCH_ITEM_SHAPE_FLATTEN;
} else {
RETURN_ERROR_IF_FALSE(
false, TRITONSERVER_ERROR_INVALID_ARG,
std::string("unexpected batch input kind '" + kind_str_ + "'"));
}
}
{
std::string bi_dtype;
RETURN_IF_ERROR(bi_config.MemberAsString("data_type", &bi_dtype));
data_type_ = ModelConfigDataTypeToTritonServerDataType(bi_dtype);
RETURN_ERROR_IF_TRUE(
data_type_ == TRITONSERVER_TYPE_INVALID, TRITONSERVER_ERROR_INVALID_ARG,
std::string("unexpected batch input data type '" + bi_dtype + "'"));
}
{
triton::common::TritonJson::Value bi_source_inputs;
RETURN_IF_ERROR(bi_config.MemberAsArray("source_input", &bi_source_inputs));
for (size_t i = 0; i < bi_source_inputs.ArraySize(); ++i) {
std::string si;
RETURN_IF_ERROR(bi_source_inputs.IndexAsString(i, &si));
source_inputs_.emplace_back(std::move(si));
}
}
return nullptr; // success
}
TRITONSERVER_DataType
ModelConfigDataTypeToTritonServerDataType(const std::string& data_type_str)
{
// Must start with "TYPE_".
if (data_type_str.rfind("TYPE_", 0) != 0) {
return TRITONSERVER_TYPE_INVALID;
}
const std::string dtype = data_type_str.substr(strlen("TYPE_"));
if (dtype == "BOOL") {
return TRITONSERVER_TYPE_BOOL;
} else if (dtype == "UINT8") {
return TRITONSERVER_TYPE_UINT8;
} else if (dtype == "UINT16") {
return TRITONSERVER_TYPE_UINT16;
} else if (dtype == "UINT32") {
return TRITONSERVER_TYPE_UINT32;
} else if (dtype == "UINT64") {
return TRITONSERVER_TYPE_UINT64;
} else if (dtype == "INT8") {
return TRITONSERVER_TYPE_INT8;
} else if (dtype == "INT16") {
return TRITONSERVER_TYPE_INT16;
} else if (dtype == "INT32") {
return TRITONSERVER_TYPE_INT32;
} else if (dtype == "INT64") {
return TRITONSERVER_TYPE_INT64;
} else if (dtype == "FP16") {
return TRITONSERVER_TYPE_FP16;
} else if (dtype == "FP32") {
return TRITONSERVER_TYPE_FP32;
} else if (dtype == "FP64") {
return TRITONSERVER_TYPE_FP64;
} else if (dtype == "STRING") {
return TRITONSERVER_TYPE_BYTES;
} else if (dtype == "BF16") {
return TRITONSERVER_TYPE_BF16;
}
return TRITONSERVER_TYPE_INVALID;
}
TRITONSERVER_Error*
BatchOutput::ParseFromModelConfig(
triton::common::TritonJson::Value& config,
std::vector<BatchOutput>* batch_outputs)
{
batch_outputs->clear();
triton::common::TritonJson::Value bos;
RETURN_IF_ERROR(config.MemberAsArray("batch_output", &bos));
for (size_t i = 0; i < bos.ArraySize(); ++i) {
batch_outputs->emplace_back();
auto& batch_output = batch_outputs->back();
triton::common::TritonJson::Value bo;
RETURN_IF_ERROR(bos.IndexAsObject(i, &bo));
{
triton::common::TritonJson::Value bo_target_names;
RETURN_IF_ERROR(bo.MemberAsArray("target_name", &bo_target_names));
for (size_t i = 0; i < bo_target_names.ArraySize(); ++i) {
std::string tn;
RETURN_IF_ERROR(bo_target_names.IndexAsString(i, &tn));
batch_output.target_names_.emplace_back(std::move(tn));
}
}
{
std::string bo_kind;
RETURN_IF_ERROR(bo.MemberAsString("kind", &bo_kind));
if (bo_kind == "BATCH_SCATTER_WITH_INPUT_SHAPE") {
batch_output.kind_ = Kind::BATCH_SCATTER_WITH_INPUT_SHAPE;
// Keep track of the output info for later cross reference with input
int64_t mbs = 0;
RETURN_IF_ERROR(config.MemberAsInt("max_batch_size", &mbs));
if (mbs != 0) {
batch_output.shape_.push_back(-1);
}
triton::common::TritonJson::Value ios;
RETURN_IF_ERROR(config.MemberAsArray("output", &ios));
for (size_t i = 0; i < ios.ArraySize(); i++) {
triton::common::TritonJson::Value io;
RETURN_IF_ERROR(ios.IndexAsObject(i, &io));
std::string io_name;
RETURN_IF_ERROR(io.MemberAsString("name", &io_name));
if (io_name == batch_output.target_names_[0]) {
std::string io_dtype;
RETURN_IF_ERROR(io.MemberAsString("data_type", &io_dtype));
batch_output.data_type_ =
ModelConfigDataTypeToTritonServerDataType(io_dtype);
// If a reshape is provided for the input then use that when
// validating that the model matches what is expected.
triton::common::TritonJson::Value reshape;
if (io.Find("reshape", &reshape)) {
RETURN_IF_ERROR(
ParseShape(reshape, "shape", &batch_output.shape_));
} else {
RETURN_IF_ERROR(ParseShape(io, "dims", &batch_output.shape_));
}
break;
}
}
} else {
RETURN_ERROR_IF_FALSE(
false, TRITONSERVER_ERROR_INVALID_ARG,
std::string("unexpected batch output kind '" + bo_kind + "'"));
}
}
{
triton::common::TritonJson::Value bo_source_inputs;
RETURN_IF_ERROR(bo.MemberAsArray("source_input", &bo_source_inputs));
for (size_t i = 0; i < bo_source_inputs.ArraySize(); ++i) {
std::string si;
RETURN_IF_ERROR(bo_source_inputs.IndexAsString(i, &si));
batch_output.source_inputs_.emplace_back(std::move(si));
}
}
}
return nullptr; // success
}
TRITONSERVER_Error*
TryParseModelStringParameter(
triton::common::TritonJson::Value& params, const std::string& mkey,
std::string* value, const std::string& default_value)
{
triton::common::TritonJson::Value json_value;
if (params.Find(mkey.c_str(), &json_value)) {
RETURN_IF_ERROR(json_value.MemberAsString("string_value", value));
} else {
*value = default_value;
}
return nullptr; // success
}
TRITONSERVER_Error*
TryParseModelStringParameter(
triton::common::TritonJson::Value& params, const std::string& mkey,
int* value, const int& default_value)
{
triton::common::TritonJson::Value json_value;
if (params.Find(mkey.c_str(), &json_value)) {
std::string string_value;
RETURN_IF_ERROR(json_value.MemberAsString("string_value", &string_value));
return ParseIntValue(string_value, value);
} else {
*value = default_value;
return nullptr; // success
}
}
TRITONSERVER_Error*
TryParseModelStringParameter(
triton::common::TritonJson::Value& params, const std::string& mkey,
bool* value, const bool& default_value)
{
triton::common::TritonJson::Value json_value;
if (params.Find(mkey.c_str(), &json_value)) {
std::string string_value;
RETURN_IF_ERROR(json_value.MemberAsString("string_value", &string_value));
return ParseBoolValue(string_value, value);
} else {
*value = default_value;
return nullptr; // success
}
}
TRITONSERVER_Error*
TryParseModelStringParameter(
triton::common::TritonJson::Value& params, const std::string& mkey,
uint64_t* value, const uint64_t& default_value)
{
triton::common::TritonJson::Value json_value;
if (params.Find(mkey.c_str(), &json_value)) {
std::string string_value;
RETURN_IF_ERROR(json_value.MemberAsString("string_value", &string_value));
return ParseUnsignedLongLongValue(string_value, value);
} else {
*value = default_value;
return nullptr; // success
}
}
namespace {
template <typename T>
TRITONSERVER_Error*
BufferAsTypedString(
std::string& str, const char* buffer, const size_t element_cnt)
{
const T* vals = reinterpret_cast<const T*>(buffer);
str += "[ ";
for (size_t i = 0; i < element_cnt; ++i) {
const T& v = vals[i];
if (i != 0) {
str += ", ";
}
str += std::to_string(v);
}
str += " ]";
return nullptr; // success
}
} // namespace
TRITONSERVER_Error*
BufferAsTypedString(
std::string& str, const char* buffer, size_t buffer_byte_size,
TRITONSERVER_DataType datatype)
{
const size_t element_cnt =
buffer_byte_size / TRITONSERVER_DataTypeByteSize(datatype);
switch (datatype) {
case TRITONSERVER_TYPE_UINT8:
return BufferAsTypedString<uint8_t>(str, buffer, element_cnt);
case TRITONSERVER_TYPE_UINT16:
return BufferAsTypedString<uint16_t>(str, buffer, element_cnt);
case TRITONSERVER_TYPE_UINT32:
return BufferAsTypedString<uint32_t>(str, buffer, element_cnt);
case TRITONSERVER_TYPE_UINT64:
return BufferAsTypedString<uint64_t>(str, buffer, element_cnt);
case TRITONSERVER_TYPE_INT8:
return BufferAsTypedString<int8_t>(str, buffer, element_cnt);
case TRITONSERVER_TYPE_INT16:
return BufferAsTypedString<int16_t>(str, buffer, element_cnt);
case TRITONSERVER_TYPE_INT32:
return BufferAsTypedString<int32_t>(str, buffer, element_cnt);
case TRITONSERVER_TYPE_INT64:
return BufferAsTypedString<int64_t>(str, buffer, element_cnt);
case TRITONSERVER_TYPE_FP32:
return BufferAsTypedString<float>(str, buffer, element_cnt);
case TRITONSERVER_TYPE_FP64:
return BufferAsTypedString<double>(str, buffer, element_cnt);
default:
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INVALID_ARG,
std::string(
std::string("class result not available for output due to "
"unsupported type '") +
std::string(TRITONSERVER_DataTypeString(datatype)) + "'")
.c_str());
}
return nullptr; // success
}
std::string
GetRequestId(TRITONBACKEND_Request* request)
{
const char* request_id = nullptr;
LOG_IF_ERROR(
TRITONBACKEND_RequestId(request, &request_id),
"unable to retrieve request ID string");
if ((request_id == nullptr) || (request_id[0] == '\0')) {
request_id = "<id_unknown>";
}
return std::string("[request id: ") + request_id + "] ";
}
}} // namespace triton::backend
// Copyright 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "triton/backend/backend_input_collector.h"
#include <atomic>
#include "triton/backend/backend_common.h"
#ifdef TRITON_ENABLE_GPU
#include "kernel.h"
#endif // TRITON_ENABLE_GPU
namespace triton { namespace backend {
//
// BackendInputCollector::InputIterator
//
BackendInputCollector::InputIterator::InputIterator(
TRITONBACKEND_Request** requests, const uint32_t request_count,
std::vector<TRITONBACKEND_Response*>* responses, const char* input_name,
const char* host_policy_name, const bool coalesce_request_input)
: requests_(requests), request_count_(request_count), responses_(responses),
input_name_(input_name), host_policy_(host_policy_name),
coalesce_request_input_(coalesce_request_input), curr_request_idx_(0),
curr_buffer_idx_(0), reach_end_(false)
{
auto& response = (*responses_)[curr_request_idx_];
RESPOND_AND_SET_NULL_IF_ERROR(
&response, TRITONBACKEND_RequestInput(
requests_[curr_request_idx_], input_name_, &curr_input_));
RESPOND_AND_SET_NULL_IF_ERROR(
&response, TRITONBACKEND_InputPropertiesForHostPolicy(
curr_input_, host_policy_, nullptr, nullptr, nullptr,
nullptr, nullptr, &curr_buffer_cnt_));
}
bool
BackendInputCollector::InputIterator::GetNextContiguousInput(
ContiguousBuffer* input)
{
if (reach_end_ || (curr_buffer_idx_ >= curr_buffer_cnt_)) {
return false;
}
// Get the first buffer
TRITONBACKEND_InputBufferForHostPolicy(
curr_input_, host_policy_, curr_buffer_idx_,
reinterpret_cast<const void**>(&input->memory_desc_.buffer_),
&input->memory_desc_.byte_size_, &input->memory_desc_.memory_type_,
&input->memory_desc_.memory_type_id_);
++curr_buffer_idx_;
input->start_request_idx_ = curr_request_idx_;
input->end_request_idx_ = curr_request_idx_;
if (!coalesce_request_input_) {
if (curr_buffer_idx_ >= curr_buffer_cnt_) {
++curr_request_idx_;
if (curr_request_idx_ < request_count_) {
auto& response = (*responses_)[curr_request_idx_];
RESPOND_AND_SET_NULL_IF_ERROR(
&response,
TRITONBACKEND_RequestInput(
requests_[curr_request_idx_], input_name_, &curr_input_));
RESPOND_AND_SET_NULL_IF_ERROR(
&response, TRITONBACKEND_InputPropertiesForHostPolicy(
curr_input_, host_policy_, nullptr, nullptr, nullptr,
nullptr, nullptr, &curr_buffer_cnt_));
// reset buffer idx
curr_buffer_idx_ = 0;
} else {
reach_end_ = true;
}
}
return true;
}
do {
for (; curr_buffer_idx_ < curr_buffer_cnt_; ++curr_buffer_idx_) {
const void* next_buffer;
size_t next_buffer_byte_size;
TRITONSERVER_MemoryType next_memory_type;
int64_t next_memory_type_id;
TRITONBACKEND_InputBufferForHostPolicy(
curr_input_, host_policy_, curr_buffer_idx_, &next_buffer,
&next_buffer_byte_size, &next_memory_type, &next_memory_type_id);
if (((input->memory_desc_.buffer_ + input->memory_desc_.byte_size_) !=
next_buffer) ||
(input->memory_desc_.memory_type_ != next_memory_type) ||
(input->memory_desc_.memory_type_id_ != next_memory_type_id)) {
return true;
}
input->memory_desc_.byte_size_ += next_buffer_byte_size;
input->end_request_idx_ = curr_request_idx_;
}
// Iterated all buffers for current request, check next
++curr_request_idx_;
if (curr_request_idx_ < request_count_) {
auto& response = (*responses_)[curr_request_idx_];
RESPOND_AND_SET_NULL_IF_ERROR(
&response,
TRITONBACKEND_RequestInput(
requests_[curr_request_idx_], input_name_, &curr_input_));
RESPOND_AND_SET_NULL_IF_ERROR(
&response, TRITONBACKEND_InputPropertiesForHostPolicy(
curr_input_, host_policy_, nullptr, nullptr, nullptr,
nullptr, nullptr, &curr_buffer_cnt_));
// reset buffer idx
curr_buffer_idx_ = 0;
}
} while (curr_request_idx_ < request_count_);
reach_end_ = true;
return true;
}
//
// BackendInputCollector
//
bool
BackendInputCollector::GetInputBufferIfContiguous(
const char* input_name, const char** buffer, size_t* buffer_byte_size,
TRITONSERVER_MemoryType* memory_type, int64_t* memory_type_id)
{
*buffer = nullptr;
*buffer_byte_size = 0;
const char* expected_next_buffer = nullptr;
bool contiguous = true;
for (size_t idx = 0; idx < request_count_; idx++) {
auto& request = requests_[idx];
auto& response = (*responses_)[idx];
TRITONBACKEND_Input* input;
RESPOND_AND_SET_NULL_IF_ERROR(
&response, TRITONBACKEND_RequestInput(request, input_name, &input));
uint64_t byte_size;
uint32_t buffer_count;
RESPOND_AND_SET_NULL_IF_ERROR(
&response, TRITONBACKEND_InputPropertiesForHostPolicy(
input, host_policy_cstr_, nullptr, nullptr, nullptr,
nullptr, &byte_size, &buffer_count));
for (size_t idx = 0; idx < buffer_count; ++idx) {
const void* src_buffer;
size_t src_byte_size;
TRITONSERVER_MemoryType src_memory_type;
int64_t src_memory_type_id;
RESPOND_AND_SET_NULL_IF_ERROR(
&response,
TRITONBACKEND_InputBufferForHostPolicy(
input, host_policy_cstr_, idx, &src_buffer, &src_byte_size,
&src_memory_type, &src_memory_type_id));
if (*buffer != nullptr) {
// If have seen the second buffer while coalescing input is not
// requested, treat the inputs are not contiguous
if (coalesce_request_input_ && (expected_next_buffer == src_buffer) &&
(*memory_type == src_memory_type) &&
(*memory_type_id == src_memory_type_id)) {
expected_next_buffer += src_byte_size;
} else {
contiguous = false;
}
// Want to know total buffer byte size even if it is not contiguous
*buffer_byte_size += src_byte_size;
} else {
*buffer = reinterpret_cast<const char*>(src_buffer);
*memory_type = src_memory_type;
*memory_type_id = src_memory_type_id;
*buffer_byte_size = src_byte_size;
expected_next_buffer = *buffer + src_byte_size;
}
}
}
return contiguous;
}
void
BackendInputCollector::ProcessTensor(
const char* input_name, char* buffer, const size_t buffer_byte_size,
const TRITONSERVER_MemoryType memory_type, const int64_t memory_type_id)
{
// A value of CPU_PINNED indicates that pinned memory buffer is not
// needed for this tensor. Any other value indicates that a pinned
// memory buffer is needed when the target memory type matches
// 'use_pinned_memory_type'.
TRITONSERVER_MemoryType use_pinned_memory_type =
TRITONSERVER_MEMORY_CPU_PINNED;
if (pinned_enabled_) {
use_pinned_memory_type = GetUsePinnedMemoryType(memory_type);
}
const bool use_kernel = (kernel_buffer_threshold_ != 0);
size_t buffer_offset = 0;
InputIterator ii(
requests_, request_count_, responses_, input_name, host_policy_cstr_,
coalesce_request_input_);
ContiguousBuffer input;
while (ii.GetNextContiguousInput(&input)) {
// If there are pending copies from tensor buffer that is not
// contiguous with 'response's part of that buffer, then need to
// go ahead and perform the pending copies so that can start a new
// contiguous region if necessary.
if ((pending_pinned_byte_size_ > 0) &&
(buffer_offset !=
(pending_pinned_byte_size_ + pending_pinned_offset_))) {
need_sync_ |= FlushPendingPinned(
buffer, buffer_byte_size, memory_type, memory_type_id);
}
if ((pending_copy_kernel_buffer_byte_size_ > 0) &&
(buffer_offset != (pending_copy_kernel_buffer_byte_size_ +
pending_copy_kernel_buffer_offset_))) {
need_sync_ |= FlushPendingCopyKernel(
buffer, buffer_byte_size, memory_type, memory_type_id);
}
need_sync_ |= SetInputTensor(
input_name, input, buffer, buffer_byte_size, memory_type,
memory_type_id, buffer_offset, use_pinned_memory_type, use_kernel,
true);
buffer_offset += input.memory_desc_.byte_size_;
}
// Done with the tensor, flush any pending pinned copies.
need_sync_ |=
FlushPendingPinned(buffer, buffer_byte_size, memory_type, memory_type_id);
need_sync_ |= FlushPendingCopyKernel(
buffer, buffer_byte_size, memory_type, memory_type_id);
#ifdef TRITON_ENABLE_GPU
if (need_sync_ && (event_ != nullptr)) {
cudaEventRecord(event_, stream_);
}
#endif // TRITON_ENABLE_GPU
}
TRITONSERVER_Error*
BackendInputCollector::ProcessTensor(
const char* input_name, char* buffer, const size_t buffer_byte_size,
const std::vector<std::pair<TRITONSERVER_MemoryType, int64_t>>&
allowed_input_types,
const char** dst_buffer, size_t* dst_buffer_byte_size,
TRITONSERVER_MemoryType* dst_memory_type, int64_t* dst_memory_type_id)
{
if (buffer == nullptr) {
if (allowed_input_types.size() == 0) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL,
"'allowed_input_types' must contain at least one pair of memory type "
"and id");
}
if (GetInputBufferIfContiguous(
input_name, dst_buffer, dst_buffer_byte_size, dst_memory_type,
dst_memory_type_id)) {
// zero size buffer will be treated as contiguous as well,
// but we want to invoke backend memory to have a valid address.
if (*dst_buffer_byte_size != 0) {
// If the buffer is contiguous, check if the caller expects its type
for (const auto& allowed_type : allowed_input_types) {
if ((*dst_memory_type == allowed_type.first) &&
((*dst_memory_type_id == allowed_type.second))) {
return nullptr; // success
}
}
}
}
// A separate buffer is needed
BackendMemory* backend_memory = nullptr;
for (const auto& allowed_type : allowed_input_types) {
std::vector<BackendMemory::AllocationType> alloc_types;
const int64_t memory_type_id = allowed_type.second;
switch (allowed_type.first) {
case TRITONSERVER_MEMORY_GPU:
alloc_types = {BackendMemory::AllocationType::GPU_POOL,
BackendMemory::AllocationType::GPU};
break;
case TRITONSERVER_MEMORY_CPU_PINNED:
alloc_types = {BackendMemory::AllocationType::CPU_PINNED_POOL,
BackendMemory::AllocationType::CPU_PINNED};
break;
case TRITONSERVER_MEMORY_CPU:
alloc_types = {BackendMemory::AllocationType::CPU};
break;
}
auto err = BackendMemory::Create(
memory_manager_, alloc_types, memory_type_id, *dst_buffer_byte_size,
&backend_memory);
if (err != nullptr) {
LOG_MESSAGE(
TRITONSERVER_LOG_VERBOSE,
(std::string("unable to create backend memory for type: ") +
TRITONSERVER_MemoryTypeString(allowed_type.first) +
" id: " + std::to_string(memory_type_id) + ": " +
TRITONSERVER_ErrorMessage(err))
.c_str());
TRITONSERVER_ErrorDelete(err);
} else {
in_use_memories_.emplace_back(backend_memory);
break;
}
}
if (backend_memory == nullptr) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL,
(std::string("failed to allocate contiguous buffer for input '") +
input_name + "'")
.c_str());
}
buffer = backend_memory->MemoryPtr();
*dst_buffer = backend_memory->MemoryPtr();
*dst_buffer_byte_size = backend_memory->ByteSize();
*dst_memory_type = backend_memory->MemoryType();
*dst_memory_type_id = backend_memory->MemoryTypeId();
} else {
if (allowed_input_types.size() != 1) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL,
"'allowed_input_types' must only contain the memory type and id of "
"'buffer'");
}
*dst_buffer = buffer;
*dst_buffer_byte_size = buffer_byte_size;
*dst_memory_type = allowed_input_types[0].first;
*dst_memory_type_id = allowed_input_types[0].second;
}
if (*dst_buffer_byte_size != 0) {
ProcessTensor(
input_name, buffer, *dst_buffer_byte_size, *dst_memory_type,
*dst_memory_type_id);
}
return nullptr; // success
}
bool
BackendInputCollector::Finalize()
{
#ifdef TRITON_ENABLE_GPU
if ((!deferred_pinned_.empty()) && need_sync_) {
if (event_ != nullptr) {
cudaEventSynchronize(event_);
} else {
cudaStreamSynchronize(stream_);
}
need_sync_ = false;
}
#endif // TRITON_ENABLE_GPU
// After the above sync all the GPU->pinned copies are complete. Any
// deferred copies of pinned->CPU can now be done.
#ifdef TRITON_ENABLE_GPU
if (buffer_ready_event_ != nullptr) {
cudaEventSynchronize(buffer_ready_event_);
buffer_ready_event_ = nullptr;
}
#endif // TRITON_ENABLE_GPU
for (auto& def : deferred_pinned_) {
if (!def.finalized_) {
need_sync_ |= def.Finalize(stream_);
}
}
for (size_t i = 0; i < async_task_count_; i++) {
need_sync_ |= completion_queue_.Get();
}
#ifdef TRITON_ENABLE_GPU
// Record the new event location if deferred copies occur
if ((!deferred_pinned_.empty()) && need_sync_ && (event_ != nullptr)) {
cudaEventRecord(event_, stream_);
}
#endif // TRITON_ENABLE_GPU
return need_sync_;
}
bool
BackendInputCollector::DeferredPinned::Finalize(cudaStream_t stream)
{
bool cuda_used = false;
auto err = CopyBuffer(
"pinned buffer", TRITONSERVER_MEMORY_CPU_PINNED, 0, tensor_memory_type_,
tensor_memory_id_, pinned_memory_size_, pinned_memory_,
tensor_buffer_ + tensor_buffer_offset_, stream, &cuda_used);
// If something goes wrong with the copy all the pending
// responses fail...
if (err != nullptr) {
for (auto& pr : requests_) {
for (size_t idx = pr.start_request_idx_; idx <= pr.end_request_idx_;
++idx) {
if ((*responses_)[idx] != nullptr) {
LOG_IF_ERROR(
TRITONBACKEND_ResponseSend(
(*responses_)[idx], TRITONSERVER_RESPONSE_COMPLETE_FINAL,
err),
"failed to send error response");
(*responses_)[idx] = nullptr;
}
}
}
TRITONSERVER_ErrorDelete(err);
}
return cuda_used;
}
bool
BackendInputCollector::SetInputTensor(
const char* input_name, const ContiguousBuffer& input, char* tensor_buffer,
const size_t tensor_buffer_byte_size,
const TRITONSERVER_MemoryType tensor_memory_type,
const int64_t tensor_memory_type_id, const size_t tensor_buffer_offset,
const TRITONSERVER_MemoryType use_pinned_memory_type, const bool use_kernel,
const bool wait_buffer)
{
bool cuda_copy = false;
if ((tensor_buffer_offset + input.memory_desc_.byte_size_) >
tensor_buffer_byte_size) {
for (size_t i = input.start_request_idx_; i <= input.end_request_idx_;
++i) {
RESPOND_AND_SET_NULL_IF_ERROR(
&(*responses_)[i],
TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INVALID_ARG,
std::string(
"unexpected total byte size " +
std::to_string(
tensor_buffer_offset + input.memory_desc_.byte_size_) +
" for input '" + input_name + "', expecting " +
std::to_string(tensor_buffer_byte_size))
.c_str()));
}
return cuda_copy;
}
// If the request buffer matches the memory type that should use an
// intermediate pinned memory buffer for the transfer, then just
// record the input as pending and increase the size required for
// the intermediate pinned buffer. We only do this check for the
// first buffer of an input and apply the same policy for all
// buffers. So if an inputs data is split over different memory
// types this may not be ideal but that should be a very rare
// situation.
if ((use_pinned_memory_type != TRITONSERVER_MEMORY_CPU_PINNED) &&
(input.memory_desc_.memory_type_ == use_pinned_memory_type)) {
if (pending_pinned_byte_size_ == 0) {
pending_pinned_offset_ = tensor_buffer_offset;
}
pending_pinned_byte_size_ += input.memory_desc_.byte_size_;
pending_pinned_input_buffers_.push_back(input);
return cuda_copy;
}
// [FIXME] support other direction if prove to be faster, all kernel
// handling code in this class asssumes the destination buffer is on device
// If the request buffer and the destination buffer are accessible by all
// GPUs (i.e. pinned, device), initiate the copy via copy CUDA kernel.
// We only do this check for the
// first buffer of an input and apply the same policy for all
// buffers. So if an inputs data is split over different memory
// types this may not be ideal but that should be a very rare
// situation.
// Currently checked direction:
// pinned -> device
// same device -> device
// different device -> device
if (use_kernel &&
(input.memory_desc_.memory_type_ != TRITONSERVER_MEMORY_CPU) &&
(tensor_memory_type == TRITONSERVER_MEMORY_GPU)) {
// [FIXME] Currently not allowing copy between devices as it requires
// peer-to-peer access to be enabled. Peer-to-peer is enabled by default,
// but server can still runs even if it fails to enable peer-to-peer.
// Should provide a utility to check whether a device pair allows direct
// access and use gather kernel accordingly
if ((input.memory_desc_.memory_type_ != TRITONSERVER_MEMORY_GPU) ||
(input.memory_desc_.memory_type_id_ == tensor_memory_type_id)) {
if (pending_copy_kernel_buffer_byte_size_ == 0) {
pending_copy_kernel_buffer_offset_ = tensor_buffer_offset;
}
pending_copy_kernel_buffer_byte_size_ += input.memory_desc_.byte_size_;
++pending_copy_kernel_input_buffer_counts_;
pending_copy_kernel_input_buffers_.push_back(input);
return cuda_copy;
}
}
#ifdef TRITON_ENABLE_GPU
if (wait_buffer && (buffer_ready_event_ != nullptr)) {
cudaEventSynchronize(buffer_ready_event_);
buffer_ready_event_ = nullptr;
}
#endif // TRITON_ENABLE_GPU
// Direct copy without intermediate pinned memory.
bool cuda_used = false;
auto err = CopyBuffer(
input_name, input.memory_desc_.memory_type_,
input.memory_desc_.memory_type_id_, tensor_memory_type,
tensor_memory_type_id, input.memory_desc_.byte_size_,
input.memory_desc_.buffer_, tensor_buffer + tensor_buffer_offset, stream_,
&cuda_used, copy_on_stream_);
if (err != nullptr) {
for (size_t i = input.start_request_idx_; i <= input.end_request_idx_;
++i) {
RESPOND_AND_SET_NULL_IF_ERROR(
&(*responses_)[i],
TRITONSERVER_ErrorNew(
TRITONSERVER_ErrorCode(err), TRITONSERVER_ErrorMessage(err)));
}
TRITONSERVER_ErrorDelete(err);
}
cuda_copy |= cuda_used;
return cuda_copy;
}
bool
BackendInputCollector::FlushPendingPinned(
char* tensor_buffer, const size_t tensor_buffer_byte_size,
const TRITONSERVER_MemoryType tensor_memory_type,
const int64_t tensor_memory_type_id)
{
bool cuda_copy = false;
// Will be copying from CPU->pinned->GPU or GPU->pinned->CPU
// Attempt to allocate a pinned buffer to use for staging the
// copy... if we fail to allocated the pinned buffer then we just
// directly go CPU->GPU or GPU->CPU.
char* pinned_memory = nullptr;
int64_t pinned_memory_type_id = 0;
TRITONSERVER_MemoryType pinned_memory_type;
BackendMemory* backend_memory;
if (pending_pinned_byte_size_ > 0) {
TRITONSERVER_Error* err = BackendMemory::Create(
memory_manager_,
{BackendMemory::AllocationType::CPU_PINNED_POOL,
BackendMemory::AllocationType::CPU_PINNED},
0 /* memory_type_id */, pending_pinned_byte_size_, &backend_memory);
if (err != nullptr) {
TRITONSERVER_ErrorDelete(err);
} else {
pinned_memory = backend_memory->MemoryPtr();
pinned_memory_type = backend_memory->MemoryType();
pinned_memory_type_id = backend_memory->MemoryTypeId();
}
}
// If the pinned buffer wasn't actually allocated then just perform
// a direct copy.
if (pinned_memory == nullptr) {
size_t offset = 0;
for (auto& pr : pending_pinned_input_buffers_) {
cuda_copy |= SetInputTensor(
"pinned fallback", pr, tensor_buffer, tensor_buffer_byte_size,
tensor_memory_type, tensor_memory_type_id,
pending_pinned_offset_ + offset, TRITONSERVER_MEMORY_CPU_PINNED,
false, true);
offset += pr.memory_desc_.byte_size_;
}
}
// We have a pinned buffer so copy the pending input buffer(s) into
// the pinned memory.
else { // pinned_memory_type == TRITONSERVER_MEMORY_CPU_PINNED
bool cuda_used = false;
size_t offset = 0;
if (!use_async_cpu_copy_) {
for (auto& pr : pending_pinned_input_buffers_) {
cuda_used |= SetInputTensor(
"pinned H2H", pr, pinned_memory, pending_pinned_byte_size_,
TRITONSERVER_MEMORY_CPU_PINNED, 0 /* memory_type_id */, offset,
TRITONSERVER_MEMORY_CPU_PINNED, false, true);
offset += pr.memory_desc_.byte_size_;
}
cuda_copy |= cuda_used;
// If the copy was not async (i.e. if request input was in CPU so
// a CPU->CPU-PINNED copy was performed above), then the pinned
// buffer now holds the tensor contents and we can immediately
// issue the copies from the pinned buffer to the tensor.
//
// Otherwise the GPU->CPU-PINNED async copies are in flight and we
// simply remember the pinned buffer and the corresponding
// request inputs so that we can do the pinned->CPU copies in
// finalize after we have waited for all async copies to complete.
if (!cuda_used) {
#ifdef TRITON_ENABLE_GPU
if (buffer_ready_event_ != nullptr) {
cudaEventSynchronize(buffer_ready_event_);
buffer_ready_event_ = nullptr;
}
#endif // TRITON_ENABLE_GPU
auto err = CopyBuffer(
"pinned input buffer H2D", TRITONSERVER_MEMORY_CPU_PINNED,
0 /* memory_type_id */, tensor_memory_type, tensor_memory_type_id,
pending_pinned_byte_size_, pinned_memory,
tensor_buffer + pending_pinned_offset_, stream_, &cuda_used,
copy_on_stream_);
cuda_copy |= cuda_used;
// If something goes wrong with the copy all the pending
// responses fail...
if (err != nullptr) {
for (auto& pr : pending_pinned_input_buffers_) {
for (size_t idx = pr.start_request_idx_; idx <= pr.end_request_idx_;
++idx) {
if ((*responses_)[idx] != nullptr) {
LOG_IF_ERROR(
TRITONBACKEND_ResponseSend(
(*responses_)[idx],
TRITONSERVER_RESPONSE_COMPLETE_FINAL, err),
"failed to send error response");
(*responses_)[idx] = nullptr;
}
}
}
TRITONSERVER_ErrorDelete(err);
}
} else { // cuda_used
deferred_pinned_.emplace_back(
pinned_memory, pending_pinned_byte_size_, tensor_buffer,
pending_pinned_offset_, tensor_memory_type, tensor_memory_type_id,
std::move(pending_pinned_input_buffers_), responses_);
}
} else {
async_task_count_++;
deferred_pinned_.emplace_back(
pinned_memory, pending_pinned_byte_size_, tensor_buffer,
pending_pinned_offset_, tensor_memory_type, tensor_memory_type_id,
std::move(pending_pinned_input_buffers_), responses_);
auto& deferred_pinned = deferred_pinned_.back();
// Mark finalized to avoid duplicated call to DeferredPinned::Finalized()
// in BackendInputCollector::Finalize()
deferred_pinned_.back().finalized_ = true;
auto incomplete_count = new std::atomic<size_t>(std::min(
deferred_pinned_.back().requests_.size(),
triton::common::AsyncWorkQueue::WorkerCount()));
auto pending_pinned_byte_size = pending_pinned_byte_size_;
size_t stride = (deferred_pinned_.back().requests_.size() +
triton::common::AsyncWorkQueue::WorkerCount() - 1) /
triton::common::AsyncWorkQueue::WorkerCount();
auto pending_it = deferred_pinned_.back().requests_.begin();
while (pending_it != deferred_pinned_.back().requests_.end()) {
auto end_it = pending_it;
auto next_offset = offset;
for (size_t idx = 0; idx < stride; idx++) {
next_offset += end_it->memory_desc_.byte_size_;
end_it++;
if (end_it == deferred_pinned_.back().requests_.end()) {
break;
}
}
auto err =
CommonErrorToTritonError(triton::common::AsyncWorkQueue::AddTask(
[this, offset, pinned_memory, pinned_memory_type,
pending_pinned_byte_size, pinned_memory_type_id, pending_it,
end_it, incomplete_count, &deferred_pinned]() mutable {
for (; pending_it != end_it; pending_it++) {
SetInputTensor(
"pinned async H2H", *pending_it, pinned_memory,
pending_pinned_byte_size, pinned_memory_type,
pinned_memory_type_id, offset,
TRITONSERVER_MEMORY_CPU_PINNED, false, false);
offset += pending_it->memory_desc_.byte_size_;
}
// The last segmented task will start the next phase of
// the internal pinned buffer copy
if (incomplete_count->fetch_sub(1) == 1) {
#ifdef TRITON_ENABLE_GPU
if (buffer_ready_event_ != nullptr) {
cudaEventSynchronize(buffer_ready_event_);
buffer_ready_event_ = nullptr;
}
#endif // TRITON_ENABLE_GPU
completion_queue_.Put(deferred_pinned.Finalize(stream_));
delete incomplete_count;
}
}));
if (err != nullptr) {
for (; pending_it != end_it; pending_it++) {
for (size_t idx = pending_it->start_request_idx_;
idx <= pending_it->end_request_idx_; ++idx) {
if ((*responses_)[idx] != nullptr) {
LOG_IF_ERROR(
TRITONBACKEND_ResponseSend(
(*responses_)[idx],
TRITONSERVER_RESPONSE_COMPLETE_FINAL, err),
"failed to send error response");
(*responses_)[idx] = nullptr;
}
}
}
}
TRITONSERVER_ErrorDelete(err);
offset = next_offset;
pending_it = end_it;
}
}
}
// Pending pinned copies are handled...
pending_pinned_byte_size_ = 0;
pending_pinned_offset_ = 0;
pending_pinned_input_buffers_.clear();
// Need to hold on to the allocated pinned buffer as there are still
// copies in flight. Will delete it in finalize.
if (pinned_memory != nullptr) {
in_use_memories_.emplace_back(backend_memory);
}
return cuda_copy;
}
TRITONSERVER_Error*
BackendInputCollector::BatchInputShape(
const BatchInput& batch_input, std::vector<int64_t>* shape)
{
*shape = std::vector<int64_t>{0};
switch (batch_input.BatchInputKind()) {
case BatchInput::Kind::BATCH_ELEMENT_COUNT:
case BatchInput::Kind::BATCH_ACCUMULATED_ELEMENT_COUNT: {
(*shape)[0] = request_count_;
break;
}
case BatchInput::Kind::BATCH_ACCUMULATED_ELEMENT_COUNT_WITH_ZERO: {
(*shape)[0] = request_count_ + 1;
break;
}
case BatchInput::Kind::BATCH_MAX_ELEMENT_COUNT_AS_SHAPE: {
const auto& source_input = batch_input.SourceInputs()[0];
for (size_t req_idx = 0; req_idx < request_count_; req_idx++) {
TRITONBACKEND_Input* input;
RETURN_IF_ERROR(TRITONBACKEND_RequestInput(
requests_[req_idx], source_input.c_str(), &input));
const int64_t* shape_arr;
uint32_t dims_count;
RETURN_IF_ERROR(TRITONBACKEND_InputPropertiesForHostPolicy(
input, host_policy_cstr_, nullptr, nullptr, &shape_arr, &dims_count,
nullptr, nullptr));
(*shape)[0] =
std::max((*shape)[0], GetElementCount(shape_arr, dims_count));
}
break;
}
case BatchInput::Kind::BATCH_ITEM_SHAPE: {
shape->emplace_back(0);
const auto& source_input = batch_input.SourceInputs()[0];
for (size_t req_idx = 0; req_idx < request_count_; req_idx++) {
TRITONBACKEND_Input* input;
RETURN_IF_ERROR(TRITONBACKEND_RequestInput(
requests_[req_idx], source_input.c_str(), &input));
const int64_t* shape_arr;
uint32_t dims_count;
RETURN_IF_ERROR(TRITONBACKEND_InputPropertiesForHostPolicy(
input, host_policy_cstr_, nullptr, nullptr, &shape_arr, &dims_count,
nullptr, nullptr));
// Assuming first dimension is batch size and ragged input is only set
// for batching enabled model.
(*shape)[0] += shape_arr[0];
// The batch input tracks the shape without batch dimension for
// each batch item
(*shape)[1] = (dims_count - 1);
}
break;
}
case BatchInput::Kind::BATCH_ITEM_SHAPE_FLATTEN: {
const auto& source_input = batch_input.SourceInputs()[0];
for (size_t req_idx = 0; req_idx < request_count_; req_idx++) {
TRITONBACKEND_Input* input;
RETURN_IF_ERROR(TRITONBACKEND_RequestInput(
requests_[req_idx], source_input.c_str(), &input));
const int64_t* shape_arr;
uint32_t dims_count;
RETURN_IF_ERROR(TRITONBACKEND_InputPropertiesForHostPolicy(
input, host_policy_cstr_, nullptr, nullptr, &shape_arr, &dims_count,
nullptr, nullptr));
// Assuming first dimension is batch size and ragged input is only set
// for batching enabled model.
// The batch input tracks the shape without batch dimension for
// each batch item
(*shape)[0] += (shape_arr[0] * (dims_count - 1));
}
break;
}
default:
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL, "unsupported BatchInputKind received");
}
return nullptr; // success
}
TRITONSERVER_Error*
BackendInputCollector::ProcessBatchInput(
const BatchInput& batch_input, char* buffer, const size_t buffer_byte_size,
const std::vector<std::pair<TRITONSERVER_MemoryType, int64_t>>&
allowed_input_types,
const char** dst_buffer, size_t* dst_buffer_byte_size,
TRITONSERVER_MemoryType* dst_memory_type, int64_t* dst_memory_type_id)
{
#ifdef TRITON_ENABLE_GPU
if (buffer_ready_event_ != nullptr) {
cudaEventSynchronize(buffer_ready_event_);
buffer_ready_event_ = nullptr;
}
#endif // TRITON_ENABLE_GPU
if (buffer == nullptr) {
if (allowed_input_types.size() == 0) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL,
"'allowed_input_types' must contain at least one pair of memory type "
"and id");
}
// Calculate the byte size of the buffer
std::vector<int64_t> shape;
RETURN_IF_ERROR(BatchInputShape(batch_input, &shape));
*dst_buffer_byte_size = GetByteSize(batch_input.DataType(), shape);
BackendMemory* backend_memory = nullptr;
for (const auto& allowed_type : allowed_input_types) {
std::vector<BackendMemory::AllocationType> alloc_types;
const int64_t memory_type_id = allowed_type.second;
switch (allowed_type.first) {
case TRITONSERVER_MEMORY_GPU:
alloc_types = {BackendMemory::AllocationType::GPU_POOL,
BackendMemory::AllocationType::GPU};
break;
case TRITONSERVER_MEMORY_CPU_PINNED:
alloc_types = {BackendMemory::AllocationType::CPU_PINNED_POOL,
BackendMemory::AllocationType::CPU_PINNED};
break;
case TRITONSERVER_MEMORY_CPU:
alloc_types = {BackendMemory::AllocationType::CPU};
break;
}
auto err = BackendMemory::Create(
memory_manager_, alloc_types, memory_type_id, *dst_buffer_byte_size,
&backend_memory);
if (err != nullptr) {
LOG_MESSAGE(
TRITONSERVER_LOG_VERBOSE,
(std::string("unable to create backend memory for type: ") +
TRITONSERVER_MemoryTypeString(allowed_type.first) +
" id: " + std::to_string(memory_type_id) + ": " +
TRITONSERVER_ErrorMessage(err))
.c_str());
TRITONSERVER_ErrorDelete(err);
} else {
in_use_memories_.emplace_back(backend_memory);
break;
}
}
if (backend_memory == nullptr) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL,
(std::string(
"failed to allocate contiguous buffer for batch input '") +
batch_input.TargetNames()[0] + "'")
.c_str());
}
buffer = backend_memory->MemoryPtr();
*dst_buffer = backend_memory->MemoryPtr();
*dst_buffer_byte_size = backend_memory->ByteSize();
*dst_memory_type = backend_memory->MemoryType();
*dst_memory_type_id = backend_memory->MemoryTypeId();
} else {
if (allowed_input_types.size() != 1) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL,
"'allowed_input_types' must only contain the memory type and id of "
"'buffer'");
}
*dst_buffer = buffer;
*dst_buffer_byte_size = buffer_byte_size;
*dst_memory_type = allowed_input_types[0].first;
*dst_memory_type_id = allowed_input_types[0].second;
}
char* input_buffer = buffer;
std::unique_ptr<BackendMemory> internal_buffer;
// Need a CPU buffer for modifying the value
if (*dst_memory_type == TRITONSERVER_MEMORY_GPU) {
BackendMemory* ib = nullptr;
RETURN_IF_ERROR(BackendMemory::Create(
memory_manager_,
{BackendMemory::AllocationType::CPU_PINNED_POOL,
BackendMemory::AllocationType::CPU},
0, *dst_buffer_byte_size, &ib));
internal_buffer.reset(ib);
input_buffer = internal_buffer->MemoryPtr();
}
const auto& data_type = batch_input.DataType();
switch (batch_input.BatchInputKind()) {
case BatchInput::Kind::BATCH_ELEMENT_COUNT: {
const auto& source_input = batch_input.SourceInputs()[0];
if (data_type == TRITONSERVER_TYPE_FP32) {
RETURN_IF_ERROR(SetElementCount<float>(
source_input, input_buffer, *dst_buffer_byte_size));
} else {
RETURN_IF_ERROR(SetElementCount<int32_t>(
source_input, input_buffer, *dst_buffer_byte_size));
}
break;
}
case BatchInput::Kind::BATCH_ACCUMULATED_ELEMENT_COUNT: {
const auto& source_input = batch_input.SourceInputs()[0];
if (data_type == TRITONSERVER_TYPE_FP32) {
RETURN_IF_ERROR(SetAccumulatedElementCount<float>(
source_input, input_buffer, *dst_buffer_byte_size));
} else {
RETURN_IF_ERROR(SetAccumulatedElementCount<int32_t>(
source_input, input_buffer, *dst_buffer_byte_size));
}
break;
}
case BatchInput::Kind::BATCH_ACCUMULATED_ELEMENT_COUNT_WITH_ZERO: {
const auto& source_input = batch_input.SourceInputs()[0];
if (data_type == TRITONSERVER_TYPE_FP32) {
*reinterpret_cast<float*>(input_buffer) = 0;
RETURN_IF_ERROR(SetAccumulatedElementCount<float>(
source_input, input_buffer + sizeof(float),
*dst_buffer_byte_size - sizeof(float)));
} else {
*reinterpret_cast<int32_t*>(input_buffer) = 0;
RETURN_IF_ERROR(SetAccumulatedElementCount<int32_t>(
source_input, input_buffer + sizeof(int32_t),
*dst_buffer_byte_size - sizeof(int32_t)));
}
break;
}
case BatchInput::Kind::BATCH_MAX_ELEMENT_COUNT_AS_SHAPE: {
// The batch input is described by the shape,
// no data modification is needed
return nullptr; // success
}
case BatchInput::Kind::BATCH_ITEM_SHAPE:
case BatchInput::Kind::BATCH_ITEM_SHAPE_FLATTEN: {
// Use the same utilities for both types as the data will be the same,
// only difference is the shape of the tensor.
const auto& source_input = batch_input.SourceInputs()[0];
if (data_type == TRITONSERVER_TYPE_FP32) {
*reinterpret_cast<float*>(input_buffer) = 0;
RETURN_IF_ERROR(SetBatchItemShape<float>(
source_input, input_buffer, *dst_buffer_byte_size));
} else {
*reinterpret_cast<int32_t*>(input_buffer) = 0;
RETURN_IF_ERROR(SetBatchItemShape<int32_t>(
source_input, input_buffer, *dst_buffer_byte_size));
}
break;
}
}
if (*dst_memory_type == TRITONSERVER_MEMORY_GPU) {
bool cuda_used;
RETURN_IF_ERROR(CopyBuffer(
"batch input buffer", internal_buffer->MemoryType(),
internal_buffer->MemoryTypeId(), *dst_memory_type, *dst_memory_type_id,
*dst_buffer_byte_size, input_buffer, buffer, stream_, &cuda_used,
copy_on_stream_));
// Need to keep the backend memory alive in the case of async copy
in_use_memories_.emplace_back(std::move(internal_buffer));
need_sync_ |= cuda_used;
}
return nullptr; // success
}
template <typename T>
TRITONSERVER_Error*
BackendInputCollector::SetElementCount(
const std::string& source_input, char* buffer,
const size_t buffer_byte_size)
{
size_t buffer_offset = 0;
for (size_t req_idx = 0; req_idx < request_count_; req_idx++) {
if (buffer_offset + sizeof(T) > buffer_byte_size) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INVALID_ARG,
"unexpected total byte size for batch input");
}
TRITONBACKEND_Input* input;
RETURN_IF_ERROR(TRITONBACKEND_RequestInput(
requests_[req_idx], source_input.c_str(), &input));
const int64_t* shape;
uint32_t dims_count;
RETURN_IF_ERROR(TRITONBACKEND_InputPropertiesForHostPolicy(
input, host_policy_cstr_, nullptr, nullptr, &shape, &dims_count,
nullptr, nullptr));
*(reinterpret_cast<T*>(buffer) + req_idx) =
GetElementCount(shape, dims_count);
buffer_offset += sizeof(T);
}
// Set the rest of the buffer to 0
for (; buffer_offset + sizeof(T) <= buffer_byte_size;
buffer_offset += sizeof(T)) {
*reinterpret_cast<T*>(buffer + buffer_offset) = 0;
}
return nullptr; // success
}
template <typename T>
TRITONSERVER_Error*
BackendInputCollector::SetAccumulatedElementCount(
const std::string& source_input, char* buffer,
const size_t buffer_byte_size)
{
size_t accumulated_element_count = 0;
size_t buffer_offset = 0;
for (size_t req_idx = 0; req_idx < request_count_; req_idx++) {
if (buffer_offset + sizeof(T) > buffer_byte_size) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INVALID_ARG,
"unexpected total byte size for batch input");
}
TRITONBACKEND_Input* input;
RETURN_IF_ERROR(TRITONBACKEND_RequestInput(
requests_[req_idx], source_input.c_str(), &input));
const int64_t* shape;
uint32_t dims_count;
RETURN_IF_ERROR(TRITONBACKEND_InputPropertiesForHostPolicy(
input, host_policy_cstr_, nullptr, nullptr, &shape, &dims_count,
nullptr, nullptr));
accumulated_element_count += GetElementCount(shape, dims_count);
*(reinterpret_cast<T*>(buffer) + req_idx) = accumulated_element_count;
buffer_offset += sizeof(T);
}
// Set the rest of the buffer to 'accumulated_element_count'
// (no increase in element count)
for (; buffer_offset + sizeof(T) <= buffer_byte_size;
buffer_offset += sizeof(T)) {
*reinterpret_cast<T*>(buffer + buffer_offset) = accumulated_element_count;
}
return nullptr; // success
}
template <typename T>
TRITONSERVER_Error*
BackendInputCollector::SetBatchItemShape(
const std::string& source_input, char* buffer,
const size_t buffer_byte_size)
{
size_t buffer_offset = 0;
for (size_t req_idx = 0; req_idx < request_count_; req_idx++) {
TRITONBACKEND_Input* input;
RETURN_IF_ERROR(TRITONBACKEND_RequestInput(
requests_[req_idx], source_input.c_str(), &input));
const int64_t* shape;
uint32_t dims_count;
RETURN_IF_ERROR(TRITONBACKEND_InputPropertiesForHostPolicy(
input, host_policy_cstr_, nullptr, nullptr, &shape, &dims_count,
nullptr, nullptr));
// Assuming first dimension is batch size and ragged input is only set
// for batching enabled model.
size_t batch_1_size = sizeof(T) * (dims_count - 1);
if (buffer_offset + (size_t)shape[0] * batch_1_size > buffer_byte_size) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INVALID_ARG,
(GetRequestId(requests_[req_idx]) +
"unexpected total byte size for batch input")
.c_str());
}
// The batch input tracks the shape without batch dimension for
// each batch item
for (size_t idx = 1; idx < dims_count; ++idx) {
// Need to set the element explicitly for type conversion
*(reinterpret_cast<T*>(buffer + buffer_offset) + (idx - 1)) = shape[idx];
}
// memcpy the data repeatedly if the request has batch size > 1
for (int64_t idx = 1; idx < shape[0]; ++idx) {
memcpy(
buffer + buffer_offset + idx * batch_1_size, buffer + buffer_offset,
batch_1_size);
}
buffer_offset += batch_1_size * (size_t)shape[0];
}
return nullptr; // success
}
bool
BackendInputCollector::FlushPendingCopyKernel(
char* tensor_buffer, const size_t tensor_buffer_byte_size,
const TRITONSERVER_MemoryType tensor_memory_type,
const int64_t tensor_memory_type_id)
{
if (pending_copy_kernel_input_buffers_.size() == 0) {
return false;
}
bool cuda_copy = false;
TRITONSERVER_Error* error = nullptr;
// Only try to launch kernel if buffer count is large enough for
// good GPU utilization
if (pending_copy_kernel_input_buffer_counts_ >= kernel_buffer_threshold_) {
error = LaunchCopyKernel(
tensor_buffer, tensor_buffer_byte_size, tensor_memory_type,
tensor_memory_type_id);
cuda_copy = (error == nullptr);
LOG_MESSAGE(
TRITONSERVER_LOG_VERBOSE,
(std::string("gather kernel launched with status: ") +
((error == nullptr) ? "Success" : TRITONSERVER_ErrorMessage(error)))
.c_str());
}
// If kernel can't be launched then just perform a direct copy.
if ((pending_copy_kernel_input_buffer_counts_ < kernel_buffer_threshold_) ||
(error != nullptr)) {
size_t offset = 0;
for (auto& pr : pending_copy_kernel_input_buffers_) {
cuda_copy |= SetInputTensor(
"gather kernel fallback", pr, tensor_buffer, tensor_buffer_byte_size,
tensor_memory_type, tensor_memory_type_id,
pending_copy_kernel_buffer_offset_ + offset,
TRITONSERVER_MEMORY_CPU_PINNED, false, true);
offset += pr.memory_desc_.byte_size_;
}
}
TRITONSERVER_ErrorDelete(error);
// Pending kernel copies are handled...
pending_copy_kernel_buffer_byte_size_ = 0;
pending_copy_kernel_buffer_offset_ = 0;
pending_copy_kernel_input_buffer_counts_ = 0;
pending_copy_kernel_input_buffers_.clear();
return cuda_copy;
}
TRITONSERVER_Error*
BackendInputCollector::LaunchCopyKernel(
char* tensor_buffer, const size_t tensor_buffer_byte_size,
const TRITONSERVER_MemoryType tensor_memory_type,
const int64_t tensor_memory_type_id)
{
#ifdef TRITON_ENABLE_GPU
input_ptr_buffer_host_.emplace_back(new std::vector<int8_t*>());
byte_size_buffer_host_.emplace_back(new std::vector<size_t>());
byte_size_offset_buffer_host_.emplace_back(new std::vector<size_t>());
auto& input_ptr_buffer_host = *input_ptr_buffer_host_.back();
auto& byte_size_buffer_host = *byte_size_buffer_host_.back();
auto& byte_size_offset_buffer_host = *byte_size_offset_buffer_host_.back();
input_ptr_buffer_host.reserve(pending_copy_kernel_input_buffer_counts_);
byte_size_buffer_host.reserve(pending_copy_kernel_input_buffer_counts_);
byte_size_offset_buffer_host.reserve(
pending_copy_kernel_input_buffer_counts_);
size_t byte_size_offset = 0;
for (const auto& response_input : pending_copy_kernel_input_buffers_) {
const auto& input = response_input.memory_desc_;
input_ptr_buffer_host.emplace_back(
const_cast<int8_t*>(reinterpret_cast<const int8_t*>(input.buffer_)));
byte_size_buffer_host.emplace_back(input.byte_size_);
byte_size_offset_buffer_host.emplace_back(byte_size_offset);
byte_size_offset += input.byte_size_;
}
BackendMemory* backend_memory = nullptr;
std::vector<BackendMemory::AllocationType> alloc_types;
switch (tensor_memory_type) {
case TRITONSERVER_MEMORY_GPU:
alloc_types = {BackendMemory::AllocationType::GPU_POOL,
BackendMemory::AllocationType::GPU};
break;
case TRITONSERVER_MEMORY_CPU_PINNED:
alloc_types = {BackendMemory::AllocationType::CPU_PINNED_POOL,
BackendMemory::AllocationType::CPU_PINNED};
break;
case TRITONSERVER_MEMORY_CPU:
alloc_types = {BackendMemory::AllocationType::CPU};
break;
}
// input_ptr_buffer
size_t input_ptr_buffer_byte_size =
pending_copy_kernel_input_buffer_counts_ * sizeof(int8_t*);
auto err = BackendMemory::Create(
memory_manager_, alloc_types, tensor_memory_type_id,
input_ptr_buffer_byte_size, &backend_memory);
if (err != nullptr) {
LOG_MESSAGE(
TRITONSERVER_LOG_VERBOSE,
(std::string("unable to create backend memory for type: ") +
TRITONSERVER_MemoryTypeString(tensor_memory_type) +
" id: " + std::to_string(tensor_memory_type_id) + ": " +
TRITONSERVER_ErrorMessage(err))
.c_str());
TRITONSERVER_ErrorDelete(err);
} else {
in_use_memories_.emplace_back(backend_memory);
}
if (backend_memory == nullptr ||
(backend_memory->MemoryType() != tensor_memory_type) ||
(backend_memory->MemoryTypeId() != tensor_memory_type_id)) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL,
"Failed to obtain memory buffer for copy kernel input");
}
char* input_ptr_buffer = backend_memory->MemoryPtr();
// byte_size_buffer
size_t byte_size_buffer_byte_size =
pending_copy_kernel_input_buffer_counts_ * sizeof(size_t);
err = BackendMemory::Create(
memory_manager_, alloc_types, tensor_memory_type_id,
byte_size_buffer_byte_size, &backend_memory);
if (err != nullptr) {
LOG_MESSAGE(
TRITONSERVER_LOG_VERBOSE,
(std::string("unable to create backend memory for type: ") +
TRITONSERVER_MemoryTypeString(tensor_memory_type) +
" id: " + std::to_string(tensor_memory_type_id) + ": " +
TRITONSERVER_ErrorMessage(err))
.c_str());
TRITONSERVER_ErrorDelete(err);
} else {
in_use_memories_.emplace_back(backend_memory);
}
if (backend_memory == nullptr ||
(backend_memory->MemoryType() != tensor_memory_type) ||
(backend_memory->MemoryTypeId() != tensor_memory_type_id)) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL,
"Failed to obtain memory buffer for copy kernel input");
}
char* byte_size_buffer = backend_memory->MemoryPtr();
// byte_size_offset_buffer
size_t byte_size_offset_buffer_byte_size =
pending_copy_kernel_input_buffer_counts_ * sizeof(size_t);
err = BackendMemory::Create(
memory_manager_, alloc_types, tensor_memory_type_id,
byte_size_offset_buffer_byte_size, &backend_memory);
if (err != nullptr) {
LOG_MESSAGE(
TRITONSERVER_LOG_VERBOSE,
(std::string("unable to create backend memory for type: ") +
TRITONSERVER_MemoryTypeString(tensor_memory_type) +
" id: " + std::to_string(tensor_memory_type_id) + ": " +
TRITONSERVER_ErrorMessage(err))
.c_str());
TRITONSERVER_ErrorDelete(err);
} else {
in_use_memories_.emplace_back(backend_memory);
}
if (backend_memory == nullptr ||
(backend_memory->MemoryType() != tensor_memory_type) ||
(backend_memory->MemoryTypeId() != tensor_memory_type_id)) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL,
"Failed to obtain memory buffer for copy kernel input");
}
char* byte_size_offset_buffer = backend_memory->MemoryPtr();
cudaMemcpyAsync(
input_ptr_buffer, input_ptr_buffer_host.data(),
pending_copy_kernel_input_buffer_counts_ * sizeof(int8_t*),
cudaMemcpyDefault, stream_);
cudaMemcpyAsync(
byte_size_buffer, byte_size_buffer_host.data(),
pending_copy_kernel_input_buffer_counts_ * sizeof(size_t),
cudaMemcpyDefault, stream_);
cudaMemcpyAsync(
byte_size_offset_buffer, byte_size_offset_buffer_host.data(),
pending_copy_kernel_input_buffer_counts_ * sizeof(size_t),
cudaMemcpyDefault, stream_);
if (buffer_ready_event_ != nullptr) {
cudaEventSynchronize(buffer_ready_event_);
buffer_ready_event_ = nullptr;
}
RETURN_IF_CUDA_ERROR(
RunGatherKernel(
(const int8_t**)input_ptr_buffer, (const size_t*)byte_size_buffer,
(const size_t*)byte_size_offset_buffer,
(int8_t*)tensor_buffer + pending_copy_kernel_buffer_offset_,
pending_copy_kernel_input_buffer_counts_, stream_),
TRITONSERVER_ERROR_INTERNAL,
std::string("Failed to launch gather kernel"));
return nullptr;
#else
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_UNSUPPORTED,
"Copy kernel can not be launched with TRITON_ENABLE_GPU=OFF");
#endif // TRITON_ENABLE_GPU
}
}} // namespace triton::backend
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "triton/backend/backend_memory.h"
#include <map>
#include "triton/backend/backend_common.h"
namespace triton { namespace backend {
TRITONSERVER_Error*
BackendMemory::Create(
TRITONBACKEND_MemoryManager* manager, const AllocationType alloc_type,
const int64_t memory_type_id, const size_t byte_size, BackendMemory** mem)
{
*mem = nullptr;
void* ptr = nullptr;
switch (alloc_type) {
case AllocationType::CPU_PINNED: {
#ifdef TRITON_ENABLE_GPU
RETURN_IF_CUDA_ERROR(
cudaHostAlloc(&ptr, byte_size, cudaHostAllocPortable),
TRITONSERVER_ERROR_UNAVAILABLE,
std::string("failed to allocate pinned system memory"));
#else
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_UNSUPPORTED,
"pinned-memory allocation not supported");
#endif // TRITON_ENABLE_GPU
break;
}
case AllocationType::GPU: {
#ifdef TRITON_ENABLE_GPU
int current_device;
RETURN_IF_CUDA_ERROR(
cudaGetDevice(&current_device), TRITONSERVER_ERROR_INTERNAL,
std::string("failed to get device"));
bool overridden = (current_device != memory_type_id);
if (overridden) {
RETURN_IF_CUDA_ERROR(
cudaSetDevice(memory_type_id), TRITONSERVER_ERROR_INTERNAL,
std::string("failed to set device"));
}
auto err = cudaMalloc(&ptr, byte_size);
if (overridden) {
LOG_IF_CUDA_ERROR(
cudaSetDevice(current_device), "failed to set CUDA device");
}
RETURN_ERROR_IF_FALSE(
err == cudaSuccess, TRITONSERVER_ERROR_UNAVAILABLE,
std::string("failed to allocate GPU memory: ") +
cudaGetErrorString(err));
#else
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_UNSUPPORTED, "GPU allocation not supported");
#endif // TRITON_ENABLE_GPU
break;
}
case AllocationType::CPU:
case AllocationType::CPU_PINNED_POOL:
case AllocationType::GPU_POOL:
RETURN_IF_ERROR(TRITONBACKEND_MemoryManagerAllocate(
manager, &ptr, AllocTypeToMemoryType(alloc_type), memory_type_id,
byte_size));
break;
}
*mem = new BackendMemory(
manager, alloc_type, memory_type_id, reinterpret_cast<char*>(ptr),
byte_size);
return nullptr; // success
}
TRITONSERVER_Error*
BackendMemory::Create(
TRITONBACKEND_MemoryManager* manager,
const std::vector<AllocationType>& alloc_types,
const int64_t memory_type_id, const size_t byte_size, BackendMemory** mem)
{
*mem = nullptr;
RETURN_ERROR_IF_TRUE(
alloc_types.size() == 0, TRITONSERVER_ERROR_INVALID_ARG,
std::string("BackendMemory::Create, at least one allocation type must be "
"specified"));
bool success = false;
std::unordered_map<AllocationType, TRITONSERVER_Error*> errors;
for (const AllocationType alloc_type : alloc_types) {
TRITONSERVER_Error* err =
Create(manager, alloc_type, memory_type_id, byte_size, mem);
if (err == nullptr) {
success = true;
break;
}
errors.insert({alloc_type, err});
}
// If allocation failed for all allocation types then display all
// the error messages and show the entire allocation request as
// failing.
if (!success) {
std::string msg = "BackendMemory::Create, all allocation types failed:";
for (const auto& pr : errors) {
const AllocationType alloc_type = pr.first;
TRITONSERVER_Error* err = pr.second;
msg += std::string("\n\t") + AllocTypeString(alloc_type) + ": " +
TRITONSERVER_ErrorMessage(err);
TRITONSERVER_ErrorDelete(err);
}
return TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_UNAVAILABLE, msg.c_str());
}
return nullptr; // success
}
TRITONSERVER_Error*
BackendMemory::Create(
TRITONBACKEND_MemoryManager* manager, const AllocationType alloc_type,
const int64_t memory_type_id, void* buffer, const size_t byte_size,
BackendMemory** mem)
{
*mem = new BackendMemory(
manager, alloc_type, memory_type_id, reinterpret_cast<char*>(buffer),
byte_size, false /* owns_buffer */);
return nullptr; // success
}
BackendMemory::~BackendMemory()
{
if (owns_buffer_) {
switch (alloctype_) {
case AllocationType::CPU_PINNED:
#ifdef TRITON_ENABLE_GPU
if (buffer_ != nullptr) {
LOG_IF_CUDA_ERROR(
cudaFreeHost(buffer_), "failed to free pinned memory");
}
#endif // TRITON_ENABLE_GPU
break;
case AllocationType::GPU:
#ifdef TRITON_ENABLE_GPU
if (buffer_ != nullptr) {
LOG_IF_CUDA_ERROR(cudaFree(buffer_), "failed to free CUDA memory");
}
#endif // TRITON_ENABLE_GPU
break;
case AllocationType::CPU:
case AllocationType::CPU_PINNED_POOL:
case AllocationType::GPU_POOL:
LOG_IF_ERROR(
TRITONBACKEND_MemoryManagerFree(
manager_, buffer_, AllocTypeToMemoryType(alloctype_),
memtype_id_),
"failed to free memory buffer");
break;
}
}
}
TRITONSERVER_MemoryType
BackendMemory::AllocTypeToMemoryType(const AllocationType a)
{
switch (a) {
case AllocationType::CPU:
return TRITONSERVER_MEMORY_CPU;
case AllocationType::CPU_PINNED:
case AllocationType::CPU_PINNED_POOL:
return TRITONSERVER_MEMORY_CPU_PINNED;
case AllocationType::GPU:
case AllocationType::GPU_POOL:
return TRITONSERVER_MEMORY_GPU;
}
return TRITONSERVER_MEMORY_CPU; // unreachable
}
const char*
BackendMemory::AllocTypeString(const AllocationType a)
{
switch (a) {
case AllocationType::CPU:
return "CPU";
case AllocationType::CPU_PINNED:
return "CPU_PINNED";
case AllocationType::GPU:
return "GPU";
case AllocationType::CPU_PINNED_POOL:
return "CPU_PINNED_POOL";
case AllocationType::GPU_POOL:
return "GPU_POOL";
}
return "<unknown>";
}
}} // namespace triton::backend
// Copyright 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "triton/backend/backend_model.h"
#include "triton/backend/backend_common.h"
namespace triton { namespace backend {
//
// BackendModel
//
BackendModel::BackendModel(
TRITONBACKEND_Model* triton_model, const bool allow_optional)
: triton_model_(triton_model), allow_optional_(allow_optional)
{
const char* model_name;
THROW_IF_BACKEND_MODEL_ERROR(
TRITONBACKEND_ModelName(triton_model, &model_name));
name_ = model_name;
THROW_IF_BACKEND_MODEL_ERROR(
TRITONBACKEND_ModelVersion(triton_model, &version_));
const char* repository_path = nullptr;
TRITONBACKEND_ArtifactType repository_artifact_type;
THROW_IF_BACKEND_MODEL_ERROR(TRITONBACKEND_ModelRepository(
triton_model, &repository_artifact_type, &repository_path));
if (repository_artifact_type != TRITONBACKEND_ARTIFACT_FILESYSTEM) {
throw BackendModelException(TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_UNSUPPORTED,
(std::string("unsupported repository artifact type for model '") +
model_name + "'")
.c_str()));
}
repository_path_ = repository_path;
THROW_IF_BACKEND_MODEL_ERROR(
TRITONBACKEND_ModelServer(triton_model, &triton_server_));
TRITONBACKEND_Backend* backend;
THROW_IF_BACKEND_MODEL_ERROR(
TRITONBACKEND_ModelBackend(triton_model, &backend));
THROW_IF_BACKEND_MODEL_ERROR(
TRITONBACKEND_BackendMemoryManager(backend, &triton_memory_manager_));
THROW_IF_BACKEND_MODEL_ERROR(ParseModelConfig());
}
TRITONSERVER_Error*
BackendModel::ParseModelConfig()
{
TRITONSERVER_Message* config_message;
RETURN_IF_ERROR(TRITONBACKEND_ModelConfig(
triton_model_, 1 /* config_version */, &config_message));
// Get the model configuration as a json string from
// config_message. We use TritonJson, which is a wrapper that
// returns nice errors (currently the underlying implementation is
// rapidjson... but others could be added).
const char* buffer;
size_t byte_size;
RETURN_IF_ERROR(
TRITONSERVER_MessageSerializeToJson(config_message, &buffer, &byte_size));
TRITONSERVER_Error* err = model_config_.Parse(buffer, byte_size);
RETURN_IF_ERROR(TRITONSERVER_MessageDelete(config_message));
RETURN_IF_ERROR(err);
int64_t mbs = 0;
RETURN_IF_ERROR(model_config_.MemberAsInt("max_batch_size", &mbs));
max_batch_size_ = mbs;
enable_pinned_input_ = false;
enable_pinned_output_ = false;
{
common::TritonJson::Value optimization;
if (model_config_.Find("optimization", &optimization)) {
common::TritonJson::Value pinned_memory;
if (optimization.Find("input_pinned_memory", &pinned_memory)) {
RETURN_IF_ERROR(
pinned_memory.MemberAsBool("enable", &enable_pinned_input_));
}
if (optimization.Find("output_pinned_memory", &pinned_memory)) {
RETURN_IF_ERROR(
pinned_memory.MemberAsBool("enable", &enable_pinned_output_));
}
}
}
RETURN_IF_ERROR(
BatchInput::ParseFromModelConfig(model_config_, &batch_inputs_));
RETURN_IF_ERROR(
BatchOutput::ParseFromModelConfig(model_config_, &batch_outputs_));
for (const auto& batch_output : batch_outputs_) {
for (const auto& name : batch_output.TargetNames()) {
batch_output_map_.emplace(name, &batch_output);
}
}
triton::common::TritonJson::Value config_inputs;
RETURN_IF_ERROR(model_config_.MemberAsArray("input", &config_inputs));
for (size_t i = 0; i < config_inputs.ArraySize(); i++) {
triton::common::TritonJson::Value io;
RETURN_IF_ERROR(config_inputs.IndexAsObject(i, &io));
std::string io_name;
RETURN_IF_ERROR(io.MemberAsString("name", &io_name));
triton::common::TritonJson::Value input_property_json;
bool allow_ragged_batch = false;
if (io.Find("allow_ragged_batch", &input_property_json)) {
RETURN_IF_ERROR(input_property_json.AsBool(&allow_ragged_batch));
}
if (allow_ragged_batch) {
ragged_inputs_.emplace(io_name);
}
bool optional = false;
if (io.Find("optional", &input_property_json)) {
RETURN_IF_ERROR(input_property_json.AsBool(&optional));
}
if (optional) {
if (allow_optional_) {
optional_inputs_.emplace(io_name);
} else {
RETURN_IF_ERROR(TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INVALID_ARG,
(std::string("'optional' is set to true for input '") + io_name +
"' while the backend model doesn't support optional input")
.c_str()));
}
}
}
return nullptr;
}
TRITONSERVER_Error*
BackendModel::SetModelConfig()
{
triton::common::TritonJson::WriteBuffer json_buffer;
RETURN_IF_ERROR(ModelConfig().Write(&json_buffer));
TRITONSERVER_Message* message;
RETURN_IF_ERROR(TRITONSERVER_MessageNewFromSerializedJson(
&message, json_buffer.Base(), json_buffer.Size()));
RETURN_IF_ERROR(TRITONBACKEND_ModelSetConfig(
triton_model_, 1 /* config_version */, message));
RETURN_IF_ERROR(TRITONSERVER_MessageDelete(message));
// Triton core can normalize the missing config settings
// in the above call. We must retrieve the updated model
// configration from the core.
RETURN_IF_ERROR(ParseModelConfig());
return nullptr;
}
TRITONSERVER_Error*
BackendModel::SupportsFirstDimBatching(bool* supports)
{
*supports = max_batch_size_ > 0;
return nullptr;
}
const BatchOutput*
BackendModel::FindBatchOutput(const std::string& output_name) const
{
const auto it = batch_output_map_.find(output_name);
return ((it == batch_output_map_.end()) ? nullptr : it->second);
}
}} // namespace triton::backend
// Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "triton/backend/backend_model_instance.h"
#include <vector>
#include "triton/backend/backend_common.h"
#include "triton/backend/backend_model.h"
namespace triton { namespace backend {
//
// BackendModelInstance
//
BackendModelInstance::BackendModelInstance(
BackendModel* backend_model,
TRITONBACKEND_ModelInstance* triton_model_instance)
: backend_model_(backend_model),
triton_model_instance_(triton_model_instance)
{
const char* instance_name;
THROW_IF_BACKEND_INSTANCE_ERROR(
TRITONBACKEND_ModelInstanceName(triton_model_instance, &instance_name));
name_ = instance_name;
THROW_IF_BACKEND_INSTANCE_ERROR(
TRITONBACKEND_ModelInstanceKind(triton_model_instance, &kind_));
THROW_IF_BACKEND_INSTANCE_ERROR(
TRITONBACKEND_ModelInstanceDeviceId(triton_model_instance, &device_id_));
common::TritonJson::Value& model_config = backend_model->ModelConfig();
// If the model configuration specifies a 'default_model_filename'
// and/or specifies 'cc_model_filenames' then determine the
// appropriate 'artifact_filename' value. If model configuration
// does not specify then just leave 'artifact_filename' empty and
// the backend can then provide its own logic for determine the
// filename if that is appropriate.
THROW_IF_BACKEND_INSTANCE_ERROR(model_config.MemberAsString(
"default_model_filename", &artifact_filename_));
switch (kind_) {
case TRITONSERVER_INSTANCEGROUPKIND_CPU: {
LOG_MESSAGE(
TRITONSERVER_LOG_VERBOSE,
(std::string("Creating instance ") + name_ +
" on CPU using artifact '" + artifact_filename_ + "'")
.c_str());
break;
}
case TRITONSERVER_INSTANCEGROUPKIND_MODEL: {
LOG_MESSAGE(
TRITONSERVER_LOG_VERBOSE,
(std::string("Creating instance ") + name_ +
" on model-specified devices using artifact '" + artifact_filename_ +
"'")
.c_str());
break;
}
case TRITONSERVER_INSTANCEGROUPKIND_GPU: {
#if defined(TRITON_ENABLE_GPU)
cudaDeviceProp cuprops;
cudaError_t cuerr = cudaGetDeviceProperties(&cuprops, device_id_);
if (cuerr != cudaSuccess) {
throw BackendModelInstanceException(TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL,
(std::string("unable to get CUDA device properties for ") + name_ +
": " + cudaGetErrorString(cuerr))
.c_str()));
}
const std::string cc =
std::to_string(cuprops.major) + "." + std::to_string(cuprops.minor);
common::TritonJson::Value cc_names;
common::TritonJson::Value cc_name;
if ((model_config.Find("cc_model_filenames", &cc_names)) &&
(cc_names.Find(cc.c_str(), &cc_name))) {
cc_name.AsString(&artifact_filename_);
}
LOG_MESSAGE(
TRITONSERVER_LOG_VERBOSE,
(std::string("Creating instance ") + name_ + " on GPU " +
std::to_string(device_id_) + " (" + cc + ") using artifact '" +
artifact_filename_ + "'")
.c_str());
#elif !defined(TRITON_ENABLE_MALI_GPU)
throw BackendModelInstanceException(TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL, "GPU instances not supported"));
#endif // TRITON_ENABLE_GPU
break;
}
default: {
throw BackendModelInstanceException(TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL,
(std::string("unexpected instance kind for ") + name_).c_str()));
}
}
stream_ = nullptr;
if (kind_ == TRITONSERVER_INSTANCEGROUPKIND_GPU) {
THROW_IF_BACKEND_INSTANCE_ERROR(
CreateCudaStream(device_id_, 0 /* cuda_stream_priority */, &stream_));
}
// Get the host policy setting as a json string from message,
// and extract the host policy name for the instance.
TRITONSERVER_Message* message = nullptr;
THROW_IF_BACKEND_MODEL_ERROR(
TRITONBACKEND_ModelInstanceHostPolicy(triton_model_instance_, &message));
const char* buffer;
size_t byte_size;
THROW_IF_BACKEND_MODEL_ERROR(
TRITONSERVER_MessageSerializeToJson(message, &buffer, &byte_size));
common::TritonJson::Value host_policy;
TRITONSERVER_Error* err = host_policy.Parse(buffer, byte_size);
THROW_IF_BACKEND_MODEL_ERROR(err);
std::vector<std::string> host_policy_name;
THROW_IF_BACKEND_MODEL_ERROR(host_policy.Members(&host_policy_name));
if (host_policy_name.size() != 1) {
throw BackendModelInstanceException(TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL,
(std::string("unexpected no host policy for ") + name_).c_str()));
}
host_policy_name_ = host_policy_name[0];
}
BackendModelInstance::~BackendModelInstance()
{
#ifdef TRITON_ENABLE_GPU
if (stream_ != nullptr) {
cudaError_t err = cudaStreamDestroy(stream_);
if (err != cudaSuccess) {
TRITONSERVER_LogMessage(
TRITONSERVER_LOG_ERROR, __FILE__, __LINE__,
(std::string("~BackendModelInstance: ") + name_ +
" failed to destroy cuda stream: " + cudaGetErrorString(err))
.c_str());
}
stream_ = nullptr;
}
#endif // TRITON_ENABLE_GPU
}
}} // namespace triton::backend
// Copyright 2019-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "triton/backend/backend_output_responder.h"
#include "triton/backend/backend_common.h"
#include "triton/backend/backend_model.h"
#include "triton/backend/backend_model_instance.h"
namespace triton { namespace backend {
//
// BackendOutputResponder
//
BackendOutputResponder::~BackendOutputResponder()
{
for (auto& pinned_memory : pinned_memories_) {
LOG_IF_ERROR(
TRITONBACKEND_MemoryManagerFree(
memory_manager_, reinterpret_cast<void*>(pinned_memory),
TRITONSERVER_MEMORY_CPU_PINNED, 0),
"failed to free pinned memory");
}
}
void
BackendOutputResponder::ProcessTensor(
const std::string& output_name, const TRITONSERVER_DataType datatype,
std::vector<int64_t>& batchn_shape, const char* buffer,
const TRITONSERVER_MemoryType memory_type, const int64_t memory_type_id)
{
// A value of CPU_PINNED indicates that pinned memory buffer is not
// needed for this tensor. Any other value indicates that a pinned
// memory buffer is needed when the target memory type matches
// 'use_pinned_memory_type'.
TRITONSERVER_MemoryType use_pinned_memory_type =
TRITONSERVER_MEMORY_CPU_PINNED;
if (pinned_enabled_) {
use_pinned_memory_type = GetUsePinnedMemoryType(memory_type);
}
const int64_t batchn_batch_size = batchn_shape[0];
int64_t batch_size_offset = 0;
size_t tensor_offset = 0;
for (size_t idx = 0; idx < responses_->size(); idx++) {
auto& request = requests_[idx];
auto& response = (*responses_)[idx];
// If then pending copies are from tensor buffer that is not
// contiguous with 'response's part of that buffer, then need to
// go ahead and perform the pending copies so that can start a
// new contiguous region if necessary.
if ((pending_pinned_byte_size_ > 0) &&
(tensor_offset !=
(pending_pinned_byte_size_ + pending_pinned_offset_))) {
need_sync_ |= FlushPendingPinned(buffer, memory_type, memory_type_id);
}
// Override shape to be correct for this response.
if (first_dim_batching_) {
TRITONBACKEND_Input* input;
TRITONBACKEND_RequestInputByIndex(request, 0, &input);
const int64_t* shape;
TRITONBACKEND_InputProperties(
input, nullptr, nullptr, &shape, nullptr, nullptr, nullptr);
if ((batchn_batch_size != -1) &&
((batch_size_offset + shape[0]) > batchn_batch_size)) {
if (response != nullptr) {
RESPOND_AND_SET_NULL_IF_ERROR(
&response,
TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_UNSUPPORTED,
std::string(
GetRequestId(request) +
"failed to split the output tensor '" + output_name +
"' in responses: expected batch size of atleast " +
std::to_string(batch_size_offset + shape[0]) +
" in model output, got " +
std::to_string(batchn_batch_size))
.c_str()));
}
}
batchn_shape[0] = shape[0];
batch_size_offset += shape[0];
}
const size_t tensor_byte_size = GetByteSize(datatype, batchn_shape);
TRITONBACKEND_Output* response_output;
if (response != nullptr) {
uint32_t output_count;
RESPOND_AND_SET_NULL_IF_ERROR(
&response, TRITONBACKEND_RequestOutputCount(request, &output_count));
if (response != nullptr) {
for (uint32_t output_idx = 0; output_idx < output_count; output_idx++) {
const char* name;
RESPOND_AND_SET_NULL_IF_ERROR(
&response,
TRITONBACKEND_RequestOutputName(request, output_idx, &name));
if ((response != nullptr) && (output_name == name)) {
RESPOND_AND_SET_NULL_IF_ERROR(
&response, TRITONBACKEND_ResponseOutput(
response, &response_output, name, datatype,
batchn_shape.data(), batchn_shape.size()));
if (response != nullptr) {
need_sync_ |= SetFixedSizeBuffer(
&response, response_output, output_name, tensor_byte_size,
tensor_offset, buffer, memory_type, memory_type_id,
use_pinned_memory_type, false /* state */);
}
break;
}
}
}
}
tensor_offset += tensor_byte_size;
}
// Done with the tensor, flush any pending pinned copies.
need_sync_ |= FlushPendingPinned(buffer, memory_type, memory_type_id);
#ifdef TRITON_ENABLE_GPU
if (need_sync_ && (event_ != nullptr)) {
cudaEventRecord(event_, stream_);
}
#endif // TRITON_ENABLE_GPU
}
std::vector<TRITONBACKEND_State*>
BackendOutputResponder::ProcessStateTensor(
const std::string& output_state_name, const TRITONSERVER_DataType datatype,
std::vector<int64_t>& batchn_shape, const char* buffer,
const TRITONSERVER_MemoryType memory_type, const int64_t memory_type_id)
{
// A value of CPU_PINNED indicates that pinned memory buffer is not
// needed for this tensor. Any other value indicates that a pinned
// memory buffer is needed when the target memory type matches
// 'use_pinned_memory_type'.
TRITONSERVER_MemoryType use_pinned_memory_type =
TRITONSERVER_MEMORY_CPU_PINNED;
if (pinned_enabled_) {
use_pinned_memory_type = GetUsePinnedMemoryType(memory_type);
}
std::vector<TRITONBACKEND_State*> states;
const int64_t batchn_batch_size = batchn_shape[0];
int64_t batch_size_offset = 0;
size_t tensor_offset = 0;
for (size_t idx = 0; idx < responses_->size(); idx++) {
auto& request = requests_[idx];
auto& response = (*responses_)[idx];
// If then pending copies are from tensor buffer that is not
// contiguous with 'response's part of that buffer, then need to
// go ahead and perform the pending copies so that can start a
// new contiguous region if necessary.
if ((pending_pinned_byte_size_ > 0) &&
(tensor_offset !=
(pending_pinned_byte_size_ + pending_pinned_offset_))) {
need_sync_ |= FlushPendingPinned(buffer, memory_type, memory_type_id);
}
// Override shape to be correct for this response.
if (first_dim_batching_) {
TRITONBACKEND_Input* input;
TRITONBACKEND_RequestInputByIndex(request, 0, &input);
const int64_t* shape;
TRITONBACKEND_InputProperties(
input, nullptr, nullptr, &shape, nullptr, nullptr, nullptr);
if ((batchn_batch_size != -1) &&
((batch_size_offset + shape[0]) > batchn_batch_size)) {
if (response != nullptr) {
RESPOND_AND_SET_NULL_IF_ERROR(
&response,
TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_UNSUPPORTED,
std::string(
GetRequestId(request) +
"failed to split the output state tensor '" +
output_state_name +
"' in responses: expected batch size of atleast " +
std::to_string(batch_size_offset + shape[0]) +
" in model output, got " +
std::to_string(batchn_batch_size))
.c_str()));
}
}
batchn_shape[0] = shape[0];
batch_size_offset += shape[0];
}
const size_t tensor_byte_size = GetByteSize(datatype, batchn_shape);
TRITONBACKEND_State* output_state;
if (response != nullptr) {
RESPOND_AND_SET_NULL_IF_ERROR(
&response, TRITONBACKEND_StateNew(
&output_state, request, output_state_name.c_str(),
datatype, batchn_shape.data(), batchn_shape.size()));
if (response != nullptr) {
states.push_back(output_state);
need_sync_ |= SetFixedSizeBuffer(
&response, output_state, output_state_name, tensor_byte_size,
tensor_offset, buffer, memory_type, memory_type_id,
use_pinned_memory_type, true /* state */);
}
}
tensor_offset += tensor_byte_size;
}
// Done with the tensor, flush any pending pinned copies.
need_sync_ |= FlushPendingPinned(buffer, memory_type, memory_type_id);
#ifdef TRITON_ENABLE_GPU
if (need_sync_ && (event_ != nullptr)) {
cudaEventRecord(event_, stream_);
}
#endif // TRITON_ENABLE_GPU
return states;
}
bool
BackendOutputResponder::Finalize()
{
#ifdef TRITON_ENABLE_GPU
if ((!deferred_pinned_.empty()) && need_sync_) {
if (event_ != nullptr) {
cudaEventSynchronize(event_);
} else {
cudaStreamSynchronize(stream_);
}
need_sync_ = false;
}
#endif // TRITON_ENABLE_GPU
// After the above sync all the GPU->pinned copies are complete. Any
// deferred copies of pinned->CPU can now be done.
for (auto& def : deferred_pinned_) {
auto pinned_memory_type = TRITONSERVER_MEMORY_CPU_PINNED;
int64_t pinned_memory_id = 0;
char* pinned_buffer = def.pinned_memory_;
size_t offset = 0;
for (auto& pr : def.responses_) {
auto& response = pr.first;
auto& response_output = pr.second;
bool cuda_used = false;
RESPOND_AND_SET_NULL_IF_ERROR(
response,
CopyBuffer(
response_output.name_, pinned_memory_type, pinned_memory_id,
response_output.memory_type_, response_output.memory_type_id_,
response_output.buffer_byte_size_, pinned_buffer + offset,
const_cast<void*>(response_output.buffer_), stream_, &cuda_used,
copy_on_stream_));
need_sync_ |= cuda_used;
offset += response_output.buffer_byte_size_;
}
}
#ifdef TRITON_ENABLE_GPU
// Record the new event location if deferred copies occur
if ((!deferred_pinned_.empty()) && need_sync_ && (event_ != nullptr)) {
cudaEventRecord(event_, stream_);
}
#endif // TRITON_ENABLE_GPU
deferred_pinned_.clear();
return need_sync_;
}
bool
BackendOutputResponder::SetFixedSizeBuffer(
TRITONBACKEND_Response** response, void* response_output_or_state,
const std::string& output_name, const size_t tensor_byte_size,
const size_t tensor_offset, const char* tensor_buffer,
const TRITONSERVER_MemoryType tensor_memory_type,
const int64_t tensor_memory_type_id,
const TRITONSERVER_MemoryType use_pinned_memory_type, bool state)
{
void* buffer = nullptr;
bool cuda_copy = false;
TRITONSERVER_MemoryType actual_memory_type = tensor_memory_type;
int64_t actual_memory_type_id = tensor_memory_type_id;
if (state) {
TRITONBACKEND_State* response_state =
reinterpret_cast<TRITONBACKEND_State*>(response_output_or_state);
auto err = TRITONBACKEND_StateBuffer(
response_state, &buffer, tensor_byte_size, &actual_memory_type,
&actual_memory_type_id);
if (err != nullptr) {
RESPOND_AND_SET_NULL_IF_ERROR(response, err);
return cuda_copy;
}
} else {
TRITONBACKEND_Output* response_output =
reinterpret_cast<TRITONBACKEND_Output*>(response_output_or_state);
auto err = TRITONBACKEND_OutputBuffer(
response_output, &buffer, tensor_byte_size, &actual_memory_type,
&actual_memory_type_id);
if (err != nullptr) {
RESPOND_AND_SET_NULL_IF_ERROR(response, err);
return cuda_copy;
}
}
// If the response buffer matches the memory type that should use an
// intermediate pinned memory buffer for the transfer, then just
// record the response as pending and increase the size required for
// the intermediate pinned buffer.
if ((use_pinned_memory_type != TRITONSERVER_MEMORY_CPU_PINNED) &&
(actual_memory_type == use_pinned_memory_type)) {
if (pending_pinned_byte_size_ == 0) {
pending_pinned_offset_ = tensor_offset;
}
pending_pinned_byte_size_ += tensor_byte_size;
pending_pinned_outputs_.push_back(std::make_pair(
response, OutputData(
output_name, buffer, tensor_byte_size, actual_memory_type,
actual_memory_type_id)));
} else {
// Direct copy without intermediate pinned memory.
bool cuda_used = false;
auto err = CopyBuffer(
output_name, tensor_memory_type, tensor_memory_type_id,
actual_memory_type, actual_memory_type_id, tensor_byte_size,
tensor_buffer + tensor_offset, buffer, stream_, &cuda_used,
copy_on_stream_);
cuda_copy |= cuda_used;
if (err != nullptr) {
RESPOND_AND_SET_NULL_IF_ERROR(response, err);
return cuda_copy;
}
}
return cuda_copy;
}
bool
BackendOutputResponder::FlushPendingPinned(
const char* tensor_buffer, const TRITONSERVER_MemoryType tensor_memory_type,
const int64_t tensor_memory_type_id)
{
bool cuda_copy = false;
// Will be copying from CPU->pinned->GPU or GPU->pinned->CPU
// Attempt to allocate a pinned buffer to use for staging the
// copy... if we fail to allocated the pinned buffer then we just
// directly go CPU->GPU or GPU->CPU.
char* pinned_memory = nullptr;
if (pending_pinned_byte_size_ > 0) {
TRITONSERVER_Error* err = TRITONBACKEND_MemoryManagerAllocate(
memory_manager_, reinterpret_cast<void**>(&pinned_memory),
TRITONSERVER_MEMORY_CPU_PINNED, 0 /* memory_type_id */,
pending_pinned_byte_size_);
if (err != nullptr) {
pinned_memory = nullptr;
TRITONSERVER_ErrorDelete(err);
}
}
// If the pinned buffer wasn't actually allocated then just perform
// a direct copy.
if (pinned_memory == nullptr) {
size_t offset = 0;
for (auto& pr : pending_pinned_outputs_) {
auto& response = pr.first;
auto& response_output = pr.second;
bool cuda_used = false;
RESPOND_AND_SET_NULL_IF_ERROR(
response,
CopyBuffer(
response_output.name_, tensor_memory_type, tensor_memory_type_id,
response_output.memory_type_, response_output.memory_type_id_,
response_output.buffer_byte_size_,
tensor_buffer + pending_pinned_offset_ + offset,
const_cast<void*>(response_output.buffer_), stream_, &cuda_used,
copy_on_stream_));
cuda_copy |= cuda_used;
offset += response_output.buffer_byte_size_;
}
}
// We have a pinned buffer so do a single copy of a block of tensor
// data to the pinned buffer.
else { // pinned_memory_type == TRITONSERVER_MEMORY_CPU_PINNED
bool cuda_used = false;
auto err = CopyBuffer(
"pinned buffer", tensor_memory_type, tensor_memory_type_id,
TRITONSERVER_MEMORY_CPU_PINNED, 0 /* memory_type_id */,
pending_pinned_byte_size_, tensor_buffer + pending_pinned_offset_,
pinned_memory, stream_, &cuda_used, copy_on_stream_);
cuda_copy |= cuda_used;
// If something goes wrong with the copy all the pending
// responses fail...
if (err != nullptr) {
for (auto& pr : pending_pinned_outputs_) {
auto& response = pr.first;
if (*response != nullptr) {
LOG_IF_ERROR(
TRITONBACKEND_ResponseSend(
*response, TRITONSERVER_RESPONSE_COMPLETE_FINAL, err),
"failed to send TensorFlow error response");
*response = nullptr;
}
}
TRITONSERVER_ErrorDelete(err);
}
// If the copy was not async (i.e. if tensor was in CPU so a
// CPU->CPU-PINNED copy was performed above), then the pinned
// buffer now holds the tensor contents and we can immediately
// issue the copies from the pinned buffer to the
// responses.
//
// Otherwise the GPU->CPU-PINNED async copies are in flight and we
// simply remember the pinned buffer and the corresponding
// response outputs so that we can do the pinned->CPU copies in
// finalize after we have waited for all async copies to complete.
if (!cuda_used) {
size_t offset = 0;
for (auto& pr : pending_pinned_outputs_) {
auto& response = pr.first;
auto& response_output = pr.second;
bool cuda_used = false;
RESPOND_AND_SET_NULL_IF_ERROR(
response,
CopyBuffer(
response_output.name_, TRITONSERVER_MEMORY_CPU_PINNED,
0 /* memory_type_id */, response_output.memory_type_,
response_output.memory_type_id_,
response_output.buffer_byte_size_, pinned_memory + offset,
const_cast<void*>(response_output.buffer_), stream_, &cuda_used,
copy_on_stream_));
cuda_copy |= cuda_used;
offset += response_output.buffer_byte_size_;
}
} else {
deferred_pinned_.emplace_back(
pinned_memory, pending_pinned_byte_size_,
std::move(pending_pinned_outputs_));
}
}
// Pending pinned copies are handled...
pending_pinned_byte_size_ = 0;
pending_pinned_offset_ = 0;
pending_pinned_outputs_.clear();
// Need to hold on to the allocated pinned buffer as there are still
// copies in flight. Will delete it in finalize.
if (pinned_memory != nullptr) {
pinned_memories_.push_back(pinned_memory);
}
return cuda_copy;
}
void
BackendOutputResponder::ProcessBatchOutput(
const std::string& name, const BatchOutput& batch_output,
const char* buffer, const TRITONSERVER_MemoryType memory_type,
const int64_t memory_type_id)
{
// A value of CPU_PINNED indicates that pinned memory buffer is not
// needed for this tensor. Any other value indicates that a pinned
// memory buffer is needed when the target memory type matches
// 'use_pinned_memory_type'.
TRITONSERVER_MemoryType use_pinned_memory_type =
TRITONSERVER_MEMORY_CPU_PINNED;
if (pinned_enabled_) {
use_pinned_memory_type = GetUsePinnedMemoryType(memory_type);
}
// Batch output may be processed differently based on the kind
switch (batch_output.BatchOutputKind()) {
case BatchOutput::Kind::BATCH_SCATTER_WITH_INPUT_SHAPE: {
const auto& output_name = batch_output.TargetNames()[0];
const auto& input_name = batch_output.SourceInputs()[0];
const auto& datatype = batch_output.DataType();
size_t tensor_offset = 0;
for (size_t idx = 0; idx < responses_->size(); idx++) {
auto& request = requests_[idx];
auto& response = (*responses_)[idx];
// If then pending copies are from tensor buffer that is not
// contiguous with 'response's part of that buffer, then need to
// go ahead and perform the pending copies so that can start a
// new contiguous region if necessary.
if ((pending_pinned_byte_size_ > 0) &&
(tensor_offset !=
(pending_pinned_byte_size_ + pending_pinned_offset_))) {
need_sync_ |= FlushPendingPinned(buffer, memory_type, memory_type_id);
}
// Override shape to be correct for this response, with a naive
// assumption that the dynamic dimension in output is mapped to the same
// dimension in the input
auto output_batchn_shape = batch_output.OutputShape();
{
TRITONBACKEND_Input* input;
TRITONBACKEND_RequestInput(request, input_name.c_str(), &input);
const int64_t* shape;
TRITONBACKEND_InputProperties(
input, nullptr, nullptr, &shape, nullptr, nullptr, nullptr);
for (size_t dim_idx = 0; dim_idx < output_batchn_shape.size();
dim_idx++) {
if (output_batchn_shape[dim_idx] == -1) {
output_batchn_shape[dim_idx] = shape[dim_idx];
}
}
}
const size_t tensor_byte_size =
GetByteSize(datatype, output_batchn_shape);
TRITONBACKEND_Output* response_output;
if (response != nullptr) {
uint32_t output_count;
RESPOND_AND_SET_NULL_IF_ERROR(
&response,
TRITONBACKEND_RequestOutputCount(request, &output_count));
if (response != nullptr) {
for (uint32_t output_idx = 0; output_idx < output_count;
output_idx++) {
const char* name;
RESPOND_AND_SET_NULL_IF_ERROR(
&response,
TRITONBACKEND_RequestOutputName(request, output_idx, &name));
if ((response != nullptr) && (output_name == name)) {
RESPOND_AND_SET_NULL_IF_ERROR(
&response, TRITONBACKEND_ResponseOutput(
response, &response_output, name, datatype,
output_batchn_shape.data(),
output_batchn_shape.size()));
if (response != nullptr) {
need_sync_ |= SetFixedSizeBuffer(
&response, response_output, output_name, tensor_byte_size,
tensor_offset, buffer, memory_type, memory_type_id,
use_pinned_memory_type, false /* state */);
}
break;
}
}
}
}
tensor_offset += tensor_byte_size;
}
break;
}
}
// Done with the tensor, flush any pending pinned copies.
need_sync_ |= FlushPendingPinned(buffer, memory_type, memory_type_id);
#ifdef TRITON_ENABLE_GPU
if (need_sync_ && (event_ != nullptr)) {
cudaEventRecord(event_, stream_);
}
#endif // TRITON_ENABLE_GPU
}
}} // namespace triton::backend
// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "kernel.h"
#include <cuda.h>
#define THREADBLOCK_SIZE 512
__launch_bounds__(THREADBLOCK_SIZE) __global__ void TritonGatherKernel(
const int8_t** __restrict input_ptr_buffer,
const size_t* __restrict byte_size_buffer,
const size_t* __restrict byte_size_offset_buffer,
int8_t* __restrict output_buffer)
{
int request_idx = blockIdx.x;
int lane_id = threadIdx.x;
const int8_t* request_input_buffer = input_ptr_buffer[request_idx];
int byte_size = byte_size_buffer[request_idx];
int byte_size_offset = byte_size_offset_buffer[request_idx];
int8_t* output_buffer_with_offset = output_buffer + byte_size_offset;
if (((byte_size % 4) == 0) && (((uint64_t)request_input_buffer % 4) == 0) &&
(((uint64_t)output_buffer_with_offset % 4) == 0)) {
int32_t* input_4 = (int32_t*)request_input_buffer;
int32_t* output_4 = (int32_t*)output_buffer_with_offset;
int element_count = byte_size / 4;
for (int elem_id = lane_id; elem_id < element_count;
elem_id += THREADBLOCK_SIZE) {
output_4[elem_id] = input_4[elem_id];
}
} else {
for (int elem_id = lane_id; elem_id < byte_size;
elem_id += THREADBLOCK_SIZE) {
output_buffer_with_offset[elem_id] =
__ldg(request_input_buffer + elem_id);
}
}
}
#ifdef __cplusplus
extern "C" {
#endif
cudaError_t
RunGatherKernel(
const int8_t** input_ptr_buffer, const size_t* byte_size_buffer,
const size_t* byte_size_offset_buffer, int8_t* output_buffer,
size_t request_count, cudaStream_t stream)
{
TritonGatherKernel<<<request_count, THREADBLOCK_SIZE, 0, stream>>>(
input_ptr_buffer, byte_size_buffer, byte_size_offset_buffer,
output_buffer);
return cudaGetLastError();
}
#ifdef __cplusplus
}
#endif
// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <cuda_runtime_api.h>
#include <stdint.h>
#ifdef __cplusplus
extern "C" {
#endif
cudaError_t RunGatherKernel(
const int8_t** input_ptr_buffer, const size_t* byte_size_buffer,
const size_t* byte_size_offset_buffer, int8_t* output_buffer,
size_t request_count, cudaStream_t stream);
#ifdef __cplusplus
}
#endif
---
BasedOnStyle: Google
IndentWidth: 2
ContinuationIndentWidth: 4
UseTab: Never
MaxEmptyLinesToKeep: 2
SortIncludes: true
CompactNamespaces: true
ReflowComments: true
DerivePointerAlignment: false
PointerAlignment: Left
AllowShortIfStatementsOnASingleLine: false
AllowShortBlocksOnASingleLine: false
AllowShortFunctionsOnASingleLine: Inline
AlwaysBreakAfterReturnType: TopLevelDefinitions
AlignAfterOpenBracket: AlwaysBreak
BreakBeforeBraces: Custom
BraceWrapping:
AfterClass: false
AfterControlStatement: false
AfterEnum: false
AfterFunction: true
AfterNamespace: false
AfterStruct: false
AfterUnion: false
BeforeCatch: true
BinPackArguments: true
BinPackParameters: true
ConstructorInitializerAllOnOneLineOrOnePerLine: false
IndentCaseLabels: true
\ No newline at end of file
# Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#cmake_minimum_required(VERSION 3.17)
cmake_minimum_required(VERSION 3.16)
project(tritoncommon LANGUAGES C CXX)
#
# Options
#
# Some components are expensive to build and have extensive
# dependencies, so those parts of the build must be enabled
# explicitly.
option(TRITON_COMMON_ENABLE_PROTOBUF "Build protobuf artifacts" OFF)
option(TRITON_COMMON_ENABLE_PROTOBUF_PYTHON "Build protobuf artifacts for python" ON)
option(TRITON_COMMON_ENABLE_GRPC "Build grpc artifacts" OFF)
option(TRITON_COMMON_ENABLE_JSON "Build json-related libs" ON)
#option(TRITON_COMMON_ENABLE_JSON "Build json-related libs" OFF)
if(TRITON_COMMON_ENABLE_JSON)
find_package(RapidJSON CONFIG REQUIRED)
message(STATUS "RapidJSON found. Headers: ${RAPIDJSON_INCLUDE_DIRS}")
endif()
set(THREADS_PREFER_PTHREAD_FLAG TRUE)
find_package(Threads REQUIRED)
if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
message("Using MSVC as compiler, default target on Windows 10. "
"If the target system is not Windows 10, please update _WIN32_WINNT "
"to corresponding value.")
endif()
add_library(common-compile-settings INTERFACE)
target_compile_features(common-compile-settings INTERFACE cxx_std_11)
target_compile_options(common-compile-settings INTERFACE
$<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>,$<CXX_COMPILER_ID:GNU>>:
-Wall -Wextra -Wno-unused-parameter -Wno-type-limits -Werror>
$<$<CXX_COMPILER_ID:MSVC>:/W0 /D_WIN32_WINNT=0x0A00 /EHsc>
)
#
# Error
#
add_library(
triton-common-error
src/error.cc
)
add_library(
TritonCommon::triton-common-error ALIAS triton-common-error
)
target_include_directories(
triton-common-error
PUBLIC
$<INSTALL_INTERFACE:include>
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/src
)
target_link_libraries(triton-common-error PRIVATE common-compile-settings)
#
# Logging
#
add_library(
triton-common-logging
src/logging.cc
)
add_library(
TritonCommon::triton-common-logging ALIAS triton-common-logging
)
target_include_directories(
triton-common-logging
PUBLIC
$<INSTALL_INTERFACE:include>
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/src
)
if(${TRITON_ENABLE_LOGGING})
target_compile_definitions(
triton-common-logging
PRIVATE TRITON_ENABLE_LOGGING=1
)
endif() # TRITON_ENABLE_LOGGING
target_link_libraries(triton-common-logging PRIVATE common-compile-settings)
#
# SyncQueue
#
add_library(
triton-common-sync-queue INTERFACE
)
add_library(
TritonCommon::triton-common-sync-queue ALIAS triton-common-sync-queue
)
target_include_directories(
triton-common-sync-queue
INTERFACE
$<INSTALL_INTERFACE:include>
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
)
#
# Async Work Queue
#
add_library(
triton-common-async-work-queue
src/async_work_queue.cc
src/error.cc
src/thread_pool.cc
)
add_library(
TritonCommon::triton-common-async-work-queue ALIAS triton-common-async-work-queue
)
target_include_directories(
triton-common-async-work-queue
PUBLIC
$<INSTALL_INTERFACE:include>
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/src
)
target_link_libraries(triton-common-async-work-queue
PUBLIC
Threads::Threads
PRIVATE
common-compile-settings
)
#
# Thread Pool
#
add_library(
triton-common-thread-pool
src/thread_pool.cc
)
add_library(
TritonCommon::triton-common-thread-pool ALIAS triton-common-thread-pool
)
target_include_directories(
triton-common-thread-pool
PUBLIC
$<INSTALL_INTERFACE:include>
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/src
)
target_link_libraries(triton-common-thread-pool
PUBLIC
Threads::Threads
PRIVATE
common-compile-settings
)
#
# JSON utilities
#
if(TRITON_COMMON_ENABLE_JSON)
add_library(
triton-common-json INTERFACE
)
add_library(
TritonCommon::triton-common-json ALIAS triton-common-json
)
target_include_directories(
triton-common-json
INTERFACE
$<INSTALL_INTERFACE:include>
$<INSTALL_INTERFACE:${RAPIDJSON_INCLUDE_DIRS}>
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
$<BUILD_INTERFACE:${RAPIDJSON_INCLUDE_DIRS}>
)
endif()
#
# Table Printer
#
add_library(
triton-common-table-printer
src/table_printer.cc
)
add_library(
TritonBackend::triton-common-table-printer ALIAS triton-common-table-printer
)
target_include_directories(
triton-common-table-printer
PUBLIC
$<INSTALL_INTERFACE:include>
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/src
)
target_link_libraries(triton-common-table-printer PRIVATE common-compile-settings)
set_target_properties(
triton-common-async-work-queue
triton-common-error
triton-common-logging
triton-common-table-printer
triton-common-thread-pool
PROPERTIES
WINDOWS_EXPORT_ALL_SYMBOLS TRUE
POSITION_INDEPENDENT_CODE ON
)
set_target_properties(
triton-common-async-work-queue
PROPERTIES
OUTPUT_NAME tritonasyncworkqueue
)
set_target_properties(
triton-common-thread-pool
PROPERTIES
OUTPUT_NAME tritonthreadpool
)
set_target_properties(
triton-common-error
PROPERTIES
OUTPUT_NAME tritoncommonerror
)
set_target_properties(
triton-common-logging
PROPERTIES
OUTPUT_NAME tritoncommonlogging
)
set_target_properties(
triton-common-table-printer
PROPERTIES
OUTPUT_NAME tritontableprinter
)
#
# Protobuf and GRPC artifacts
#
if(${TRITON_COMMON_ENABLE_PROTOBUF} OR ${TRITON_COMMON_ENABLE_GRPC})
add_subdirectory(protobuf)
set(protobuf_MODULE_COMPATIBLE TRUE CACHE BOOL "protobuf_MODULE_COMPATIBLE" FORCE)
find_package(Protobuf CONFIG REQUIRED)
message(STATUS "Using protobuf ${Protobuf_VERSION}")
#
# Model Config (depends on protobuf & generated .pb.h file)
#
add_library(
triton-common-model-config
src/model_config.cc
)
add_library(
TritonCommon::triton-common-model-config ALIAS triton-common-model-config
)
target_include_directories(
triton-common-model-config
PUBLIC
$<INSTALL_INTERFACE:include>
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/src
${Protobuf_INCLUDE_DIRS}
)
target_link_libraries(
triton-common-model-config
PRIVATE
common-compile-settings
protobuf::libprotobuf
proto-library
)
set_target_properties(
triton-common-model-config
PROPERTIES
WINDOWS_EXPORT_ALL_SYMBOLS TRUE
POSITION_INDEPENDENT_CODE ON
OUTPUT_NAME tritoncommonmodelconfig
)
endif()
#
# Install
#
include(GNUInstallDirs)
set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/TritonCommon)
install(
TARGETS
triton-common-async-work-queue
triton-common-error
triton-common-logging
triton-common-sync-queue
triton-common-table-printer
triton-common-thread-pool
common-compile-settings
EXPORT
triton-common-targets
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
)
if(TRITON_COMMON_ENABLE_JSON)
install(
TARGETS
triton-common-json
EXPORT
triton-common-targets
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
)
endif()
if(${TRITON_COMMON_ENABLE_GRPC} OR ${TRITON_COMMON_ENABLE_PROTOBUF})
install(
TARGETS
proto-library
triton-common-model-config
# proto-py-library
EXPORT
triton-common-targets
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
)
endif()
if(${TRITON_COMMON_ENABLE_GRPC})
install(
TARGETS
grpc-service-library
# grpc-service-py-library
EXPORT
triton-common-targets
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
)
endif()
install(
DIRECTORY include/
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
)
install(
EXPORT
triton-common-targets
FILE
TritonCommonTargets.cmake
NAMESPACE
TritonCommon::
DESTINATION
${INSTALL_CONFIGDIR}
)
include(CMakePackageConfigHelpers)
configure_package_config_file(
${CMAKE_CURRENT_LIST_DIR}/cmake/TritonCommonConfig.cmake.in
${CMAKE_CURRENT_BINARY_DIR}/TritonCommonConfig.cmake
INSTALL_DESTINATION ${INSTALL_CONFIGDIR}
)
install(
FILES
${CMAKE_CURRENT_BINARY_DIR}/TritonCommonConfig.cmake
DESTINATION
${INSTALL_CONFIGDIR}
)
#
# Export from build tree
#
export(
EXPORT
triton-common-targets
FILE
${CMAKE_CURRENT_BINARY_DIR}/TritonCommonTargets.cmake
NAMESPACE
TritonCommon::
)
export(PACKAGE TritonCommon)
Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of NVIDIA CORPORATION nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
<!--
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-->
[![License](https://img.shields.io/badge/License-BSD3-lightgrey.svg)](https://opensource.org/licenses/BSD-3-Clause)
# Triton Inference Server Common
Common source, scripts and utilities shared across all Triton
repositories.
This repo is not typically built directly but is instead included in
the build of other repos. To build directly first install the required
dependencies.
```
$ apt-get install rapidjson-dev
```
Use cmake 3.17 or later to build and install in a local directory.
```
$ mkdir build
$ cd build
$ cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install ..
$ make install
```
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@PACKAGE_INIT@
set_and_check(TRITONCOMMON_CMAKE_DIR "${CMAKE_CURRENT_LIST_DIR}")
list(APPEND CMAKE_MODULE_PATH ${TRITONCOMMON_CMAKE_DIR})
include(CMakeFindDependencyMacro)
find_dependency(Threads)
if(NOT TARGET TritonCommon::triton-common-json)
include("${TRITONCOMMON_CMAKE_DIR}/TritonCommonTargets.cmake")
endif()
check_required_components(triton-common-json
triton-common-sync-queue
triton-common-async-work-queue
triton-common-thread-pool
)
set(TRITONCOMMON_LIBRARIES
TritonCommon::triton-common-json
TritonCommon::triton-common-sync-queue
TritonCommon::triton-common-async-work-queue
TritonCommon::triton-common-thread-pool
)
// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include "error.h"
#include "thread_pool.h"
namespace triton { namespace common {
// Manager for asynchronous worker threads. Use to accelerate copies and
// other such operations by running them in parallel.
// Call Initialize to start the worker threads (once) and AddTask to tasks to
// the queue.
class AsyncWorkQueue {
public:
// Start 'worker_count' number of worker threads.
static Error Initialize(size_t worker_count);
// Get the number of worker threads.
static size_t WorkerCount();
// Add a 'task' to the queue. The function will take ownership of 'task'.
// Therefore std::move should be used when calling AddTask.
static Error AddTask(std::function<void(void)>&& task);
protected:
static void Reset();
private:
AsyncWorkQueue() = default;
~AsyncWorkQueue();
static AsyncWorkQueue* GetSingleton();
std::unique_ptr<ThreadPool> thread_pool_;
};
}} // namespace triton::common
// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <string>
namespace triton { namespace common {
//
// Error
//
// Error returned by utilities from common repo.
//
class Error {
public:
enum class Code {
SUCCESS,
UNKNOWN,
INTERNAL,
NOT_FOUND,
INVALID_ARG,
UNAVAILABLE,
UNSUPPORTED,
ALREADY_EXISTS
};
explicit Error(Code code = Code::SUCCESS) : code_(code) {}
explicit Error(Code code, const std::string& msg) : code_(code), msg_(msg) {}
// Convenience "success" value. Can be used as Error::Success to
// indicate no error.
static const Error Success;
// Return the code for this status.
Code ErrorCode() const { return code_; }
// Return the message for this status.
const std::string& Message() const { return msg_; }
// Return true if this status indicates "ok"/"success", false if
// status indicates some kind of failure.
bool IsOk() const { return code_ == Code::SUCCESS; }
// Return the status as a string.
std::string AsString() const;
// Return the constant string name for a code.
static const char* CodeString(const Code code);
protected:
Code code_;
std::string msg_;
};
}} // namespace triton::common
// Copyright 2018-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <mutex>
#include <sstream>
#include <string>
#include <vector>
#include <cerrno>
#include <cstring>
#include <fstream>
namespace triton { namespace common {
// A log message.
class LogMessage {
public:
// Log levels.
enum Level { kERROR = 0, kWARNING = 1, kINFO = 2 };
LogMessage(const char* file, int line, uint32_t level);
~LogMessage();
std::stringstream& stream() { return stream_; }
private:
static const std::vector<char> level_name_;
std::stringstream stream_;
};
// Global logger for messages. Controls how log messages are reported.
class Logger {
public:
enum class Format { kDEFAULT, kISO8601 };
Logger();
// Is a log level enabled.
bool IsEnabled(LogMessage::Level level) const { return enables_[level]; }
// Set enable for a log Level.
void SetEnabled(LogMessage::Level level, bool enable)
{
enables_[level] = enable;
}
// Get the current verbose logging level.
uint32_t VerboseLevel() const { return vlevel_; }
// Set the current verbose logging level.
void SetVerboseLevel(uint32_t vlevel) { vlevel_ = vlevel; }
// Get the logging format.
Format LogFormat() { return format_; }
// Get the logging format as a string.
std::string LogFormatString()
{
switch (format_) {
case Format::kISO8601:
return "ISO8601";
case Format::kDEFAULT:
return "default";
default:
return "Invalid format";
}
}
// Set the logging format.
void SetLogFormat(Format format) { format_ = format; }
// Get the log output file name.
const std::string& LogFile() { return filename_; }
// Set the log output file. Returns an empty string upon
// success, else returns an error string.
const std::string SetLogFile(const std::string& filename)
{
const std::lock_guard<std::mutex> lock(mutex_);
file_stream_.close();
std::string revert_name(filename_);
filename_ = filename;
if (!filename_.empty()) {
file_stream_.open(filename_, std::ios::app);
if (file_stream_.fail()) {
std::stringstream error;
error << __FILE__ << " " << __LINE__
<< ": Failed to open log file: " << std::strerror(errno)
<< std::endl;
filename_ = revert_name;
file_stream_.open(filename_, std::ios::app);
return error.str();
}
}
// will return an empty string
return std::string();
}
// Log a message.
void Log(const std::string& msg);
// Flush the log.
void Flush();
private:
std::vector<bool> enables_;
uint32_t vlevel_;
Format format_;
std::mutex mutex_;
std::string filename_;
std::ofstream file_stream_;
};
extern Logger gLogger_;
#define LOG_ENABLE_INFO(E) \
triton::common::gLogger_.SetEnabled( \
triton::common::LogMessage::Level::kINFO, (E))
#define LOG_ENABLE_WARNING(E) \
triton::common::gLogger_.SetEnabled( \
triton::common::LogMessage::Level::kWARNING, (E))
#define LOG_ENABLE_ERROR(E) \
triton::common::gLogger_.SetEnabled( \
triton::common::LogMessage::Level::kERROR, (E))
#define LOG_SET_VERBOSE(L) \
triton::common::gLogger_.SetVerboseLevel( \
static_cast<uint32_t>(std::max(0, (L))))
#define LOG_SET_OUT_FILE(FN) triton::common::gLogger_.SetLogFile((FN))
#define LOG_SET_FORMAT(F) triton::common::gLogger_.SetLogFormat((F))
#define LOG_VERBOSE_LEVEL triton::common::gLogger_.VerboseLevel()
#define LOG_FORMAT triton::common::gLogger_.LogFormat()
#define LOG_FORMAT_STRING triton::common::gLogger_.LogFormatString()
#define LOG_FILE triton::common::gLogger_.LogFile()
#ifdef TRITON_ENABLE_LOGGING
#define LOG_INFO_IS_ON \
triton::common::gLogger_.IsEnabled(triton::common::LogMessage::Level::kINFO)
#define LOG_WARNING_IS_ON \
triton::common::gLogger_.IsEnabled( \
triton::common::LogMessage::Level::kWARNING)
#define LOG_ERROR_IS_ON \
triton::common::gLogger_.IsEnabled(triton::common::LogMessage::Level::kERROR)
#define LOG_VERBOSE_IS_ON(L) (triton::common::gLogger_.VerboseLevel() >= (L))
#else
// If logging is disabled, define macro to be false to avoid further evaluation
#define LOG_INFO_IS_ON false
#define LOG_WARNING_IS_ON false
#define LOG_ERROR_IS_ON false
#define LOG_VERBOSE_IS_ON(L) false
#endif // TRITON_ENABLE_LOGGING
// Macros that use explicitly given filename and line number.
#define LOG_INFO_FL(FN, LN) \
if (LOG_INFO_IS_ON) \
triton::common::LogMessage( \
(char*)(FN), LN, triton::common::LogMessage::Level::kINFO) \
.stream()
#define LOG_WARNING_FL(FN, LN) \
if (LOG_WARNING_IS_ON) \
triton::common::LogMessage( \
(char*)(FN), LN, triton::common::LogMessage::Level::kWARNING) \
.stream()
#define LOG_ERROR_FL(FN, LN) \
if (LOG_ERROR_IS_ON) \
triton::common::LogMessage( \
(char*)(FN), LN, triton::common::LogMessage::Level::kERROR) \
.stream()
#define LOG_VERBOSE_FL(L, FN, LN) \
if (LOG_VERBOSE_IS_ON(L)) \
triton::common::LogMessage( \
(char*)(FN), LN, triton::common::LogMessage::Level::kINFO) \
.stream()
// Macros that use current filename and line number.
#define LOG_INFO LOG_INFO_FL(__FILE__, __LINE__)
#define LOG_WARNING LOG_WARNING_FL(__FILE__, __LINE__)
#define LOG_ERROR LOG_ERROR_FL(__FILE__, __LINE__)
#define LOG_VERBOSE(L) LOG_VERBOSE_FL(L, __FILE__, __LINE__)
#define LOG_STATUS_ERROR(X, MSG) \
do { \
const Status& status__ = (X); \
if (!status__.IsOk()) { \
LOG_ERROR << (MSG) << ": " << status__.AsString(); \
} \
} while (false)
#define LOG_TRITONSERVER_ERROR(X, MSG) \
do { \
TRITONSERVER_Error* err__ = (X); \
if (err__ != nullptr) { \
LOG_ERROR << (MSG) << ": " << TRITONSERVER_ErrorCodeString(err__) \
<< " - " << TRITONSERVER_ErrorMessage(err__); \
TRITONSERVER_ErrorDelete(err__); \
} \
} while (false)
#define LOG_FLUSH triton::common::gLogger_.Flush()
}} // namespace triton::common
// Copyright 2018-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <google/protobuf/any.pb.h>
#include <stdint.h>
#include "model_config.pb.h"
namespace triton { namespace common {
/// The type for a repeated dims field (used for shape).
using DimsList = ::google::protobuf::RepeatedField<::google::protobuf::int64>;
/// The type for the metric_tags map.
using MetricTagsMap = ::google::protobuf::Map<std::string, std::string>;
// Map from a host policy name to <setting, value> map of cmdline
// settings for the host policy.
using HostPolicyCmdlineConfig = std::map<std::string, std::string>;
using HostPolicyCmdlineConfigMap =
std::unordered_map<std::string, HostPolicyCmdlineConfig>;
// Map from backend name to list of setting=value pairs of cmdline
// settings for the backend.
using BackendCmdlineConfig = std::vector<std::pair<std::string, std::string>>;
using BackendCmdlineConfigMap =
std::unordered_map<std::string, BackendCmdlineConfig>;
/// The value for a dimension in a shape that indicates that that
/// dimension can take on any size.
constexpr int WILDCARD_DIM = -1;
constexpr int SCHEDULER_DEFAULT_NICE = 5;
/// Enumeration for the different platform types.
enum Platform {
PLATFORM_UNKNOWN = 0,
PLATFORM_TENSORRT_PLAN = 1,
PLATFORM_TENSORFLOW_GRAPHDEF = 2,
PLATFORM_TENSORFLOW_SAVEDMODEL = 3,
PLATFORM_ENSEMBLE = 4,
PLATFORM_ONNXRUNTIME_ONNX = 5,
PLATFORM_PYTORCH_LIBTORCH = 6
};
/// Get the number of elements in a shape.
/// \param dims The shape.
/// \return The number of elements, or -1 if the number of elements
/// cannot be determined because the shape contains one or more
/// wilcard dimensions.
int64_t GetElementCount(const DimsList& dims);
/// Get the number of elements in a shape.
/// \param dims The shape.
/// \return The number of elements, or -1 if the number of elements
/// cannot be determined because the shape contains one or more
/// wilcard dimensions.
int64_t GetElementCount(const std::vector<int64_t>& dims);
/// Get the number of elements in the shape of a model input.
/// \param mio The model input.
/// \return The number of elements, or -1 if the number of elements
/// cannot be determined because the shape contains one or more
/// wilcard dimensions.
int64_t GetElementCount(const inference::ModelInput& mio);
/// Get the number of elements in the shape of a model output.
/// \param mio The model output.
/// \return The number of elements, or -1 if the number of elements
/// cannot be determined because the shape contains one or more
/// wilcard dimensions.
int64_t GetElementCount(const inference::ModelOutput& mio);
/// Are values of a datatype fixed-size, or variable-sized.
/// \param dtype The data-type.
/// \return True if datatype values are fixed-sized, false if
/// variable-sized.
bool IsFixedSizeDataType(const inference::DataType dtype);
/// Get the size of objects of a given datatype in bytes.
/// \param dtype The data-type.
/// \return The size, in bytes, of objects of the datatype, or 0 if
/// size cannot be determine (for example, values of type TYPE_STRING
/// have variable length and so size cannot be determine just from the
/// type).
size_t GetDataTypeByteSize(const inference::DataType dtype);
/// Get the size, in bytes, of a tensor based on datatype and
/// shape.
/// \param dtype The data-type.
/// \param dims The shape.
/// \return The size, in bytes, of the corresponding tensor, or -1 if
/// unable to determine the size.
int64_t GetByteSize(const inference::DataType& dtype, const DimsList& dims);
/// Get the size, in bytes, of a tensor based on datatype and
/// shape.
/// \param dtype The data-type.
/// \param dims The shape.
/// \return The size, in bytes, of the corresponding tensor, or -1 if
/// unable to determine the size.
int64_t GetByteSize(
const inference::DataType& dtype, const std::vector<int64_t>& dims);
/// Get the size, in bytes, of a tensor based on batch-size, datatype
/// and shape. A tensor that has empty shape [] and non-zero
/// batch-size is sized as a tensor with shape [ batch-size ].
/// \param batch_size The batch-size. May be 0 to indicate no
/// batching.
/// \param dtype The data-type.
/// \param dims The shape.
/// \return The size, in bytes, of the corresponding tensor, or -1 if
/// unable to determine the size.
int64_t GetByteSize(
const int batch_size, const inference::DataType& dtype,
const DimsList& dims);
/// Get the size, in bytes, of a tensor based on batch-size, datatype
/// and shape. A tensor that has empty shape [] and non-zero
/// batch-size is sized as a tensor with shape [ batch-size ].
/// \param batch_size The batch-size. May be 0 to indicate no
/// batching.
/// \param dtype The data-type.
/// \param dims The shape.
/// \return The size, in bytes, of the corresponding tensor, or -1 if
/// unable to determine the size.
int64_t GetByteSize(
const int batch_size, const inference::DataType& dtype,
const std::vector<int64_t>& dims);
/// Get the size, in bytes, of a tensor based on ModelInput.
/// \param mio The ModelInput protobuf.
/// \return The size, in bytes, of the corresponding tensor, or -1 if
/// unable to determine the size.
int64_t GetByteSize(const inference::ModelInput& mio);
/// Get the size, in bytes, of a tensor based on ModelOutput.
/// \param mio The ModelOutput protobuf.
/// \return The size, in bytes, of the corresponding tensor, or -1 if
/// unable to determine the size.
int64_t GetByteSize(const inference::ModelOutput& mio);
/// Get the CPU thread nice level associate with a model
/// configuration's priority.
/// \param config The model configuration.
/// \return The nice level.
int GetCpuNiceLevel(const inference::ModelConfig& config);
/// Compare two model configuration shapes for equality. Wildcard
/// dimensions (that is, dimensions with size WILDCARD_DIM) are
/// compared literally so that to be equal the two shapes must both
/// specify WILDCARD_DIM in the same dimensions.
/// \params dims0 The first shape.
/// \params dims1 The second shape.
/// \return True if the shapes are equal, false if not equal.
bool CompareDims(const DimsList& dims0, const DimsList& dims1);
/// Compare two model configuration shapes for equality. Wildcard
/// dimensions (that is, dimensions with size WILDCARD_DIM) are
/// compared literally so that to be equal the two shapes must both
/// specify WILDCARD_DIM in the same dimensions.
/// \params dims0 The first shape.
/// \params dims1 The second shape.
/// \return True if the shapes are equal, false if not equal.
bool CompareDims(
const std::vector<int64_t>& dims0, const std::vector<int64_t>& dims1);
/// Compare two model configuration shapes for equality. Wildcard
/// dimensions (that is, dimensions with size WILDCARD_DIM) are
/// allowed to match with any value. So, a dimension in one shape
/// specified as WILDCARD_DIM will always match the same dimension in
/// the other shape.
/// \params dims0 The first shape.
/// \params dims1 The second shape.
/// \return True if the shapes are equal, false if not equal.
bool CompareDimsWithWildcard(const DimsList& dims0, const DimsList& dims1);
/// Compare two model configuration shapes for equality. Wildcard
/// dimensions (that is, dimensions with size WILDCARD_DIM) are
/// allowed to match with any value. So, a dimension in one shape
/// specified as WILDCARD_DIM will always match the same dimension in
/// the other shape.
/// \params dims0 The first shape.
/// \params dims1 The second shape.
/// \return True if the shapes are equal, false if not equal.
bool CompareDimsWithWildcard(
const DimsList& dims0, const std::vector<int64_t>& dims1);
/// Convert a DimsList to string representation.
/// \param dims The DimsList to be converted.
/// \return String representation of the DimsList in pattern
/// "[d0,d1,...,dn]"
std::string DimsListToString(const DimsList& dims);
/// Convert a vector representing a shape to string representation.
/// \param dims The vector of dimensions to be converted.
/// \return String representation of the vector in pattern
/// "[d0,d1,...,dn]"
std::string DimsListToString(
const std::vector<int64_t>& dims, const int start_idx = 0);
/// Get the server protocol string representation of a datatype.
/// \param dtype The data type.
/// \return The string representation.
const char* DataTypeToProtocolString(const inference::DataType dtype);
/// Get the datatype corresponding to a server protocol string
/// representation of a datatype.
/// \param dtype string representation.
/// \return The data type.
inference::DataType ProtocolStringToDataType(const std::string& dtype);
/// Get the datatype corresponding to a server protocol string
/// representation of a datatype.
/// \param dtype Pointer to string.
/// \param len Length of the string.
/// \return The data type.
inference::DataType ProtocolStringToDataType(const char* dtype, size_t len);
}} // namespace triton::common
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment