// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // * Neither the name of NVIDIA CORPORATION nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma once #include #include #include #include #include "buffer_attributes.h" #include "constants.h" #include "infer_parameter.h" #include "infer_trace.h" #include "response_allocator.h" #include "status.h" #include "triton/common/model_config.h" #include "tritonserver_apis.h" namespace triton { namespace core { class Model; class InferenceResponse; // // An inference response factory. // class InferenceResponseFactory { public: InferenceResponseFactory() = default; InferenceResponseFactory( const std::shared_ptr& model, const std::string& id, const ResponseAllocator* allocator, void* alloc_userp, TRITONSERVER_InferenceResponseCompleteFn_t response_fn, void* response_userp, const std::function&&, const uint32_t)>& delegator) : model_(model), id_(id), allocator_(allocator), alloc_userp_(alloc_userp), response_fn_(response_fn), response_userp_(response_userp), response_delegator_(delegator) { } const ResponseAllocator* Allocator() { return allocator_; } void* AllocatorUserp() { return alloc_userp_; } Status SetResponseDelegator( const std::function&&, const uint32_t)>& delegator) { response_delegator_ = delegator; return Status::Success; } // Create a new response. Status CreateResponse(std::unique_ptr* response) const; // Send a "null" response with 'flags'. Status SendFlags(const uint32_t flags) const; #ifdef TRITON_ENABLE_TRACING const std::shared_ptr& Trace() const { return trace_; } void SetTrace(const std::shared_ptr& trace) { trace_ = trace; } void ReleaseTrace() { trace_ = nullptr; } #endif // TRITON_ENABLE_TRACING private: // The model associated with this factory. For normal // requests/responses this will always be defined and acts to keep // the model loaded as long as this factory is live. It may be // nullptr for cases where the model itself created the request // (like running requests for warmup) and so must protect any uses // to handle the nullptr case. std::shared_ptr model_; // The ID of the corresponding request that should be included in every // response. This is a property that can be optionally provided by the user. std::string id_; // The response allocator and user pointer. The 'allocator_' is a // raw pointer because it is owned by the client, and the client is // responsible for ensuring that the lifetime of the allocator // extends longer that any request or response that depend on the // allocator. const ResponseAllocator* allocator_; void* alloc_userp_; // The response callback function and user pointer. TRITONSERVER_InferenceResponseCompleteFn_t response_fn_; void* response_userp_; // Delegator to be invoked on sending responses. std::function&&, const uint32_t)> response_delegator_; #ifdef TRITON_ENABLE_TRACING // Inference trace associated with this response. std::shared_ptr trace_; #endif // TRITON_ENABLE_TRACING }; // // An inference response. // class InferenceResponse { public: // Output tensor class Output { public: Output( const std::string& name, const inference::DataType datatype, const std::vector& shape, const ResponseAllocator* allocator, void* alloc_userp) : name_(name), datatype_(datatype), shape_(shape), allocator_(allocator), alloc_userp_(alloc_userp), allocated_buffer_(nullptr) { } Output( const std::string& name, const inference::DataType datatype, std::vector&& shape, const ResponseAllocator* allocator, void* alloc_userp) : name_(name), datatype_(datatype), shape_(std::move(shape)), allocator_(allocator), alloc_userp_(alloc_userp), allocated_buffer_(nullptr) { } ~Output(); // The name of the output tensor. const std::string& Name() const { return name_; } // Data type of the output tensor. inference::DataType DType() const { return datatype_; } // The shape of the output tensor. const std::vector& Shape() const { return shape_; } BufferAttributes* GetBufferAttributes() { return &buffer_attributes_; } // Reshape the output tensor. This function must only be called // for outputs that have respace specified in the model // configuration. void Reshape( const bool has_batch_dim, const inference::ModelOutput* output_config); // Get information about the buffer allocated for this output // tensor's data. If no buffer is allocated 'buffer' will return // nullptr and the other returned values will be undefined. Status DataBuffer( const void** buffer, size_t* buffer_byte_size, TRITONSERVER_MemoryType* memory_type, int64_t* memory_type_id, void** userp) const; // Allocate the buffer that should be used for this output // tensor's data. 'buffer' must return a buffer of size // 'buffer_byte_size'. 'memory_type' acts as both input and // output. On input gives the buffer memory type preferred by the // caller and on return holds the actual memory type of // 'buffer'. 'memory_type_id' acts as both input and output. On // input gives the buffer memory type id preferred by the caller // and returns the actual memory type id of 'buffer'. Only a // single buffer may be allocated for the output at any time, so // multiple calls to AllocateDataBuffer without intervening // ReleaseDataBuffer call will result in an error. Status AllocateDataBuffer( void** buffer, const size_t buffer_byte_size, TRITONSERVER_MemoryType* memory_type, int64_t* memory_type_id); // Release the buffer that was previously allocated by // AllocateDataBuffer(). Do nothing if AllocateDataBuffer() has // not been called. Status ReleaseDataBuffer(); private: DISALLOW_COPY_AND_ASSIGN(Output); friend std::ostream& operator<<( std::ostream& out, const InferenceResponse::Output& output); std::string name_; inference::DataType datatype_; std::vector shape_; // The response allocator and user pointer. const ResponseAllocator* allocator_; void* alloc_userp_; // Information about the buffer allocated by // AllocateDataBuffer(). This information is needed by // DataBuffer() and ReleaseDataBuffer(). void* allocated_buffer_; BufferAttributes buffer_attributes_; void* allocated_userp_; }; // InferenceResponse InferenceResponse( const std::shared_ptr& model, const std::string& id, const ResponseAllocator* allocator, void* alloc_userp, TRITONSERVER_InferenceResponseCompleteFn_t response_fn, void* response_userp, const std::function&&, const uint32_t)>& delegator); // "null" InferenceResponse is a special instance of InferenceResponse which // contains minimal information for calling InferenceResponse::Send, // InferenceResponse::NullResponse. nullptr will be passed as response in // 'response_fn'. InferenceResponse( TRITONSERVER_InferenceResponseCompleteFn_t response_fn, void* response_userp); const std::string& Id() const { return id_; } const std::string& ModelName() const; int64_t ActualModelVersion() const; const Status& ResponseStatus() const { return status_; } // The response parameters. const std::deque& Parameters() const { return parameters_; } // Add an parameter to the response. Status AddParameter(const char* name, const char* value); Status AddParameter(const char* name, const int64_t value); Status AddParameter(const char* name, const bool value); // The response outputs. const std::deque& Outputs() const { return outputs_; } // Add an output to the response. If 'output' is non-null // return a pointer to the newly added output. Status AddOutput( const std::string& name, const inference::DataType datatype, const std::vector& shape, Output** output = nullptr); Status AddOutput( const std::string& name, const inference::DataType datatype, std::vector&& shape, Output** output = nullptr); // Get the classification label associated with an output. Return // 'label' == nullptr if no label. Status ClassificationLabel( const Output& output, const uint32_t class_index, const char** label) const; // Send the response with success status. Calling this function // releases ownership of the response object and gives it to the // callback function. static Status Send( std::unique_ptr&& response, const uint32_t flags); // Send the response with explicit status. Calling this function // releases ownership of the response object and gives it to the // callback function. static Status SendWithStatus( std::unique_ptr&& response, const uint32_t flags, const Status& status); #ifdef TRITON_ENABLE_TRACING const std::shared_ptr& Trace() const { return trace_; } void SetTrace(const std::shared_ptr& trace) { trace_ = trace; } void ReleaseTrace() { trace_ = nullptr; } #endif // TRITON_ENABLE_TRACING private: DISALLOW_COPY_AND_ASSIGN(InferenceResponse); friend std::ostream& operator<<( std::ostream& out, const InferenceResponse& response); #ifdef TRITON_ENABLE_TRACING Status TraceOutputTensors( TRITONSERVER_InferenceTraceActivity activity, const std::string& msg); #endif // TRITON_ENABLE_TRACING // The model associated with this factory. For normal // requests/responses this will always be defined and acts to keep // the model loaded as long as this factory is live. It may be // nullptr for cases where the model itself created the request // (like running requests for warmup) and so must protect any uses // to handle the nullptr case. std::shared_ptr model_; // The ID of the corresponding request that should be included in // every response. std::string id_; // Error status for the response. Status status_; // The parameters of the response. Use a deque so that there is no // reallocation. std::deque parameters_; // The result tensors. Use a deque so that there is no reallocation. std::deque outputs_; // The response allocator and user pointer. const ResponseAllocator* allocator_; void* alloc_userp_; // The response callback function and user pointer. TRITONSERVER_InferenceResponseCompleteFn_t response_fn_; void* response_userp_; // Delegator to be invoked on sending responses. std::function&&, const uint32_t)> response_delegator_; bool null_response_; #ifdef TRITON_ENABLE_TRACING // Inference trace associated with this response. std::shared_ptr trace_; #endif // TRITON_ENABLE_TRACING }; std::ostream& operator<<(std::ostream& out, const InferenceResponse& response); std::ostream& operator<<( std::ostream& out, const InferenceResponse::Output& output); }} // namespace triton::core