// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // * Neither the name of NVIDIA CORPORATION nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "bls.h" namespace triton { namespace backend { namespace bls { BLSExecutor::BLSExecutor(TRITONSERVER_Server* server) : server_(server), model_executor_(server) { } TRITONSERVER_Error* BLSExecutor::PrepareInferenceRequest( TRITONBACKEND_Request* bls_request, TRITONSERVER_InferenceRequest** irequest, const std::string model_name) { // Get request_id, correlation_id, and flags from the current request // for preparing a new inference request that we will send to 'addsub_python' // or 'addsub_tf' model later. const char* request_id; uint64_t correlation_id; uint32_t flags; RETURN_IF_ERROR(TRITONBACKEND_RequestId(bls_request, &request_id)); RETURN_IF_ERROR( TRITONBACKEND_RequestCorrelationId(bls_request, &correlation_id)); RETURN_IF_ERROR(TRITONBACKEND_RequestFlags(bls_request, &flags)); // Create an inference request object. The inference request object // is where we set the name of the model we want to use for // inference and the input tensors. RETURN_IF_ERROR(TRITONSERVER_InferenceRequestNew( irequest, server_, model_name.c_str(), -1 /* model_version */)); // Set request_id, correlation_id, and flags for the new request. RETURN_IF_ERROR(TRITONSERVER_InferenceRequestSetId(*irequest, request_id)); RETURN_IF_ERROR( TRITONSERVER_InferenceRequestSetCorrelationId(*irequest, correlation_id)); RETURN_IF_ERROR(TRITONSERVER_InferenceRequestSetFlags(*irequest, flags)); RETURN_IF_ERROR(TRITONSERVER_InferenceRequestSetReleaseCallback( *irequest, InferRequestComplete, nullptr /* request_release_userp */)); return nullptr; // success } TRITONSERVER_Error* BLSExecutor::PrepareInferenceInput( TRITONBACKEND_Request* bls_request, TRITONSERVER_InferenceRequest* irequest) { // Get the properties of the two inputs from the current request. // Then, add the two input tensors and append the input data to the new // request. uint32_t input_count; RETURN_IF_ERROR(TRITONBACKEND_RequestInputCount(bls_request, &input_count)); TRITONBACKEND_Input* input; const char* name; TRITONSERVER_DataType datatype; const int64_t* shape; uint32_t dims_count; size_t data_byte_size; TRITONSERVER_MemoryType data_memory_type; int64_t data_memory_id; const char* data_buffer; for (size_t count = 0; count < input_count; count++) { RETURN_IF_ERROR(TRITONBACKEND_RequestInputByIndex( bls_request, count /* index */, &input)); RETURN_IF_ERROR(TRITONBACKEND_InputProperties( input, &name, &datatype, &shape, &dims_count, nullptr, nullptr)); RETURN_IF_ERROR(TRITONBACKEND_InputBuffer( input, 0 /* idx */, reinterpret_cast(&data_buffer), &data_byte_size, &data_memory_type, &data_memory_id)); RETURN_IF_ERROR(TRITONSERVER_InferenceRequestAddInput( irequest, name, datatype, shape, dims_count)); RETURN_IF_ERROR(TRITONSERVER_InferenceRequestAppendInputData( irequest, name, &data_buffer[0], data_byte_size, data_memory_type, data_memory_id)); } return nullptr; // success } TRITONSERVER_Error* BLSExecutor::PrepareInferenceOutput( TRITONBACKEND_Request* bls_request, TRITONSERVER_InferenceRequest* irequest) { // Indicate the output tensors to be calculated and returned // for the inference request. uint32_t output_count; RETURN_IF_ERROR(TRITONBACKEND_RequestOutputCount(bls_request, &output_count)); const char* output_name; for (size_t count = 0; count < output_count; count++) { RETURN_IF_ERROR(TRITONBACKEND_RequestOutputName( bls_request, count /* index */, &output_name)); RETURN_IF_ERROR( TRITONSERVER_InferenceRequestAddRequestedOutput(irequest, output_name)); } return nullptr; // success } void BLSExecutor::Execute( TRITONBACKEND_Request* bls_request, TRITONBACKEND_Response** response) { // The names of the models that we will send internal requests on. std::vector model_names = {"addsub_python", "addsub_tf"}; // Check if both models are valid before executing request. try { for (size_t i = 0; i < 2; i++) { // Check if the model is ready. bool is_ready = false; THROW_IF_TRITON_ERROR(TRITONSERVER_ServerModelIsReady( server_, model_names[i].c_str(), -1 /* model_version */, &is_ready)); if (!is_ready) { throw BLSBackendException( (std::string("Failed to execute the inference request. Model '") + model_names[i].c_str() + "' is not ready.") .c_str()); } // For simplicity, decoupled API is not supported in this BLS backend. You // can implement your own backend that supports decoupled models. uint32_t txn_flags; THROW_IF_TRITON_ERROR(TRITONSERVER_ServerModelTransactionProperties( server_, model_names[i].c_str(), -1 /* model_version */, &txn_flags, nullptr /* voidp */)); if ((txn_flags & TRITONSERVER_TXN_DECOUPLED) != 0) { throw BLSBackendException( std::string("Model '") + model_names[i].c_str() + "' is using the decoupled. This BLS Backend doesn't support models " "using the decoupled transaction policy."); } } } catch (const BLSBackendException& bls_exception) { LOG_MESSAGE(TRITONSERVER_LOG_ERROR, bls_exception.what()); RESPOND_AND_SET_NULL_IF_ERROR( response, TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INTERNAL, "Failed to send inference requests")); return; } // Prepare std::future for each model. Since this BLS backend // can handle requests in parallel, we will send all the inference // requests first and then retrieve them later. std::vector> futures(2); // The inference request object for sending internal requests. TRITONSERVER_InferenceRequest* irequest = nullptr; // For each inference request, the backend sends two requests on the // 'addsub_python' and 'addsub_tf' models. try { for (size_t icount = 0; icount < 2; icount++) { // Initialize the inference request with required information. THROW_IF_TRITON_ERROR( PrepareInferenceRequest(bls_request, &irequest, model_names[icount])); THROW_IF_TRITON_ERROR(PrepareInferenceInput(bls_request, irequest)); THROW_IF_TRITON_ERROR(PrepareInferenceOutput(bls_request, irequest)); // Execute inference request. THROW_IF_TRITON_ERROR( model_executor_.AsyncExecute(irequest, &futures[icount])); } } catch (const BLSBackendException& bls_exception) { LOG_MESSAGE(TRITONSERVER_LOG_ERROR, bls_exception.what()); LOG_IF_ERROR( TRITONSERVER_InferenceRequestDelete(irequest), "Failed to delete inference request."); RESPOND_AND_SET_NULL_IF_ERROR( response, TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INTERNAL, "Failed to send inference requests")); return; } // If both internal requests are sent successfully, retrieve the output from // each request and construct the final response. ConstructFinalResponse(response, std::move(futures)); } void BLSExecutor::ConstructFinalResponse( TRITONBACKEND_Response** response, std::vector> futures) { // Prepare two TRITONSERVER_InferenceResponse* objects for 'addsub_python' and // 'addsub_tf' repectively. std::vector completed_responses = {nullptr, nullptr}; const char* output_name; TRITONSERVER_DataType output_datatype; const int64_t* output_shape; uint64_t dims_count; size_t output_byte_size; TRITONSERVER_MemoryType output_memory_type; int64_t output_memory_id; const void* output_base; void* userp; for (size_t icount = 0; icount < 2; icount++) { // Retrieve the corresponding TRITONSERVER_InferenceResponse object from // 'futures'. The InferResponseComplete function sets the std::promise // so that this thread will block until the response is returned. completed_responses[icount] = futures[icount].get(); try { THROW_IF_TRITON_ERROR( TRITONSERVER_InferenceResponseError(completed_responses[icount])); } catch (const BLSBackendException& bls_exception) { LOG_MESSAGE(TRITONSERVER_LOG_ERROR, bls_exception.what()); if (completed_responses[icount] != nullptr) { LOG_IF_ERROR( TRITONSERVER_InferenceResponseDelete(completed_responses[icount]), "Failed to delete inference response."); } return; } // Retrieve outputs from 'completed_responses'. // Extract OUTPUT0 from the 'addsub_python' and OUTPUT1 from the // 'addsub_tf' model to form the final inference response object. // Get all the information about the output tensor. RESPOND_AND_SET_NULL_IF_ERROR( response, TRITONSERVER_InferenceResponseOutput( completed_responses[icount], icount, &output_name, &output_datatype, &output_shape, &dims_count, &output_base, &output_byte_size, &output_memory_type, &output_memory_id, &userp)); // Create an output tensor in the final response with // the information retrieved above. TRITONBACKEND_Output* output; RESPOND_AND_SET_NULL_IF_ERROR( response, TRITONBACKEND_ResponseOutput( *response, &output, output_name, output_datatype, output_shape, dims_count)); // Get a buffer that holds the tensor data for the output. // We request a buffer in CPU memory but we have to handle any returned // type. If we get back a buffer in GPU memory we just fail the request. void* output_buffer; output_memory_type = TRITONSERVER_MEMORY_CPU; RESPOND_AND_SET_NULL_IF_ERROR( response, TRITONBACKEND_OutputBuffer( output, &output_buffer, output_byte_size, &output_memory_type, &output_memory_id)); if (output_memory_type == TRITONSERVER_MEMORY_GPU) { RESPOND_AND_SET_NULL_IF_ERROR( response, TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INTERNAL, "failed to create output buffer in CPU memory")); } // Fill the BLS output buffer with output data returned by internal // requests. memcpy(output_buffer, output_base, output_byte_size); LOG_IF_ERROR( TRITONSERVER_InferenceResponseDelete(completed_responses[icount]), "Failed to delete inference response."); } } }}} // namespace triton::backend::bls