Commit c68e1835 authored by lijian6's avatar lijian6
Browse files

Initial commit

parents
Pipeline #561 failed with stages
in 0 seconds
// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "infer_data_manager.h"
#include <algorithm>
namespace triton { namespace perfanalyzer {
cb::Error
InferDataManager::Init()
{
RETURN_IF_ERROR(CreateAndPopulateInputs());
return cb::Error::Success;
}
cb::Error
InferDataManager::CreateAndPopulateInputs()
{
// All combinations of thread + input + stream + step
//
for (size_t thread_id = 0; thread_id < max_threads_; thread_id++) {
for (const auto& input : *(parser_->Inputs())) {
const std::string& name = input.first;
const ModelTensor& tensor = input.second;
for (int stream_id = 0;
stream_id < (int)data_loader_->GetDataStreamsCount(); stream_id++) {
for (int step_id = 0;
step_id < (int)data_loader_->GetTotalSteps(stream_id);
step_id += 1) {
RETURN_IF_ERROR(CreateAndPopulateInput(
thread_id, name, tensor, stream_id, step_id));
}
}
}
}
return cb::Error::Success;
}
cb::Error
InferDataManager::CreateAndPopulateInput(
const size_t thread_id, const std::string& name, const ModelTensor& tensor,
int stream_id, int step_id)
{
std::vector<TensorData> input_datas;
size_t count = 0;
RETURN_IF_ERROR(GetInputData(name, tensor, stream_id, step_id, input_datas));
if (tensor.is_shape_tensor_) {
RETURN_IF_ERROR(
ValidateShapeTensor(tensor, stream_id, step_id, input_datas));
}
std::vector<int64_t> shape;
RETURN_IF_ERROR(
data_loader_->GetInputShape(tensor, stream_id, step_id, &shape));
if (!shape.empty()) {
if ((parser_->MaxBatchSize() != 0) && (!tensor.is_shape_tensor_)) {
shape.insert(shape.begin(), (int64_t)batch_size_);
}
}
cb::InferInput* input;
RETURN_IF_ERROR(
CreateInferInput(&input, backend_kind_, name, shape, tensor.datatype_));
// Number of missing pieces of data for optional inputs
int missing_data_cnt = 0;
int total_cnt = input_datas.size();
for (size_t i = 0; i < total_cnt; i++) {
if (!input_datas[i].is_valid) {
missing_data_cnt++;
} else {
RETURN_IF_ERROR(input->AppendRaw(
input_datas[i].data_ptr, input_datas[i].batch1_size));
}
}
// If all optional inputs had data provided, this is a valid input. But if
// some inferences in the batch provided data for an optional input and
// some inferences did not, this is an invalid case and an error is
// thrown.
if (missing_data_cnt == 0) {
inputs_.insert({{thread_id, name, stream_id, step_id}, input});
} else if (missing_data_cnt > 0 && missing_data_cnt < total_cnt) {
return cb::Error(
"For batch sizes larger than 1, the same set of inputs must be "
"specified for each batch. You cannot use different set of "
"optional inputs for each individual batch.");
}
return cb::Error::Success;
}
cb::InferInput*
InferDataManager::GetInput(
const size_t thread_id, const std::string& name, int stream_id, int step_id)
{
auto input = inputs_.find({thread_id, name, stream_id, step_id});
if (input == inputs_.end()) {
return nullptr;
} else {
return input->second;
}
}
cb::Error
InferDataManager::InitInferDataInput(
const std::string& name, const ModelTensor& model_tensor,
InferData& infer_data)
{
std::vector<int64_t> shape;
RETURN_IF_ERROR(data_loader_->GetInputShape(model_tensor, 0, 0, &shape));
if (shape.empty() && (backend_kind_ == cb::BackendKind::TRITON)) {
return cb::Error("unable to set shape for the input", pa::GENERIC_ERROR);
}
if ((parser_->MaxBatchSize() != 0) && (!model_tensor.is_shape_tensor_)) {
shape.insert(shape.begin(), (int64_t)batch_size_);
}
cb::InferInput* infer_input;
RETURN_IF_ERROR(CreateInferInput(
&infer_input, backend_kind_, name, shape, model_tensor.datatype_));
infer_data.inputs_.push_back(infer_input);
TensorData input_data;
RETURN_IF_ERROR(data_loader_->GetInputData(model_tensor, 0, 0, input_data));
// Add optional input to request if data was found
if (input_data.is_valid) {
infer_data.valid_inputs_.push_back(infer_input);
}
if (!shape.empty()) {
size_t max_count = (parser_->MaxBatchSize() == 0) ? 1 : batch_size_;
for (size_t i = 0; i < max_count; ++i) {
RETURN_IF_ERROR(
infer_input->AppendRaw(input_data.data_ptr, input_data.batch1_size));
}
}
return cb::Error::Success;
}
cb::Error
InferDataManager::InitInferDataOutput(
const std::string& name, InferData& infer_data)
{
cb::InferRequestedOutput* requested_output;
RETURN_IF_ERROR(
cb::InferRequestedOutput::Create(&requested_output, backend_kind_, name));
infer_data.outputs_.push_back(requested_output);
return cb::Error::Success;
}
cb::Error
InferDataManager::UpdateInputs(
const size_t thread_id, const int stream_index, const int step_index,
InferData& infer_data)
{
// Reset inputs for this inference request
infer_data.valid_inputs_.clear();
for (const auto& input : infer_data.inputs_) {
const auto& name = input->Name();
cb::InferInput* tmp_input =
GetInput(thread_id, name, stream_index, step_index);
if (tmp_input != nullptr) {
infer_data.valid_inputs_.push_back(tmp_input);
}
}
return cb::Error::Success;
}
}} // namespace triton::perfanalyzer
// Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include "client_backend/client_backend.h"
#include "constants.h"
#include "data_loader.h"
#include "infer_data.h"
#include "infer_data_manager_base.h"
#include "model_parser.h"
#include "perf_utils.h"
namespace triton { namespace perfanalyzer {
/// Manages infer data to prepare an inference request and the resulting
/// inference output from triton server
class InferDataManager : public InferDataManagerBase {
public:
InferDataManager(
const size_t max_threads, const int32_t batch_size,
const std::shared_ptr<ModelParser>& parser,
const std::shared_ptr<cb::ClientBackendFactory>& factory,
const std::shared_ptr<DataLoader>& data_loader)
: max_threads_(max_threads),
InferDataManagerBase(batch_size, parser, factory, data_loader)
{
}
/// Initialize this object. Must be called before any other functions
/// \return cb::Error object indicating success or failure.
cb::Error Init() override;
protected:
const size_t max_threads_{1};
std::map<std::tuple<size_t, std::string, int, int>, cb::InferInput*> inputs_;
cb::Error CreateAndPopulateInputs();
cb::Error CreateAndPopulateInput(
const size_t thread_id, const std::string& name,
const ModelTensor& model_tensor, int stream_id, int step_id);
cb::InferInput* GetInput(
const size_t thread_id, const std::string& name, int stream_id,
int step_id);
cb::Error InitInferDataInput(
const std::string& name, const ModelTensor& model_tensor,
InferData& infer_data) override;
cb::Error InitInferDataOutput(
const std::string& name, InferData& infer_data) override;
/// Helper function to update the inputs
/// \param thread_id The ID of the calling thread
/// \param stream_index The data stream to use for next data
/// \param step_index The step index to use for next data
/// \param infer_data The target InferData object
/// \return cb::Error object indicating success or failure.
cb::Error UpdateInputs(
const size_t thread_id, const int stream_index, const int step_index,
InferData& infer_data);
#ifndef DOCTEST_CONFIG_DISABLE
public:
InferDataManager() = default;
#endif
};
}} // namespace triton::perfanalyzer
// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "infer_data_manager_base.h"
#include <algorithm>
namespace triton { namespace perfanalyzer {
cb::Error
InferDataManagerBase::GetInputData(
const std::string& name, const ModelTensor& tensor, int stream_id,
int step_id, std::vector<TensorData>& input_datas)
{
size_t max_count = tensor.is_shape_tensor_ ? 1 : batch_size_;
std::vector<int64_t> shape;
std::vector<int64_t> prev_shape;
for (size_t count = 0; count < max_count; count++) {
int local_step_id =
(step_id + count) % data_loader_->GetTotalSteps(stream_id);
TensorData input_data;
RETURN_IF_ERROR(
data_loader_->GetInputShape(tensor, stream_id, local_step_id, &shape));
if (!shape.empty()) {
if (count == 0) {
prev_shape = shape;
} else {
if (!std::equal(shape.begin(), shape.end(), prev_shape.begin())) {
return cb::Error(
"can not batch tensors with different shapes together "
"(input '" +
name + "' expected shape " + ShapeVecToString(prev_shape) +
" and received " + ShapeVecToString(shape),
pa::GENERIC_ERROR);
}
}
}
RETURN_IF_ERROR(data_loader_->GetInputData(
tensor, stream_id, local_step_id, input_data));
input_datas.push_back(input_data);
}
return cb::Error::Success;
}
cb::Error
InferDataManagerBase::ValidateShapeTensor(
const ModelTensor& tensor, int stream_id, int step_id,
const std::vector<TensorData>& input_datas)
{
// Validate that steps 1 through N are exactly the same as step 0, since step
// 0 is the only one we send for shape tensors
for (size_t count = 1; count < batch_size_; count++) {
int local_step_id =
(step_id + count) % data_loader_->GetTotalSteps(stream_id);
TensorData input_data;
RETURN_IF_ERROR(data_loader_->GetInputData(
tensor, stream_id, local_step_id, input_data));
if (input_data.batch1_size != input_datas.back().batch1_size) {
return cb::Error(
"The shape tensors should be identical in a batch (mismatch "
"in size)",
pa::GENERIC_ERROR);
}
for (size_t data_idx = 0; data_idx < input_data.batch1_size; data_idx++) {
if (*(input_data.data_ptr + data_idx) !=
*(input_datas.back().data_ptr + data_idx)) {
return cb::Error(
"The shape tensors should be identical in a batch "
"(mismatch in content)",
pa::GENERIC_ERROR);
}
}
}
return cb::Error::Success;
}
cb::Error
InferDataManagerBase::InitInferData(InferData& infer_data)
{
// Initialize inputs
for (const auto& input : *(parser_->Inputs())) {
RETURN_IF_ERROR(InitInferDataInput(input.first, input.second, infer_data));
}
for (const auto& output : *(parser_->Outputs())) {
RETURN_IF_ERROR(InitInferDataOutput(output.first, infer_data));
}
return cb::Error::Success;
}
cb::Error
InferDataManagerBase::UpdateInferData(
size_t thread_id, int stream_index, int step_index, InferData& infer_data)
{
RETURN_IF_ERROR(data_loader_->ValidateIndexes(stream_index, step_index));
RETURN_IF_ERROR(
UpdateInputs(thread_id, stream_index, step_index, infer_data));
RETURN_IF_ERROR(
UpdateValidationOutputs(stream_index, step_index, infer_data));
return cb::Error::Success;
}
cb::Error
InferDataManagerBase::UpdateValidationOutputs(
int stream_index, int step_index, InferData& infer_data)
{
RETURN_IF_ERROR(data_loader_->ValidateIndexes(stream_index, step_index));
infer_data.expected_outputs_.clear();
for (const auto& output : infer_data.outputs_) {
const auto& model_output = (*(parser_->Outputs()))[output->Name()];
TensorData output_data;
const int* set_shape_values = nullptr;
int set_shape_value_cnt = 0;
std::vector<TensorData> outputs;
for (size_t i = 0; i < batch_size_; ++i) {
RETURN_IF_ERROR(data_loader_->GetOutputData(
output->Name(), stream_index,
(step_index + i) % data_loader_->GetTotalSteps(0), output_data));
if (!output_data.is_valid) {
break;
}
outputs.emplace_back(output_data);
// Shape tensor only need the first batch element
if (model_output.is_shape_tensor_) {
break;
}
}
if (!outputs.empty()) {
infer_data.expected_outputs_.emplace_back(std::move(outputs));
}
}
return cb::Error::Success;
}
cb::Error
InferDataManagerBase::CreateInferInput(
cb::InferInput** infer_input, const cb::BackendKind kind,
const std::string& name, const std::vector<int64_t>& dims,
const std::string& datatype)
{
return cb::InferInput::Create(infer_input, kind, name, dims, datatype);
}
}} // namespace triton::perfanalyzer
// Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include "client_backend/client_backend.h"
#include "constants.h"
#include "data_loader.h"
#include "iinfer_data_manager.h"
#include "infer_data.h"
#include "model_parser.h"
#include "perf_utils.h"
#include "tensor_data.h"
namespace triton { namespace perfanalyzer {
/// Base class for Infer Data managers
///
class InferDataManagerBase : public IInferDataManager {
public:
InferDataManagerBase(
const int32_t batch_size, const std::shared_ptr<ModelParser>& parser,
const std::shared_ptr<cb::ClientBackendFactory>& factory,
const std::shared_ptr<DataLoader>& data_loader)
: batch_size_(batch_size), parser_(parser), factory_(factory),
data_loader_(data_loader), backend_kind_(factory->Kind())
{
}
/// Populate the target InferData object with input and output objects
/// according to the model's shape
/// \param infer_data The target InferData object.
/// \return cb::Error object indicating success or failure.
cb::Error InitInferData(InferData& infer_data) override;
/// Updates the input data to use for inference request
/// \param thread_id The ID of the calling thread
/// \param stream_index The data stream to use for next data
/// \param step_index The step index to use for next data
/// \param infer_data The target InferData object
/// \return cb::Error object indicating success or failure.
cb::Error UpdateInferData(
size_t thread_id, int stream_index, int step_index,
InferData& infer_data) override;
protected:
size_t batch_size_;
std::shared_ptr<ModelParser> parser_;
std::shared_ptr<cb::ClientBackendFactory> factory_;
std::shared_ptr<DataLoader> data_loader_;
std::unique_ptr<cb::ClientBackend> backend_;
cb::BackendKind backend_kind_;
/// Gets the input data for the specified input for the specified batch size
///
/// \param name The name of the input to get data for
/// \param tensor The ModelTensor of the input to get data for
/// \param stream_id The ID of the stream to get data for
/// \param step_id The ID of the step within the stream
/// \param input_datas The returned vector of TensorDatas
/// \return cb::Error object indicating success or failure.
cb::Error GetInputData(
const std::string& name, const ModelTensor& tensor, int stream_id,
int step_id, std::vector<TensorData>& input_datas);
/// For the case of an input with is_shape_tensor true, validate that
/// it follows all rules, and throw an error if it does not
/// \param tensor The ModelTensor of the input to validate
/// \param stream_id The ID of the stream to validate
/// \param step_id The ID of the step within the stream
/// \param input_datas vector of TensorDatas to validate
/// \return cb::Error object indicating success or failure.
cb::Error ValidateShapeTensor(
const ModelTensor& tensor, int stream_id, int step_id,
const std::vector<TensorData>& input_datas);
/// Helper function to update the inputs
/// \param thread_id The ID of the calling thread
/// \param stream_index The data stream to use for next data
/// \param step_index The step index to use for next data
/// \param infer_data The target InferData object
/// \return cb::Error object indicating success or failure.
virtual cb::Error UpdateInputs(
const size_t thread_id, const int stream_index, const int step_index,
InferData& infer_data) = 0;
/// Updates the expected output data to use for inference request. Empty
/// vector will be returned if there is no expected output associated to the
/// step.
/// \param stream_index The data stream to use for next data
/// \param step_index The step index to use for next data
/// \param infer_data The target InferData object
/// \return cb::Error object indicating success or failure.
cb::Error UpdateValidationOutputs(
int stream_index, int step_index, InferData& infer_data);
/// Creates inference input object
/// \param infer_input Output parameter storing newly created inference input
/// \param kind Backend kind
/// \param name Name of inference input
/// \param dims Shape of inference input
/// \param datatype Data type of inference input
/// \return cb::Error object indicating success or failure.
virtual cb::Error CreateInferInput(
cb::InferInput** infer_input, const cb::BackendKind kind,
const std::string& name, const std::vector<int64_t>& dims,
const std::string& datatype);
virtual cb::Error InitInferDataInput(
const std::string& name, const ModelTensor& model_tensor,
InferData& infer_data) = 0;
virtual cb::Error InitInferDataOutput(
const std::string& name, InferData& infer_data) = 0;
#ifndef DOCTEST_CONFIG_DISABLE
public:
InferDataManagerBase() = default;
#endif
};
}} // namespace triton::perfanalyzer
// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include "data_loader.h"
#include "iinfer_data_manager.h"
#include "infer_data_manager.h"
#include "infer_data_manager_shm.h"
#include "model_parser.h"
namespace triton { namespace perfanalyzer {
class InferDataManagerFactory {
public:
static std::shared_ptr<IInferDataManager> CreateInferDataManager(
const size_t max_threads, const int32_t batch_size,
const SharedMemoryType shared_memory_type, const size_t output_shm_size,
const std::shared_ptr<ModelParser>& parser,
const std::shared_ptr<cb::ClientBackendFactory>& factory,
const std::shared_ptr<DataLoader>& data_loader)
{
if (shared_memory_type == SharedMemoryType::NO_SHARED_MEMORY) {
return CreateInferDataManagerNoShm(
max_threads, batch_size, parser, factory, data_loader);
} else {
return CreateInferDataManagerShm(
batch_size, shared_memory_type, output_shm_size, parser, factory,
data_loader);
}
}
private:
static std::shared_ptr<IInferDataManager> CreateInferDataManagerNoShm(
const size_t max_threads, const int32_t batch_size,
const std::shared_ptr<ModelParser>& parser,
const std::shared_ptr<cb::ClientBackendFactory>& factory,
const std::shared_ptr<DataLoader>& data_loader)
{
return std::make_shared<InferDataManager>(
max_threads, batch_size, parser, factory, data_loader);
}
static std::shared_ptr<IInferDataManager> CreateInferDataManagerShm(
const int32_t batch_size, const SharedMemoryType shared_memory_type,
const size_t output_shm_size, const std::shared_ptr<ModelParser>& parser,
const std::shared_ptr<cb::ClientBackendFactory>& factory,
const std::shared_ptr<DataLoader>& data_loader)
{
return std::make_shared<InferDataManagerShm>(
batch_size, shared_memory_type, output_shm_size, parser, factory,
data_loader);
}
};
}} // namespace triton::perfanalyzer
// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "infer_data_manager_shm.h"
#include <algorithm>
namespace triton { namespace perfanalyzer {
InferDataManagerShm::~InferDataManagerShm()
{
cb::Error err;
if (backend_.get() != nullptr) {
err = backend_->UnregisterAllSharedMemory();
if (!err.IsOk()) {
std::cerr << "Unable to unregister all shared memory regions"
<< std::endl;
}
if (shared_memory_type_ == SharedMemoryType::SYSTEM_SHARED_MEMORY) {
for (auto& region : shared_memory_regions_) {
if (factory_->Kind() !=
triton::perfanalyzer::clientbackend::BackendKind::TRITON_C_API) {
err = backend_->UnmapSharedMemory(
shared_memory_regions_[region.first].data_.get(),
shared_memory_regions_[region.first].byte_size_);
if (!err.IsOk()) {
std::cerr << "Unable to unmap shared memory with key ("
<< region.first << "): Starting: "
<< static_cast<void*>(
shared_memory_regions_[region.first].data_.get())
<< ", size: "
<< shared_memory_regions_[region.first].byte_size_
<< std::endl;
}
err = backend_->UnlinkSharedMemoryRegion(region.first);
if (!err.IsOk()) {
std::cerr << "Unable to unlink shared memory with key: "
<< region.first << std::endl;
}
}
}
}
}
}
cb::Error
InferDataManagerShm::Init()
{
// TMA-1062 remove the factory from this class and use only the backend
RETURN_IF_ERROR(factory_->CreateClientBackend(&backend_));
// Calling this function for the clean start
backend_->UnregisterAllSharedMemory();
RETURN_IF_ERROR(CreateOutputMemoryRegions());
RETURN_IF_ERROR(CreateAndPopulateInputMemoryRegions());
return cb::Error::Success;
}
cb::Error
InferDataManagerShm::CreateOutputMemoryRegions()
{
// Allocate the shared memory for outputs
for (const auto& output : *(parser_->Outputs())) {
const std::string& name = output.first;
const ModelTensor& tensor = output.second;
int64_t batch1_bytesize = ByteSize(tensor.shape_, tensor.datatype_);
if (batch1_bytesize < 0) {
batch1_bytesize = output_shm_size_;
}
uint8_t* output_shm_ptr;
size_t alloc_size = batch1_bytesize * batch_size_;
std::string region_name(TensorToRegionName(name));
RETURN_IF_ERROR(CreateMemoryRegion(
region_name, shared_memory_type_, alloc_size,
reinterpret_cast<void**>(&output_shm_ptr)));
}
return cb::Error::Success;
}
cb::Error
InferDataManagerShm::CreateAndPopulateInputMemoryRegions()
{
// All combinations of input + stream + step
//
for (const auto& input : *(parser_->Inputs())) {
const std::string& name = input.first;
const ModelTensor& tensor = input.second;
for (int stream_id = 0;
stream_id < (int)data_loader_->GetDataStreamsCount(); stream_id++) {
for (int step_id = 0;
step_id < (int)data_loader_->GetTotalSteps(stream_id);
step_id += 1) {
RETURN_IF_ERROR(CreateAndPopulateInputMemoryRegion(
name, tensor, stream_id, step_id));
}
}
}
return cb::Error::Success;
}
cb::Error
InferDataManagerShm::CreateAndPopulateInputMemoryRegion(
const std::string& name, const ModelTensor& tensor, int stream_id,
int step_id)
{
std::vector<TensorData> input_datas;
size_t count = 0;
RETURN_IF_ERROR(GetInputData(name, tensor, stream_id, step_id, input_datas));
if (tensor.is_shape_tensor_) {
RETURN_IF_ERROR(
ValidateShapeTensor(tensor, stream_id, step_id, input_datas));
}
size_t alloc_size = 0;
for (size_t i = 0; i < input_datas.size(); i++) {
if (!input_datas[i].is_valid) {
return cb::Error(
"Shared memory support in Perf Analyzer does not support "
"optional inputs at this time");
}
alloc_size += input_datas[i].batch1_size;
}
// Generate the shared memory region name
std::string region_name(
TensorToRegionName(name) + "_" + std::to_string(stream_id) + "_" +
std::to_string(step_id));
uint8_t* input_shm_ptr;
RETURN_IF_ERROR(CreateMemoryRegion(
region_name, shared_memory_type_, alloc_size,
reinterpret_cast<void**>(&input_shm_ptr)));
RETURN_IF_ERROR(CopySharedMemory(
input_shm_ptr, input_datas, tensor.is_shape_tensor_, region_name));
return cb::Error::Success;
}
cb::Error
InferDataManagerShm::CreateMemoryRegion(
const std::string& shm_region_name, const SharedMemoryType& memory_type,
const size_t byte_size, void** ptr)
{
if (memory_type == SharedMemoryType::SYSTEM_SHARED_MEMORY) {
if (factory_->Kind() ==
triton::perfanalyzer::clientbackend::BackendKind::TRITON_C_API) {
*ptr = new uint8_t[byte_size];
RETURN_IF_ERROR(
backend_->RegisterSystemMemory(shm_region_name, *ptr, byte_size));
// Set free as the destructor.
shared_memory_regions_.emplace(
std::piecewise_construct, std::forward_as_tuple(shm_region_name),
std::forward_as_tuple(SharedMemoryData(
byte_size,
std::unique_ptr<uint8_t, std::function<void(uint8_t*)>>(
reinterpret_cast<uint8_t*>(*ptr),
[](uint8_t* memory) { free(memory); }))));
} else {
std::string shm_key("/" + shm_region_name);
int shm_fd_op;
RETURN_IF_ERROR(
backend_->CreateSharedMemoryRegion(shm_key, byte_size, &shm_fd_op));
RETURN_IF_ERROR(backend_->MapSharedMemory(shm_fd_op, 0, byte_size, ptr));
RETURN_IF_ERROR(backend_->RegisterSystemSharedMemory(
shm_region_name, shm_key, byte_size));
// No-op destruction
shared_memory_regions_.emplace(
std::piecewise_construct, std::forward_as_tuple(shm_region_name),
std::forward_as_tuple(SharedMemoryData(
byte_size,
std::unique_ptr<uint8_t, std::function<void(uint8_t*)>>(
reinterpret_cast<uint8_t*>(*ptr), [](uint8_t* memory) {}))));
}
} else if (memory_type == SharedMemoryType::CUDA_SHARED_MEMORY) {
#ifdef TRITON_ENABLE_GPU
cudaError_t cuda_err = cudaMalloc((void**)ptr, byte_size);
if (cuda_err != cudaSuccess) {
return cb::Error(
"unable to allocate memory of " + std::to_string(byte_size) +
" bytes on gpu for output: " +
std::string(cudaGetErrorString(cuda_err)),
pa::GENERIC_ERROR);
}
if (factory_->Kind() ==
triton::perfanalyzer::clientbackend::BackendKind::TRITON_C_API) {
RETURN_IF_ERROR(
backend_->RegisterCudaMemory(shm_region_name, *ptr, byte_size));
// Set cudaFree as the destructor
shared_memory_regions_.emplace(
std::piecewise_construct, std::forward_as_tuple(shm_region_name),
std::forward_as_tuple(SharedMemoryData(
byte_size,
std::unique_ptr<uint8_t, std::function<void(uint8_t*)>>(
reinterpret_cast<uint8_t*>(*ptr),
[shm_region_name, byte_size](uint8_t* memory) {
cudaError_t cuda_err = cudaFree(memory);
if (cuda_err != cudaSuccess) {
std::cerr
<< "Unable to free cuda shared memory for "
<< shm_region_name
<< ": Starting: " << static_cast<void*>(memory)
<< ", size: " << byte_size
<< " bytes, Details: " << cudaGetErrorString(cuda_err)
<< std::endl;
}
}))));
} else {
cudaIpcMemHandle_t cuda_handle;
RETURN_IF_ERROR(
CreateCUDAIPCHandle(&cuda_handle, reinterpret_cast<void*>(*ptr)));
RETURN_IF_ERROR(backend_->RegisterCudaSharedMemory(
shm_region_name, cuda_handle, byte_size));
// No operation required for deleting the memory
shared_memory_regions_.emplace(
std::piecewise_construct, std::forward_as_tuple(shm_region_name),
std::forward_as_tuple(SharedMemoryData(
byte_size,
std::unique_ptr<uint8_t, std::function<void(uint8_t*)>>(
reinterpret_cast<uint8_t*>(*ptr), [](uint8_t* memory) {}))));
}
#endif // TRITON_ENABLE_GPU
} else {
return cb::Error(
"CreateMemoryRegion called with invalid memory region type.",
pa::GENERIC_ERROR);
}
return cb::Error::Success;
}
cb::Error
InferDataManagerShm::CopySharedMemory(
uint8_t* input_shm_ptr, const std::vector<TensorData>& tensor_datas,
bool is_shape_tensor, std::string& region_name)
{
if (shared_memory_type_ == SharedMemoryType::SYSTEM_SHARED_MEMORY) {
// Populate the region with data
size_t count = 0;
size_t offset = 0;
size_t max_count = is_shape_tensor ? 1 : batch_size_;
while (count < max_count) {
memcpy(
input_shm_ptr + offset, tensor_datas[count].data_ptr,
tensor_datas[count].batch1_size);
offset += tensor_datas[count].batch1_size;
count++;
}
} else {
#ifdef TRITON_ENABLE_GPU
// Populate the region with data
size_t count = 0;
size_t offset = 0;
size_t max_count = is_shape_tensor ? 1 : batch_size_;
while (count < max_count) {
cudaError_t cuda_err = cudaMemcpy(
(void*)(input_shm_ptr + offset), (void*)tensor_datas[count].data_ptr,
tensor_datas[count].batch1_size, cudaMemcpyHostToDevice);
if (cuda_err != cudaSuccess) {
return cb::Error(
"Failed to copy data to cuda shared memory for " + region_name +
" : " + std::string(cudaGetErrorString(cuda_err)),
pa::GENERIC_ERROR);
}
offset += tensor_datas[count].batch1_size;
count++;
}
#endif // TRITON_ENABLE_GPU
}
return cb::Error::Success;
}
cb::Error
InferDataManagerShm::InitInferDataInput(
const std::string& name, const ModelTensor& model_tensor,
InferData& infer_data)
{
std::vector<int64_t> shape;
RETURN_IF_ERROR(data_loader_->GetInputShape(model_tensor, 0, 0, &shape));
if (!shape.empty()) {
if ((parser_->MaxBatchSize() != 0) && (!model_tensor.is_shape_tensor_)) {
shape.insert(shape.begin(), (int64_t)batch_size_);
}
} else {
return cb::Error("unable to set shape for the input", pa::GENERIC_ERROR);
}
cb::InferInput* infer_input;
RETURN_IF_ERROR(CreateInferInput(
&infer_input, backend_kind_, name, shape, model_tensor.datatype_));
infer_data.inputs_.push_back(infer_input);
// FIXME: TMA-765 - Shared memory mode does not support optional inputs,
// currently, and will be implemented in the associated story.
infer_data.valid_inputs_.push_back(infer_input);
std::string region_name(
TensorToRegionName(name) + "_" + std::to_string(0) + "_" +
std::to_string(0));
RETURN_IF_ERROR(infer_input->SetSharedMemory(
region_name, shared_memory_regions_[region_name].byte_size_));
return cb::Error::Success;
}
cb::Error
InferDataManagerShm::InitInferDataOutput(
const std::string& name, InferData& infer_data)
{
cb::InferRequestedOutput* requested_output;
RETURN_IF_ERROR(
cb::InferRequestedOutput::Create(&requested_output, backend_kind_, name));
infer_data.outputs_.push_back(requested_output);
std::string region_name(TensorToRegionName(name));
RETURN_IF_ERROR(requested_output->SetSharedMemory(
region_name, shared_memory_regions_[region_name].byte_size_));
return cb::Error::Success;
}
cb::Error
InferDataManagerShm::UpdateInputs(
const size_t thread_id, const int stream_index, const int step_index,
InferData& infer_data)
{
for (const auto& input : infer_data.inputs_) {
RETURN_IF_ERROR(input->Reset());
const auto& model_input = (*(parser_->Inputs()))[input->Name()];
std::string region_name(
TensorToRegionName(input->Name()) + '_' + std::to_string(stream_index) +
"_" + std::to_string(step_index));
std::vector<int64_t> shape;
RETURN_IF_ERROR(data_loader_->GetInputShape(
model_input, stream_index, step_index, &shape));
if (!shape.empty()) {
if ((parser_->MaxBatchSize() != 0) && (!model_input.is_shape_tensor_)) {
shape.insert(shape.begin(), (int64_t)batch_size_);
}
input->SetShape(shape);
}
RETURN_IF_ERROR(input->SetSharedMemory(
region_name, shared_memory_regions_[region_name].byte_size_));
}
return cb::Error::Success;
}
}} // namespace triton::perfanalyzer
// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include "client_backend/client_backend.h"
#include "constants.h"
#include "data_loader.h"
#include "infer_data.h"
#include "infer_data_manager_base.h"
#include "model_parser.h"
#include "perf_utils.h"
namespace triton { namespace perfanalyzer {
namespace {
#ifdef TRITON_ENABLE_GPU
#include <cuda_runtime_api.h>
#define RETURN_IF_CUDA_ERR(FUNC) \
{ \
const cudaError_t result = FUNC; \
if (result != cudaSuccess) { \
return cb::Error( \
"CUDA exception (line " + std::to_string(__LINE__) + \
"): " + cudaGetErrorName(result) + " (" + \
cudaGetErrorString(result) + ")", \
pa::GENERIC_ERROR); \
} \
}
cb::Error
CreateCUDAIPCHandle(
cudaIpcMemHandle_t* cuda_handle, void* input_d_ptr, int device_id = 0)
{
// Set the GPU device to the desired GPU
RETURN_IF_CUDA_ERR(cudaSetDevice(device_id));
// Create IPC handle for data on the gpu
RETURN_IF_CUDA_ERR(cudaIpcGetMemHandle(cuda_handle, input_d_ptr));
return cb::Error::Success;
}
#endif // TRITON_ENABLE_GPU
} // namespace
/// Holds information about the shared memory locations
struct SharedMemoryData {
SharedMemoryData(
size_t byte_size,
std::unique_ptr<uint8_t, std::function<void(uint8_t*)>> data)
: byte_size_(byte_size), data_(std::move(data))
{
}
SharedMemoryData() {}
// Byte size
size_t byte_size_;
// Unique pointer holding the shared memory data
std::unique_ptr<uint8_t, std::function<void(uint8_t*)>> data_;
};
/// Manages infer data to prepare an inference request and the resulting
/// inference output from triton server
class InferDataManagerShm : public InferDataManagerBase {
public:
InferDataManagerShm(
const int32_t batch_size, const SharedMemoryType shared_memory_type,
const size_t output_shm_size, const std::shared_ptr<ModelParser>& parser,
const std::shared_ptr<cb::ClientBackendFactory>& factory,
const std::shared_ptr<DataLoader>& data_loader)
: shared_memory_type_(shared_memory_type),
output_shm_size_(output_shm_size),
InferDataManagerBase(batch_size, parser, factory, data_loader)
{
}
~InferDataManagerShm();
/// Initialize this object. Must be called before any other functions
/// \return cb::Error object indicating success or failure.
cb::Error Init() override;
protected:
cb::Error CreateOutputMemoryRegions();
cb::Error CreateAndPopulateInputMemoryRegions();
cb::Error CreateAndPopulateInputMemoryRegion(
const std::string& name, const ModelTensor& tensor, int stream_id,
int step_id);
/// Create a memory region.
/// \return cb::Error object indicating success or failure.
cb::Error CreateMemoryRegion(
const std::string& shm_region_name, const SharedMemoryType& memory_type,
const size_t byte_size, void** ptr);
/// \brief Helper function to handle copying shared memory to the correct
/// memory region
/// \param input_shm_ptr Pointer to the shared memory for a specific input
/// \param input_datas The TensorDatas to be copied
/// \param is_shape_tensor Is the input a shape tensor
/// \param region_name Name of the shared memory region
/// \return cb::Error object indicating success or failure
virtual cb::Error CopySharedMemory(
uint8_t* input_shm_ptr, const std::vector<TensorData>& input_datas,
bool is_shape_tensor, std::string& region_name);
cb::Error InitInferDataInput(
const std::string& name, const ModelTensor& model_tensor,
InferData& infer_data) override;
cb::Error InitInferDataOutput(
const std::string& name, InferData& infer_data) override;
/// Helper function to update the inputs
/// \param thread_id The ID of the calling thread
/// \param stream_index The data stream to use for next data
/// \param step_index The step index to use for next data
/// \param infer_data The target InferData object
/// \return cb::Error object indicating success or failure.
virtual cb::Error UpdateInputs(
size_t thread_id, const int stream_index, const int step_index,
InferData& infer_data) override;
SharedMemoryType shared_memory_type_;
size_t output_shm_size_;
// Map from shared memory key to its starting address and size
std::unordered_map<std::string, SharedMemoryData> shared_memory_regions_;
};
}} // namespace triton::perfanalyzer
// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "inference_profiler.h"
#include <math.h>
#include <algorithm>
#include <iomanip>
#include <iostream>
#include <limits>
#include <queue>
#include <sstream>
#include <stdexcept>
#include "client_backend/client_backend.h"
#include "constants.h"
#include "doctest.h"
namespace triton { namespace perfanalyzer {
cb::Error
ReportPrometheusMetrics(const Metrics& metrics)
{
const size_t max_num_gpus_in_stdout{16};
if (metrics.gpu_utilization_per_gpu.size() > max_num_gpus_in_stdout ||
metrics.gpu_power_usage_per_gpu.size() > max_num_gpus_in_stdout ||
metrics.gpu_memory_used_bytes_per_gpu.size() > max_num_gpus_in_stdout ||
metrics.gpu_memory_total_bytes_per_gpu.size() > max_num_gpus_in_stdout) {
std::cout << "Too many GPUs on system to print out individual Prometheus "
"metrics, use the CSV output feature to see metrics."
<< std::endl;
return cb::Error::Success;
}
std::cout << " Avg GPU Utilization:" << std::endl;
for (const auto& gpu_uuid_metric_pair : metrics.gpu_utilization_per_gpu) {
const auto gpu_uuid{gpu_uuid_metric_pair.first};
const auto metric{gpu_uuid_metric_pair.second};
std::cout << " " << gpu_uuid << " : " << (metric * 100.0) << "%"
<< std::endl;
}
std::cout << " Avg GPU Power Usage:" << std::endl;
for (const auto& gpu_uuid_metric_pair : metrics.gpu_power_usage_per_gpu) {
const auto gpu_uuid{gpu_uuid_metric_pair.first};
const auto metric{gpu_uuid_metric_pair.second};
std::cout << " " << gpu_uuid << " : " << metric << " watts"
<< std::endl;
}
std::cout << " Max GPU Memory Usage:" << std::endl;
for (const auto& gpu_uuid_metric_pair :
metrics.gpu_memory_used_bytes_per_gpu) {
const auto gpu_uuid{gpu_uuid_metric_pair.first};
const auto metric{gpu_uuid_metric_pair.second};
std::cout << " " << gpu_uuid << " : " << metric << " bytes"
<< std::endl;
}
std::cout << " Total GPU Memory:" << std::endl;
for (const auto& gpu_uuid_metric_pair :
metrics.gpu_memory_total_bytes_per_gpu) {
const auto gpu_uuid{gpu_uuid_metric_pair.first};
const auto metric{gpu_uuid_metric_pair.second};
std::cout << " " << gpu_uuid << " : " << metric << " bytes"
<< std::endl;
}
return cb::Error::Success;
}
namespace {
inline uint64_t
AverageDurationInUs(const uint64_t total_time_in_ns, const uint64_t cnt)
{
if (cnt == 0) {
return 0;
}
return total_time_in_ns / (cnt * 1000);
}
EnsembleDurations
GetTotalEnsembleDurations(const ServerSideStats& stats)
{
EnsembleDurations result;
for (const auto& model_stats : stats.composing_models_stat) {
if (model_stats.second.composing_models_stat.empty()) {
// Cache hit count covers cache hits, not related to compute times
const uint64_t cache_hit_cnt = model_stats.second.cache_hit_count;
// cache_miss_cnt should either equal infer_cnt or be zero if
// cache is disabled or not supported for the model/scheduler type
const uint64_t cache_miss_cnt = model_stats.second.cache_miss_count;
result.total_queue_time_avg_us += AverageDurationInUs(
model_stats.second.queue_time_ns, model_stats.second.queue_count);
const uint64_t compute_time = model_stats.second.compute_input_time_ns +
model_stats.second.compute_infer_time_ns +
model_stats.second.compute_output_time_ns;
if (model_stats.second.compute_input_count !=
model_stats.second.compute_infer_count ||
model_stats.second.compute_infer_count !=
model_stats.second.compute_output_count) {
throw std::runtime_error(
"Server side statistics compute counts must be the same.");
}
const uint64_t compute_cnt = model_stats.second.compute_input_count;
result.total_compute_time_avg_us +=
AverageDurationInUs(compute_time, compute_cnt);
result.total_cache_hit_time_avg_us += AverageDurationInUs(
model_stats.second.cache_hit_time_ns, cache_hit_cnt);
result.total_cache_miss_time_avg_us += AverageDurationInUs(
model_stats.second.cache_miss_time_ns, cache_miss_cnt);
// Track combined cache/compute total avg for reporting latency with cache
// enabled
result.total_combined_cache_compute_time_avg_us += AverageDurationInUs(
compute_time + model_stats.second.cache_hit_time_ns +
model_stats.second.cache_miss_time_ns,
compute_cnt + cache_hit_cnt);
} else {
const auto this_ensemble_duration =
GetTotalEnsembleDurations(model_stats.second);
result.total_queue_time_avg_us +=
this_ensemble_duration.total_queue_time_avg_us;
result.total_compute_time_avg_us +=
this_ensemble_duration.total_compute_time_avg_us;
result.total_cache_hit_time_avg_us +=
this_ensemble_duration.total_cache_hit_time_avg_us;
result.total_cache_miss_time_avg_us +=
this_ensemble_duration.total_cache_miss_time_avg_us;
result.total_combined_cache_compute_time_avg_us +=
this_ensemble_duration.total_combined_cache_compute_time_avg_us;
}
}
return result;
}
size_t
GetOverheadDuration(size_t total_time, size_t queue_time, size_t compute_time)
{
return (total_time > queue_time + compute_time)
? (total_time - queue_time - compute_time)
: 0;
}
cb::Error
ReportServerSideStats(
const ServerSideStats& stats, const int iteration,
const std::shared_ptr<ModelParser>& parser)
{
const std::string ident = std::string(2 * iteration, ' ');
// Infer/exec counts cover compute time done in inference backends,
// not related to cache hit times
const uint64_t exec_cnt = stats.execution_count;
const uint64_t infer_cnt = stats.inference_count;
// Cache hit count covers cache hits, not related to compute times
const uint64_t cache_hit_cnt = stats.cache_hit_count;
const uint64_t cache_miss_cnt = stats.cache_miss_count;
// Success count covers all successful requests, cumulative time, queue
// time, compute, and cache
const uint64_t cnt = stats.success_count;
if (cnt == 0) {
std::cout << ident << " Request count: " << cnt << std::endl;
return cb::Error::Success;
}
const uint64_t cumm_avg_us = AverageDurationInUs(stats.cumm_time_ns, cnt);
std::cout << ident << " Inference count: " << infer_cnt << std::endl
<< ident << " Execution count: " << exec_cnt << std::endl;
if (parser->ResponseCacheEnabled()) {
std::cout << ident << " Cache hit count: " << cache_hit_cnt << std::endl;
std::cout << ident << " Cache miss count: " << cache_miss_cnt << std::endl;
}
std::cout << ident << " Successful request count: " << cnt << std::endl
<< ident << " Avg request latency: " << cumm_avg_us << " usec";
// Non-ensemble model
if (stats.composing_models_stat.empty()) {
const uint64_t queue_avg_us =
AverageDurationInUs(stats.queue_time_ns, stats.queue_count);
const uint64_t compute_input_avg_us = AverageDurationInUs(
stats.compute_input_time_ns, stats.compute_input_count);
const uint64_t compute_infer_avg_us = AverageDurationInUs(
stats.compute_infer_time_ns, stats.compute_infer_count);
const uint64_t compute_output_avg_us = AverageDurationInUs(
stats.compute_output_time_ns, stats.compute_output_count);
const uint64_t compute_time = stats.compute_input_time_ns +
stats.compute_infer_time_ns +
stats.compute_output_time_ns;
if (stats.compute_input_count != stats.compute_infer_count ||
stats.compute_infer_count != stats.compute_output_count) {
throw std::runtime_error(
"Server side statistics compute counts must be the same.");
}
const uint64_t compute_cnt = stats.compute_input_count;
const uint64_t compute_avg_us =
AverageDurationInUs(compute_time, compute_cnt);
const uint64_t cache_hit_avg_us =
AverageDurationInUs(stats.cache_hit_time_ns, cache_hit_cnt);
const uint64_t cache_miss_avg_us =
AverageDurationInUs(stats.cache_miss_time_ns, cache_miss_cnt);
const uint64_t total_compute_time_ns = stats.compute_input_time_ns +
stats.compute_infer_time_ns +
stats.compute_output_time_ns;
// Get the average of cache hits and misses across successful requests
const uint64_t combined_cache_compute_avg_us = AverageDurationInUs(
stats.cache_hit_time_ns + stats.cache_miss_time_ns +
total_compute_time_ns,
compute_cnt + cache_hit_cnt);
if (parser->ResponseCacheEnabled()) {
const uint64_t overhead_avg_us = GetOverheadDuration(
cumm_avg_us, queue_avg_us, combined_cache_compute_avg_us);
std::cout << " (overhead " << overhead_avg_us << " usec + "
<< "queue " << queue_avg_us << " usec + "
<< "cache hit/miss " << combined_cache_compute_avg_us
<< " usec)" << std::endl;
std::cout << ident << ident
<< " Average Cache Hit Latency: " << cache_hit_avg_us
<< " usec" << std::endl;
std::cout << ident << ident << " Average Cache Miss Latency: "
<< cache_miss_avg_us + compute_avg_us << " usec "
<< "(cache lookup/insertion " << cache_miss_avg_us << " usec + "
<< "compute input " << compute_input_avg_us << " usec + "
<< "compute infer " << compute_infer_avg_us << " usec + "
<< "compute output " << compute_output_avg_us << " usec)"
<< std::endl
<< std::endl;
}
// Response Cache Disabled
else {
std::cout << " (overhead "
<< GetOverheadDuration(
cumm_avg_us, queue_avg_us, compute_avg_us)
<< " usec + "
<< "queue " << queue_avg_us << " usec + "
<< "compute input " << compute_input_avg_us << " usec + "
<< "compute infer " << compute_infer_avg_us << " usec + "
<< "compute output " << compute_output_avg_us << " usec)"
<< std::endl
<< std::endl;
if (cache_hit_avg_us > 0 || cache_miss_avg_us > 0) {
std::cerr << "Response Cache is disabled for model ["
<< parser->ModelName()
<< "] but cache hit/miss latency is non-zero." << std::endl;
}
}
}
// Ensemble Model
else {
const auto ensemble_times = GetTotalEnsembleDurations(stats);
// Response Cache Enabled
if (parser->ResponseCacheEnabled()) {
const uint64_t overhead_avg_us = GetOverheadDuration(
cumm_avg_us, ensemble_times.total_queue_time_avg_us,
ensemble_times.total_combined_cache_compute_time_avg_us);
std::cout << " (overhead " << overhead_avg_us << " usec + "
<< "queue " << ensemble_times.total_queue_time_avg_us
<< " usec + "
<< "cache hit/miss "
<< ensemble_times.total_combined_cache_compute_time_avg_us
<< " usec)" << std::endl;
std::cout << ident << ident << " Average Cache Hit Latency: "
<< ensemble_times.total_cache_hit_time_avg_us << " usec"
<< std::endl;
std::cout << ident << ident << " Average Cache Miss Latency: "
<< ensemble_times.total_cache_miss_time_avg_us +
ensemble_times.total_compute_time_avg_us
<< " usec " << std::endl
<< std::endl;
}
// Response Cache Disabled
else {
std::cout << " (overhead "
<< GetOverheadDuration(
cumm_avg_us, ensemble_times.total_queue_time_avg_us,
ensemble_times.total_compute_time_avg_us)
<< " usec + "
<< "queue " << ensemble_times.total_queue_time_avg_us
<< " usec + "
<< "compute " << ensemble_times.total_compute_time_avg_us
<< " usec)" << std::endl
<< std::endl;
}
// List out composing models of ensemble model
std::cout << ident << "Composing models: " << std::endl;
for (const auto& model_stats : stats.composing_models_stat) {
const auto& model_identifier = model_stats.first;
std::cout << ident << model_identifier.first
<< ", version: " << model_identifier.second << std::endl;
ReportServerSideStats(model_stats.second, iteration + 1, parser);
}
}
return cb::Error::Success;
}
cb::Error
ReportClientSideStats(
const ClientSideStats& stats, const int64_t percentile,
const cb::ProtocolType protocol, const bool verbose,
const bool on_sequence_model, const bool include_lib_stats,
const double overhead_pct, const double send_request_rate,
const bool is_decoupled_model)
{
const uint64_t avg_latency_us = stats.avg_latency_ns / 1000;
const uint64_t std_us = stats.std_us;
const uint64_t avg_request_time_us = stats.avg_request_time_ns / 1000;
const uint64_t avg_send_time_us = stats.avg_send_time_ns / 1000;
const uint64_t avg_receive_time_us = stats.avg_receive_time_ns / 1000;
const uint64_t avg_response_wait_time_us =
avg_request_time_us - avg_send_time_us - avg_receive_time_us;
std::string client_library_detail = " ";
if (include_lib_stats) {
if (protocol == cb::ProtocolType::GRPC) {
client_library_detail +=
"Avg gRPC time: " + std::to_string(avg_request_time_us) + " usec (";
if (!verbose) {
client_library_detail +=
"(un)marshal request/response " +
std::to_string(avg_send_time_us + avg_receive_time_us) +
" usec + response wait " +
std::to_string(avg_response_wait_time_us) + " usec)";
} else {
client_library_detail += "marshal " + std::to_string(avg_send_time_us) +
" usec + response wait " +
std::to_string(avg_response_wait_time_us) +
" usec + unmarshal " +
std::to_string(avg_receive_time_us) + " usec)";
}
} else if (protocol == cb::ProtocolType::HTTP) {
client_library_detail +=
"Avg HTTP time: " + std::to_string(avg_request_time_us) + " usec (";
if (!verbose) {
client_library_detail +=
"send/recv " +
std::to_string(avg_send_time_us + avg_receive_time_us) +
" usec + response wait " +
std::to_string(avg_response_wait_time_us) + " usec)";
} else {
client_library_detail += "send " + std::to_string(avg_send_time_us) +
" usec + response wait " +
std::to_string(avg_response_wait_time_us) +
" usec + receive " +
std::to_string(avg_receive_time_us) + " usec)";
}
}
}
std::cout << " Request count: " << stats.request_count << std::endl;
double delay_pct =
((double)stats.delayed_request_count / stats.request_count) * 100;
if (delay_pct > DELAY_PCT_THRESHOLD) {
std::cout << " "
<< "Avg send request rate: " << std::fixed << std::setprecision(2)
<< send_request_rate << " infer/sec" << std::endl;
std::cout << " "
<< "[WARNING] Perf Analyzer was not able to keep up with the "
"desired request rate. ";
std::cout << delay_pct << "% of the requests were delayed. " << std::endl;
}
if (on_sequence_model) {
std::cout << " Sequence count: " << stats.sequence_count << " ("
<< stats.sequence_per_sec << " seq/sec)" << std::endl;
}
std::cout << " Throughput: " << stats.infer_per_sec << " infer/sec"
<< std::endl;
if (is_decoupled_model) {
std::cout << " Response Throughput: " << stats.responses_per_sec
<< " infer/sec" << std::endl;
}
if (verbose) {
std::stringstream client_overhead{""};
client_overhead << " "
<< "Avg client overhead: " << std::fixed
<< std::setprecision(2) << overhead_pct << "%";
std::cout << client_overhead.str() << std::endl;
}
if (percentile == -1) {
std::cout << " Avg latency: " << avg_latency_us << " usec"
<< " (standard deviation " << std_us << " usec)" << std::endl;
}
for (const auto& percentile : stats.percentile_latency_ns) {
std::cout << " p" << percentile.first
<< " latency: " << (percentile.second / 1000) << " usec"
<< std::endl;
}
std::cout << client_library_detail << std::endl;
return cb::Error::Success;
}
cb::Error
Report(
const PerfStatus& summary, const int64_t percentile,
const cb::ProtocolType protocol, const bool verbose,
const bool include_lib_stats, const bool include_server_stats,
const std::shared_ptr<ModelParser>& parser,
const bool should_collect_metrics, const double overhead_pct_threshold)
{
std::cout << " Client: " << std::endl;
ReportClientSideStats(
summary.client_stats, percentile, protocol, verbose,
summary.on_sequence_model, include_lib_stats, summary.overhead_pct,
summary.send_request_rate, parser->IsDecoupled());
if (include_server_stats) {
std::cout << " Server: " << std::endl;
ReportServerSideStats(summary.server_stats, 1, parser);
}
if (should_collect_metrics) {
std::cout << " Server Prometheus Metrics: " << std::endl;
ReportPrometheusMetrics(summary.metrics.front());
}
if (summary.overhead_pct > overhead_pct_threshold) {
std::cout << "[WARNING] Perf Analyzer is not able to keep up with the "
"desired load. The results may not be accurate."
<< std::endl;
}
return cb::Error::Success;
}
} // namespace
cb::Error
InferenceProfiler::Create(
const bool verbose, const double stability_threshold,
const uint64_t measurement_window_ms, const size_t max_trials,
const int64_t percentile, const uint64_t latency_threshold_ms_,
const cb::ProtocolType protocol, std::shared_ptr<ModelParser>& parser,
std::shared_ptr<cb::ClientBackend> profile_backend,
std::unique_ptr<LoadManager> manager,
std::unique_ptr<InferenceProfiler>* profiler,
uint64_t measurement_request_count, MeasurementMode measurement_mode,
std::shared_ptr<MPIDriver> mpi_driver, const uint64_t metrics_interval_ms,
const bool should_collect_metrics, const double overhead_pct_threshold,
const std::shared_ptr<ProfileDataCollector> collector,
const bool should_collect_profile_data)
{
std::unique_ptr<InferenceProfiler> local_profiler(new InferenceProfiler(
verbose, stability_threshold, measurement_window_ms, max_trials,
(percentile != -1), percentile, latency_threshold_ms_, protocol, parser,
profile_backend, std::move(manager), measurement_request_count,
measurement_mode, mpi_driver, metrics_interval_ms, should_collect_metrics,
overhead_pct_threshold, collector, should_collect_profile_data));
*profiler = std::move(local_profiler);
return cb::Error::Success;
}
InferenceProfiler::InferenceProfiler(
const bool verbose, const double stability_threshold,
const int32_t measurement_window_ms, const size_t max_trials,
const bool extra_percentile, const size_t percentile,
const uint64_t latency_threshold_ms_, const cb::ProtocolType protocol,
std::shared_ptr<ModelParser>& parser,
std::shared_ptr<cb::ClientBackend> profile_backend,
std::unique_ptr<LoadManager> manager, uint64_t measurement_request_count,
MeasurementMode measurement_mode, std::shared_ptr<MPIDriver> mpi_driver,
const uint64_t metrics_interval_ms, const bool should_collect_metrics,
const double overhead_pct_threshold,
const std::shared_ptr<ProfileDataCollector> collector,
const bool should_collect_profile_data)
: verbose_(verbose), measurement_window_ms_(measurement_window_ms),
max_trials_(max_trials), extra_percentile_(extra_percentile),
percentile_(percentile), latency_threshold_ms_(latency_threshold_ms_),
protocol_(protocol), parser_(parser), profile_backend_(profile_backend),
manager_(std::move(manager)),
measurement_request_count_(measurement_request_count),
measurement_mode_(measurement_mode), mpi_driver_(mpi_driver),
should_collect_metrics_(should_collect_metrics),
overhead_pct_threshold_(overhead_pct_threshold), collector_(collector),
should_collect_profile_data_(should_collect_profile_data)
{
load_parameters_.stability_threshold = stability_threshold;
load_parameters_.stability_window = 3;
if (profile_backend_->Kind() == cb::BackendKind::TRITON ||
profile_backend_->Kind() == cb::BackendKind::TRITON_C_API) {
// Measure and report client library stats only when the model
// is not decoupled.
include_lib_stats_ = (!parser_->IsDecoupled());
// Measure and report server statistics only when the server
// supports the statistics extension.
std::set<std::string> extensions;
profile_backend_->ServerExtensions(&extensions);
include_server_stats_ = (extensions.find("statistics") != extensions.end());
} else {
include_lib_stats_ = true;
include_server_stats_ = false;
}
if (should_collect_metrics_) {
metrics_manager_ =
std::make_shared<MetricsManager>(profile_backend, metrics_interval_ms);
}
}
cb::Error
InferenceProfiler::Profile(
const size_t concurrent_request_count,
std::vector<PerfStatus>& perf_statuses, bool& meets_threshold,
bool& is_stable)
{
cb::Error err;
PerfStatus perf_status{};
perf_status.concurrency = concurrent_request_count;
is_stable = false;
meets_threshold = true;
RETURN_IF_ERROR(dynamic_cast<ConcurrencyManager*>(manager_.get())
->ChangeConcurrencyLevel(concurrent_request_count));
err = ProfileHelper(perf_status, &is_stable);
if (err.IsOk()) {
uint64_t stabilizing_latency_ms =
perf_status.stabilizing_latency_ns / NANOS_PER_MILLIS;
if ((stabilizing_latency_ms >= latency_threshold_ms_) &&
(latency_threshold_ms_ != NO_LIMIT)) {
std::cerr << "Measured latency went over the set limit of "
<< latency_threshold_ms_ << " msec. " << std::endl;
meets_threshold = false;
} else if (!is_stable) {
if (measurement_mode_ == MeasurementMode::TIME_WINDOWS) {
std::cerr << "Failed to obtain stable measurement within "
<< max_trials_ << " measurement windows for concurrency "
<< concurrent_request_count << ". Please try to "
<< "increase the --measurement-interval." << std::endl;
} else if (measurement_mode_ == MeasurementMode::COUNT_WINDOWS) {
std::cerr << "Failed to obtain stable measurement within "
<< max_trials_ << " measurement windows for concurrency "
<< concurrent_request_count << ". Please try to "
<< "increase the --measurement-request-count." << std::endl;
}
meets_threshold = false;
} else {
perf_statuses.push_back(perf_status);
err = Report(
perf_status, percentile_, protocol_, verbose_, include_lib_stats_,
include_server_stats_, parser_, should_collect_metrics_,
overhead_pct_threshold_);
if (!err.IsOk()) {
std::cerr << err;
meets_threshold = false;
}
}
} else {
return err;
}
return cb::Error::Success;
}
cb::Error
InferenceProfiler::Profile(
const double request_rate, std::vector<PerfStatus>& perf_statuses,
bool& meets_threshold, bool& is_stable)
{
cb::Error err;
PerfStatus perf_status{};
perf_status.request_rate = request_rate;
is_stable = false;
meets_threshold = true;
RETURN_IF_ERROR(dynamic_cast<RequestRateManager*>(manager_.get())
->ChangeRequestRate(request_rate));
std::cout << "Request Rate: " << request_rate
<< " inference requests per seconds" << std::endl;
err = ProfileHelper(perf_status, &is_stable);
if (err.IsOk()) {
uint64_t stabilizing_latency_ms =
perf_status.stabilizing_latency_ns / NANOS_PER_MILLIS;
if ((stabilizing_latency_ms >= latency_threshold_ms_) &&
(latency_threshold_ms_ != NO_LIMIT)) {
std::cerr << "Measured latency went over the set limit of "
<< latency_threshold_ms_ << " msec. " << std::endl;
meets_threshold = false;
} else if (!is_stable) {
std::cerr << "Failed to obtain stable measurement." << std::endl;
meets_threshold = false;
} else {
perf_statuses.push_back(perf_status);
err = Report(
perf_status, percentile_, protocol_, verbose_, include_lib_stats_,
include_server_stats_, parser_, should_collect_metrics_,
overhead_pct_threshold_);
if (!err.IsOk()) {
std::cerr << err;
meets_threshold = false;
}
}
} else {
return err;
}
return cb::Error::Success;
}
cb::Error
InferenceProfiler::Profile(
std::vector<PerfStatus>& perf_statuses, bool& meets_threshold,
bool& is_stable)
{
cb::Error err;
PerfStatus perf_status{};
RETURN_IF_ERROR(
dynamic_cast<CustomLoadManager*>(manager_.get())->InitCustomIntervals());
RETURN_IF_ERROR(dynamic_cast<CustomLoadManager*>(manager_.get())
->GetCustomRequestRate(&perf_status.request_rate));
is_stable = false;
meets_threshold = true;
err = ProfileHelper(perf_status, &is_stable);
if (err.IsOk()) {
uint64_t stabilizing_latency_ms =
perf_status.stabilizing_latency_ns / NANOS_PER_MILLIS;
if ((stabilizing_latency_ms >= latency_threshold_ms_) &&
(latency_threshold_ms_ != NO_LIMIT)) {
std::cerr << "Measured latency went over the set limit of "
<< latency_threshold_ms_ << " msec. " << std::endl;
meets_threshold = false;
} else if (!is_stable) {
std::cerr << "Failed to obtain stable measurement." << std::endl;
meets_threshold = false;
} else {
perf_statuses.push_back(perf_status);
err = Report(
perf_status, percentile_, protocol_, verbose_, include_lib_stats_,
include_server_stats_, parser_, should_collect_metrics_,
overhead_pct_threshold_);
if (!err.IsOk()) {
std::cerr << err;
meets_threshold = false;
}
}
} else {
return err;
}
return cb::Error::Success;
}
cb::Error
InferenceProfiler::ProfileHelper(
PerfStatus& experiment_perf_status, bool* is_stable)
{
// Start measurement
LoadStatus load_status;
size_t completed_trials = 0;
std::queue<cb::Error> error;
std::deque<PerfStatus> measurement_perf_statuses;
all_request_records_.clear();
previous_window_end_ns_ = 0;
// Start with a fresh empty request records vector in the manager
//
std::vector<RequestRecord> empty_request_records;
RETURN_IF_ERROR(manager_->SwapRequestRecords(empty_request_records));
do {
PerfStatus measurement_perf_status;
measurement_perf_status.concurrency = experiment_perf_status.concurrency;
measurement_perf_status.request_rate = experiment_perf_status.request_rate;
RETURN_IF_ERROR(manager_->CheckHealth());
if (measurement_mode_ == MeasurementMode::TIME_WINDOWS) {
error.push(
Measure(measurement_perf_status, measurement_window_ms_, false));
} else {
error.push(
Measure(measurement_perf_status, measurement_request_count_, true));
}
measurement_perf_statuses.push_back(measurement_perf_status);
if (error.size() > load_parameters_.stability_window) {
error.pop();
measurement_perf_statuses.pop_front();
}
if (error.back().IsOk()) {
load_status.infer_per_sec.push_back(
measurement_perf_status.client_stats.infer_per_sec);
load_status.latencies.push_back(
measurement_perf_status.stabilizing_latency_ns);
} else {
load_status.infer_per_sec.push_back(0);
load_status.latencies.push_back(std::numeric_limits<uint64_t>::max());
}
load_status.avg_ips +=
load_status.infer_per_sec.back() / load_parameters_.stability_window;
load_status.avg_latency +=
load_status.latencies.back() / load_parameters_.stability_window;
if (verbose_) {
if (error.back().IsOk()) {
std::cout << " Pass [" << (completed_trials + 1)
<< "] throughput: " << load_status.infer_per_sec.back()
<< " infer/sec. ";
if (extra_percentile_) {
std::cout << "p" << percentile_ << " latency: "
<< (measurement_perf_status.client_stats
.percentile_latency_ns.find(percentile_)
->second /
1000)
<< " usec" << std::endl;
} else {
std::cout << "Avg latency: "
<< (measurement_perf_status.client_stats.avg_latency_ns /
1000)
<< " usec (std "
<< measurement_perf_status.client_stats.std_us << " usec). "
<< std::endl;
}
} else {
std::cout << " Pass [" << (completed_trials + 1)
<< "] cb::Error: " << error.back().Message() << std::endl;
}
}
*is_stable = DetermineStability(load_status);
if (IsDoneProfiling(load_status, is_stable)) {
break;
}
completed_trials++;
} while ((!early_exit) && (completed_trials < max_trials_));
if (should_collect_metrics_) {
metrics_manager_->StopQueryingMetrics();
}
// return the appropriate error which might have occurred in the
// stability_window for its proper handling.
while (!error.empty()) {
if (!error.front().IsOk()) {
return error.front();
} else {
error.pop();
}
}
// Only merge the results if the results have stabilized.
if (*is_stable) {
RETURN_IF_ERROR(MergePerfStatusReports(
measurement_perf_statuses, experiment_perf_status));
}
if (early_exit) {
return cb::Error("Received exit signal.", pa::GENERIC_ERROR);
}
return cb::Error::Success;
}
bool
InferenceProfiler::DetermineStability(LoadStatus& load_status)
{
bool stable = false;
if (load_status.infer_per_sec.size() >= load_parameters_.stability_window) {
stable = true;
size_t idx =
load_status.infer_per_sec.size() - load_parameters_.stability_window;
for (size_t i = idx; i < load_status.infer_per_sec.size(); i++) {
if (load_status.infer_per_sec[i] == 0) {
stable = false;
}
}
stable = stable && CheckWindowForStability(idx, load_status);
}
return stable;
}
bool
InferenceProfiler::CheckWindowForStability(size_t idx, LoadStatus& load_status)
{
return IsInferWindowStable(idx, load_status) &&
IsLatencyWindowStable(idx, load_status);
}
bool
InferenceProfiler::IsInferWindowStable(size_t idx, LoadStatus& load_status)
{
auto infer_start = std::begin(load_status.infer_per_sec) + idx;
auto infer_per_sec_measurements = std::minmax_element(
infer_start, infer_start + load_parameters_.stability_window);
auto max_infer_per_sec = *infer_per_sec_measurements.second;
auto min_infer_per_sec = *infer_per_sec_measurements.first;
return max_infer_per_sec / min_infer_per_sec <=
1 + load_parameters_.stability_threshold;
}
bool
InferenceProfiler::IsLatencyWindowStable(size_t idx, LoadStatus& load_status)
{
auto latency_start = std::begin(load_status.latencies) + idx;
auto latencies_per_sec_measurements = std::minmax_element(
latency_start, latency_start + load_parameters_.stability_window);
double max_latency = *latencies_per_sec_measurements.second;
double min_latency = *latencies_per_sec_measurements.first;
return max_latency / min_latency <= 1 + load_parameters_.stability_threshold;
}
bool
InferenceProfiler::IsDoneProfiling(LoadStatus& load_status, bool* is_stable)
{
bool done = false;
bool within_threshold = true;
if (load_status.infer_per_sec.size() >= load_parameters_.stability_window) {
size_t idx =
load_status.infer_per_sec.size() - load_parameters_.stability_window;
for (; idx < load_status.infer_per_sec.size(); idx++) {
within_threshold &= CheckWithinThreshold(idx, load_status);
}
}
if (mpi_driver_->IsMPIRun()) {
if (AllMPIRanksAreStable(*is_stable)) {
done = true;
}
} else if (*is_stable) {
done = true;
}
if ((!within_threshold) && (latency_threshold_ms_ != NO_LIMIT)) {
done = true;
}
return done;
}
bool
InferenceProfiler::CheckWithinThreshold(size_t idx, LoadStatus& load_status)
{
return load_status.latencies[idx] <
(latency_threshold_ms_ * NANOS_PER_MILLIS);
}
cb::Error
InferenceProfiler::MergeServerSideStats(
std::vector<ServerSideStats>& server_side_stats,
ServerSideStats& server_side_summary)
{
auto& server_side_stat = server_side_stats[0];
// Make sure that the perf status reports profiling settings match with each
// other.
for (size_t i = 1; i < server_side_stats.size(); i++) {
if (server_side_stats[i].composing_models_stat.size() !=
server_side_stat.composing_models_stat.size()) {
return cb::Error(
"Inconsistent ensemble setting detected between the trials.",
pa::GENERIC_ERROR);
}
}
// Initialize the server stats for the merged report.
server_side_summary.inference_count = 0;
server_side_summary.execution_count = 0;
server_side_summary.cache_hit_count = 0;
server_side_summary.cache_miss_count = 0;
server_side_summary.success_count = 0;
server_side_summary.queue_count = 0;
server_side_summary.compute_input_count = 0;
server_side_summary.compute_output_count = 0;
server_side_summary.compute_infer_count = 0;
server_side_summary.cumm_time_ns = 0;
server_side_summary.queue_time_ns = 0;
server_side_summary.compute_input_time_ns = 0;
server_side_summary.compute_infer_time_ns = 0;
server_side_summary.compute_output_time_ns = 0;
server_side_summary.cache_hit_time_ns = 0;
server_side_summary.cache_miss_time_ns = 0;
server_side_summary.composing_models_stat.clear();
for (auto& composing_model_stat : server_side_stat.composing_models_stat) {
std::vector<ServerSideStats> composing_model_stats;
for (auto& server_side_stat : server_side_stats) {
composing_model_stats.push_back(
server_side_stat.composing_models_stat[composing_model_stat.first]);
}
ServerSideStats merged_composing_model_stats;
RETURN_IF_ERROR(MergeServerSideStats(
composing_model_stats, merged_composing_model_stats));
server_side_summary.composing_models_stat.insert(
{composing_model_stat.first, merged_composing_model_stats});
}
for (auto& server_side_stat : server_side_stats) {
// Aggregated Server Stats
server_side_summary.inference_count += server_side_stat.inference_count;
server_side_summary.execution_count += server_side_stat.execution_count;
server_side_summary.cache_hit_count += server_side_stat.cache_hit_count;
server_side_summary.cache_miss_count += server_side_stat.cache_miss_count;
server_side_summary.success_count += server_side_stat.success_count;
server_side_summary.queue_count += server_side_stat.queue_count;
server_side_summary.compute_input_count +=
server_side_stat.compute_input_count;
server_side_summary.compute_infer_count +=
server_side_stat.compute_infer_count;
server_side_summary.compute_output_count +=
server_side_stat.compute_output_count;
server_side_summary.cumm_time_ns += server_side_stat.cumm_time_ns;
server_side_summary.queue_time_ns += server_side_stat.queue_time_ns;
server_side_summary.compute_input_time_ns +=
server_side_stat.compute_input_time_ns;
server_side_summary.compute_infer_time_ns +=
server_side_stat.compute_infer_time_ns;
server_side_summary.compute_output_time_ns +=
server_side_stat.compute_output_time_ns;
server_side_summary.cache_hit_time_ns += server_side_stat.cache_hit_time_ns;
server_side_summary.cache_miss_time_ns +=
server_side_stat.cache_miss_time_ns;
}
return cb::Error::Success;
}
cb::Error
InferenceProfiler::MergePerfStatusReports(
std::deque<PerfStatus>& perf_status_reports,
PerfStatus& experiment_perf_status)
{
auto& perf_status = perf_status_reports[0];
// Make sure that the perf status reports profiling settings match with each
// other.
for (size_t i = 1; i < perf_status_reports.size(); i++) {
perf_status.concurrency = experiment_perf_status.concurrency;
perf_status.request_rate = experiment_perf_status.request_rate;
if (perf_status_reports[i].on_sequence_model !=
perf_status.on_sequence_model) {
return cb::Error(
"Inconsistent sequence setting detected.", pa::GENERIC_ERROR);
}
if (perf_status_reports[i].batch_size != perf_status.batch_size) {
return cb::Error("Inconsistent batch size detected.", pa::GENERIC_ERROR);
}
if (perf_status_reports[i].server_stats.composing_models_stat.size() !=
perf_status.server_stats.composing_models_stat.size()) {
return cb::Error(
"Inconsistent ensemble setting detected between the trials.",
pa::GENERIC_ERROR);
}
}
experiment_perf_status.batch_size = perf_status.batch_size;
experiment_perf_status.on_sequence_model = perf_status.on_sequence_model;
// Initialize the client stats for the merged report.
experiment_perf_status.client_stats.request_count = 0;
experiment_perf_status.client_stats.sequence_count = 0;
experiment_perf_status.client_stats.delayed_request_count = 0;
experiment_perf_status.client_stats.duration_ns = 0;
experiment_perf_status.client_stats.avg_latency_ns = 0;
experiment_perf_status.client_stats.percentile_latency_ns.clear();
experiment_perf_status.client_stats.latencies.clear();
experiment_perf_status.client_stats.std_us = 0;
experiment_perf_status.client_stats.avg_request_time_ns = 0;
experiment_perf_status.client_stats.avg_send_time_ns = 0;
experiment_perf_status.client_stats.avg_receive_time_ns = 0;
experiment_perf_status.client_stats.infer_per_sec = 0;
experiment_perf_status.client_stats.sequence_per_sec = 0;
experiment_perf_status.client_stats.completed_count = 0;
experiment_perf_status.stabilizing_latency_ns = 0;
experiment_perf_status.overhead_pct = 0;
experiment_perf_status.send_request_rate = 0.0;
std::vector<ServerSideStats> server_side_stats;
for (auto& perf_status : perf_status_reports) {
// Aggregated Client Stats
experiment_perf_status.client_stats.request_count +=
perf_status.client_stats.request_count;
experiment_perf_status.client_stats.sequence_count +=
perf_status.client_stats.sequence_count;
experiment_perf_status.client_stats.delayed_request_count +=
perf_status.client_stats.delayed_request_count;
experiment_perf_status.client_stats.response_count +=
perf_status.client_stats.response_count;
experiment_perf_status.client_stats.duration_ns +=
perf_status.client_stats.duration_ns;
server_side_stats.push_back(perf_status.server_stats);
experiment_perf_status.client_stats.latencies.insert(
experiment_perf_status.client_stats.latencies.end(),
perf_status.client_stats.latencies.begin(),
perf_status.client_stats.latencies.end());
// Accumulate the overhead percentage and send rate here to remove extra
// traversals over the perf_status_reports
experiment_perf_status.overhead_pct += perf_status.overhead_pct;
experiment_perf_status.send_request_rate += perf_status.send_request_rate;
}
// Calculate the average overhead_pct for the experiment.
experiment_perf_status.overhead_pct /= perf_status_reports.size();
experiment_perf_status.send_request_rate /= perf_status_reports.size();
if (include_lib_stats_) {
for (auto& perf_status : perf_status_reports) {
experiment_perf_status.client_stats.completed_count +=
perf_status.client_stats.completed_count;
experiment_perf_status.client_stats.avg_request_time_ns +=
perf_status.client_stats.avg_request_time_ns *
perf_status.client_stats.completed_count;
experiment_perf_status.client_stats.avg_send_time_ns +=
perf_status.client_stats.avg_send_time_ns *
perf_status.client_stats.completed_count;
experiment_perf_status.client_stats.avg_receive_time_ns +=
perf_status.client_stats.avg_receive_time_ns *
perf_status.client_stats.completed_count;
}
if (experiment_perf_status.client_stats.completed_count != 0) {
experiment_perf_status.client_stats.avg_request_time_ns =
experiment_perf_status.client_stats.avg_request_time_ns /
experiment_perf_status.client_stats.completed_count;
experiment_perf_status.client_stats.avg_send_time_ns =
experiment_perf_status.client_stats.avg_send_time_ns /
experiment_perf_status.client_stats.completed_count;
experiment_perf_status.client_stats.avg_receive_time_ns =
experiment_perf_status.client_stats.avg_receive_time_ns /
experiment_perf_status.client_stats.completed_count;
}
}
RETURN_IF_ERROR(MergeServerSideStats(
server_side_stats, experiment_perf_status.server_stats));
std::sort(
experiment_perf_status.client_stats.latencies.begin(),
experiment_perf_status.client_stats.latencies.end());
float client_duration_sec =
(float)experiment_perf_status.client_stats.duration_ns / NANOS_PER_SECOND;
experiment_perf_status.client_stats.sequence_per_sec =
experiment_perf_status.client_stats.sequence_count / client_duration_sec;
experiment_perf_status.client_stats.infer_per_sec =
(experiment_perf_status.client_stats.request_count *
experiment_perf_status.batch_size) /
client_duration_sec;
experiment_perf_status.client_stats.responses_per_sec =
experiment_perf_status.client_stats.response_count / client_duration_sec;
RETURN_IF_ERROR(SummarizeLatency(
experiment_perf_status.client_stats.latencies, experiment_perf_status));
if (should_collect_metrics_) {
// Put all Metric objects in a flat vector so they're easier to merge
std::vector<std::reference_wrapper<const Metrics>> all_metrics{};
std::for_each(
perf_status_reports.begin(), perf_status_reports.end(),
[&all_metrics](const PerfStatus& p) {
std::for_each(
p.metrics.begin(), p.metrics.end(),
[&all_metrics](const Metrics& m) { all_metrics.push_back(m); });
});
Metrics merged_metrics{};
RETURN_IF_ERROR(MergeMetrics(all_metrics, merged_metrics));
experiment_perf_status.metrics.push_back(std::move(merged_metrics));
}
return cb::Error::Success;
}
cb::Error
InferenceProfiler::GetServerSideStatus(
std::map<cb::ModelIdentifier, cb::ModelStatistics>* model_stats)
{
if ((parser_->SchedulerType() == ModelParser::ENSEMBLE) ||
(parser_->SchedulerType() == ModelParser::ENSEMBLE_SEQUENCE)) {
RETURN_IF_ERROR(profile_backend_->ModelInferenceStatistics(model_stats));
} else {
RETURN_IF_ERROR(profile_backend_->ModelInferenceStatistics(
model_stats, parser_->ModelName(), parser_->ModelVersion()));
}
return cb::Error::Success;
}
// Used for measurement
cb::Error
InferenceProfiler::Measure(
PerfStatus& perf_status, uint64_t measurement_window, bool is_count_based)
{
std::map<cb::ModelIdentifier, cb::ModelStatistics> start_status;
std::map<cb::ModelIdentifier, cb::ModelStatistics> end_status;
cb::InferStat start_stat;
cb::InferStat end_stat;
manager_->ResetIdleTime();
// Set current window start time to end of previous window. For first
// measurement window, capture start time, server side stats, and client side
// stats.
uint64_t window_start_ns = previous_window_end_ns_;
start_stat = prev_client_side_stats_;
start_status = prev_server_side_stats_;
if (window_start_ns == 0) {
window_start_ns = std::chrono::duration_cast<std::chrono::nanoseconds>(
std::chrono::system_clock::now().time_since_epoch())
.count();
if (should_collect_metrics_) {
metrics_manager_->StartQueryingMetrics();
}
if (include_server_stats_) {
RETURN_IF_ERROR(GetServerSideStatus(&start_status));
}
RETURN_IF_ERROR(manager_->GetAccumulatedClientStat(&start_stat));
}
if (should_collect_metrics_) {
try {
metrics_manager_->CheckQueryingStatus();
}
catch (const std::exception& e) {
return cb::Error(e.what(), pa::GENERIC_ERROR);
}
}
if (!is_count_based) {
// Wait for specified time interval in msec
std::this_thread::sleep_for(
std::chrono::milliseconds((uint64_t)(measurement_window_ms_ * 1.2)));
} else {
do {
// Check the health of the worker threads.
RETURN_IF_ERROR(manager_->CheckHealth());
// Wait for 1s until enough samples have been collected.
std::this_thread::sleep_for(std::chrono::milliseconds((uint64_t)1000));
} while (manager_->CountCollectedRequests() < measurement_window);
}
uint64_t window_end_ns =
std::chrono::duration_cast<std::chrono::nanoseconds>(
std::chrono::system_clock::now().time_since_epoch())
.count();
previous_window_end_ns_ = window_end_ns;
if (should_collect_metrics_) {
metrics_manager_->GetLatestMetrics(perf_status.metrics);
}
// Get server status and then print report on difference between
// before and after status.
if (include_server_stats_) {
RETURN_IF_ERROR(GetServerSideStatus(&end_status));
prev_server_side_stats_ = end_status;
}
RETURN_IF_ERROR(manager_->GetAccumulatedClientStat(&end_stat));
prev_client_side_stats_ = end_stat;
std::vector<RequestRecord> current_request_records;
RETURN_IF_ERROR(manager_->SwapRequestRecords(current_request_records));
all_request_records_.insert(
all_request_records_.end(), current_request_records.begin(),
current_request_records.end());
RETURN_IF_ERROR(Summarize(
start_status, end_status, start_stat, end_stat, perf_status,
window_start_ns, window_end_ns));
return cb::Error::Success;
}
cb::Error
InferenceProfiler::Summarize(
const std::map<cb::ModelIdentifier, cb::ModelStatistics>& start_status,
const std::map<cb::ModelIdentifier, cb::ModelStatistics>& end_status,
const cb::InferStat& start_stat, const cb::InferStat& end_stat,
PerfStatus& summary, uint64_t window_start_ns, uint64_t window_end_ns)
{
size_t valid_sequence_count = 0;
size_t delayed_request_count = 0;
size_t response_count = 0;
// Get measurement from requests that fall within the time interval
std::pair<uint64_t, uint64_t> valid_range{window_start_ns, window_end_ns};
uint64_t window_duration_ns = valid_range.second - valid_range.first;
std::vector<uint64_t> latencies;
std::vector<RequestRecord> valid_requests{};
ValidLatencyMeasurement(
valid_range, valid_sequence_count, delayed_request_count, &latencies,
response_count, valid_requests);
if (should_collect_profile_data_) {
CollectData(
summary, window_start_ns, window_end_ns, std::move(valid_requests));
}
RETURN_IF_ERROR(SummarizeLatency(latencies, summary));
RETURN_IF_ERROR(SummarizeClientStat(
start_stat, end_stat, window_duration_ns, latencies.size(),
valid_sequence_count, delayed_request_count, response_count, summary));
summary.client_stats.latencies = std::move(latencies);
SummarizeOverhead(window_duration_ns, manager_->GetIdleTime(), summary);
double window_duration_s{
window_duration_ns / static_cast<double>(NANOS_PER_SECOND)};
SummarizeSendRequestRate(
window_duration_s, manager_->GetAndResetNumSentRequests(), summary);
if (include_server_stats_) {
RETURN_IF_ERROR(SummarizeServerStats(
start_status, end_status, &(summary.server_stats)));
}
return cb::Error::Success;
}
void
InferenceProfiler::ValidLatencyMeasurement(
const std::pair<uint64_t, uint64_t>& valid_range,
size_t& valid_sequence_count, size_t& delayed_request_count,
std::vector<uint64_t>* valid_latencies, size_t& response_count,
std::vector<RequestRecord>& valid_requests)
{
valid_latencies->clear();
valid_sequence_count = 0;
response_count = 0;
std::vector<size_t> erase_indices{};
for (size_t i = 0; i < all_request_records_.size(); i++) {
const auto& request_record = all_request_records_[i];
uint64_t request_start_ns = CHRONO_TO_NANOS(request_record.start_time_);
uint64_t request_end_ns;
if (request_record.has_null_last_response_ == false) {
request_end_ns = CHRONO_TO_NANOS(request_record.response_times_.back());
} else if (request_record.response_times_.size() > 1) {
size_t last_response_idx{request_record.response_times_.size() - 2};
request_end_ns =
CHRONO_TO_NANOS(request_record.response_times_[last_response_idx]);
} else {
erase_indices.push_back(i);
continue;
}
if (request_start_ns <= request_end_ns) {
// Only counting requests that end within the time interval
if ((request_end_ns >= valid_range.first) &&
(request_end_ns <= valid_range.second)) {
valid_latencies->push_back(request_end_ns - request_start_ns);
response_count += request_record.response_times_.size();
if (request_record.has_null_last_response_) {
response_count--;
}
erase_indices.push_back(i);
if (request_record.sequence_end_) {
valid_sequence_count++;
}
if (request_record.delayed_) {
delayed_request_count++;
}
}
}
}
std::for_each(
erase_indices.begin(), erase_indices.end(),
[this, &valid_requests](size_t i) {
valid_requests.push_back(std::move(this->all_request_records_[i]));
});
// Iterate through erase indices backwards so that erases from
// `all_request_records_` happen from the back to the front to avoid using
// wrong indices after subsequent erases
std::for_each(erase_indices.rbegin(), erase_indices.rend(), [this](size_t i) {
this->all_request_records_.erase(this->all_request_records_.begin() + i);
});
// Always sort measured latencies as percentile will be reported as default
std::sort(valid_latencies->begin(), valid_latencies->end());
}
void
InferenceProfiler::CollectData(
PerfStatus& summary, uint64_t window_start_ns, uint64_t window_end_ns,
std::vector<RequestRecord>&& request_records)
{
InferenceLoadMode id{summary.concurrency, summary.request_rate};
collector_->AddWindow(id, window_start_ns, window_end_ns);
collector_->AddData(id, std::move(request_records));
}
cb::Error
InferenceProfiler::SummarizeLatency(
const std::vector<uint64_t>& latencies, PerfStatus& summary)
{
if (latencies.size() == 0) {
return cb::Error(
"No valid requests recorded within time interval."
" Please use a larger time window.",
pa::OPTION_ERROR);
}
std::tie(summary.client_stats.avg_latency_ns, summary.client_stats.std_us) =
GetMeanAndStdDev(latencies);
// retrieve other interesting percentile
summary.client_stats.percentile_latency_ns.clear();
std::set<size_t> percentiles{50, 90, 95, 99};
if (extra_percentile_) {
percentiles.emplace(percentile_);
}
for (const auto percentile : percentiles) {
size_t index = (percentile / 100.0) * (latencies.size() - 1) + 0.5;
summary.client_stats.percentile_latency_ns.emplace(
percentile, latencies[index]);
}
if (extra_percentile_) {
summary.stabilizing_latency_ns =
summary.client_stats.percentile_latency_ns.find(percentile_)->second;
} else {
summary.stabilizing_latency_ns = summary.client_stats.avg_latency_ns;
}
return cb::Error::Success;
}
std::tuple<uint64_t, uint64_t>
InferenceProfiler::GetMeanAndStdDev(const std::vector<uint64_t>& latencies)
{
uint64_t avg_latency_ns{0};
uint64_t std_dev_latency_us{0};
// calculate mean of latencies
uint64_t tol_latency_ns{
std::accumulate(latencies.begin(), latencies.end(), 0ULL)};
avg_latency_ns = tol_latency_ns / latencies.size();
// calculate sample standard deviation of latencies
uint64_t sq_sum_latency_avg_diff_ns{0};
std::for_each(
latencies.begin(), latencies.end(),
[avg_latency_ns, &sq_sum_latency_avg_diff_ns](uint64_t l) {
sq_sum_latency_avg_diff_ns += static_cast<int64_t>(l - avg_latency_ns) *
static_cast<int64_t>(l - avg_latency_ns);
});
if (latencies.size() > 1) {
std_dev_latency_us =
std::sqrt(sq_sum_latency_avg_diff_ns / (latencies.size() - 1)) / 1000;
} else {
std_dev_latency_us = UINT64_MAX;
std::cerr << "WARNING: Pass contained only one request, so sample latency "
"standard deviation will be infinity (UINT64_MAX)."
<< std::endl;
}
return std::make_tuple(avg_latency_ns, std_dev_latency_us);
}
cb::Error
InferenceProfiler::SummarizeClientStat(
const cb::InferStat& start_stat, const cb::InferStat& end_stat,
const uint64_t duration_ns, const size_t valid_request_count,
const size_t valid_sequence_count, const size_t delayed_request_count,
const size_t response_count, PerfStatus& summary)
{
summary.on_sequence_model =
((parser_->SchedulerType() == ModelParser::SEQUENCE) ||
(parser_->SchedulerType() == ModelParser::ENSEMBLE_SEQUENCE));
summary.batch_size = std::max(manager_->BatchSize(), (size_t)1);
summary.client_stats.request_count = valid_request_count;
summary.client_stats.sequence_count = valid_sequence_count;
summary.client_stats.delayed_request_count = delayed_request_count;
summary.client_stats.response_count = response_count;
summary.client_stats.duration_ns = duration_ns;
float client_duration_sec =
(float)summary.client_stats.duration_ns / NANOS_PER_SECOND;
summary.client_stats.sequence_per_sec =
valid_sequence_count / client_duration_sec;
summary.client_stats.infer_per_sec =
(valid_request_count * summary.batch_size) / client_duration_sec;
summary.client_stats.responses_per_sec = response_count / client_duration_sec;
if (include_lib_stats_) {
size_t completed_count =
end_stat.completed_request_count - start_stat.completed_request_count;
uint64_t request_time_ns = end_stat.cumulative_total_request_time_ns -
start_stat.cumulative_total_request_time_ns;
summary.client_stats.completed_count = completed_count;
uint64_t send_time_ns =
end_stat.cumulative_send_time_ns - start_stat.cumulative_send_time_ns;
uint64_t receive_time_ns = end_stat.cumulative_receive_time_ns -
start_stat.cumulative_receive_time_ns;
if (completed_count != 0) {
summary.client_stats.avg_request_time_ns =
request_time_ns / completed_count;
summary.client_stats.avg_send_time_ns = send_time_ns / completed_count;
summary.client_stats.avg_receive_time_ns =
receive_time_ns / completed_count;
}
}
return cb::Error::Success;
}
void
InferenceProfiler::SummarizeSendRequestRate(
const double window_duration_s, const size_t num_sent_requests,
PerfStatus& summary)
{
if (window_duration_s <= 0.0) {
throw std::runtime_error("window_duration_s must be positive");
}
summary.send_request_rate = num_sent_requests / window_duration_s;
}
cb::Error
InferenceProfiler::DetermineStatsModelVersion(
const cb::ModelIdentifier& model_identifier,
const std::map<cb::ModelIdentifier, cb::ModelStatistics>& start_stats,
const std::map<cb::ModelIdentifier, cb::ModelStatistics>& end_stats,
int64_t* status_model_version)
{
// If model_version is unspecified then look in the stats to find the
// version with stats that incremented during the measurement.
//
// If multiple versions had incremented stats, use the highest numbered one
// and print a warning
*status_model_version = -1;
bool multiple_found = false;
bool version_unspecified = model_identifier.second.empty();
if (version_unspecified) {
for (const auto& x : end_stats) {
const auto& end_id = x.first;
const auto& end_stat = x.second;
bool is_correct_model_name =
model_identifier.first.compare(end_id.first) == 0;
if (is_correct_model_name) {
uint64_t end_queue_count = end_stat.queue_count_;
uint64_t start_queue_count = 0;
const auto& itr = start_stats.find(end_id);
if (itr != start_stats.end()) {
start_queue_count = itr->second.queue_count_;
}
if (end_queue_count > start_queue_count) {
int64_t this_version = std::stoll(end_id.second);
if (*status_model_version != -1) {
multiple_found = true;
}
*status_model_version = std::max(*status_model_version, this_version);
}
}
}
} else {
const auto& itr = end_stats.find(model_identifier);
if (itr != end_stats.end()) {
*status_model_version = std::stoll(model_identifier.second);
}
}
if (*status_model_version == -1) {
return cb::Error(
"failed to find the requested model version", pa::GENERIC_ERROR);
}
if (multiple_found) {
std::cerr << "WARNING: Multiple versions of model "
<< model_identifier.first
<< " are loaded in the triton server, and the version to use was "
"unspecified. The stats for that model may be inaccurate."
<< std::endl;
}
return cb::Error::Success;
}
cb::Error
InferenceProfiler::SummarizeServerStats(
const std::map<cb::ModelIdentifier, cb::ModelStatistics>& start_status,
const std::map<cb::ModelIdentifier, cb::ModelStatistics>& end_status,
ServerSideStats* server_stats)
{
RETURN_IF_ERROR(SummarizeServerStats(
std::make_pair(parser_->ModelName(), parser_->ModelVersion()),
start_status, end_status, server_stats));
return cb::Error::Success;
}
cb::Error
InferenceProfiler::SummarizeServerStats(
const cb::ModelIdentifier& model_identifier,
const std::map<cb::ModelIdentifier, cb::ModelStatistics>& start_status,
const std::map<cb::ModelIdentifier, cb::ModelStatistics>& end_status,
ServerSideStats* server_stats)
{
RETURN_IF_ERROR(SummarizeServerStatsHelper(
model_identifier, start_status, end_status, server_stats));
// Summarize the composing models, if any.
for (auto composing_model_identifier :
(*parser_->GetComposingModelMap())[model_identifier.first]) {
int64_t model_version;
RETURN_IF_ERROR(DetermineStatsModelVersion(
composing_model_identifier, start_status, end_status, &model_version));
composing_model_identifier.second = std::to_string(model_version);
auto it = server_stats->composing_models_stat
.emplace(composing_model_identifier, ServerSideStats())
.first;
RETURN_IF_ERROR(SummarizeServerStats(
composing_model_identifier, start_status, end_status, &(it->second)));
}
return cb::Error::Success;
}
cb::Error
InferenceProfiler::SummarizeServerStatsHelper(
const cb::ModelIdentifier& model_identifier,
const std::map<cb::ModelIdentifier, cb::ModelStatistics>& start_status,
const std::map<cb::ModelIdentifier, cb::ModelStatistics>& end_status,
ServerSideStats* server_stats)
{
int64_t model_version;
RETURN_IF_ERROR(DetermineStatsModelVersion(
model_identifier, start_status, end_status, &model_version));
const std::pair<std::string, std::string> this_id(
model_identifier.first, std::to_string(model_version));
const auto& end_itr = end_status.find(this_id);
if (end_itr == end_status.end()) {
return cb::Error(
"missing statistics for requested model", pa::GENERIC_ERROR);
} else {
uint64_t start_infer_cnt = 0;
uint64_t start_exec_cnt = 0;
uint64_t start_cnt = 0;
uint64_t start_queue_cnt = 0;
uint64_t start_compute_input_cnt = 0;
uint64_t start_compute_infer_cnt = 0;
uint64_t start_compute_output_cnt = 0;
uint64_t start_cumm_time_ns = 0;
uint64_t start_queue_time_ns = 0;
uint64_t start_compute_input_time_ns = 0;
uint64_t start_compute_infer_time_ns = 0;
uint64_t start_compute_output_time_ns = 0;
uint64_t start_cache_hit_cnt = 0;
uint64_t start_cache_hit_time_ns = 0;
uint64_t start_cache_miss_cnt = 0;
uint64_t start_cache_miss_time_ns = 0;
const auto& start_itr = start_status.find(this_id);
if (start_itr != start_status.end()) {
start_infer_cnt = start_itr->second.inference_count_;
start_exec_cnt = start_itr->second.execution_count_;
start_cnt = start_itr->second.success_count_;
start_queue_cnt = start_itr->second.queue_count_;
start_compute_input_cnt = start_itr->second.compute_input_count_;
start_compute_infer_cnt = start_itr->second.compute_infer_count_;
start_compute_output_cnt = start_itr->second.compute_output_count_;
start_cumm_time_ns = start_itr->second.cumm_time_ns_;
start_queue_time_ns = start_itr->second.queue_time_ns_;
start_compute_input_time_ns = start_itr->second.compute_input_time_ns_;
start_compute_infer_time_ns = start_itr->second.compute_infer_time_ns_;
start_compute_output_time_ns = start_itr->second.compute_output_time_ns_;
start_cache_hit_cnt = start_itr->second.cache_hit_count_;
start_cache_hit_time_ns = start_itr->second.cache_hit_time_ns_;
start_cache_miss_cnt = start_itr->second.cache_miss_count_;
start_cache_miss_time_ns = start_itr->second.cache_miss_time_ns_;
}
server_stats->inference_count =
end_itr->second.inference_count_ - start_infer_cnt;
server_stats->execution_count =
end_itr->second.execution_count_ - start_exec_cnt;
server_stats->success_count = end_itr->second.success_count_ - start_cnt;
server_stats->queue_count = end_itr->second.queue_count_ - start_queue_cnt;
server_stats->compute_input_count =
end_itr->second.compute_input_count_ - start_compute_input_cnt;
server_stats->compute_infer_count =
end_itr->second.compute_infer_count_ - start_compute_infer_cnt;
server_stats->compute_output_count =
end_itr->second.compute_output_count_ - start_compute_output_cnt;
server_stats->cumm_time_ns =
end_itr->second.cumm_time_ns_ - start_cumm_time_ns;
server_stats->queue_time_ns =
end_itr->second.queue_time_ns_ - start_queue_time_ns;
server_stats->compute_input_time_ns =
end_itr->second.compute_input_time_ns_ - start_compute_input_time_ns;
server_stats->compute_infer_time_ns =
end_itr->second.compute_infer_time_ns_ - start_compute_infer_time_ns;
server_stats->compute_output_time_ns =
end_itr->second.compute_output_time_ns_ - start_compute_output_time_ns;
server_stats->cache_hit_count =
end_itr->second.cache_hit_count_ - start_cache_hit_cnt;
server_stats->cache_hit_time_ns =
end_itr->second.cache_hit_time_ns_ - start_cache_hit_time_ns;
server_stats->cache_miss_count =
end_itr->second.cache_miss_count_ - start_cache_miss_cnt;
server_stats->cache_miss_time_ns =
end_itr->second.cache_miss_time_ns_ - start_cache_miss_time_ns;
}
return cb::Error::Success;
}
void
InferenceProfiler::SummarizeOverhead(
const uint64_t window_duration_ns, const uint64_t idle_ns,
PerfStatus& summary)
{
// The window start/stop is not instantaneous. It is possible that the PA
// overhead is smaller than the delay in the window start/stop process. Treat
// it as 0% overhead (100% idle) in that case
//
if (idle_ns > window_duration_ns) {
summary.overhead_pct = 0;
} else {
uint64_t overhead_ns = window_duration_ns - idle_ns;
double overhead_pct = double(overhead_ns) / window_duration_ns * 100;
summary.overhead_pct = overhead_pct;
}
}
bool
InferenceProfiler::AllMPIRanksAreStable(bool current_rank_stability)
{
int world_size{mpi_driver_->MPICommSizeWorld()};
std::vector<int> stabilities_per_rank{};
stabilities_per_rank.resize(world_size, 0);
int my_rank{mpi_driver_->MPICommRankWorld()};
stabilities_per_rank[my_rank] = static_cast<int>(current_rank_stability);
for (int rank{0}; rank < world_size; rank++) {
mpi_driver_->MPIBcastIntWorld(stabilities_per_rank.data() + rank, 1, rank);
}
bool all_stable{true};
for (int rank{0}; rank < world_size; rank++) {
if (stabilities_per_rank[rank] == 0) {
all_stable = false;
break;
}
}
if (verbose_ && all_stable) {
std::cout << "All models on all MPI ranks are stable" << std::endl;
}
return all_stable;
}
cb::Error
InferenceProfiler::MergeMetrics(
const std::vector<std::reference_wrapper<const Metrics>>& all_metrics,
Metrics& merged_metrics)
{
// Maps from each metric collection mapping gpu uuid to gpu utilization
std::vector<std::reference_wrapper<const std::map<std::string, double>>>
gpu_utilization_per_gpu_maps{};
// Maps from each metric collection mapping gpu uuid to gpu power usage
std::vector<std::reference_wrapper<const std::map<std::string, double>>>
gpu_power_usage_per_gpu_maps{};
// Maps from each metric collection mapping gpu uuid to gpu memory used bytes
std::vector<std::reference_wrapper<const std::map<std::string, uint64_t>>>
gpu_memory_used_bytes_per_gpu_maps{};
// Maps from each metric collection mapping gpu uuid to gpu memory total bytes
std::vector<std::reference_wrapper<const std::map<std::string, uint64_t>>>
gpu_memory_total_bytes_per_gpu_maps{};
// Put all metric maps in vector so they're easier to aggregate
std::for_each(
all_metrics.begin(), all_metrics.end(),
[&gpu_utilization_per_gpu_maps, &gpu_power_usage_per_gpu_maps,
&gpu_memory_used_bytes_per_gpu_maps,
&gpu_memory_total_bytes_per_gpu_maps](
const std::reference_wrapper<const Metrics> m) {
gpu_utilization_per_gpu_maps.push_back(m.get().gpu_utilization_per_gpu);
gpu_power_usage_per_gpu_maps.push_back(m.get().gpu_power_usage_per_gpu);
gpu_memory_used_bytes_per_gpu_maps.push_back(
m.get().gpu_memory_used_bytes_per_gpu);
gpu_memory_total_bytes_per_gpu_maps.push_back(
m.get().gpu_memory_total_bytes_per_gpu);
});
GetMetricAveragePerGPU<double>(
gpu_utilization_per_gpu_maps, merged_metrics.gpu_utilization_per_gpu);
GetMetricAveragePerGPU<double>(
gpu_power_usage_per_gpu_maps, merged_metrics.gpu_power_usage_per_gpu);
GetMetricMaxPerGPU<uint64_t>(
gpu_memory_used_bytes_per_gpu_maps,
merged_metrics.gpu_memory_used_bytes_per_gpu);
GetMetricFirstPerGPU<uint64_t>(
gpu_memory_total_bytes_per_gpu_maps,
merged_metrics.gpu_memory_total_bytes_per_gpu);
return cb::Error::Success;
}
}} // namespace triton::perfanalyzer
// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <algorithm>
#include <cstdint>
#include <deque>
#include <functional>
#include <map>
#include <memory>
#include <string>
#include <thread>
#include <tuple>
#include <vector>
#include "concurrency_manager.h"
#include "constants.h"
#include "custom_load_manager.h"
#include "metrics.h"
#include "metrics_manager.h"
#include "model_parser.h"
#include "mpi_utils.h"
#include "profile_data_collector.h"
#include "request_rate_manager.h"
namespace triton { namespace perfanalyzer {
#ifndef DOCTEST_CONFIG_DISABLE
class NaggyMockInferenceProfiler;
class TestInferenceProfiler;
#endif
/// Constant parameters that determine the whether stopping criteria has met
/// for the current phase of testing
struct LoadParams {
// The number of measurements to account for during calculation of load
// status
uint32_t stability_window;
// The +/- range to account for while assessing load status
double stability_threshold;
};
/// Data structure to keep track of real-time load status and determine whether
/// stopping criteria has met for the current phase of testing.
struct LoadStatus {
// Stores the observations of infer_per_sec and latencies in a vector
std::vector<double> infer_per_sec;
std::vector<uint64_t> latencies;
// Records the average inference per second within the stability window
double avg_ips = 0;
// Stores the average latency within the stability window
uint64_t avg_latency = 0;
};
// Holds the total of the timiming components of composing models of an
// ensemble.
struct EnsembleDurations {
EnsembleDurations()
: total_queue_time_avg_us(0), total_compute_time_avg_us(0),
total_cache_hit_time_avg_us(0), total_cache_miss_time_avg_us(0),
total_combined_cache_compute_time_avg_us(0)
{
}
uint64_t total_queue_time_avg_us;
uint64_t total_compute_time_avg_us;
// Time spent on cache lookups/copies for cache hits
uint64_t total_cache_hit_time_avg_us;
// Time spent on cache lookups/copies/insertions for cache misses
uint64_t total_cache_miss_time_avg_us;
// Combined average of cache and compute times
uint64_t total_combined_cache_compute_time_avg_us;
};
/// Holds the server-side inference statisitcs of the target model and its
/// composing models
struct ServerSideStats {
uint64_t inference_count;
uint64_t execution_count;
uint64_t cache_hit_count;
uint64_t cache_miss_count;
uint64_t success_count;
uint64_t queue_count;
uint64_t compute_input_count;
uint64_t compute_infer_count;
uint64_t compute_output_count;
uint64_t cumm_time_ns;
uint64_t queue_time_ns;
uint64_t compute_input_time_ns;
uint64_t compute_infer_time_ns;
uint64_t compute_output_time_ns;
// Time spent on cache lookups/copies for cache hits
uint64_t cache_hit_time_ns;
// Time spent on cache lookups/copies/insertions for cache misses
uint64_t cache_miss_time_ns;
std::map<cb::ModelIdentifier, ServerSideStats> composing_models_stat;
};
/// Holds the statistics recorded at the client side.
struct ClientSideStats {
// Request count and elapsed time measured by client
uint64_t request_count;
// Only record sequences that finish within the measurement window
uint64_t sequence_count;
// The number of requests that missed their schedule
uint64_t delayed_request_count;
// The number of responses
uint64_t response_count;
uint64_t duration_ns;
uint64_t avg_latency_ns;
// a ordered map of percentiles to be reported (<percentile, value> pair)
std::map<size_t, uint64_t> percentile_latency_ns;
// List of all the valid latencies.
std::vector<uint64_t> latencies;
// Using usec to avoid square of large number (large in nsec)
uint64_t std_us;
uint64_t avg_request_time_ns;
uint64_t avg_send_time_ns;
uint64_t avg_receive_time_ns;
// Per sec stat
double infer_per_sec;
double responses_per_sec;
double sequence_per_sec;
// Completed request count reported by the client library
uint64_t completed_count;
};
/// The entire statistics record.
struct PerfStatus {
uint32_t concurrency;
double request_rate;
size_t batch_size;
ServerSideStats server_stats;
ClientSideStats client_stats;
std::vector<Metrics> metrics{};
double overhead_pct;
bool on_sequence_model;
// placeholder for the latency value that is used for conditional checking
uint64_t stabilizing_latency_ns;
// Metric for requests sent per second
double send_request_rate{0.0};
};
cb::Error ReportPrometheusMetrics(const Metrics& metrics);
//==============================================================================
/// A InferenceProfiler is a helper class that measures and summarizes the
/// inference statistic under different concurrency level.
///
/// The profiler can adjust the number of concurrent requests by informing the
/// concurrency manager. And after the adjustment, the profiler will actively
/// collecting the statistic from both the concurrency manager and the inference
/// server directly until it is stable. Once stable, the profiler updates the
/// 'status_summary' based on the most recent measurement.
///
/// The measurement procedure:
/// 1. The profiler gets start status from the server and records the start
/// time.
/// 2. After given time interval, the profiler gets end status from the server
/// and records the end time.
/// 3. The profiler obtains the request records recorded by concurrency manager,
/// and uses the request records that are recorded between start time and end
/// time to measure client side status and update status_summary.
///
class InferenceProfiler {
public:
/// Create a profiler that collects and summarizes inference statistic.
/// \param verbose Whether to print verbose logging.
/// \param stability_threshold The range that the measurement is considered as
/// stable. i.e. within (1 +/- stability_threshold) * average value of the
/// last 3 measurements. The criteria are "infer per second" and "average
/// latency", or "infer per second" and "percentile latency" if valid
/// percentile is set (see 'percentile' below).
/// \param measurement_window_ms The duration of each measurement in msec.
/// \param max_trials The maximum number of attempts to obtain
/// stable measurement.
/// \param percentile The percentile in terms of latency to be reported.
/// if it is a valid percentile value, the percentile latency will reported
/// and used as stable criteria instead of average latency. If it is -1,
/// average latency will be reported and used as stable criteria.
/// \param latency_threshold_ms The threshold on the latency measurements in
/// microseconds.
/// \param parser The ModelParse object which holds all the details about the
/// model.
/// \param profile_backend The ClientBackend object used to communicate
/// with the server by profiler.
/// \param manager The LoadManager object that will produce load on the
/// server.
/// \param profiler Returns a new InferenceProfiler object.
/// \param measurement_request_count The number of requests to capture when
/// using "count_windows" mode.
/// \param measurement_mode The measurement mode to use for windows.
/// \param mpi_driver The driver class for MPI operations.
/// \param metrics_interval_ms The interval at which the server-side metrics
/// \param should_collect_metrics Whether server-side inference server metrics
/// should be collected.
/// \param overhead_pct_threshold User set threshold above which the PA
/// overhead is too significant to provide usable results.
/// \param collector Collector for the profile data from experiments
/// \param should_collect_profile_data Whether to collect profile data.
/// \return cb::Error object indicating success or failure.
static cb::Error Create(
const bool verbose, const double stability_threshold,
const uint64_t measurement_window_ms, const size_t max_trials,
const int64_t percentile, const uint64_t latency_threshold_ms,
const cb::ProtocolType protocol, std::shared_ptr<ModelParser>& parser,
std::shared_ptr<cb::ClientBackend> profile_backend,
std::unique_ptr<LoadManager> manager,
std::unique_ptr<InferenceProfiler>* profiler,
uint64_t measurement_request_count, MeasurementMode measurement_mode,
std::shared_ptr<MPIDriver> mpi_driver, const uint64_t metrics_interval_ms,
const bool should_collect_metrics, const double overhead_pct_threshold,
const std::shared_ptr<ProfileDataCollector> collector,
const bool should_collect_profile_data);
/// Performs the profiling on the given range with the given search algorithm.
/// For profiling using request rate invoke template with double, otherwise
/// invoke with size_t for concurrency search.
/// \param start The starting point of the search range.
/// \param end The ending point of the search range.
/// \param step The step size to move along the search range in linear search
/// or the precision in binary search.
/// \param search_mode The search algorithm to be applied.
/// \param summary Returns the trace of the measurement along the search
/// path.
/// \return cb::Error object indicating success or failure.
template <typename T>
cb::Error Profile(
const T start, const T end, const T step, const SearchMode search_mode,
std::vector<PerfStatus>& perf_statuses)
{
cb::Error err;
bool meets_threshold, is_stable;
if (search_mode == SearchMode::NONE) {
err = Profile(perf_statuses, meets_threshold, is_stable);
if (!err.IsOk()) {
return err;
}
} else if (search_mode == SearchMode::LINEAR) {
T current_value = start;
do {
err = Profile(current_value, perf_statuses, meets_threshold, is_stable);
if (!err.IsOk()) {
return err;
}
current_value += step;
} while (((current_value <= end) || (end == static_cast<T>(NO_LIMIT))) &&
(meets_threshold));
// If there was only one concurrency we swept over and it did not meet the
// stability threshold, we should return an error.
if (current_value == (start + step) && is_stable == false) {
return cb::Error(
"Failed to obtain stable measurement.", pa::STABILITY_ERROR);
}
} else {
err = Profile(start, perf_statuses, meets_threshold, is_stable);
if (!err.IsOk() || (!meets_threshold)) {
return err;
}
err = Profile(end, perf_statuses, meets_threshold, is_stable);
if (!err.IsOk() || (meets_threshold)) {
return err;
}
T this_start = start;
T this_end = end;
while ((this_end - this_start) > step) {
T current_value = (this_end + this_start) / 2;
err = Profile(current_value, perf_statuses, meets_threshold, is_stable);
if (!err.IsOk()) {
return err;
}
if (meets_threshold) {
this_start = current_value;
} else {
this_end = current_value;
}
}
}
return cb::Error::Success;
}
bool IncludeServerStats() { return include_server_stats_; }
private:
InferenceProfiler(
const bool verbose, const double stability_threshold,
const int32_t measurement_window_ms, const size_t max_trials,
const bool extra_percentile, const size_t percentile,
const uint64_t latency_threshold_ms, const cb::ProtocolType protocol,
std::shared_ptr<ModelParser>& parser,
std::shared_ptr<cb::ClientBackend> profile_backend,
std::unique_ptr<LoadManager> manager, uint64_t measurement_request_count,
MeasurementMode measurement_mode, std::shared_ptr<MPIDriver> mpi_driver,
const uint64_t metrics_interval_ms, const bool should_collect_metrics,
const double overhead_pct_threshold,
const std::shared_ptr<ProfileDataCollector> collector,
const bool should_collect_profile_data);
/// Actively measure throughput in every 'measurement_window' msec until the
/// throughput is stable. Once the throughput is stable, it adds the
/// observations on summary trace and returns whether the setting met the
/// threshold. NOTE: the requests are being sent regardless of the
/// measurement, so the data returned by the server (see struct
/// PerforamnceStatusStruct) will include more requests than what the client
/// measures (we can't get the exact server status right before the first
/// request and right after the last request in the measurement window).
/// \param concurrent_request_count The concurrency level for the measurement.
/// \param perf_statuses Appends the measurements summary at the end of this
/// list. \param meets_threshold Returns whether the setting meets the
/// threshold.
/// \param is_stable Returns whether the measurement is stable.
/// \return cb::Error object indicating success or failure.
cb::Error Profile(
const size_t concurrent_request_count,
std::vector<PerfStatus>& perf_statuses, bool& meets_threshold,
bool& is_stable);
/// Similar to above function, but instead of setting the concurrency, it
/// sets the specified request rate for measurements.
/// \param request_rate The request rate for inferences.
/// \param perf_statuses Appends the measurements summary at the end of this
/// list. \param meets_threshold Returns whether the setting meets the
/// threshold. \param is_stable Returns whether the measurement is stable.
/// \return cb::Error object indicating success or failure.
cb::Error Profile(
const double request_rate, std::vector<PerfStatus>& perf_statuses,
bool& meets_threshold, bool& is_stable);
/// Measures throughput and latencies for custom load without controlling
/// request rate nor concurrency. Requires load manager to be loaded with
/// a file specifying the time intervals.
/// \param perf_statuses Appends the measurements summary at the end of this
/// list. \param meets_threshold Returns whether the measurement met the
/// threshold. \param is_stable Returns whether the measurement is stable.
/// \return cb::Error object indicating success
/// or failure.
cb::Error Profile(
std::vector<PerfStatus>& perf_statuses, bool& meets_threshold,
bool& is_stable);
/// A helper function for profiling functions.
/// \param status_summary Returns the summary of the measurement.
/// \param is_stable Returns whether the measurement stabilized or not.
/// \return cb::Error object indicating success or failure.
cb::Error ProfileHelper(PerfStatus& status_summary, bool* is_stable);
/// A helper function to determine if profiling is stable
/// \param load_status Stores the observations of infer_per_sec and latencies
/// \return Returns if the threshold and latencies are stable.
bool DetermineStability(LoadStatus& load_status);
/// Check if latency at index idx is within the latency threshold
/// \param idx index in latency vector
/// \param load_status Stores the observations of infer_per_sec and latencies
/// \return Returns whether the latencies are below the max threshold
bool CheckWithinThreshold(size_t idx, LoadStatus& load_status);
/// A helper function to determine if profiling is done
/// \param load_status Stores the observations of infer_per_sec and latencies
/// \param is_stable Returns whether the measurement stabilized or not.
/// \return Returns if we should break out of the infinite stability check
/// loop.
bool IsDoneProfiling(LoadStatus& load_status, bool* is_stable);
/// Check if observed inferences and latencies are within threshold
/// for a single window starting at idx
/// \param idx index in latency vector
/// \param load_status Stores the observations of infer_per_sec and latencies
/// \return Returns whether inference and latency are stable
bool CheckWindowForStability(size_t idx, LoadStatus& load_status);
/// Check if observed inferences are within threshold
/// for a single window starting at idx
/// \param idx index in latency vector
/// \param load_status Stores the observations of infer_per_sec and latencies
/// \return Returns whether inference is stable
bool IsInferWindowStable(size_t idx, LoadStatus& load_status);
/// Check if observed latencies are within threshold
/// for a single window starting at idx
/// \param idx index in latency vector
/// \param load_status Stores the observations of infer_per_sec and latencies
/// \return Returns whether latency is stable
bool IsLatencyWindowStable(size_t idx, LoadStatus& load_status);
/// Helper function to perform measurement.
/// \param status_summary The summary of this measurement.
/// \param measurement_window Indicating the number of requests or the
/// duration in milliseconds to collect requests.
/// \param is_count_based determines whether measurement_window is indicating
/// time or count.
/// \return cb::Error object indicating success or failure.
cb::Error Measure(
PerfStatus& status_summary, uint64_t measurement_window,
bool is_count_based);
/// Gets the server side statistics
/// \param model_status Returns the status of the models provided by
/// the server. If the model being profiled is non-ensemble model,
/// only its status will be returned. Otherwise, the status of the composing
/// models will also be returned.
/// \return cb::Error object indicating success or failure.
cb::Error GetServerSideStatus(
std::map<cb::ModelIdentifier, cb::ModelStatistics>* model_status);
/// Summarize the measurement with the provided statistics.
/// \param start_status The model status at the start of the measurement.
/// \param end_status The model status at the end of the measurement.
/// \param start_stat The accumulated context status at the start.
/// \param end_stat The accumulated context status at the end.
/// \param summary Returns the summary of the measurement.
/// \param window_start_ns The window start timestamp in nanoseconds.
/// \param window_end_ns The window end timestamp in nanoseconds.
/// \return cb::Error object indicating success or failure.
cb::Error Summarize(
const std::map<cb::ModelIdentifier, cb::ModelStatistics>& start_status,
const std::map<cb::ModelIdentifier, cb::ModelStatistics>& end_status,
const cb::InferStat& start_stat, const cb::InferStat& end_stat,
PerfStatus& summary, uint64_t window_start_ns, uint64_t window_end_ns);
/// \param valid_range The start and end timestamp of the measurement window.
/// \param valid_sequence_count Returns the number of completed sequences
/// during the measurement. A sequence is a set of correlated requests sent to
/// sequence model.
/// \param latencies Returns the vector of request latencies where the
/// requests are completed within the measurement window.
/// \param response_count Returns the number of responses
/// \param valid_requests Returns a vector of valid request records
virtual void ValidLatencyMeasurement(
const std::pair<uint64_t, uint64_t>& valid_range,
size_t& valid_sequence_count, size_t& delayed_request_count,
std::vector<uint64_t>* latencies, size_t& response_count,
std::vector<RequestRecord>& valid_requests);
/// Add the data from the request records to the Raw Data Collector
/// \param perf_status PerfStatus of the current measurement
/// \param window_start_ns The window start timestamp in nanoseconds.
/// \param window_end_ns The window end timestamp in nanoseconds.
/// \param request_records The request records to collect.
void CollectData(
PerfStatus& perf_status, uint64_t window_start_ns, uint64_t window_end_ns,
std::vector<RequestRecord>&& request_records);
/// \param latencies The vector of request latencies collected.
/// \param summary Returns the summary that the latency related fields are
/// set.
/// \return cb::Error object indicating success or failure.
virtual cb::Error SummarizeLatency(
const std::vector<uint64_t>& latencies, PerfStatus& summary);
/// \param latencies The vector of request latencies collected.
/// \return std::tuple object containing:
/// * mean of latencies in nanoseconds
/// * sample standard deviation of latencies in microseconds
std::tuple<uint64_t, uint64_t> GetMeanAndStdDev(
const std::vector<uint64_t>& latencies);
/// \param start_stat The accumulated client statistics at the start.
/// \param end_stat The accumulated client statistics at the end.
/// \param duration_ns The duration of the measurement in nsec.
/// \param valid_request_count The number of completed requests recorded.
/// \param valid_sequence_count The number of completed sequences recorded.
/// \param delayed_request_count The number of requests that missed their
/// schedule.
/// \param response_count The number of responses.
/// \param summary Returns the summary that the fields recorded by
/// client are set.
/// \return cb::Error object indicating success or failure.
virtual cb::Error SummarizeClientStat(
const cb::InferStat& start_stat, const cb::InferStat& end_stat,
const uint64_t duration_ns, const size_t valid_request_count,
const size_t delayed_request_count, const size_t valid_sequence_count,
const size_t response_count, PerfStatus& summary);
/// Adds the send request rate metric to the summary object.
/// \param window_duration_s The duration of the window in seconds.
/// \param num_sent_requests The number of requests sent during the last
/// window.
/// \param summary The summary object to be updated with the send request rate
/// metric.
void SummarizeSendRequestRate(
const double window_duration_s, const size_t num_sent_requests,
PerfStatus& summary);
/// Given a model_identifier to gather stats for, and a map of ALL stats,
/// determine which version of the model should be gathered
/// \param model_identifier A pair of model_name and model_version to identify
/// a specific model
/// \param start_stats The stats for all models at the start of the
/// measurement
/// \param end_stats The stats for all models at the end of the measurement
/// \param model_version The determined model version
cb::Error DetermineStatsModelVersion(
const cb::ModelIdentifier& model_identifier,
const std::map<cb::ModelIdentifier, cb::ModelStatistics>& start_stats,
const std::map<cb::ModelIdentifier, cb::ModelStatistics>& end_stats,
int64_t* model_version);
/// \param start_status The model status at the start of the measurement.
/// \param end_status The model status at the end of the measurement.
/// \param server_stats Returns the summary that the fields recorded by server
/// are set.
/// \return cb::Error object indicating success or failure.
cb::Error SummarizeServerStats(
const std::map<cb::ModelIdentifier, cb::ModelStatistics>& start_status,
const std::map<cb::ModelIdentifier, cb::ModelStatistics>& end_status,
ServerSideStats* server_stats);
/// \param model_identifier A pair of model_name and model_version to identify
/// a specific model.
/// \param start_status The model status at the start of the measurement.
/// \param end_status The model status at the end of the measurement.
/// \param server_stats Returns the summary that the fields recorded by server
/// are set.
/// \return cb::Error object indicating success or failure.
cb::Error SummarizeServerStats(
const cb::ModelIdentifier& model_identifier,
const std::map<cb::ModelIdentifier, cb::ModelStatistics>& start_status,
const std::map<cb::ModelIdentifier, cb::ModelStatistics>& end_status,
ServerSideStats* server_stats);
/// \param model_identifier A pair of model_name and model_version to identify
/// a specific model.
/// \param start_status The model status at the start of the measurement.
/// \param end_status The model status at the end of the measurement.
/// \param server_stats Returns the summary that the fields recorded by server
/// are set.
/// \return cb::Error object indicating success or failure.
cb::Error SummarizeServerStatsHelper(
const cb::ModelIdentifier& model_identifier,
const std::map<cb::ModelIdentifier, cb::ModelStatistics>& start_status,
const std::map<cb::ModelIdentifier, cb::ModelStatistics>& end_status,
ServerSideStats* server_stats);
/// Calculate the overhead and put the results into the summary
///
/// \param window_duration_ns The duration of the window
/// \param idle_ns The average worker idle time during the window
/// \param summary The summary object to be updated with overhead stats
///
void SummarizeOverhead(
const uint64_t window_duration_ns, const uint64_t idle_ns,
PerfStatus& summary);
/// Returns true if all MPI ranks (models) are stable. Should only be run if
/// and only if IsMPIRun() returns true.
/// \param current_rank_stability The stability of the current rank.
/// \return True if all MPI ranks are stable.
bool AllMPIRanksAreStable(bool current_rank_stability);
/// Merge individual perf status reports into a single perf status. This
/// function is used to merge the results from multiple Measure runs into a
/// single report.
/// \param perf_status List of perf status reports to be merged.
/// \param summary_status Final merged summary status.
/// \return cb::Error object indicating success or failure.
virtual cb::Error MergePerfStatusReports(
std::deque<PerfStatus>& perf_status, PerfStatus& summary_status);
/// Merge individual server side statistics into a single server side report.
/// \param server_side_stats List of server side statistics reports to be
/// merged.
/// \param server_side_summary Final merged summary status.
/// \return cb::Error object indicating success or failure.
virtual cb::Error MergeServerSideStats(
std::vector<ServerSideStats>& server_side_stats,
ServerSideStats& server_side_summary);
/// \param all_metrics Individual metrics from all intervals from stable
/// passes.
/// \param merged_metrics Output merged metrics from all intervals from stable
/// passes.
/// \return cb::Error object indicating success or failure.
cb::Error MergeMetrics(
const std::vector<std::reference_wrapper<const Metrics>>& all_metrics,
Metrics& merged_metrics);
template <typename T>
void GetMetricAveragePerGPU(
const std::vector<std::reference_wrapper<const std::map<std::string, T>>>&
input_metric_maps,
std::map<std::string, T>& output_metric_map)
{
std::map<std::string, size_t> metric_count_per_gpu{};
for (const auto& input_metric_map : input_metric_maps) {
for (const auto& input_metric : input_metric_map.get()) {
const auto& gpu_uuid{input_metric.first};
const auto& metric{input_metric.second};
if (output_metric_map.find(gpu_uuid) == output_metric_map.end()) {
output_metric_map[gpu_uuid] = 0;
metric_count_per_gpu[gpu_uuid] = 0;
}
output_metric_map[gpu_uuid] += metric;
metric_count_per_gpu[gpu_uuid]++;
}
}
for (auto& output_metric : output_metric_map) {
const auto& gpu_uuid{output_metric.first};
auto& metric{output_metric.second};
const auto& metric_count{metric_count_per_gpu[gpu_uuid]};
if (metric_count > 0) {
metric /= metric_count;
}
}
}
template <typename T>
void GetMetricMaxPerGPU(
const std::vector<std::reference_wrapper<const std::map<std::string, T>>>&
input_metric_maps,
std::map<std::string, T>& output_metric_map)
{
for (const auto& input_metric_map : input_metric_maps) {
for (const auto& input_metric : input_metric_map.get()) {
const auto& gpu_uuid{input_metric.first};
const auto& metric{input_metric.second};
if (output_metric_map.find(gpu_uuid) == output_metric_map.end()) {
output_metric_map[gpu_uuid] = 0;
}
output_metric_map[gpu_uuid] =
std::max(output_metric_map[gpu_uuid], metric);
}
}
}
template <typename T>
void GetMetricFirstPerGPU(
const std::vector<std::reference_wrapper<const std::map<std::string, T>>>&
input_metric_maps,
std::map<std::string, T>& output_metric_map)
{
for (const auto& input_metric_map : input_metric_maps) {
for (const auto& input_metric : input_metric_map.get()) {
const auto& gpu_uuid{input_metric.first};
const auto& metric{input_metric.second};
if (output_metric_map.find(gpu_uuid) == output_metric_map.end()) {
output_metric_map[gpu_uuid] = metric;
}
}
}
}
bool verbose_;
uint64_t measurement_window_ms_;
uint64_t measurement_request_count_;
MeasurementMode measurement_mode_;
size_t max_trials_;
bool extra_percentile_;
size_t percentile_;
uint64_t latency_threshold_ms_;
cb::ProtocolType protocol_;
std::string model_name_;
int64_t model_version_;
std::shared_ptr<ModelParser> parser_;
std::shared_ptr<cb::ClientBackend> profile_backend_;
std::unique_ptr<LoadManager> manager_;
std::shared_ptr<ProfileDataCollector> collector_;
LoadParams load_parameters_;
bool include_lib_stats_;
bool include_server_stats_;
std::shared_ptr<MPIDriver> mpi_driver_;
/// The request records of the requests completed during all measurements
std::vector<RequestRecord> all_request_records_;
/// The end time of the previous measurement window
uint64_t previous_window_end_ns_;
/// Server side statistics from the previous measurement window
std::map<cb::ModelIdentifier, cb::ModelStatistics> prev_server_side_stats_;
/// Client side statistics from the previous measurement window
cb::InferStat prev_client_side_stats_;
/// Metrics manager that collects server-side metrics periodically
std::shared_ptr<MetricsManager> metrics_manager_{nullptr};
/// Whether server-side inference server metrics should be collected.
bool should_collect_metrics_{false};
/// User set threshold above which the PA overhead is too significant to
/// provide usable results.
const double overhead_pct_threshold_{0.0};
// Whether to collect profile data.
bool should_collect_profile_data_{false};
#ifndef DOCTEST_CONFIG_DISABLE
friend NaggyMockInferenceProfiler;
friend TestInferenceProfiler;
public:
InferenceProfiler() = default;
#endif
};
}} // namespace triton::perfanalyzer
// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include "rate_schedule.h"
namespace triton { namespace perfanalyzer {
/// Interface for worker threads that use a schedule
///
class IScheduler {
public:
/// Provides the schedule that should be followed
///
virtual void SetSchedule(RateSchedulePtr_t schedule) = 0;
};
}} // namespace triton::perfanalyzer
// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
namespace triton { namespace perfanalyzer {
/// Interface for worker threads that generate inference requests
///
class IWorker {
public:
virtual void Infer() = 0;
};
}} // namespace triton::perfanalyzer
// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "load_manager.h"
#include <algorithm>
#include "client_backend/client_backend.h"
#include "infer_data_manager_factory.h"
namespace triton { namespace perfanalyzer {
cb::Error
LoadManager::CheckHealth()
{
// Check thread status to make sure that the load setting is
// consistent to the one being reported
// If some thread return early, main thread will return and
// the worker thread's error message will be reported
// when derived class destructor gets called.
for (auto& thread_stat : threads_stat_) {
if (!thread_stat->status_.IsOk()) {
return cb::Error(
"Failed to maintain requested inference load."
" Worker thread(s) failed to generate concurrent requests.",
pa::GENERIC_ERROR);
}
if (!thread_stat->cb_status_.IsOk()) {
return cb::Error(
"Failed to retrieve results from inference request.",
pa::GENERIC_ERROR);
}
}
return cb::Error::Success;
}
cb::Error
LoadManager::SwapRequestRecords(std::vector<RequestRecord>& new_request_records)
{
std::vector<RequestRecord> total_request_records;
// Gather request records with proper locking from all the worker threads
for (auto& thread_stat : threads_stat_) {
std::lock_guard<std::mutex> lock(thread_stat->mu_);
total_request_records.insert(
total_request_records.end(), thread_stat->request_records_.begin(),
thread_stat->request_records_.end());
thread_stat->request_records_.clear();
}
// Swap the results
total_request_records.swap(new_request_records);
return cb::Error::Success;
}
uint64_t
LoadManager::CountCollectedRequests()
{
uint64_t num_of_requests = 0;
for (auto& thread_stat : threads_stat_) {
std::lock_guard<std::mutex> lock(thread_stat->mu_);
num_of_requests += thread_stat->request_records_.size();
}
return num_of_requests;
}
cb::Error
LoadManager::GetAccumulatedClientStat(cb::InferStat* contexts_stat)
{
contexts_stat->completed_request_count = 0;
contexts_stat->cumulative_receive_time_ns = 0;
contexts_stat->cumulative_send_time_ns = 0;
contexts_stat->cumulative_total_request_time_ns = 0;
for (auto& thread_stat : threads_stat_) {
std::lock_guard<std::mutex> lock(thread_stat->mu_);
for (auto& context_stat : thread_stat->contexts_stat_) {
contexts_stat->completed_request_count +=
context_stat.completed_request_count;
contexts_stat->cumulative_total_request_time_ns +=
context_stat.cumulative_total_request_time_ns;
contexts_stat->cumulative_send_time_ns +=
context_stat.cumulative_send_time_ns;
contexts_stat->cumulative_receive_time_ns +=
context_stat.cumulative_receive_time_ns;
}
}
return cb::Error::Success;
}
uint64_t
LoadManager::GetIdleTime()
{
uint64_t total{0};
size_t num_active_threads = 0;
for (auto& thread_stat : threads_stat_) {
std::lock_guard<std::mutex> lock(thread_stat->mu_);
uint64_t idle_time = thread_stat->idle_timer.GetIdleTime();
if (idle_time) {
total += idle_time;
num_active_threads++;
}
}
// TODO REFACTOR TMA-1043 InferDataManager should have an API to get
// num_active_threads. This method of determining active threads isn't fully
// accurate
if (num_active_threads) {
total /= num_active_threads;
}
return total;
}
void
LoadManager::ResetIdleTime()
{
for (auto& thread_stat : threads_stat_) {
std::lock_guard<std::mutex> lock(thread_stat->mu_);
thread_stat->idle_timer.Reset();
}
}
const size_t
LoadManager::GetAndResetNumSentRequests()
{
size_t num_sent_requests{0};
for (auto& thread_stat : threads_stat_) {
num_sent_requests += thread_stat->num_sent_requests_;
thread_stat->num_sent_requests_ = 0;
}
return num_sent_requests;
}
LoadManager::LoadManager(
const bool async, const bool streaming, const int32_t batch_size,
const size_t max_threads, const SharedMemoryType shared_memory_type,
const size_t output_shm_size, const std::shared_ptr<ModelParser>& parser,
const std::shared_ptr<cb::ClientBackendFactory>& factory)
: async_(async), streaming_(streaming), batch_size_(batch_size),
max_threads_(max_threads), parser_(parser), factory_(factory),
using_json_data_(false)
{
on_sequence_model_ =
((parser_->SchedulerType() == ModelParser::SEQUENCE) ||
(parser_->SchedulerType() == ModelParser::ENSEMBLE_SEQUENCE));
data_loader_.reset(new DataLoader(batch_size_));
infer_data_manager_ = InferDataManagerFactory::CreateInferDataManager(
max_threads, batch_size, shared_memory_type, output_shm_size, parser,
factory, data_loader_);
}
void
LoadManager::InitManager(
const size_t string_length, const std::string& string_data,
const bool zero_input, std::vector<std::string>& user_data,
const uint64_t start_sequence_id, const uint64_t sequence_id_range,
const size_t sequence_length, const bool sequence_length_specified,
const double sequence_length_variation)
{
// Note, this is already caught by the CLI, but adding it here for extra
// protection
if (on_sequence_model_ && batch_size_ > 1) {
throw PerfAnalyzerException(
"error: sequence models do not support batching", GENERIC_ERROR);
}
auto status =
InitManagerInputs(string_length, string_data, zero_input, user_data);
THROW_IF_ERROR(status, "Failed to init manager inputs");
THROW_IF_ERROR(
infer_data_manager_->Init(), "Unable to init infer data manager");
sequence_manager_ = MakeSequenceManager(
start_sequence_id, sequence_id_range, sequence_length,
sequence_length_specified, sequence_length_variation, using_json_data_,
data_loader_);
InitManagerFinalize();
}
cb::Error
LoadManager::InitManagerInputs(
const size_t string_length, const std::string& string_data,
const bool zero_input, std::vector<std::string>& user_data)
{
RETURN_IF_ERROR(factory_->CreateClientBackend(&backend_));
// Read provided data
if (!user_data.empty()) {
if (IsDirectory(user_data[0])) {
RETURN_IF_ERROR(data_loader_->ReadDataFromDir(
parser_->Inputs(), parser_->Outputs(), user_data[0]));
} else {
using_json_data_ = true;
for (const auto& json_file : user_data) {
RETURN_IF_ERROR(data_loader_->ReadDataFromJSON(
parser_->Inputs(), parser_->Outputs(), json_file));
}
std::cout << " Successfully read data for "
<< data_loader_->GetDataStreamsCount() << " stream/streams";
if (data_loader_->GetDataStreamsCount() == 1) {
std::cout << " with " << data_loader_->GetTotalSteps(0)
<< " step/steps";
}
std::cout << "." << std::endl;
}
} else {
RETURN_IF_ERROR(data_loader_->GenerateData(
parser_->Inputs(), zero_input, string_length, string_data));
}
// Reserve the required vector space
threads_stat_.reserve(max_threads_);
return cb::Error::Success;
}
void
LoadManager::StopWorkerThreads()
{
early_exit = true;
// wake up all threads
wake_signal_.notify_all();
size_t cnt = 0;
for (auto& thread : threads_) {
thread.join();
if (!threads_stat_[cnt]->status_.IsOk()) {
std::cerr << "Thread [" << cnt
<< "] had error: " << (threads_stat_[cnt]->status_)
<< std::endl;
}
if (!threads_stat_[cnt]->cb_status_.IsOk()) {
std::cerr << "Thread [" << cnt
<< "] had error: " << (threads_stat_[cnt]->cb_status_)
<< std::endl;
}
cnt++;
}
threads_.clear();
}
std::shared_ptr<SequenceManager>
LoadManager::MakeSequenceManager(
const uint64_t start_sequence_id, const uint64_t sequence_id_range,
const size_t sequence_length, const bool sequence_length_specified,
const double sequence_length_variation, const bool using_json_data,
std::shared_ptr<DataLoader> data_loader)
{
return std::make_shared<SequenceManager>(
start_sequence_id, sequence_id_range, sequence_length,
sequence_length_specified, sequence_length_variation, using_json_data,
data_loader);
}
}} // namespace triton::perfanalyzer
// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <atomic>
#include <condition_variable>
#include <memory>
#include <random>
#include <thread>
#include "client_backend/client_backend.h"
#include "data_loader.h"
#include "iinfer_data_manager.h"
#include "load_worker.h"
#include "perf_utils.h"
#include "sequence_manager.h"
namespace triton { namespace perfanalyzer {
#ifndef DOCTEST_CONFIG_DISABLE
class NaggyMockLoadManager;
#endif
class LoadManager {
public:
virtual ~LoadManager() = default;
/// Initialize the Manager class to set up shared memory and inputs
/// \param string_length The length of the random strings to be generated
/// for string inputs.
/// \param string_data The string to be used as string inputs for model.
/// \param zero_input Whether to use zero for model inputs.
/// \param user_data The vector containing path/paths to user-provided data
/// that can be a directory or path to a json data file.
/// \param start_sequence_id The starting sequence ID to be used for iterating
/// through valid sequence IDs.
/// \param sequence_id_range The maximum sequence ID to be used for iterating
/// through valid sequence IDs.
/// \param sequence_length The base length of new sequences.
/// \param sequence_length_specified Whether the user specified the sequence
/// length.
/// \param sequence_length_variation The percentage variation in length of
/// sequences using autogenerated data as input.
void InitManager(
const size_t string_length, const std::string& string_data,
const bool zero_input, std::vector<std::string>& user_data,
const uint64_t start_sequence_id, const uint64_t sequence_id_range,
const size_t sequence_length, const bool sequence_length_specified,
const double sequence_length_variation);
/// Check if the load manager is working as expected.
/// \return cb::Error object indicating success or failure.
cb::Error CheckHealth();
/// Swap the content of the request records vector recorded by the load
/// manager with a new request records vector
/// \param new_request_records The request records vector to be swapped.
/// \return cb::Error object indicating success or failure.
cb::Error SwapRequestRecords(std::vector<RequestRecord>& new_request_records);
/// Get the sum of all contexts' stat
/// \param contexts_stat Returned the accumulated stat from all contexts
/// in load manager
cb::Error GetAccumulatedClientStat(cb::InferStat* contexts_stat);
/// Returns the amount of valid time each worker thread has averaged in
/// nanoseconds
///
uint64_t GetIdleTime();
/// Resets the counter for tracking valid time
///
void ResetIdleTime();
/// Calculates and returns the total number of sent requests across all
/// threads. Resets individual number of sent requests per thread.
/// \return The total number of sent requests across all threads.
const size_t GetAndResetNumSentRequests();
/// \return the batch size used for the inference requests
virtual size_t BatchSize() const { return batch_size_; }
/// Count the number of requests collected until now.
uint64_t CountCollectedRequests();
protected:
LoadManager(
const bool async, const bool streaming, const int32_t batch_size,
const size_t max_threads, const SharedMemoryType shared_memory_type,
const size_t output_shm_size, const std::shared_ptr<ModelParser>& parser,
const std::shared_ptr<cb::ClientBackendFactory>& factory);
/// Complete any subclass-specific manager initialization tasks.
virtual void InitManagerFinalize() {}
/// Helper function to retrieve the input data for the inferences
/// \param string_length The length of the random strings to be generated
/// for string inputs.
/// \param string_data The string to be used as string inputs for model.
/// \param zero_input Whether to use zero for model inputs.
/// \param user_data The vector containing path/paths to user-provided data
/// that can be a directory or path to a json data file.
/// \return cb::Error object indicating success or failure.
cb::Error InitManagerInputs(
const size_t string_length, const std::string& string_data,
const bool zero_input, std::vector<std::string>& user_data);
/// Stops all the worker threads generating the request load.
void StopWorkerThreads();
protected:
bool async_;
bool streaming_;
size_t batch_size_;
size_t max_threads_;
bool on_sequence_model_;
std::shared_ptr<ModelParser> parser_;
std::shared_ptr<cb::ClientBackendFactory> factory_;
bool using_json_data_;
std::shared_ptr<DataLoader> data_loader_;
std::unique_ptr<cb::ClientBackend> backend_;
std::shared_ptr<IInferDataManager> infer_data_manager_;
// Track the workers so they all go out of scope at the
// same time
std::vector<std::shared_ptr<IWorker>> workers_;
// Worker threads that loads the server with inferences
std::vector<std::thread> threads_;
// Contains the statistics on the current working threads
std::vector<std::shared_ptr<ThreadStat>> threads_stat_;
// Use condition variable to pause/continue worker threads
std::condition_variable wake_signal_;
std::mutex wake_mutex_;
std::shared_ptr<SequenceManager> sequence_manager_{nullptr};
virtual std::shared_ptr<SequenceManager> MakeSequenceManager(
const uint64_t start_sequence_id, const uint64_t sequence_id_range,
const size_t sequence_length, const bool sequence_length_specified,
const double sequence_length_variation, const bool using_json_data,
std::shared_ptr<DataLoader> data_loader);
#ifndef DOCTEST_CONFIG_DISABLE
friend NaggyMockLoadManager;
public:
LoadManager() = default;
#endif
};
}} // namespace triton::perfanalyzer
// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "load_worker.h"
#include <algorithm>
#include <thread>
#include "client_backend/client_backend.h"
#include "perf_utils.h"
namespace triton { namespace perfanalyzer {
bool
LoadWorker::ShouldExit()
{
return early_exit || !thread_stat_->cb_status_.IsOk() ||
!thread_stat_->status_.IsOk();
}
bool
LoadWorker::HandleExitConditions()
{
if (ShouldExit()) {
CompleteOngoingSequences();
WaitForOngoingRequests();
return true;
}
return false;
}
void
LoadWorker::CompleteOngoingSequences()
{
if (on_sequence_model_) {
for (size_t ctx_id = 0; ctx_id < ctxs_.size(); ++ctx_id) {
size_t seq_stat_index = GetSeqStatIndex(ctx_id);
ctxs_[ctx_id]->CompleteOngoingSequence(seq_stat_index);
}
}
}
void
LoadWorker::WaitForOngoingRequests()
{
while (GetNumOngoingRequests() != 0) {
std::this_thread::sleep_for(std::chrono::milliseconds(50));
}
}
uint
LoadWorker::GetNumOngoingRequests()
{
uint num = 0;
for (auto ctx : ctxs_) {
num += ctx->GetNumOngoingRequests();
}
return num;
}
void
LoadWorker::CreateContext()
{
auto ctx = CreateInferContext();
ctx->Init();
CreateContextFinalize(ctx);
ctxs_.push_back(ctx);
}
uint32_t
LoadWorker::GetCtxId()
{
std::lock_guard<std::mutex> lk(cb_mtx_);
return ctx_id_tracker_->Get();
}
void
LoadWorker::RestoreFreeCtxId(uint32_t ctx_id)
{
if (!async_) {
{
std::lock_guard<std::mutex> lock(cb_mtx_);
ctx_id_tracker_->Restore(ctx_id);
}
}
}
void
LoadWorker::AsyncCallbackFinalize(uint32_t ctx_id)
{
// avoid competition over 'cb_mtx_'
{
std::lock_guard<std::mutex> lk(cb_mtx_);
ctx_id_tracker_->Restore(ctx_id);
notified_ = true;
}
cb_cv_.notify_all();
}
}} // namespace triton::perfanalyzer
// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <condition_variable>
#include <memory>
#include <mutex>
#include <queue>
#include "ctx_id_tracker_factory.h"
#include "data_loader.h"
#include "infer_context.h"
#include "iworker.h"
#include "model_parser.h"
#include "sequence_manager.h"
namespace triton { namespace perfanalyzer {
/// Abstract base class for worker threads
///
class LoadWorker : public IWorker {
protected:
LoadWorker(
uint32_t id, std::shared_ptr<ThreadStat> thread_stat,
const std::shared_ptr<ModelParser> parser,
std::shared_ptr<DataLoader> data_loader,
const std::shared_ptr<cb::ClientBackendFactory> factory,
const bool on_sequence_model, const bool async, const bool streaming,
const int32_t batch_size, const bool using_json_data,
std::condition_variable& wake_signal, std::mutex& wake_mutex,
bool& execute,
const std::shared_ptr<IInferDataManager>& infer_data_manager,
std::shared_ptr<SequenceManager> sequence_manager)
: id_(id), thread_stat_(thread_stat), parser_(parser),
data_loader_(data_loader), factory_(factory),
on_sequence_model_(on_sequence_model), async_(async),
streaming_(streaming), batch_size_(batch_size),
using_json_data_(using_json_data), wake_signal_(wake_signal),
wake_mutex_(wake_mutex), execute_(execute),
infer_data_manager_(infer_data_manager),
sequence_manager_(sequence_manager)
{
}
virtual ~LoadWorker() = default;
protected:
// Return the total number of async requests that have started and not
// finished
uint GetNumOngoingRequests();
void SendInferRequest(uint32_t ctx_id, bool delayed = false)
{
if (ShouldExit()) {
return;
}
if (on_sequence_model_) {
uint32_t seq_stat_index = GetSeqStatIndex(ctx_id);
ctxs_[ctx_id]->SendSequenceInferRequest(seq_stat_index, delayed);
} else {
ctxs_[ctx_id]->SendInferRequest(delayed);
}
}
virtual std::shared_ptr<InferContext> CreateInferContext()
{
return std::make_shared<InferContext>(
id_, ctxs_.size(), async_, streaming_, on_sequence_model_,
using_json_data_, batch_size_, thread_stat_, data_loader_, parser_,
factory_, execute_, infer_data_manager_, sequence_manager_);
}
// Create an inference context and add it to ctxs_
virtual void CreateContext();
// Any code that needs to execute after the Context has been created
virtual void CreateContextFinalize(std::shared_ptr<InferContext> ctx) = 0;
// Detect the cases where this thread needs to exit
bool ShouldExit();
// Detect and handle the case where this thread needs to exit
// Returns true if an exit condition was met
bool HandleExitConditions();
void CompleteOngoingSequences();
void WaitForOngoingRequests();
virtual uint32_t GetSeqStatIndex(uint32_t ctx_id) = 0;
uint32_t GetCtxId();
void RestoreFreeCtxId(uint32_t ctx_id);
void AsyncCallbackFinalize(uint32_t ctx_id);
uint32_t id_;
std::vector<std::shared_ptr<InferContext>> ctxs_;
std::shared_ptr<ICtxIdTracker> ctx_id_tracker_;
// Variables used to signal async request completion
bool notified_ = false;
std::mutex cb_mtx_;
std::condition_variable cb_cv_;
// TODO REFACTOR TMA-1017 is there a better way to do threading than to pass
// the same cv/mutex into every thread by reference? Used to wake up this
// thread if it has been put to sleep
std::condition_variable& wake_signal_;
std::mutex& wake_mutex_;
// TODO REFACTOR TMA-1017 is there a better way to communicate this than a
// shared bool reference? Used to pause execution of this thread
bool& execute_;
// Stats for this thread
std::shared_ptr<ThreadStat> thread_stat_;
std::shared_ptr<DataLoader> data_loader_;
const std::shared_ptr<ModelParser> parser_;
const std::shared_ptr<cb::ClientBackendFactory> factory_;
const std::shared_ptr<IInferDataManager> infer_data_manager_;
const bool on_sequence_model_;
const bool async_;
const bool streaming_;
const int32_t batch_size_;
const bool using_json_data_;
std::shared_ptr<SequenceManager> sequence_manager_{nullptr};
};
}} // namespace triton::perfanalyzer
// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "perf_analyzer.h"
#include "perf_analyzer_exception.h"
namespace pa = triton::perfanalyzer;
int
main(int argc, char* argv[])
{
try {
triton::perfanalyzer::CLParser clp;
pa::PAParamsPtr params = clp.Parse(argc, argv);
PerfAnalyzer analyzer(params);
analyzer.Run();
}
catch (pa::PerfAnalyzerException& e) {
return e.GetError();
}
return 0;
}
// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <cstdint>
#include <map>
#include <string>
namespace triton { namespace perfanalyzer {
/// Struct that holds server-side metrics for the inference server.
/// The keys for each map are GPU UUIDs and the values are described in the
/// variable names.
struct Metrics {
std::map<std::string, double> gpu_utilization_per_gpu{};
std::map<std::string, double> gpu_power_usage_per_gpu{};
std::map<std::string, uint64_t> gpu_memory_used_bytes_per_gpu{};
std::map<std::string, uint64_t> gpu_memory_total_bytes_per_gpu{};
};
}} // namespace triton::perfanalyzer
// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "metrics_manager.h"
#include <iostream>
#include <stdexcept>
#include <utility>
#include "constants.h"
#include "perf_analyzer_exception.h"
namespace triton { namespace perfanalyzer {
MetricsManager::MetricsManager(
std::shared_ptr<clientbackend::ClientBackend> client_backend,
uint64_t metrics_interval_ms)
: client_backend_(client_backend), metrics_interval_ms_(metrics_interval_ms)
{
}
MetricsManager::~MetricsManager()
{
if (query_loop_future_.valid()) {
StopQueryingMetrics();
}
}
void
MetricsManager::StartQueryingMetrics()
{
should_keep_querying_ = true;
query_loop_future_ =
std::async(&MetricsManager::QueryMetricsEveryNMilliseconds, this);
}
void
MetricsManager::QueryMetricsEveryNMilliseconds()
{
while (should_keep_querying_) {
const auto& start{std::chrono::system_clock::now()};
Metrics metrics{};
clientbackend::Error err{client_backend_->Metrics(metrics)};
if (err.IsOk() == false) {
throw PerfAnalyzerException(err.Message(), err.Err());
}
CheckForMissingMetrics(metrics);
{
std::lock_guard<std::mutex> metrics_lock{metrics_mutex_};
metrics_.push_back(std::move(metrics));
}
const auto& end{std::chrono::system_clock::now()};
const auto& duration{end - start};
const auto& remainder{
std::chrono::milliseconds(metrics_interval_ms_) - duration};
CheckForMetricIntervalTooShort(remainder, duration);
{
std::unique_lock<std::mutex> query_loop_lock{query_loop_mutex_};
query_loop_cv_.wait_for(query_loop_lock, remainder);
}
}
}
void
MetricsManager::CheckForMissingMetrics(const Metrics& metrics)
{
if (has_given_missing_metrics_warning_) {
return;
}
if (metrics.gpu_utilization_per_gpu.empty()) {
std::cerr << "WARNING: Unable to parse 'nv_gpu_utilization' metric."
<< std::endl;
has_given_missing_metrics_warning_ = true;
}
if (metrics.gpu_power_usage_per_gpu.empty()) {
std::cerr << "WARNING: Unable to parse 'nv_gpu_power_usage' metric."
<< std::endl;
has_given_missing_metrics_warning_ = true;
}
if (metrics.gpu_memory_used_bytes_per_gpu.empty()) {
std::cerr << "WARNING: Unable to parse 'nv_gpu_memory_used_bytes' metric."
<< std::endl;
has_given_missing_metrics_warning_ = true;
}
if (metrics.gpu_memory_total_bytes_per_gpu.empty()) {
std::cerr << "WARNING: Unable to parse 'nv_gpu_memory_total_bytes' metric."
<< std::endl;
has_given_missing_metrics_warning_ = true;
}
}
void
MetricsManager::CheckForMetricIntervalTooShort(
const std::chrono::nanoseconds& remainder,
const std::chrono::nanoseconds& duration)
{
if (has_given_metric_interval_warning_) {
return;
}
if (remainder < std::chrono::nanoseconds::zero()) {
std::cerr << "WARNING: Triton metrics endpoint latency ("
<< std::chrono::duration_cast<std::chrono::milliseconds>(duration)
.count()
<< "ms) is larger than the querying interval ("
<< metrics_interval_ms_
<< "ms). Please try a larger querying interval "
"via `--triton-metrics-interval`."
<< std::endl;
has_given_metric_interval_warning_ = true;
}
}
void
MetricsManager::CheckQueryingStatus()
{
if (query_loop_future_.valid() &&
query_loop_future_.wait_for(std::chrono::seconds(0)) ==
std::future_status::ready) {
query_loop_future_.get();
}
}
void
MetricsManager::GetLatestMetrics(std::vector<Metrics>& metrics)
{
if (metrics.empty() == false) {
throw PerfAnalyzerException(
"MetricsManager::GetLatestMetrics() must be passed an empty vector.",
GENERIC_ERROR);
}
std::lock_guard<std::mutex> metrics_lock{metrics_mutex_};
metrics_.swap(metrics);
}
void
MetricsManager::StopQueryingMetrics()
{
should_keep_querying_ = false;
query_loop_cv_.notify_one();
if (query_loop_future_.valid()) {
query_loop_future_.get();
}
}
}} // namespace triton::perfanalyzer
// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <chrono>
#include <condition_variable>
#include <cstdint>
#include <future>
#include <memory>
#include <mutex>
#include <vector>
#include "client_backend/client_backend.h"
#include "metrics.h"
namespace triton { namespace perfanalyzer {
#ifndef DOCTEST_CONFIG_DISABLE
class TestMetricsManager;
#endif
class MetricsManager {
public:
MetricsManager(
std::shared_ptr<clientbackend::ClientBackend> client_backend,
uint64_t metrics_interval_ms);
/// Ends the background thread, redundant in case StopQueryingMetrics() isn't
/// called
~MetricsManager();
/// Starts background thread that queries metrics on an interval
void StartQueryingMetrics();
/// Checks if background thread threw exception and propagates it if so
void CheckQueryingStatus();
/// Puts the latest-collected metrics from background thread into vector
/// output parameter to be used by main thread
void GetLatestMetrics(std::vector<Metrics>& metrics_per_timestamp);
/// Ends the background thread
void StopQueryingMetrics();
private:
void QueryMetricsEveryNMilliseconds();
void CheckForMissingMetrics(const Metrics& metrics);
void CheckForMetricIntervalTooShort(
const std::chrono::nanoseconds& remainder,
const std::chrono::nanoseconds& duration);
std::shared_ptr<clientbackend::ClientBackend> client_backend_{nullptr};
uint64_t metrics_interval_ms_{0};
std::mutex metrics_mutex_{};
std::vector<Metrics> metrics_{};
bool should_keep_querying_{false};
std::future<void> query_loop_future_{};
std::mutex query_loop_mutex_{};
std::condition_variable query_loop_cv_{};
bool has_given_missing_metrics_warning_{false};
bool has_given_metric_interval_warning_{false};
#ifndef DOCTEST_CONFIG_DISABLE
friend TestMetricsManager;
public:
MetricsManager() = default;
#endif
};
}} // namespace triton::perfanalyzer
// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include "concurrency_worker.h"
#include "gmock/gmock.h"
namespace triton { namespace perfanalyzer {
class NaggyMockConcurrencyWorker : public ConcurrencyWorker {
public:
NaggyMockConcurrencyWorker(
uint32_t id, std::shared_ptr<ThreadStat> thread_stat,
std::shared_ptr<ThreadConfig> thread_config,
const std::shared_ptr<ModelParser> parser,
std::shared_ptr<DataLoader> data_loader,
const std::shared_ptr<cb::ClientBackendFactory> factory,
const bool on_sequence_model, const bool async,
const size_t max_concurrency, const bool using_json_data,
const bool streaming, const int32_t batch_size,
std::condition_variable& wake_signal, std::mutex& wake_mutex,
size_t& active_threads, bool& execute,
const std::shared_ptr<IInferDataManager>& infer_data_manager,
std::shared_ptr<SequenceManager> sequence_manager)
: ConcurrencyWorker(
id, thread_stat, thread_config, parser, data_loader, factory,
on_sequence_model, async, max_concurrency, using_json_data,
streaming, batch_size, wake_signal, wake_mutex, active_threads,
execute, infer_data_manager, sequence_manager)
{
ON_CALL(*this, Infer()).WillByDefault([this]() -> void {
ConcurrencyWorker::Infer();
});
}
MOCK_METHOD(void, Infer, (), (override));
void EmptyInfer() { thread_config_->is_paused_ = true; }
};
// Non-naggy version of Mock (won't warn when using default gmock
// mocked function)
using MockConcurrencyWorker = testing::NiceMock<NaggyMockConcurrencyWorker>;
}} // namespace triton::perfanalyzer
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment