Commit c68e1835 authored by lijian6's avatar lijian6
Browse files

Initial commit

parents
Pipeline #561 failed with stages
in 0 seconds
// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <chrono>
#include <string>
#include <vector>
#include "client_backend/client_backend.h"
#include "request_rate_manager.h"
namespace triton { namespace perfanalyzer {
#ifndef DOCTEST_CONFIG_DISABLE
class TestCustomLoadManager;
#endif
//==============================================================================
/// CustomLoadManager is a helper class to send inference requests to
/// inference server in accordance with user provided time intervals. This
/// load manager can be used to model certain patterns of interest.
///
class CustomLoadManager : public RequestRateManager {
public:
~CustomLoadManager() = default;
/// Create an object of realistic load manager that is responsible to maintain
/// specified load on inference server.
/// \param async Whether to use asynchronous or synchronous API for infer
/// request.
/// \param streaming Whether to use gRPC streaming API for infer request
/// \param measurement_window_ms The time window for measurements.
/// \param max_trials The maximum number of windows that will be measured
/// \param request_intervals_file The path to the file to use to pick up the
/// time intervals between the successive requests.
/// \param batch_size The batch size used for each request.
/// \param max_threads The maximum number of working threads to be spawned.
/// \param num_of_sequences The number of concurrent sequences that must be
/// maintained on the server.
/// \param zero_input Whether to fill the input tensors with zero.
/// \param input_shapes The shape of the input tensors.
/// \param user_data The vector containing path/paths to user-provided data
/// that can be a directory or path to a json data file.
/// \param shared_memory_type The type of shared memory to use for inputs.
/// \param output_shm_size The size of the shared memory to allocate for the
/// output.
/// \param serial_sequences Enable serial sequence mode.
/// \param parser The ModelParser object to get the model details.
/// \param factory The ClientBackendFactory object used to create
/// client to the server.
/// \param manager Returns a new ConcurrencyManager object.
/// \return cb::Error object indicating success or failure.
static cb::Error Create(
const bool async, const bool streaming,
const uint64_t measurement_window_ms, const size_t max_trials,
const std::string& request_intervals_file, const int32_t batch_size,
const size_t max_threads, const uint32_t num_of_sequences,
const SharedMemoryType shared_memory_type, const size_t output_shm_size,
const bool serial_sequences, const std::shared_ptr<ModelParser>& parser,
const std::shared_ptr<cb::ClientBackendFactory>& factory,
std::unique_ptr<LoadManager>* manager);
/// Initializes the load manager with the provided file containing request
/// intervals
/// \return cb::Error object indicating success or failure.
cb::Error InitCustomIntervals();
/// Computes the request rate from the time interval file. Fails with an error
/// if the file is not present or is empty.
/// \param request_rate Returns request rate as computed from the time
/// interval file.
/// \return cb::Error object indicating success or failure.
cb::Error GetCustomRequestRate(double* request_rate);
private:
CustomLoadManager(
const bool async, const bool streaming,
const std::string& request_intervals_file, const int32_t batch_size,
const uint64_t measurement_window_ms, const size_t max_trials,
const size_t max_threads, const uint32_t num_of_sequences,
const SharedMemoryType shared_memory_type, const size_t output_shm_size,
const bool serial_sequences, const std::shared_ptr<ModelParser>& parser,
const std::shared_ptr<cb::ClientBackendFactory>& factory);
cb::Error GenerateSchedule();
std::vector<RateSchedulePtr_t> CreateWorkerSchedules();
/// Reads the time intervals file and stores intervals in vector
/// \param path Filesystem path of the time intervals file.
/// \param contents Output intervals vector.
/// \return cb::Error object indicating success or failure.
virtual cb::Error ReadTimeIntervalsFile(
const std::string& path, NanoIntervals* contents);
std::string request_intervals_file_;
NanoIntervals custom_intervals_;
#ifndef DOCTEST_CONFIG_DISABLE
friend TestCustomLoadManager;
public:
CustomLoadManager() = default;
#endif
};
}} // namespace triton::perfanalyzer
// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "data_loader.h"
#include <b64/decode.h>
#include <rapidjson/filereadstream.h>
#include <fstream>
namespace triton { namespace perfanalyzer {
DataLoader::DataLoader(const size_t batch_size)
: batch_size_(batch_size), data_stream_cnt_(0)
{
}
cb::Error
DataLoader::ReadDataFromDir(
const std::shared_ptr<ModelTensorMap>& inputs,
const std::shared_ptr<ModelTensorMap>& outputs,
const std::string& data_directory)
{
// Directory structure supports only a single data stream and step
data_stream_cnt_ = 1;
step_num_.push_back(1);
for (const auto& input : *inputs) {
if (input.second.datatype_.compare("BYTES") != 0) {
const auto file_path = data_directory + "/" + input.second.name_;
std::string key_name(
input.second.name_ + "_" + std::to_string(0) + "_" +
std::to_string(0));
auto it = input_data_.emplace(key_name, std::vector<char>()).first;
RETURN_IF_ERROR(ReadFile(file_path, &it->second));
int64_t byte_size = ByteSize(input.second.shape_, input.second.datatype_);
if (byte_size < 0) {
return cb::Error(
"input " + input.second.name_ +
" contains dynamic shape, provide shapes to send along with "
"the request",
pa::GENERIC_ERROR);
}
if (it->second.size() != byte_size) {
return cb::Error(
"provided data for input " + input.second.name_ +
" has byte size " + std::to_string(it->second.size()) +
", expect " + std::to_string(byte_size),
pa::GENERIC_ERROR);
}
} else {
const auto file_path = data_directory + "/" + input.second.name_;
std::vector<std::string> input_string_data;
RETURN_IF_ERROR(ReadTextFile(file_path, &input_string_data));
std::string key_name(
input.second.name_ + "_" + std::to_string(0) + "_" +
std::to_string(0));
auto it = input_data_.emplace(key_name, std::vector<char>()).first;
SerializeStringTensor(input_string_data, &it->second);
int64_t batch1_num_strings = ElementCount(input.second.shape_);
if (batch1_num_strings == -1) {
return cb::Error(
"input " + input.second.name_ +
" contains dynamic shape, provide shapes to send along with "
"the request",
pa::GENERIC_ERROR);
}
if (input_string_data.size() != batch1_num_strings) {
return cb::Error(
"provided data for input " + input.second.name_ + " has " +
std::to_string(input_string_data.size()) +
" elements, expect " + std::to_string(batch1_num_strings),
pa::GENERIC_ERROR);
}
}
}
for (const auto& output : *outputs) {
if (output.second.datatype_.compare("BYTES") != 0) {
const auto file_path = data_directory + "/" + output.second.name_;
std::string key_name(
output.second.name_ + "_" + std::to_string(0) + "_" +
std::to_string(0));
auto it = output_data_.emplace(key_name, std::vector<char>()).first;
if (!ReadFile(file_path, &it->second).IsOk()) {
output_data_.erase(it);
}
} else {
const auto file_path = data_directory + "/" + output.second.name_;
std::vector<std::string> output_string_data;
if (!ReadTextFile(file_path, &output_string_data).IsOk()) {
continue;
}
std::string key_name(
output.second.name_ + "_" + std::to_string(0) + "_" +
std::to_string(0));
auto it = output_data_.emplace(key_name, std::vector<char>()).first;
SerializeStringTensor(output_string_data, &it->second);
}
}
return cb::Error::Success;
}
cb::Error
DataLoader::ReadDataFromJSON(
const std::shared_ptr<ModelTensorMap>& inputs,
const std::shared_ptr<ModelTensorMap>& outputs,
const std::string& json_file)
{
FILE* data_file = fopen(json_file.c_str(), "r");
if (data_file == nullptr) {
return cb::Error(
"failed to open file for reading provided data", pa::GENERIC_ERROR);
}
char readBuffer[65536];
rapidjson::FileReadStream fs(data_file, readBuffer, sizeof(readBuffer));
rapidjson::Document d{};
const unsigned int parseFlags = rapidjson::kParseNanAndInfFlag;
d.ParseStream<parseFlags>(fs);
fclose(data_file);
return ParseData(d, inputs, outputs);
}
cb::Error
DataLoader::ParseData(
const rapidjson::Document& json,
const std::shared_ptr<ModelTensorMap>& inputs,
const std::shared_ptr<ModelTensorMap>& outputs)
{
if (json.HasParseError()) {
std::cerr << "cb::Error : " << json.GetParseError() << '\n'
<< "Offset : " << json.GetErrorOffset() << '\n';
return cb::Error(
"failed to parse the specified json file for reading provided data",
pa::GENERIC_ERROR);
}
if (!json.HasMember("data")) {
return cb::Error(
"The json file doesn't contain data field", pa::GENERIC_ERROR);
}
const rapidjson::Value& streams = json["data"];
// Validation data is optional, once provided, it must align with 'data'
const rapidjson::Value* out_streams = nullptr;
if (json.HasMember("validation_data")) {
out_streams = &json["validation_data"];
if (out_streams->Size() != streams.Size()) {
return cb::Error(
"The 'validation_data' field doesn't align with 'data' field in the "
"json file",
pa::GENERIC_ERROR);
}
}
int count = streams.Size();
data_stream_cnt_ += count;
int offset = step_num_.size();
for (size_t i = offset; i < data_stream_cnt_; i++) {
const rapidjson::Value& steps = streams[i - offset];
const rapidjson::Value* output_steps =
(out_streams == nullptr) ? nullptr : &(*out_streams)[i - offset];
RETURN_IF_ERROR(ValidateParsingMode(steps));
if (steps.IsArray()) {
step_num_.push_back(steps.Size());
for (size_t k = 0; k < step_num_[i]; k++) {
RETURN_IF_ERROR(ReadTensorData(steps[k], inputs, i, k, true));
}
if (output_steps != nullptr) {
if (!output_steps->IsArray() ||
(output_steps->Size() != steps.Size())) {
return cb::Error(
"The 'validation_data' field doesn't align with 'data' field in "
"the json file",
pa::GENERIC_ERROR);
}
for (size_t k = 0; k < step_num_[i]; k++) {
RETURN_IF_ERROR(
ReadTensorData((*output_steps)[k], outputs, i, k, false));
}
}
} else {
// There is no nesting of tensors, hence, will interpret streams as steps
// and add the tensors to a single stream '0'.
int offset = 0;
if (step_num_.empty()) {
step_num_.push_back(count);
} else {
offset = step_num_[0];
step_num_[0] += (count);
}
data_stream_cnt_ = 1;
for (size_t k = offset; k < step_num_[0]; k++) {
RETURN_IF_ERROR(
ReadTensorData(streams[k - offset], inputs, 0, k, true));
}
if (out_streams != nullptr) {
for (size_t k = offset; k < step_num_[0]; k++) {
RETURN_IF_ERROR(
ReadTensorData((*out_streams)[k - offset], outputs, 0, k, false));
}
}
break;
}
}
return cb::Error::Success;
}
cb::Error
DataLoader::GenerateData(
std::shared_ptr<ModelTensorMap> inputs, const bool zero_input,
const size_t string_length, const std::string& string_data)
{
// Data generation supports only a single data stream and step
// Not supported for inputs with dynamic shapes
data_stream_cnt_ = 1;
step_num_.push_back(1);
// Validate the absence of shape tensors
for (const auto& input : *inputs) {
if (input.second.is_shape_tensor_) {
return cb::Error(
"can not generate data for shape tensor '" + input.second.name_ +
"', user-provided data is needed.",
pa::GENERIC_ERROR);
}
}
uint64_t max_input_byte_size = 0;
for (const auto& input : *inputs) {
if (input.second.datatype_.compare("BYTES") != 0) {
int64_t byte_size = ByteSize(input.second.shape_, input.second.datatype_);
if (byte_size < 0) {
return cb::Error(
"input " + input.second.name_ +
" contains dynamic shape, provide shapes to send along with "
"the request",
pa::GENERIC_ERROR);
}
max_input_byte_size = std::max(max_input_byte_size, (size_t)byte_size);
} else {
// Generate string input and store it into map
std::vector<std::string> input_string_data;
int64_t batch1_num_strings = ElementCount(input.second.shape_);
if (batch1_num_strings == -1) {
return cb::Error(
"input " + input.second.name_ +
" contains dynamic shape, provide shapes to send along with "
"the request",
pa::GENERIC_ERROR);
}
input_string_data.resize(batch1_num_strings);
if (!string_data.empty()) {
for (size_t i = 0; i < batch1_num_strings; i++) {
input_string_data[i] = string_data;
}
} else {
for (size_t i = 0; i < batch1_num_strings; i++) {
input_string_data[i] = GetRandomString(string_length);
}
}
std::string key_name(
input.second.name_ + "_" + std::to_string(0) + "_" +
std::to_string(0));
auto it = input_data_.emplace(key_name, std::vector<char>()).first;
SerializeStringTensor(input_string_data, &it->second);
}
}
// Create a zero or randomly (as indicated by zero_input)
// initialized buffer that is large enough to provide the largest
// needed input. We (re)use this buffer for all non-string input values.
if (max_input_byte_size > 0) {
if (zero_input) {
input_buf_.resize(max_input_byte_size, 0);
} else {
input_buf_.resize(max_input_byte_size);
for (auto& byte : input_buf_) {
byte = rand();
}
}
}
return cb::Error::Success;
}
cb::Error
DataLoader::GetInputData(
const ModelTensor& input, const int stream_id, const int step_id,
TensorData& data)
{
data.data_ptr = nullptr;
data.batch1_size = 0;
data.is_valid = false;
// If json data is available then try to retrieve the data from there
if (!input_data_.empty()) {
RETURN_IF_ERROR(ValidateIndexes(stream_id, step_id));
std::string key_name(
input.name_ + "_" + std::to_string(stream_id) + "_" +
std::to_string(step_id));
// Get the data and the corresponding byte-size
auto it = input_data_.find(key_name);
if (it != input_data_.end()) {
std::vector<char>* data_vec = &it->second;
data.is_valid = true;
data.batch1_size = data_vec->size();
data.data_ptr = (const uint8_t*)data_vec->data();
}
}
if (!data.is_valid) {
if ((input.datatype_.compare("BYTES") != 0) && (input_buf_.size() != 0)) {
int64_t byte_size = ByteSize(input.shape_, input.datatype_);
if (byte_size < 0) {
return cb::Error(
"failed to get correct byte size for '" + input.name_ + "'.",
pa::GENERIC_ERROR);
}
data.batch1_size = (size_t)byte_size;
data.data_ptr = &input_buf_[0];
data.is_valid = true;
}
}
if (input.is_optional_ == false && !data.is_valid) {
return cb::Error(
"unable to find data for input '" + input.name_ + "'.",
pa::GENERIC_ERROR);
}
return cb::Error::Success;
}
cb::Error
DataLoader::GetOutputData(
const std::string& output_name, const int stream_id, const int step_id,
TensorData& data)
{
data.data_ptr = nullptr;
data.batch1_size = 0;
data.is_valid = false;
// If json data is available then try to retrieve the data from there
if (!output_data_.empty()) {
RETURN_IF_ERROR(ValidateIndexes(stream_id, step_id));
std::string key_name(
output_name + "_" + std::to_string(stream_id) + "_" +
std::to_string(step_id));
// Get the data and the corresponding byte-size
auto it = output_data_.find(key_name);
if (it != output_data_.end()) {
std::vector<char>* data_vec = &it->second;
data.is_valid = true;
data.batch1_size = data_vec->size();
data.data_ptr = (const uint8_t*)data_vec->data();
}
}
return cb::Error::Success;
}
cb::Error
DataLoader::ValidateIndexes(int stream_id, int step_id)
{
if (stream_id < 0 || stream_id >= (int)data_stream_cnt_) {
return cb::Error(
"stream_id for retrieving the data should be less than " +
std::to_string(data_stream_cnt_) + ", got " +
std::to_string(stream_id),
pa::GENERIC_ERROR);
}
if (step_id < 0 || step_id >= (int)step_num_[stream_id]) {
return cb::Error(
"step_id for retrieving the data should be less than " +
std::to_string(step_num_[stream_id]) + ", got " +
std::to_string(step_id),
pa::GENERIC_ERROR);
}
return cb::Error::Success;
}
cb::Error
DataLoader::GetInputShape(
const ModelTensor& input, const int stream_id, const int step_id,
std::vector<int64_t>* provided_shape)
{
std::string key_name(
input.name_ + "_" + std::to_string(stream_id) + "_" +
std::to_string(step_id));
provided_shape->clear();
// Prefer the values read from file over the ones provided from
// CLI
auto it = input_shapes_.find(key_name);
if (it != input_shapes_.end()) {
*provided_shape = it->second;
} else {
*provided_shape = input.shape_;
}
return cb::Error::Success;
}
cb::Error
DataLoader::ReadTensorData(
const rapidjson::Value& step,
const std::shared_ptr<ModelTensorMap>& tensors, const int stream_index,
const int step_index, const bool is_input)
{
auto& tensor_data = is_input ? input_data_ : output_data_;
auto& tensor_shape = is_input ? input_shapes_ : output_shapes_;
for (const auto& io : *tensors) {
if (step.HasMember(io.first.c_str())) {
std::string key_name(
io.first + "_" + std::to_string(stream_index) + "_" +
std::to_string(step_index));
auto it = tensor_data.emplace(key_name, std::vector<char>()).first;
const rapidjson::Value& tensor = step[(io.first).c_str()];
const rapidjson::Value* content;
// Check if the input data file is malformed
if (!(tensor.IsArray() || tensor.IsObject())) {
return cb::Error("Input data file is malformed.", pa::GENERIC_ERROR);
}
if (tensor.IsArray()) {
content = &tensor;
} else {
// Populate the shape values first if available
if (tensor.HasMember("shape")) {
auto shape_it =
tensor_shape.emplace(key_name, std::vector<int64_t>()).first;
for (const auto& value : tensor["shape"].GetArray()) {
if (!value.IsInt()) {
return cb::Error(
"shape values must be integers.", pa::GENERIC_ERROR);
}
shape_it->second.push_back(value.GetInt());
}
}
if (tensor.HasMember("b64")) {
content = &tensor;
} else {
if (!tensor.HasMember("content")) {
return cb::Error(
"missing content field. ( Location stream id: " +
std::to_string(stream_index) +
", step id: " + std::to_string(step_index) + ")",
pa::GENERIC_ERROR);
}
content = &tensor["content"];
}
}
if (content->IsArray()) {
RETURN_IF_ERROR(SerializeExplicitTensor(
*content, io.second.datatype_, &it->second));
} else {
if (content->IsObject() && content->HasMember("b64")) {
if ((*content)["b64"].IsString()) {
const std::string& encoded = (*content)["b64"].GetString();
it->second.resize(encoded.length());
base64::decoder D;
int size =
D.decode(encoded.c_str(), encoded.length(), &it->second[0]);
it->second.resize(size);
} else {
return cb::Error(
"the value of b64 field should be of type string ( "
"Location stream id: " +
std::to_string(stream_index) +
", step id: " + std::to_string(step_index) + ")",
pa::GENERIC_ERROR);
}
} else {
return cb::Error(
"The tensor values are not supported. Expected an array or "
"b64 string ( Location stream id: " +
std::to_string(stream_index) +
", step id: " + std::to_string(step_index) + ")",
pa::GENERIC_ERROR);
}
}
RETURN_IF_ERROR(ValidateTensor(io.second, stream_index, step_index));
} else if (io.second.is_optional_ == false) {
return cb::Error(
"missing tensor " + io.first +
" ( Location stream id: " + std::to_string(stream_index) +
", step id: " + std::to_string(step_index) + ")",
pa::GENERIC_ERROR);
}
}
return cb::Error::Success;
}
cb::Error
DataLoader::ReadFile(const std::string& path, std::vector<char>* contents)
{
std::ifstream in(path, std::ios::in | std::ios::binary);
if (!in) {
return cb::Error("failed to open file '" + path + "'", pa::GENERIC_ERROR);
}
in.seekg(0, std::ios::end);
int file_size = in.tellg();
if (file_size > 0) {
contents->resize(file_size);
in.seekg(0, std::ios::beg);
in.read(&(*contents)[0], contents->size());
}
in.close();
// If size is invalid, report after ifstream is closed
if (file_size < 0) {
return cb::Error(
"failed to get size for file '" + path + "'", pa::GENERIC_ERROR);
} else if (file_size == 0) {
return cb::Error("file '" + path + "' is empty", pa::GENERIC_ERROR);
}
return cb::Error::Success;
}
cb::Error
DataLoader::ReadTextFile(
const std::string& path, std::vector<std::string>* contents)
{
std::ifstream in(path);
if (!in) {
return cb::Error("failed to open file '" + path + "'", pa::GENERIC_ERROR);
}
std::string current_string;
while (std::getline(in, current_string)) {
contents->push_back(current_string);
}
in.close();
if (contents->size() == 0) {
return cb::Error("file '" + path + "' is empty", pa::GENERIC_ERROR);
}
return cb::Error::Success;
}
cb::Error
DataLoader::ValidateTensor(
const ModelTensor& model_tensor, const int stream_index,
const int step_index)
{
std::string key_name(
model_tensor.name_ + "_" + std::to_string(stream_index) + "_" +
std::to_string(step_index));
auto data_it = input_data_.find(key_name);
if (data_it == input_data_.end()) {
data_it = output_data_.find(key_name);
}
if (data_it == output_data_.end()) {
return cb::Error("Can't validate a nonexistent tensor");
}
auto shape_it = input_shapes_.find(key_name);
const std::vector<char>& data = data_it->second;
const std::vector<int64_t>& shape = (shape_it == input_shapes_.end())
? model_tensor.shape_
: shape_it->second;
int64_t batch1_byte = ByteSize(shape, model_tensor.datatype_);
RETURN_IF_ERROR(ValidateTensorShape(shape, model_tensor));
RETURN_IF_ERROR(ValidateTensorDataSize(data, batch1_byte, model_tensor));
return cb::Error::Success;
}
cb::Error
DataLoader::ValidateTensorShape(
const std::vector<int64_t>& shape, const ModelTensor& model_tensor)
{
int element_count = ElementCount(shape);
if (element_count < 0) {
return cb::Error(
"The variable-sized tensor \"" + model_tensor.name_ +
"\" with model shape " + ShapeVecToString(model_tensor.shape_) +
" needs to have its shape fully defined. See the --shape option.",
pa::GENERIC_ERROR);
}
bool is_error = false;
if (shape.size() != model_tensor.shape_.size()) {
is_error = true;
}
for (size_t i = 0; i < shape.size() && !is_error; i++) {
if (shape[i] != model_tensor.shape_[i] && model_tensor.shape_[i] != -1) {
is_error = true;
}
}
if (is_error) {
return cb::Error(
"The supplied shape of " + ShapeVecToString(shape) + " for input \"" +
model_tensor.name_ +
"\" is incompatible with the model's input shape of " +
ShapeVecToString(model_tensor.shape_));
}
return cb::Error::Success;
}
cb::Error
DataLoader::ValidateTensorDataSize(
const std::vector<char>& data, int64_t batch1_byte,
const ModelTensor& model_tensor)
{
// Validate that the supplied data matches the amount of data expected based
// on the shape
if (batch1_byte > 0 && (size_t)batch1_byte != data.size()) {
return cb::Error(
"mismatch in the data provided for " + model_tensor.name_ +
". Expected: " + std::to_string(batch1_byte) +
" bytes, Got: " + std::to_string(data.size()) + " bytes",
pa::GENERIC_ERROR);
}
return cb::Error::Success;
}
cb::Error
DataLoader::ValidateParsingMode(const rapidjson::Value& steps)
{
// If our first time parsing data, capture the mode
if (step_num_.size() == 0) {
multiple_stream_mode_ = steps.IsArray();
} else {
if (steps.IsArray() != multiple_stream_mode_) {
return cb::Error(
"Inconsistency in input-data provided. Can not have a combination of "
"objects and arrays inside of the Data array",
pa::GENERIC_ERROR);
}
}
return cb::Error::Success;
}
}} // namespace triton::perfanalyzer
// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <fstream>
#include "model_parser.h"
#include "perf_utils.h"
#include "tensor_data.h"
namespace triton { namespace perfanalyzer {
#ifndef DOCTEST_CONFIG_DISABLE
class NaggyMockDataLoader;
#endif
class DataLoader {
public:
DataLoader(size_t batch_size);
/// Returns the total number of data streams available.
size_t GetDataStreamsCount() { return data_stream_cnt_; }
/// Returns the total data steps supported for a requested data stream
/// id.
/// \param stream_id The target stream id
virtual size_t GetTotalSteps(size_t stream_id)
{
if (stream_id < data_stream_cnt_) {
return step_num_[stream_id];
}
return 0;
}
/// Reads the input data from the specified data directory.
/// \param inputs The pointer to the map holding the information about
/// input tensors of a model
/// \param data_directory The path to the directory containing the data
cb::Error ReadDataFromDir(
const std::shared_ptr<ModelTensorMap>& inputs,
const std::shared_ptr<ModelTensorMap>& outputs,
const std::string& data_directory);
/// Reads the input data from the specified json file.
/// \param inputs The pointer to the map holding the information about
/// input tensors of a model
/// \param json_file The json file containing the user-provided input
/// data.
/// Returns error object indicating status
virtual cb::Error ReadDataFromJSON(
const std::shared_ptr<ModelTensorMap>& inputs,
const std::shared_ptr<ModelTensorMap>& outputs,
const std::string& json_file);
/// Generates the input data to use with the inference requests
/// \param inputs The pointer to the map holding the information about
/// input tensors of a model
/// \param zero_input Whether or not to use zero value for buffer
/// initialization.
/// \param string_length The length of the string to generate for
/// tensor inputs.
/// \param string_data The user provided string to use to populate
/// string tensors
/// Returns error object indicating status
cb::Error GenerateData(
std::shared_ptr<ModelTensorMap> inputs, const bool zero_input,
const size_t string_length, const std::string& string_data);
/// Helper function to access data for the specified input
/// \param input The target model input tensor
/// \param stream_id The data stream_id to use for retrieving input data.
/// \param step_id The data step_id to use for retrieving input data.
/// \param data Returns the input TensorData
/// Returns error object indicating status
cb::Error GetInputData(
const ModelTensor& input, const int stream_id, const int step_id,
TensorData& data);
/// Helper function to get the shape values to the input
/// \param input The target model input tensor
/// \param stream_id The data stream_id to use for retrieving input shape.
/// \param step_id The data step_id to use for retrieving input shape.
/// \param shape returns the pointer to the vector containing the shape
/// values.
/// Returns error object indicating status
cb::Error GetInputShape(
const ModelTensor& input, const int stream_id, const int step_id,
std::vector<int64_t>* shape);
/// Helper function to access data for the specified output. nullptr will be
/// returned if there is no data specified.
/// \param output_name The name of the output tensor
/// \param stream_id The data stream_id to use for retrieving output data.
/// \param step_id The data step_id to use for retrieving output data.
/// \param data Returns the output TensorData
/// Returns error object indicating status
cb::Error GetOutputData(
const std::string& output_name, const int stream_id, const int step_id,
TensorData& data);
/// Return an error if the stream index or step index are invalid
cb::Error ValidateIndexes(int stream_index, int step_index);
protected:
/// Parses the input and output data from the json document
/// \param inputs The input tensors of a model
/// \param outputs The output tensors of a model
/// \param json The json document containing the raw json inputs/outputs
/// \return Returns error object indicating status
cb::Error ParseData(
const rapidjson::Document& json,
const std::shared_ptr<ModelTensorMap>& inputs,
const std::shared_ptr<ModelTensorMap>& outputs);
private:
/// Reads the data from file specified by path into vector of characters
/// \param path The complete path to the file to be read
/// \param contents The character vector that will contain the data read
/// \return error status. Returns Non-Ok if an error is encountered during
/// read operation.
virtual cb::Error ReadFile(
const std::string& path, std::vector<char>* contents);
/// Reads the string from file specified by path into vector of strings
/// \param path The complete path to the file to be read
/// \param contents The string vector that will contain the data read
/// \return error status. Returns Non-Ok if an error is encountered during
/// read operation.
virtual cb::Error ReadTextFile(
const std::string& path, std::vector<std::string>* contents);
/// Helper function to read data for the specified input from json
/// \param step the DOM for current step
/// \param inputs The pointer to the map holding the information about
/// input tensors of a model
/// \param stream_index the stream index the data should be exported to.
/// \param step_index the step index the data should be exported to.
/// Returns error object indicating status
cb::Error ReadTensorData(
const rapidjson::Value& step,
const std::shared_ptr<ModelTensorMap>& tensors, const int stream_index,
const int step_index, const bool is_input);
/// Helper function to validate the provided data and shape for the tensor
/// \param input The target model input or output tensor
/// \param stream_index the stream index the data should be exported to.
/// \param step_index the step index the data should be exported to.
/// Returns error object indicating status
cb::Error ValidateTensor(
const ModelTensor& model_tensor, const int stream_index,
const int step_index);
/// Helper function to validate the provided shape for a tensor
/// \param shape Shape for the tensor
/// \param model_tensor The tensor to validate
/// Returns error object indicating status
cb::Error ValidateTensorShape(
const std::vector<int64_t>& shape, const ModelTensor& model_tensor);
/// Helper function to validate the provided data's size
/// \param data The provided data for the tensor
/// \param batch1_byte The expected number of bytes of data
/// \param model_tensor The tensor to validate
/// Returns error object indicating status
cb::Error ValidateTensorDataSize(
const std::vector<char>& data, int64_t batch1_byte,
const ModelTensor& model_tensor);
/// Helper function to validate consistency of parsing mode for provided input
/// data. The code explicitly does not support a mixture of objects (multiple
/// entries of a single stream) and arrays (multiple streams)
///
/// \param steps The json data provided for one or multiple streams
cb::Error ValidateParsingMode(const rapidjson::Value& steps);
// The batch_size_ for the data
size_t batch_size_{1};
// The total number of data streams available.
size_t data_stream_cnt_{0};
// A vector containing the supported step number for respective stream
// ids.
std::vector<size_t> step_num_;
// User provided input data, it will be preferred over synthetic data
std::unordered_map<std::string, std::vector<char>> input_data_;
std::unordered_map<std::string, std::vector<int64_t>> input_shapes_;
// User provided output data for validation
std::unordered_map<std::string, std::vector<char>> output_data_;
std::unordered_map<std::string, std::vector<int64_t>> output_shapes_;
// Placeholder for generated input data, which will be used for all inputs
// except string
std::vector<uint8_t> input_buf_;
// Tracks what type of input data has been provided
bool multiple_stream_mode_ = false;
#ifndef DOCTEST_CONFIG_DISABLE
friend NaggyMockDataLoader;
public:
DataLoader() = default;
#endif
};
}} // namespace triton::perfanalyzer
<!--
Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of NVIDIA CORPORATION nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-->
# **Perf Analyzer Documentation**
| [Installation](README.md#installation) | [Getting Started](README.md#getting-started) | [User Guide](README.md#user-guide) |
| -------------------------------------- | -------------------------------------------- | ---------------------------------- |
## **Installation**
See the [Installation Guide](install.md) for details on how to install Perf
Analyzer.
## **Getting Started**
The [Quick Start Guide](quick_start.md) will show you how to use Perf
Analyzer to profile a simple PyTorch model.
## **User Guide**
The User Guide describes the Perf Analyzer command line options, how to specify
model input data, the performance measurement modes, the performance metrics and
outputs, how to benchmark different servers, and more.
- [Perf Analyzer CLI](cli.md)
- [Inference Load Modes](inference_load_modes.md)
- [Input Data](input_data.md)
- [Measurements & Metrics](measurements_metrics.md)
- [Benchmarking](benchmarking.md)
<!--
Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of NVIDIA CORPORATION nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-->
# Benchmarking Triton via HTTP or gRPC endpoint
This is the default mode for Perf Analyzer.
# Benchmarking Triton directly via C API
Besides using HTTP or gRPC server endpoints to communicate with Triton, Perf
Analyzer also allows users to benchmark Triton directly using the C API. HTTP
and gRPC endpoints introduce an additional latency in the pipeline which may not
be of interest to users who are using Triton via C API within their application.
Specifically, this feature is useful to benchmark a bare minimum Triton without
additional overheads from HTTP/gRPC communication.
## Prerequisite
Pull the Triton SDK and the Triton Server container images on target machine.
Since you will need access to the `tritonserver` install, it might be easier if
you copy the `perf_analyzer` binary to the Inference Server container.
## Required parameters
Use the [`--help`](cli.md#--help) option to see a complete list of supported
command line arguments. By default, Perf Analyzer expects the Triton instance to
already be running. You can configure C API mode using the
[`--service-kind`](cli.md#--service-kindtritontriton_c_apitfservingtorchserve)
option. In addition, you will need to point Perf Analyzer to the Triton server
library path using the
[`--triton-server-directory`](cli.md#--triton-server-directorypath) option and
the model repository path using the
[`--model-repository`](cli.md#--model-repositorypath) option.
An example run would look like:
```
$ perf_analyzer -m my_model --service-kind=triton_c_api --triton-server-directory=/opt/tritonserver --model-repository=/my/model/repository
...
*** Measurement Settings ***
Service Kind: Triton C-API
Using "time_windows" mode for stabilization
Measurement window: 5000 msec
Using synchronous calls for inference
Stabilizing using average latency
Request concurrency: 1
Client:
Request count: 353
Throughput: 19.6095 infer/sec
Avg latency: 50951 usec (standard deviation 2265 usec)
p50 latency: 50833 usec
p90 latency: 50923 usec
p95 latency: 50940 usec
p99 latency: 50985 usec
Server:
Inference count: 353
Execution count: 353
Successful request count: 353
Avg request latency: 50841 usec (overhead 20 usec + queue 63 usec + compute input 35 usec + compute infer 50663 usec + compute output 59 usec)
Inferences/Second vs. Client Average Batch Latency
Concurrency: 1, throughput: 19.6095 infer/sec, latency 50951 usec
```
## Non-supported functionalities
There are a few functionalities that are missing from C API mode. They are:
1. Async mode ([`--async`](cli.md#--async))
2. For additional known non-working cases, please refer to
[qa/L0_perf_analyzer_capi/test.sh](https://github.com/triton-inference-server/server/blob/main/qa/L0_perf_analyzer_capi/test.sh#L239-L277)
# Benchmarking TensorFlow Serving
Perf Analyzer can also be used to benchmark models deployed on
[TensorFlow Serving](https://github.com/tensorflow/serving) using the
[`--service-kind=tfserving`](cli.md#--service-kindtritontriton_c_apitfservingtorchserve)
option. Only gRPC protocol is supported.
The following invocation demonstrates how to configure Perf Analyzer to issue
requests to a running instance of `tensorflow_model_server`:
```
$ perf_analyzer -m resnet50 --service-kind tfserving -i grpc -b 1 -p 5000 -u localhost:8500
*** Measurement Settings ***
Batch size: 1
Using "time_windows" mode for stabilization
Measurement window: 5000 msec
Using synchronous calls for inference
Stabilizing using average latency
Request concurrency: 1
Client:
Request count: 829
Throughput: 165.8 infer/sec
Avg latency: 6032 usec (standard deviation 569 usec)
p50 latency: 5863 usec
p90 latency: 6655 usec
p95 latency: 6974 usec
p99 latency: 8093 usec
Avg gRPC time: 5984 usec ((un)marshal request/response 257 usec + response wait 5727 usec)
Inferences/Second vs. Client Average Batch Latency
Concurrency: 1, throughput: 165.8 infer/sec, latency 6032 usec
```
You might have to specify a different url ([`-u`](cli.md#-u-url)) to access
wherever the server is running. The report of Perf Analyzer will only include
statistics measured at the client-side.
**NOTE:** The support is still in **beta**. Perf Analyzer does not guarantee
optimal tuning for TensorFlow Serving. However, a single benchmarking tool that
can be used to stress the inference servers in an identical manner is important
for performance analysis.
The following points are important for interpreting the results:
1. `Concurrent Request Execution`:
TensorFlow Serving (TFS), as of version 2.8.0, by default creates threads for
each request that individually submits requests to TensorFlow Session. There
is a resource limit on the number of concurrent threads serving requests.
When benchmarking at a higher request concurrency, you can see higher
throughput because of this. Unlike TFS, by default Triton is configured with
only a single
[instance count](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#instance-groups).
Hence, at a higher request concurrency, most of the requests are blocked on
the instance availability. To configure Triton to behave like TFS, set the
instance count to a reasonably high value and then set
[MAX_SESSION_SHARE_COUNT](https://github.com/triton-inference-server/tensorflow_backend#parameters)
parameter in the model `config.pbtxt` to the same value. For some context,
the TFS sets its thread constraint to four times the num of schedulable CPUs.
2. `Different library versions`:
The version of TensorFlow might differ between Triton and TensorFlow Serving
being benchmarked. Even the versions of CUDA libraries might differ between
the two solutions. The performance of models can be susceptible to the
versions of these libraries. For a single request concurrency, if the
`compute_infer` time reported by Perf Analyzer when benchmarking Triton is as
large as the latency reported by Perf Analyzer when benchmarking TFS, then
the performance difference is likely because of the difference in the
software stack and outside the scope of Triton.
3. `CPU Optimization`:
TFS has separate builds for CPU and GPU targets. They have target-specific
optimization. Unlike TFS, Triton has a single build which is optimized for
execution on GPUs. When collecting performance on CPU models on Triton, try
running Triton with the environment variable `TF_ENABLE_ONEDNN_OPTS=1`.
# Benchmarking TorchServe
Perf Analyzer can also be used to benchmark
[TorchServe](https://github.com/pytorch/serve) using the
[`--service-kind=torchserve`](cli.md#--service-kindtritontriton_c_apitfservingtorchserve)
option. Only HTTP protocol is supported. It also requires input to be provided
via JSON file.
The following invocation demonstrates how to configure Perf Analyzer to issue
requests to a running instance of `torchserve` assuming the location holds
`kitten_small.jpg`:
```
$ perf_analyzer -m resnet50 --service-kind torchserve -i http -u localhost:8080 -b 1 -p 5000 --input-data data.json
Successfully read data for 1 stream/streams with 1 step/steps.
*** Measurement Settings ***
Batch size: 1
Using "time_windows" mode for stabilization
Measurement window: 5000 msec
Using synchronous calls for inference
Stabilizing using average latency
Request concurrency: 1
Client:
Request count: 799
Throughput: 159.8 infer/sec
Avg latency: 6259 usec (standard deviation 397 usec)
p50 latency: 6305 usec
p90 latency: 6448 usec
p95 latency: 6494 usec
p99 latency: 7158 usec
Avg HTTP time: 6272 usec (send/recv 77 usec + response wait 6195 usec)
Inferences/Second vs. Client Average Batch Latency
Concurrency: 1, throughput: 159.8 infer/sec, latency 6259 usec
```
The content of `data.json`:
```json
{
"data" :
[
{
"TORCHSERVE_INPUT" : ["kitten_small.jpg"]
}
]
}
```
You might have to specify a different url ([`-u`](cli.md#-u-url)) to access
wherever the server is running. The report of Perf Analyzer will only include
statistics measured at the client-side.
**NOTE:** The support is still in **beta**. Perf Analyzer does not guarantee
optimal tuning for TorchServe. However, a single benchmarking tool that can be
used to stress the inference servers in an identical manner is important for
performance analysis.
# Advantages of using Perf Analyzer over third-party benchmark suites
Triton Inference Server offers the entire serving solution which includes
[client libraries](https://github.com/triton-inference-server/client) that are
optimized for Triton. Using third-party benchmark suites like `jmeter` fails to
take advantage of the optimized libraries. Some of these optimizations includes
but are not limited to:
1. Using
[binary tensor data extension](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_binary_data.md#binary-tensor-data-extension)
with HTTP requests.
2. Effective re-use of gRPC message allocation in subsequent requests.
3. Avoiding extra memory copy via libcurl interface.
These optimizations can have a tremendous impact on overall performance. Using
Perf Analyzer for benchmarking directly allows a user to access these
optimizations in their study.
Not only that, Perf Analyzer is also very customizable and supports many Triton
features as described in this document. This, along with a detailed report,
allows a user to identify performance bottlenecks and experiment with different
features before deciding upon what works best for them.
<!--
Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of NVIDIA CORPORATION nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-->
# Perf Analyzer CLI
This document details the Perf Analyzer command line interface:
- [General Options](#general-options)
- [Measurement Options](#measurement-options)
- [Sequence Model Options](#sequence-model-options)
- [Input Data Options](#input-data-options)
- [Request Options](#request-options)
- [Server Options](#server-options)
- [Prometheus Metrics Options](#prometheus-metrics-options)
- [Report Options](#report-options)
- [Trace Options](#trace-options)
- [Deprecated Options](#deprecated-options)
## General Options
#### `-?`
#### `-h`
#### `--help`
Prints a description of the Perf Analyzer command line interface.
#### `-m <string>`
Specifies the model name for Perf Analyzer to run.
This is a required option.
#### `-x <string>`
Specifies the version of the model to be used. If not specified the most
recent version (the highest numbered version) of the model will be used.
#### `--service-kind=[triton|triton_c_api|tfserving|torchserve]`
Specifies the kind of service for Perf Analyzer to generate load for. Note: in
order to use `torchserve` backend, the `--input-data` option must point to a
JSON file holding data in the following format:
```
{
"data": [
{
"TORCHSERVE_INPUT": [
"<complete path to the content file>"
]
},
{...},
...
]
}
```
The type of file here will depend on the model. In order to use `triton_c_api`
you must specify the Triton server install path and the model repository path
via the `--triton-server-directory` and `--model-repository` options.
Default is `triton`.
#### `--bls-composing-models=<string>`
Specifies the list of all BLS composing models as a comma separated list of
model names (with optional model version number after a colon for each) that may
be called by the input BLS model. For example,
`--bls-composing-models=modelA:3,modelB` would specify that modelA and modelB
are composing models that may be called by the input BLS model, and that modelA
will use version 3, while modelB's version is unspecified.
#### `--model-signature-name=<string>`
Specifies the signature name of the saved model to use.
Default is `serving_default`. This option will be ignored if `--service-kind`
is not `tfserving`.
#### `-v`
Enables verbose mode. May be specified an additional time (`-v -v`) to enable
extra verbose mode.
## Measurement Options
#### `--measurement-mode=[time_windows|count_windows]`
Specifies the mode used for stabilizing measurements. 'time_windows' will
create windows such that the duration of each window is equal to
`--measurement-interval`. 'count_windows' will create windows such that there
are at least `--measurement-request-count` requests in each window and that
the window is at least one second in duration (adding more requests if
necessary).
Default is `time_windows`.
#### `-p <n>`
#### `--measurement-interval=<n>`
Specifies the time interval used for each measurement in milliseconds when
`--measurement-mode=time_windows` is used. Perf Analyzer will sample a time
interval specified by this option and take measurement over the requests
completed within that time interval.
Default is `5000`.
#### `--measurement-request-count=<n>`
Specifies the minimum number of requests to be collected in each measurement
window when `--measurement-mode=count_windows` is used.
Default is `50`.
#### `-s <n>`
#### `--stability-percentage=<n>`
Specifies the allowed variation in latency measurements when determining if a
result is stable. The measurement is considered stable if the ratio of max /
min from the recent 3 measurements is within (stability percentage)% in terms
of both inferences per second and latency.
Default is `10`(%).
#### `--percentile=<n>`
Specifies the confidence value as a percentile that will be used to determine
if a measurement is stable. For example, a value of `85` indicates that the
85th percentile latency will be used to determine stability. The percentile
will also be reported in the results.
Default is `-1` indicating that the average latency is used to determine
stability.
#### `-r <n>`
#### `--max-trials=<n>`
Specifies the maximum number of measurements when attempting to reach stability
of inferences per second and latency for each concurrency or request rate
during the search. Perf Analyzer will terminate if the measurement is still
unstable after the maximum number of trials.
Default is `10`.
#### `--concurrency-range=<start:end:step>`
Specifies the range of concurrency levels covered by Perf Analyzer. Perf
Analyzer will start from the concurrency level of 'start' and go until 'end'
with a stride of 'step'.
Default of 'end' and 'step' are `1`. If 'end' is not specified then Perf
Analyzer will run for a single concurrency level determined by 'start'. If
'end' is set as `0`, then the concurrency limit will be incremented by 'step'
until the latency threshold is met. 'end' and `--latency-threshold` cannot
both be `0`. 'end' cannot be `0` for sequence models while using asynchronous
mode.
#### `--request-rate-range=<start:end:step>`
Specifies the range of request rates for load generated by Perf Analyzer. This
option can take floating-point values. The search along the request rate range
is enabled only when using this option.
If not specified, then Perf Analyzer will search along the concurrency range.
Perf Analyzer will start from the request rate of 'start' and go until 'end'
with a stride of 'step'. Default values of 'start', 'end' and 'step' are all
`1.0`. If 'end' is not specified, then Perf Analyzer will run for a single
request rate as determined by 'start'. If 'end' is set as `0.0`, then the
request rate will be incremented by 'step' until the latency threshold is met.
'end' and `--latency-threshold` can not be both `0`.
#### `--request-distribution=[constant|poisson]`
Specifies the time interval distribution between dispatching inference requests
to the server. Poisson distribution closely mimics the real-world work load on
a server. This option is ignored if not using `--request-rate-range`.
Default is `constant`.
#### `-l <n>`
#### `--latency-threshold=<n>`
Specifies the limit on the observed latency, in milliseconds. Perf Analyzer
will terminate the concurrency or request rate search once the measured latency
exceeds this threshold.
Default is `0` indicating that Perf Analyzer will run for the entire
concurrency or request rate range.
#### `--binary-search`
Enables binary search on the specified search range (concurrency or request
rate). This option requires 'start' and 'end' to be expilicitly specified in
the concurrency range or request rate range. When using this option, 'step' is
more like the precision. When the 'step' is lower, there are more iterations
along the search path to find suitable convergence.
When `--binary-search` is not specified, linear search is used.
#### `--request-intervals=<path>`
Specifies a path to a file containing time intervals in microseconds. Each time
interval should be in a new line. Perf Analyzer will try to maintain time
intervals between successive generated requests to be as close as possible in
this file. This option can be used to apply custom load to server with a
certain pattern of interest. Perf Analyzer will loop around the file if the
duration of execution exceeds the amount of time specified by the intervals.
This option can not be used with `--request-rate-range` or
`--concurrency-range`.
#### `--max-threads=<n>`
Specifies the maximum number of threads that will be created for providing
desired concurrency or request rate. However, when running in synchronous mode
with `--concurrency-range` having explicit 'end' specification, this value will
be ignored.
Default is `4` if `--request-rate-range` is specified, otherwise default is
`16`.
## Sequence Model Options
#### `--num-of-sequences=<n>`
Specifies the number of concurrent sequences for sequence models. This option
is ignored when `--request-rate-range` is not specified.
Default is `4`.
#### `--sequence-length=<n>`
Specifies the base length of a sequence used for sequence models. A sequence
with length X will be composed of X requests to be sent as the elements in the
sequence. The actual length of the sequencewill be within +/- Y% of the base
length, where Y defaults to 20% and is customizable via
`--sequence-length-variation`. If sequence length is unspecified and input data
is provided, the sequence length will be the number of inputs in the
user-provided input data.
Default is `20`.
#### `--sequence-length-variation=<n>`
Specifies the percentage variation in length of sequences. This option is only
valid when not using user-provided input data or when `--sequence-length` is
specified while using user-provided input data.
Default is `20`(%).
#### `--sequence-id-range=<start:end>`
Specifies the range of sequence IDs used by Perf Analyzer. Perf Analyzer will
start from the sequence ID of 'start' and go until 'end' (excluded). If 'end'
is not specified then Perf Analyzer will generate new sequence IDs without
bounds. If 'end' is specified and the concurrency setting may result in
maintaining a number of sequences more than the range of available sequence
IDs, Perf Analyzer will exit with an error due to possible sequence ID
collisions.
The default for 'start is `1`, and 'end' is not specified (no bounds).
#### `--serial-sequences`
Enables the serial sequence mode where a maximum of one request is live per sequence.
Note: It is possible that this mode can cause the request rate mode to not achieve the
desired rate, especially if num-of-sequences is too small.
## Input Data Options
#### `--input-data=[zero|random|<path>]`
Specifies type of data that will be used for input in inference requests. The
available options are `zero`, `random`, and a path to a directory or a JSON
file.
When pointing to a JSON file, the user must adhere to the format described in
the [input data documentation](input_data.md). By specifying JSON data, users
can control data used with every request. Multiple data streams can be specified
for a sequence model, and Perf Analyzer will select a data stream in a
round-robin fashion for every new sequence. Multiple JSON files can also be
provided (`--input-data json_file1.json --input-data json_file2.json` and so on)
and Perf Analyzer will append data streams from each file. When using
`--service-kind=torchserve`, make sure this option points to a JSON file.
If the option is path to a directory then the directory must contain a binary
text file for each non-string/string input respectively, named the same as the
input. Each file must contain the data required for that input for a batch-1
request. Each binary file should contain the raw binary representation of the
input in row-major order for non-string inputs. The text file should contain
all strings needed by batch-1, each in a new line, listed in row-major order.
Default is `random`.
#### `-b <n>`
Specifies the batch size for each request sent.
Default is `1`.
#### `--shape=<string>`
Specifies the shape used for the specified input. The argument must be
specified as 'name:shape' where the shape is a comma-separated list for
dimension sizes. For example `--shape=input_name:1,2,3` indicates that the
input `input_name` has tensor shape [ 1, 2, 3 ]. `--shape` may be specified
multiple times to specify shapes for different inputs.
#### `--string-data=<string>`
Specifies the string to initialize string input buffers. Perf Analyzer will
replicate the given string to build tensors of required shape.
`--string-length` will not have any effect. This option is ignored if
`--input-data` points to a JSON file or directory.
#### `--string-length=<n>`
Specifies the length of the random strings to be generated by Perf Analyzer
for string input. This option is ignored if `--input-data` points to a
JSON file or directory.
Default is `128`.
#### `--shared-memory=[none|system|cuda]`
Specifies the type of the shared memory to use for input and output data.
Default is `none`.
#### `--output-shared-memory-size=<n>`
Specifies The size, in bytes, of the shared memory region to allocate per
output tensor. Only needed when one or more of the outputs are of string type
and/or variable shape. The value should be larger than the size of the largest
output tensor that the model is expected to return. Perf Analyzer will use the
following formula to calculate the total shared memory to allocate:
output_shared_memory_size * number_of_outputs * batch_size.
Default is `102400` (100 KB).
#### `--input-tensor-format=[binary|json]`
Specifies the Triton inference request input tensor format. Only valid when HTTP
protocol is used.
Default is `binary`.
#### `--output-tensor-format=[binary|json]`
Specifies the Triton inference response output tensor format. Only valid when
HTTP protocol is used.
Default is `binary`.
## Request Options
#### `-i [http|grpc]`
Specifies the communication protocol to use. The available protocols are HTTP
and gRPC.
Default is `http`.
#### `-a`
#### `--async`
Enables asynchronous mode in Perf Analyzer.
By default, Perf Analyzer will use a synchronous request API for inference.
However, if the model is sequential, then the default mode is asynchronous.
Specify `--sync` to operate sequential models in synchronous mode. In
synchronous mode, Perf Analyzer will start threads equal to the concurrency
level. Use asynchronous mode to limit the number of threads, yet maintain the
concurrency.
#### `--sync`
Enables synchronous mode in Perf Analyzer. Can be used to operate Perf
Analyzer with sequential model in synchronous mode.
#### `--streaming`
Enables the use of streaming API. This option is only valid with gRPC protocol.
#### `-H <string>`
Specifies the header that will be added to HTTP requests (ignored for gRPC
requests). The header must be specified as 'Header:Value'. `-H` may be
specified multiple times to add multiple headers.
#### `--grpc-compression-algorithm=[none|gzip|deflate]`
Specifies the compression algorithm to be used by gRPC when sending requests.
Only supported when gRPC protocol is being used.
Default is `none`.
## Server Options
#### `-u <url>`
Specifies the URL for the server.
Default is `localhost:8000` when using `--service-kind=triton` with HTTP.
Default is `localhost:8001` when using `--service-kind=triton` with gRPC.
Default is `localhost:8500` when using `--service-kind=tfserving`.
#### `--ssl-grpc-use-ssl`
Enables usage of an encrypted channel to the server.
#### `--ssl-grpc-root-certifications-file=<path>`
Specifies the path to file containing the PEM encoding of the server root
certificates.
#### `--ssl-grpc-private-key-file=<path>`
Specifies the path to file containing the PEM encoding of the client's private
key.
#### `--ssl-grpc-certificate-chain-file=<path>`
Specifies the path to file containing the PEM encoding of the client's
certificate chain.
#### `--ssl-https-verify-peer=[0|1]`
Specifies whether to verify the peer's SSL certificate. See
https://curl.se/libcurl/c/CURLOPT_SSL_VERIFYPEER.html for the meaning of each
value.
Default is `1`.
#### `--ssl-https-verify-host=[0|1|2]`
Specifies whether to verify the certificate's name against host. See
https://curl.se/libcurl/c/CURLOPT_SSL_VERIFYHOST.html for the meaning of each
value.
Default is `2`.
#### `--ssl-https-ca-certificates-file=<path>`
Specifies the path to Certificate Authority (CA) bundle.
#### `--ssl-https-client-certificate-file=<path>`
Specifies the path to the SSL client certificate.
#### `--ssl-https-client-certificate-type=[PEM|DER]`
Specifies the type of the client SSL certificate.
Default is `PEM`.
#### `--ssl-https-private-key-file=<path>`
Specifies the path to the private keyfile for TLS and SSL client cert.
#### `--ssl-https-private-key-type=[PEM|DER]`
Specifies the type of the private key file.
Default is `PEM`.
#### `--triton-server-directory=<path>`
Specifies the Triton server install path. Required by and only used when C API
is used (`--service-kind=triton_c_api`).
Default is `/opt/tritonserver`.
#### `--model-repository=<path>`
Specifies the model repository directory path for loading models. Required by
and only used when C API is used (`--service-kind=triton_c_api`).
## Prometheus Metrics Options
#### `--collect-metrics`
Enables the collection of server-side inference server metrics. Perf Analyzer
will output metrics in the CSV file generated with the `-f` option. Only valid
when `--verbose-csv` option also used.
#### `--metrics-url=<url>`
Specifies the URL to query for server-side inference server metrics.
Default is `localhost:8002/metrics`.
#### `--metrics-interval=<n>`
Specifies how often within each measurement window, in milliseconds, Perf
Analyzer should query for server-side inference server metrics.
Default is `1000`.
## Report Options
#### `-f <path>`
Specifies the path that the latency report file will be generated at.
When `-f` is not specified, a latency report will not be generated.
#### `--profile-export-file <path>`
Specifies the path that the profile export will be generated at.
When `--profile-export-file` is not specified, a profile export will not be
generated.
#### `--verbose-csv`
Enables additional information being output to the CSV file generated by Perf
Analyzer.
## Trace Options
#### `--trace-file=<path>`
Specifies the file where trace output will be saved.
If `--log-frequency` is also specified, this argument value will be the
prefix of the files to save the trace output. See `--log-frequency` for
details. Only used for `--service-kind=triton`.
#### `--trace-level=[OFF|TIMESTAMPS|TENSORS]`
Specifies a trace level. `OFF` disables tracing. `TIMESTAMPS` traces
timestamps. `TENSORS` traces tensors. It may be specified multiple times to
trace multiple information.
Default is `OFF`.
#### `--trace-rate=<n>`
Specifies the trace sampling rate (traces per second).
Default is `1000`.
#### `--trace-count=<n>`
Specifies the number of traces to be sampled. If the value is `-1`, the number
of traces to be sampled will not be limited.
Default is `-1`.
#### `--log-frequency=<n>`
Specifies the trace log frequency. If the value is `0`, Triton will only log
the trace output to path specified via `--trace-file` when shutting down.
Otherwise, Triton will log the trace output to the path specified via
`--trace-file`.<idx> when it collects the specified number of traces. For
example, if `--trace-file` is specified to be `trace_file.log`, and if the log
frequency is `100`, when Triton collects the 100th trace, it logs the traces
to file `trace_file.log.0`, and when it collects the 200th trace, it logs the
101st to the 200th traces to file `trace_file.log.1`.
Default is `0`.
## Deprecated Options
#### `--data-directory=<path>`
**DEPRECATED**
Alias for `--input-data=<path>` where `<path>` is the path to a directory. See
`--input-data` option documentation for details.
#### `-c <n>`
**DEPRECATED**
Specifies the maximum concurrency that Perf Analyzer will search up to. Cannot
be used with `--concurrency-range`.
#### `-d`
**DEPRECATED**
Enables dynamic concurrency mode. Perf Analyzer will search along
concurrencies up to the maximum concurrency specified via `-c <n>`. Cannot be
used with `--concurrency-range`.
#### `-t <n>`
**DEPRECATED**
Specifies the number of concurrent requests. Cannot be used with
`--concurrency-range`.
Default is `1`.
#### `-z`
**DEPRECATED**
Alias for `--input-data=zero`. See `--input-data` option documentation for
details.
{
"experiments": [
{
"experiment": {
"mode": "concurrency",
"value": 4
},
"requests": [
{
"timestamp": 1,
"sequence_id": 1,
"response_timestamps": [
2,
3,
4
]
},
{
"timestamp": 5,
"sequence_id": 2,
"response_timestamps": []
},
{
"timestamp": 6,
"sequence_id": 2,
"response_timestamps": [
7,
8,
9
]
}
],
"window_boundaries": [
1,
5,
6
]
}
],
"version": "1.2.3"
}
<!--
Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of NVIDIA CORPORATION nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-->
# Inference Load Modes
Perf Analyzer has several modes for generating inference request load for a
model.
## Concurrency Mode
In concurrency mode, Perf Analyzer attempts to send inference requests to the
server such that N requests are always outstanding during profiling. For
example, when using
[`--concurrency-range=4`](cli.md#--concurrency-rangestartendstep), Perf Analyzer
will to attempt to have 4 outgoing inference requests at all times during
profiling.
## Request Rate Mode
In request rate mode, Perf Analyzer attempts to send N inference requests per
second to the server during profiling. For example, when using
[`--request-rate-range=20`](cli.md#--request-rate-rangestartendstep), Perf
Analyzer will attempt to send 20 requests per second during profiling.
## Custom Interval Mode
In custom interval mode, Perf Analyzer attempts to send inference requests
according to intervals (between requests, looping if necessary) provided by the
user in the form of a text file with one time interval (in microseconds) per
line. For example, when using
[`--request-intervals=my_intervals.txt`](cli.md#--request-intervalspath),
where `my_intervals.txt` contains:
```
100000
200000
500000
```
Perf Analyzer will attempt to send requests at the following times: 0.1s, 0.3s,
0.8s, 0.9s, 1.1s, 1.6s, and so on, during profiling.
<!--
Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of NVIDIA CORPORATION nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-->
# Input Data
Use the [`--help`](cli.md#--help) option to see complete documentation for all
input data options. By default Perf Analyzer sends random data to all the inputs
of your model. You can select a different input data mode with the
[`--input-data`](cli.md#--input-datazerorandompath) option:
- _random_: (default) Send random data for each input. Note: Perf Analyzer only
generates random data once per input and reuses that for all inferences
- _zero_: Send zeros for each input.
- directory path: A path to a directory containing a binary file for each input,
named the same as the input. Each binary file must contain the data required
for that input for a batch-1 request. Each file should contain the raw binary
representation of the input in row-major order.
- file path: A path to a JSON file containing data to be used with every
inference request. See the "Real Input Data" section for further details.
[`--input-data`](cli.md#--input-datazerorandompath) can be provided multiple
times with different file paths to specific multiple JSON files.
For tensors with with `STRING`/`BYTES` datatype, the
[`--string-length`](cli.md#--string-lengthn) and
[`--string-data`](cli.md#--string-datastring) options may be used in some cases
(see [`--help`](cli.md#--help) for full documentation).
For models that support batching you can use the [`-b`](cli.md#-b-n) option to
indicate the batch size of the requests that Perf Analyzer should send. For
models with variable-sized inputs you must provide the
[`--shape`](cli.md#--shapestring) argument so that Perf Analyzer knows what
shape tensors to use. For example, for a model that has an input called
`IMAGE` that has shape `[3, N, M]`, where `N` and `M` are variable-size
dimensions, to tell Perf Analyzer to send batch size 4 requests of shape
`[3, 224, 224]`:
```
$ perf_analyzer -m mymodel -b 4 --shape IMAGE:3,224,224
```
## Real Input Data
The performance of some models is highly dependent on the data used. For such
cases you can provide data to be used with every inference request made by Perf
Analyzer in a JSON file. Perf Analyzer will use the provided data in a
round-robin order when sending inference requests. For sequence models, if a
sequence length is specified via
[`--sequence-length`](cli.md#--sequence-lengthn), Perf Analyzer will also loop
through the provided data in a round-robin order up to the specified sequence
length (with a percentage variation customizable via
[`--sequence-length-variation`](cli.md#--sequence-length-variationn)).
Otherwise, the sequence length will be the number of inputs specified in
user-provided input data.
Each entry in the `"data"` array must specify all input tensors with the exact
size expected by the model for a single batch. The following example describes
data for a model with inputs named, `INPUT0` and `INPUT1`, shape `[4, 4]` and
data type `INT32`:
```json
{
"data":
[
{
"INPUT0": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
"INPUT1": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
},
{
"INPUT0": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
"INPUT1": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
},
{
"INPUT0": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
"INPUT1": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
},
{
"INPUT0": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
"INPUT1": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
}
]
}
```
Note that the `[4, 4]` tensor has been flattened in a row-major format for the
inputs. In addition to specifying explicit tensors, you can also provide Base64
encoded binary data for the tensors. Each data object must list its data in a
row-major order. Binary data must be in little-endian byte order. The following
example highlights how this can be achieved:
```json
{
"data":
[
{
"INPUT0": {"b64": "YmFzZTY0IGRlY29kZXI="},
"INPUT1": {"b64": "YmFzZTY0IGRlY29kZXI="}
},
{
"INPUT0": {"b64": "YmFzZTY0IGRlY29kZXI="},
"INPUT1": {"b64": "YmFzZTY0IGRlY29kZXI="}
},
{
"INPUT0": {"b64": "YmFzZTY0IGRlY29kZXI="},
"INPUT1": {"b64": "YmFzZTY0IGRlY29kZXI="}
}
]
}
```
In case of sequence models, multiple data streams can be specified in the JSON
file. Each sequence will get a data stream of its own and Perf Analyzer will
ensure the data from each stream is played back to the same correlation ID. The
below example highlights how to specify data for multiple streams for a sequence
model with a single input named `INPUT`, shape `[1]` and data type `STRING`:
```json
{
"data":
[
[
{
"INPUT": ["1"]
},
{
"INPUT": ["2"]
},
{
"INPUT": ["3"]
},
{
"INPUT": ["4"]
}
],
[
{
"INPUT": ["1"]
},
{
"INPUT": ["1"]
},
{
"INPUT": ["1"]
}
],
[
{
"INPUT": ["1"]
},
{
"INPUT": ["1"]
}
]
]
}
```
The above example describes three data streams with lengths 4, 3 and 2
respectively. Perf Analyzer will hence produce sequences of length 4, 3 and 2 in
this case.
You can also provide an optional `"shape"` field to the tensors. This is
especially useful while profiling the models with variable-sized tensors as
input. Additionally note that when providing the `"shape"` field, tensor
contents must be provided separately in a "content" field in row-major order.
The specified shape values will override default input shapes provided as a
command line option (see [`--shape`](cli.md#--shapestring)) for variable-sized
inputs. In the absence of a `"shape"` field, the provided defaults will be used.
There is no need to specify shape as a command line option if all the input data
provide shape values for variable tensors. Below is an example JSON file for a
model with a single input `INPUT`, shape `[-1, -1]` and data type `INT32`:
```json
{
"data":
[
{
"INPUT":
{
"content": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
"shape": [2,8]
}
},
{
"INPUT":
{
"content": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
"shape": [8,2]
}
},
{
"INPUT":
{
"content": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
}
},
{
"INPUT":
{
"content": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
"shape": [4,4]
}
}
]
}
```
The following is the example to provide contents as base64 string with explicit
shapes:
```json
{
"data":
[
{
"INPUT":
{
"content": {"b64": "/9j/4AAQSkZ(...)"},
"shape": [7964]
}
},
{
"INPUT":
{
"content": {"b64": "/9j/4AAQSkZ(...)"},
"shape": [7964]
}
}
]
}
```
Note that for `STRING` type, an element is represented by a 4-byte unsigned
integer giving the length followed by the actual bytes. The byte array to be
encoded using base64 must include the 4-byte unsigned integers.
### Output Validation
When real input data is provided, it is optional to request Perf Analyzer to
validate the inference output for the input data.
Validation output can be specified in the `"validation_data"` field have the
same format as the `"data"` field for real input. Note that the entries in
`"validation_data"` must align with `"data"` for proper mapping. The following
example describes validation data for a model with inputs named `INPUT0` and
`INPUT1`, outputs named `OUTPUT0` and `OUTPUT1`, all tensors have shape `[4, 4]`
and data type `INT32`:
```json
{
"data":
[
{
"INPUT0": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
"INPUT1": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
}
],
"validation_data":
[
{
"OUTPUT0": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
"OUTPUT1": [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
}
]
}
```
Besides the above example, the validation outputs can be specified in the same
variations described in the real input data section.
# Shared Memory
By default Perf Analyzer sends input tensor data and receives output tensor data
over the network. You can instead instruct Perf Analyzer to use system shared
memory or CUDA shared memory to communicate tensor data. By using these options
you can model the performance that you can achieve by using shared memory in
your application. Use
[`--shared-memory=system`](cli.md#--shared-memorynonesystemcuda) to use system
(CPU) shared memory or
[`--shared-memory=cuda`](cli.md#--shared-memorynonesystemcuda) to use CUDA
shared memory.
<!--
Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of NVIDIA CORPORATION nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-->
# Recommended Installation Method
## Triton SDK Container
The recommended way to "install" Perf Analyzer is to run the pre-built
executable from within the Triton SDK docker container available on the
[NVIDIA GPU Cloud Catalog](https://ngc.nvidia.com/catalog/containers/nvidia:tritonserver).
As long as the SDK container has its network exposed to the address and port of
the inference server, Perf Analyzer will be able to run.
```bash
export RELEASE=<yy.mm> # e.g. to use the release from the end of February of 2023, do `export RELEASE=23.02`
docker pull nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk
docker run --gpus all --rm -it --net host nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk
# inside container
perf_analyzer -m <model>
```
# Alternative Installation Methods
- [Pip](#pip)
- [Build from Source](#build-from-source)
## Pip
```bash
pip install tritonclient
perf_analyzer -m <model>
```
**Warning**: If any runtime dependencies are missing, Perf Analyzer will produce
errors showing which ones are missing. You will need to manually install them.
## Build from Source
The Triton SDK container is used for building, so some build and runtime
dependencies are already installed.
```bash
export RELEASE=<yy.mm> # e.g. to use the release from the end of February of 2023, do `export RELEASE=23.02`
docker pull nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk
docker run --gpus all --rm -it --net host nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk
# inside container
# prep installing newer version of cmake
apt update && apt install -y gpg wget && wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null && . /etc/os-release && echo "deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ $UBUNTU_CODENAME main" | tee /etc/apt/sources.list.d/kitware.list >/dev/null
# install build/runtime dependencies
apt update && apt install -y cmake-data cmake libcurl4-openssl-dev rapidjson-dev
rm -rf client ; git clone --depth 1 https://github.com/triton-inference-server/client
mkdir client/build ; cd client/build
cmake -DTRITON_ENABLE_PERF_ANALYZER=ON ..
make -j8 cc-clients
perf_analyzer -m <model>
```
- To enable
[CUDA shared memory](input_data.md#shared-memory), add
`-DTRITON_ENABLE_GPU=ON` to the `cmake` command.
- To enable
[C API mode](benchmarking.md#benchmarking-triton-directly-via-c-api), add
`-DTRITON_ENABLE_PERF_ANALYZER_C_API=ON` to the `cmake` command.
- To enable [TorchServe backend](benchmarking.md#benchmarking-torchserve), add
`-DTRITON_ENABLE_PERF_ANALYZER_TS=ON` to the `cmake` command.
- To enable
[Tensorflow Serving backend](benchmarking.md#benchmarking-tensorflow-serving),
add `-DTRITON_ENABLE_PERF_ANALYZER_TFS=ON` to the `cmake` command.
<!--
Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of NVIDIA CORPORATION nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-->
# Measurement Modes
Currently, Perf Analyzer has 2 measurement modes.
## Time Windows
When using time windows measurement mode
([`--measurement-mode=time_windows`](cli.md#--measurement-modetime_windowscount_windows)),
Perf Analyzer will count how many requests have completed during a window of
duration `X` (in milliseconds, via
[`--measurement-interval=X`](cli.md#--measurement-intervaln), default is
`5000`). This is the default measurement mode.
## Count Windows
When using count windows measurement mode
([`--measurement-mode=count_windows`](cli.md#--measurement-modetime_windowscount_windows)),
Perf Analyzer will start the window duration at 1 second and potentially
dynamically increase it until `X` requests have completed (via
[`--measurement-request-count=X`](cli.md#--measurement-request-countn), default
is `50`).
# Metrics
## How Throughput is Calculated
Perf Analyzer calculates throughput to be the total number of requests completed
during a measurement, divided by the duration of the measurement, in seconds.
## How Latency is Calculated
For each request concurrency level Perf Analyzer reports latency and throughput
as seen from Perf Analyzer and also the average request latency on the server.
The server latency measures the total time from when the request is received at
the server until when the response is sent from the server. Because of the HTTP
and gRPC libraries used to implement the server endpoints, total server latency
is typically more accurate for HTTP requests as it measures time from the first
byte received until last byte sent. For both HTTP and gRPC the total server
latency is broken-down into the following components:
- _queue_: The average time spent in the inference schedule queue by a request
waiting for an instance of the model to become available.
- _compute_: The average time spent performing the actual inference, including
any time needed to copy data to/from the GPU.
- _overhead_: The average time spent in the endpoint that cannot be correctly
captured in the send/receive time with the way the gRPC and HTTP libraries are
structured.
The client latency time is broken-down further for HTTP and gRPC as follows:
- HTTP: _send/recv_ indicates the time on the client spent sending the request
and receiving the response. _response wait_ indicates time waiting for the
response from the server.
- gRPC: _(un)marshal request/response_ indicates the time spent marshalling the
request data into the gRPC protobuf and unmarshalling the response data from
the gRPC protobuf. _response wait_ indicates time writing the gRPC request to
the network, waiting for the response, and reading the gRPC response from the
network.
Use the verbose ([`-v`](cli.md#-v)) option see more output, including the
stabilization passes run for each request concurrency level or request rate.
# Reports
## Visualizing Latency vs. Throughput
Perf Analyzer provides the [`-f`](cli.md#-f-path) option to generate a file
containing CSV output of the results.
```
$ perf_analyzer -m inception_graphdef --concurrency-range 1:4 -f perf.csv
...
$ cat perf.csv
Concurrency,Inferences/Second,Client Send,Network+Server Send/Recv,Server Queue,Server Compute Input,Server Compute Infer,Server Compute Output,Client Recv,p50 latency,p90 latency,p95 latency,p99 latency
1,69.2,225,2148,64,206,11781,19,0,13891,18795,19753,21018
3,84.2,237,1768,21673,209,11742,17,0,35398,43984,47085,51701
4,84.2,279,1604,33669,233,11731,18,1,47045,56545,59225,64886
2,87.2,235,1973,9151,190,11346,17,0,21874,28557,29768,34766
```
NOTE: The rows in the CSV file are sorted in an increasing order of throughput
(Inferences/Second).
You can import the CSV file into a spreadsheet to help visualize the latency vs
inferences/second tradeoff as well as see some components of the latency. Follow
these steps:
- Open
[this spreadsheet](https://docs.google.com/spreadsheets/d/1S8h0bWBBElHUoLd2SOvQPzZzRiQ55xjyqodm_9ireiw)
- Make a copy from the File menu "Make a copy..."
- Open the copy
- Select the A1 cell on the "Raw Data" tab
- From the File menu select "Import..."
- Select "Upload" and upload the file
- Select "Replace data at selected cell" and then select the "Import data"
button
## Server-side Prometheus metrics
Perf Analyzer can collect
[server-side metrics](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/metrics.md#gpu-metrics),
such as GPU utilization and GPU power usage. To enable the collection of these
metrics, use the [`--collect-metrics`](cli.md#--collect-metrics) option.
By default, Perf Analyzer queries the metrics endpoint at the URL
`localhost:8002/metrics`. If the metrics are accessible at a different url, use
the [`--metrics-url=<url>`](cli.md#--metrics-urlurl) option to specify that.
By default, Perf Analyzer queries the metrics endpoint every 1000 milliseconds.
To use a different querying interval, use the
[`--metrics-interval=<n>`](cli.md#--metrics-intervaln) option (specify in
milliseconds).
Because Perf Analyzer can collect the server-side metrics multiple times per
run, these metrics are aggregated in specific ways to produce one final number
per searched concurrency or request rate. Here are how the metrics are
aggregated:
| Metric | Aggregation |
| - | - |
| GPU Utilization | Averaged from each collection taken during stable passes. We want a number representative of all stable passes. |
| GPU Power Usage | Averaged from each collection taken during stable passes. We want a number representative of all stable passes. |
| GPU Used Memory | Maximum from all collections taken during a stable pass. Users are typically curious what the peak memory usage is for determining model/hardware viability. |
| GPU Total Memory | First from any collection taken during a stable pass. All of the collections should produce the same value for total memory available on the GPU. |
Note that all metrics are per-GPU in the case of multi-GPU systems.
To output these server-side metrics to a CSV file, use the
[`-f <path>`](cli.md#-f-path) and [`--verbose-csv`](cli.md#--verbose-csv)
options. The output CSV will contain one column per metric. The value of each
column will be a `key:value` pair (`GPU UUID:metric value`). Each `key:value`
pair will be delimited by a semicolon (`;`) to indicate metric values for each
GPU accessible by the server. There is a trailing semicolon. See below:
`<gpu-uuid-0>:<metric-value>;<gpu-uuid-1>:<metric-value>;...;`
Here is a simplified CSV output:
```
$ perf_analyzer -m resnet50_libtorch --collect-metrics -f output.csv --verbose-csv
$ cat output.csv
Concurrency,...,Avg GPU Utilization,Avg GPU Power Usage,Max GPU Memory Usage,Total GPU Memory
1,...,gpu_uuid_0:0.33;gpu_uuid_1:0.5;,gpu_uuid_0:55.3;gpu_uuid_1:56.9;,gpu_uuid_0:10000;gpu_uuid_1:11000;,gpu_uuid_0:50000;gpu_uuid_1:75000;,
2,...,gpu_uuid_0:0.25;gpu_uuid_1:0.6;,gpu_uuid_0:25.6;gpu_uuid_1:77.2;,gpu_uuid_0:11000;gpu_uuid_1:17000;,gpu_uuid_0:50000;gpu_uuid_1:75000;,
3,...,gpu_uuid_0:0.87;gpu_uuid_1:0.9;,gpu_uuid_0:87.1;gpu_uuid_1:71.7;,gpu_uuid_0:15000;gpu_uuid_1:22000;,gpu_uuid_0:50000;gpu_uuid_1:75000;,
```
## Communication Protocol
By default, Perf Analyzer uses HTTP to communicate with Triton. The gRPC
protocol can be specified with the [`-i [http|grpc]`](cli.md#-i-httpgrpc)
option. If gRPC is selected the [`--streaming`](cli.md#--streaming) option can
also be specified for gRPC streaming.
### SSL/TLS Support
Perf Analyzer can be used to benchmark Triton service behind SSL/TLS-enabled
endpoints. These options can help in establishing secure connection with the
endpoint and profile the server.
For gRPC, see the following options:
- [`--ssl-grpc-use-ssl`](cli.md#--ssl-grpc-use-ssl)
- [`--ssl-grpc-root-certifications-file=<path>`](cli.md#--ssl-grpc-root-certifications-filepath)
- [`--ssl-grpc-private-key-file=<path>`](cli.md#--ssl-grpc-private-key-filepath)
- [`--ssl-grpc-certificate-chain-file=<path>`](cli.md#--ssl-grpc-certificate-chain-filepath)
More details here:
https://grpc.github.io/grpc/cpp/structgrpc_1_1_ssl_credentials_options.html
The
[inference protocol gRPC SSL/TLS section](https://github.com/triton-inference-server/server/blob/main/docs/customization_guide/inference_protocols.md#ssltls)
describes server-side options to configure SSL/TLS in Triton's gRPC endpoint.
For HTTPS, the following options are exposed:
- [`--ssl-https-verify-peer`](cli.md#--ssl-https-verify-peer01)
- [`--ssl-https-verify-host`](cli.md#--ssl-https-verify-host012)
- [`--ssl-https-ca-certificates-file`](cli.md#--ssl-https-ca-certificates-filepath)
- [`--ssl-https-client-certificate-file`](cli.md#--ssl-https-client-certificate-filepath)
- [`--ssl-https-client-certificate-type`](cli.md#--ssl-https-client-certificate-typepemder)
- [`--ssl-https-private-key-file`](cli.md#--ssl-https-private-key-filepath)
- [`--ssl-https-private-key-type`](cli.md#--ssl-https-private-key-typepemder)
See [`--help`](cli.md#--help) for full documentation.
Unlike gRPC, Triton's HTTP server endpoint can not be configured with SSL/TLS
support.
Note: Just providing these `--ssl-http-*` options to Perf Analyzer does not
ensure that SSL/TLS is used in communication. If SSL/TLS is not enabled on the
service endpoint, these options have no effect. The intent of exposing these
options to a user of Perf Analyzer is to allow them to configure Perf Analyzer
to benchmark a Triton service behind SSL/TLS-enabled endpoints. In other words,
if Triton is running behind a HTTPS server proxy, then these options would allow
Perf Analyzer to profile Triton via exposed HTTPS proxy.
<!--
Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of NVIDIA CORPORATION nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-->
# Quick Start
The steps below will guide you on how to start using Perf Analyzer.
### Step 1: Start Triton Container
```bash
export RELEASE=<yy.mm> # e.g. to use the release from the end of February of 2023, do `export RELEASE=23.02`
docker pull nvcr.io/nvidia/tritonserver:${RELEASE}-py3
docker run --gpus all --rm -it --net host nvcr.io/nvidia/tritonserver:${RELEASE}-py3
```
### Step 2: Download `simple` Model
```bash
# inside triton container
git clone --depth 1 https://github.com/triton-inference-server/server
mkdir model_repository ; cp -r server/docs/examples/model_repository/simple model_repository
```
### Step 3: Start Triton Server
```bash
# inside triton container
tritonserver --model-repository $(pwd)/model_repository &> server.log &
# confirm server is ready, look for 'HTTP/1.1 200 OK'
curl -v localhost:8000/v2/health/ready
# detach (CTRL-p CTRL-q)
```
### Step 4: Start Triton SDK Container
```bash
docker pull nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk
docker run --gpus all --rm -it --net host nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk
```
### Step 5: Run Perf Analyzer
```bash
# inside sdk container
perf_analyzer -m simple
```
### Step 6: Observe and Analyze Output
```
$ perf_analyzer -m simple
*** Measurement Settings ***
Batch size: 1
Service Kind: Triton
Using "time_windows" mode for stabilization
Measurement window: 5000 msec
Using synchronous calls for inference
Stabilizing using average latency
Request concurrency: 1
Client:
Request count: 25348
Throughput: 1407.84 infer/sec
Avg latency: 708 usec (standard deviation 663 usec)
p50 latency: 690 usec
p90 latency: 881 usec
p95 latency: 926 usec
p99 latency: 1031 usec
Avg HTTP time: 700 usec (send/recv 102 usec + response wait 598 usec)
Server:
Inference count: 25348
Execution count: 25348
Successful request count: 25348
Avg request latency: 382 usec (overhead 41 usec + queue 41 usec + compute input 26 usec + compute infer 257 usec + compute output 16 usec)
Inferences/Second vs. Client Average Batch Latency
Concurrency: 1, throughput: 1407.84 infer/sec, latency 708 usec
```
We can see from the output that the model was able to complete approximately
1407.84 inferences per second, with an average latency of 708 microseconds per
inference request. Concurrency of 1 meant that Perf Analyzer attempted to always
have 1 outgoing request at all times.
This source diff could not be displayed because it is too large. You can view the blob instead.
// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include "base_queue_ctx_id_tracker.h"
namespace triton { namespace perfanalyzer {
// Context ID Tracker that reuses IDs in a roughly round-robin manner using a
// FIFO
//
class FifoCtxIdTracker : public BaseQueueCtxIdTracker {
public:
FifoCtxIdTracker() = default;
void Reset(size_t count) override
{
Clear();
for (size_t i = 0; i < count; ++i) {
free_ctx_ids_.push(i);
}
}
};
}}; // namespace triton::perfanalyzer
// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
namespace triton { namespace perfanalyzer {
/// Interface for object that tracks context IDs
///
class ICtxIdTracker {
public:
// Reset the tracker using the provided input count
//
virtual void Reset(size_t count) = 0;
// Restore the given ID into the tracker
//
virtual void Restore(size_t id) = 0;
// Pick and return a Ctx ID
//
virtual size_t Get() = 0;
// Returns true if there are Ctx IDs available to Get.
virtual bool IsAvailable() = 0;
};
}} // namespace triton::perfanalyzer
// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <chrono>
#include <mutex>
#include <stdexcept>
namespace triton { namespace perfanalyzer {
#ifndef DOCTEST_CONFIG_DISABLE
class TestLoadManager;
#endif
/// Class to track idle periods of time
///
class IdleTimer {
public:
void Start()
{
std::lock_guard<std::mutex> lk(mtx_);
StartImpl();
}
void Stop()
{
std::lock_guard<std::mutex> lk(mtx_);
StopImpl();
}
/// Reset the time counter, and restart the timer if it is active
///
void Reset()
{
Restart();
idle_ns_ = 0;
}
/// Returns the number of nanoseconds this timer has counted as being idle
/// If the timer was already active, then it will first stop (and count the
/// pending time), and then start back up
///
uint64_t GetIdleTime()
{
Restart();
return idle_ns_;
}
private:
std::mutex mtx_;
uint64_t idle_ns_{0};
bool is_idle_{false};
std::chrono::_V2::steady_clock::time_point start_time_;
void Restart()
{
std::lock_guard<std::mutex> lk(mtx_);
if (is_idle_) {
StopImpl();
StartImpl();
}
}
void StartImpl()
{
if (is_idle_) {
throw std::runtime_error("Can't start a timer that is already active\n");
}
is_idle_ = true;
start_time_ = std::chrono::steady_clock::now();
}
void StopImpl()
{
if (!is_idle_) {
throw std::runtime_error("Can't stop a timer that isn't active\n");
}
is_idle_ = false;
auto end = std::chrono::steady_clock::now();
auto duration = end - start_time_;
idle_ns_ += duration.count();
}
#ifndef DOCTEST_CONFIG_DISABLE
friend TestLoadManager;
#endif
};
}} // namespace triton::perfanalyzer
// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include "client_backend/client_backend.h"
#include "constants.h"
#include "data_loader.h"
#include "infer_data.h"
#include "model_parser.h"
#include "perf_utils.h"
namespace triton { namespace perfanalyzer {
/// Interface for classes that manage infer data preparation for inference
///
class IInferDataManager {
public:
/// Initialize this object. Must be called before any other functions
/// \return cb::Error object indicating success or failure.
virtual cb::Error Init() = 0;
/// Populate the target InferData object with input and output objects
/// according to the model's shape
/// \param infer_data The target InferData object.
/// \return cb::Error object indicating success or failure.
virtual cb::Error InitInferData(InferData& infer_data) = 0;
/// Updates the input and expected output data in the target infer_data for an
/// inference request
/// \param thread_id The ID of the calling thread
/// \param stream_index The data stream to use for next data
/// \param step_index The step index to use for next data
/// \param infer_data The target InferData object
/// \return cb::Error object indicating success or failure.
virtual cb::Error UpdateInferData(
size_t thread_id, int stream_index, int step_index,
InferData& infer_data) = 0;
};
}} // namespace triton::perfanalyzer
// Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "infer_context.h"
namespace triton { namespace perfanalyzer {
void
InferContext::Init()
{
thread_stat_->status_ = infer_data_manager_->InitInferData(infer_data_);
if (!thread_stat_->status_.IsOk()) {
return;
}
if (streaming_) {
// Decoupled models should not collect client side statistics
thread_stat_->status_ = infer_backend_->StartStream(
async_callback_func_, (!parser_->IsDecoupled()));
if (!thread_stat_->status_.IsOk()) {
return;
}
}
}
void
InferContext::SendInferRequest(bool delayed)
{
// Update the inputs if required
if (using_json_data_) {
UpdateJsonData();
}
SendRequest(request_id_++, delayed);
}
void
InferContext::SendSequenceInferRequest(uint32_t seq_stat_index, bool delayed)
{
// Need lock to protect the order of dispatch across worker threads.
// This also helps in reporting the realistic latencies.
std::lock_guard<std::mutex> guard(
sequence_manager_->GetMutex(seq_stat_index));
if (!early_exit && execute_) {
sequence_manager_->SetInferSequenceOptions(
seq_stat_index, infer_data_.options_);
// Update the inputs if required
if (using_json_data_) {
UpdateSeqJsonData(seq_stat_index);
}
sequence_manager_->DecrementRemainingQueries(seq_stat_index);
SendRequest(
request_id_++, delayed,
sequence_manager_->GetSequenceID(seq_stat_index));
}
}
void
InferContext::CompleteOngoingSequence(uint32_t seq_stat_index)
{
std::lock_guard<std::mutex> guard(
sequence_manager_->GetMutex(seq_stat_index));
if (sequence_manager_->GetRemainingQueries(seq_stat_index) != 0) {
sequence_manager_->SetRemainingQueries(seq_stat_index, 1);
sequence_manager_->SetInferSequenceOptions(
seq_stat_index, infer_data_.options_);
if (using_json_data_) {
UpdateSeqJsonData(seq_stat_index);
}
sequence_manager_->DecrementRemainingQueries(seq_stat_index);
bool is_delayed = false;
SendRequest(
request_id_++, is_delayed,
sequence_manager_->GetSequenceID(seq_stat_index));
}
}
void
InferContext::SendRequest(
const uint64_t request_id, const bool delayed, const uint64_t sequence_id)
{
if (!thread_stat_->status_.IsOk()) {
return;
}
thread_stat_->num_sent_requests_++;
if (async_) {
infer_data_.options_->request_id_ = std::to_string(request_id);
{
std::lock_guard<std::mutex> lock(thread_stat_->mu_);
auto it = async_req_map_
.emplace(infer_data_.options_->request_id_, RequestRecord())
.first;
it->second.start_time_ = std::chrono::system_clock::now();
it->second.sequence_end_ = infer_data_.options_->sequence_end_;
it->second.delayed_ = delayed;
it->second.sequence_id_ = sequence_id;
}
thread_stat_->idle_timer.Start();
if (streaming_) {
thread_stat_->status_ = infer_backend_->AsyncStreamInfer(
*(infer_data_.options_), infer_data_.valid_inputs_,
infer_data_.outputs_);
} else {
thread_stat_->status_ = infer_backend_->AsyncInfer(
async_callback_func_, *(infer_data_.options_),
infer_data_.valid_inputs_, infer_data_.outputs_);
}
thread_stat_->idle_timer.Stop();
total_ongoing_requests_++;
} else {
std::chrono::time_point<std::chrono::system_clock> start_time_sync,
end_time_sync;
thread_stat_->idle_timer.Start();
start_time_sync = std::chrono::system_clock::now();
cb::InferResult* results = nullptr;
thread_stat_->status_ = infer_backend_->Infer(
&results, *(infer_data_.options_), infer_data_.valid_inputs_,
infer_data_.outputs_);
thread_stat_->idle_timer.Stop();
if (results != nullptr) {
if (thread_stat_->status_.IsOk()) {
thread_stat_->status_ = ValidateOutputs(results);
}
delete results;
}
if (!thread_stat_->status_.IsOk()) {
return;
}
end_time_sync = std::chrono::system_clock::now();
std::vector<std::chrono::time_point<std::chrono::system_clock>>
end_time_syncs{end_time_sync};
{
// Add the request record to thread request records vector with proper
// locking
std::lock_guard<std::mutex> lock(thread_stat_->mu_);
auto total = end_time_sync - start_time_sync;
thread_stat_->request_records_.emplace_back(RequestRecord(
start_time_sync, std::move(end_time_syncs),
infer_data_.options_->sequence_end_, delayed, sequence_id, false));
thread_stat_->status_ =
infer_backend_->ClientInferStat(&(thread_stat_->contexts_stat_[id_]));
if (!thread_stat_->status_.IsOk()) {
return;
}
}
}
}
void
InferContext::UpdateJsonData()
{
int step_id = (data_step_id_ * batch_size_) % data_loader_->GetTotalSteps(0);
data_step_id_ += GetNumActiveThreads();
thread_stat_->status_ =
infer_data_manager_->UpdateInferData(thread_id_, 0, step_id, infer_data_);
}
void
InferContext::UpdateSeqJsonData(size_t seq_stat_index)
{
const size_t sequence_length{
sequence_manager_->GetSequenceLength(seq_stat_index)};
const size_t remaining_queries{
sequence_manager_->GetRemainingQueries(seq_stat_index)};
const uint64_t data_stream_id{
sequence_manager_->GetDataStreamID(seq_stat_index)};
const size_t total_steps{data_loader_->GetTotalSteps(data_stream_id)};
int step_id = (sequence_length - remaining_queries) % total_steps;
thread_stat_->status_ = infer_data_manager_->UpdateInferData(
thread_id_, data_stream_id, step_id, infer_data_);
}
cb::Error
InferContext::ValidateOutputs(const cb::InferResult* result_ptr)
{
// Validate output if set
if (!infer_data_.expected_outputs_.empty()) {
for (size_t i = 0; i < infer_data_.outputs_.size(); ++i) {
const uint8_t* buf = nullptr;
size_t byte_size = 0;
result_ptr->RawData(infer_data_.outputs_[i]->Name(), &buf, &byte_size);
for (const auto& expected : infer_data_.expected_outputs_[i]) {
if (!expected.is_valid) {
return cb::Error(
"Expected output can't be invalid", pa::GENERIC_ERROR);
}
if (byte_size < expected.batch1_size) {
return cb::Error(
"Output size doesn't match expected size", pa::GENERIC_ERROR);
} else if (memcmp(buf, expected.data_ptr, expected.batch1_size) != 0) {
return cb::Error(
"Output doesn't match expected output", pa::GENERIC_ERROR);
} else {
buf += expected.batch1_size;
byte_size -= expected.batch1_size;
}
}
if (byte_size != 0) {
return cb::Error(
"Output size doesn't match expected size", pa::GENERIC_ERROR);
}
}
}
return cb::Error::Success;
}
void
InferContext::AsyncCallbackFuncImpl(cb::InferResult* result)
{
std::shared_ptr<cb::InferResult> result_ptr(result);
bool is_final_response{true};
if (thread_stat_->cb_status_.IsOk()) {
// Add the request record to thread request records vector with
// proper locking
std::lock_guard<std::mutex> lock(thread_stat_->mu_);
thread_stat_->cb_status_ = result_ptr->RequestStatus();
if (thread_stat_->cb_status_.IsOk()) {
std::string request_id;
thread_stat_->cb_status_ = result_ptr->Id(&request_id);
const auto& it = async_req_map_.find(request_id);
if (it != async_req_map_.end()) {
bool is_null_response{false};
thread_stat_->cb_status_ =
result_ptr->IsNullResponse(&is_null_response);
if (thread_stat_->cb_status_.IsOk() == false) {
return;
}
it->second.response_times_.push_back(std::chrono::system_clock::now());
if (is_null_response == true) {
it->second.has_null_last_response_ = true;
}
thread_stat_->cb_status_ =
result_ptr->IsFinalResponse(&is_final_response);
if (thread_stat_->cb_status_.IsOk() == false) {
return;
}
if (is_final_response) {
thread_stat_->request_records_.emplace_back(
it->second.start_time_, it->second.response_times_,
it->second.sequence_end_, it->second.delayed_,
it->second.sequence_id_, it->second.has_null_last_response_);
infer_backend_->ClientInferStat(&(thread_stat_->contexts_stat_[id_]));
thread_stat_->cb_status_ = ValidateOutputs(result);
async_req_map_.erase(request_id);
}
}
}
}
if (is_final_response) {
total_ongoing_requests_--;
if (async_callback_finalize_func_ != nullptr) {
async_callback_finalize_func_(id_);
}
}
}
}} // namespace triton::perfanalyzer
// Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <atomic>
#include <functional>
#include <memory>
#include <mutex>
#include <vector>
#include "data_loader.h"
#include "idle_timer.h"
#include "iinfer_data_manager.h"
#include "infer_data.h"
#include "perf_utils.h"
#include "request_record.h"
#include "sequence_manager.h"
namespace triton { namespace perfanalyzer {
// Holds the running status of the thread.
struct ThreadStat {
ThreadStat() {}
// The status of the worker thread
cb::Error status_;
// The status of the callback thread for async requests
cb::Error cb_status_;
// TODO REFACTOR TMA-1046 -- This should be in the InferContext class
// The statistics of the InferContext
std::vector<cb::InferStat> contexts_stat_;
// Tracks the amount of time this thread spent sleeping or waiting
IdleTimer idle_timer;
// A vector of request records
std::vector<RequestRecord> request_records_;
// A lock to protect thread data
std::mutex mu_;
// The number of sent requests by this thread.
std::atomic<size_t> num_sent_requests_{0};
};
#ifndef DOCTEST_CONFIG_DISABLE
class NaggyMockInferContext;
#endif
/// Sends inference requests to the server
class InferContext {
public:
InferContext(
const size_t thread_id, const uint32_t id, const bool async,
const bool streaming, const bool on_sequence_model,
const bool using_json_data, const int32_t batch_size,
std::shared_ptr<ThreadStat> thread_stat,
std::shared_ptr<DataLoader> data_loader,
std::shared_ptr<ModelParser> parser,
std::shared_ptr<cb::ClientBackendFactory> factory, const bool& execute,
const std::shared_ptr<IInferDataManager>& infer_data_manager,
std::shared_ptr<SequenceManager> sequence_manager)
: thread_id_(thread_id), id_(id), async_(async), streaming_(streaming),
on_sequence_model_(on_sequence_model),
using_json_data_(using_json_data), batch_size_(batch_size),
thread_stat_(thread_stat), data_loader_(data_loader), parser_(parser),
factory_(factory), data_step_id_(id), execute_(execute),
infer_data_manager_(infer_data_manager),
sequence_manager_(sequence_manager)
{
thread_stat_->status_ = factory_->CreateClientBackend(&infer_backend_);
infer_data_.options_.reset(new cb::InferOptions(parser_->ModelName()));
infer_data_.options_->model_version_ = parser_->ModelVersion();
infer_data_.options_->model_signature_name_ = parser_->ModelSignatureName();
thread_stat_->contexts_stat_.emplace_back();
}
InferContext(InferContext&&) = delete;
InferContext(const InferContext&) = delete;
// Initialize the context. Must be done before any inferences are sent
void Init();
// Send a single inference request to the server
void SendInferRequest(bool delayed = false);
// Send a single sequence inference request to the server
void SendSequenceInferRequest(uint32_t seq_index, bool delayed = false);
// Finish the active sequence at the given seq_stat_index
void CompleteOngoingSequence(uint32_t seq_stat_index);
// Returns the total number of async requests that have been sent by this
// object and have not returned
uint GetNumOngoingRequests() { return total_ongoing_requests_; }
// Register a function that will get called after every async request returns
void RegisterAsyncCallbackFinalize(std::function<void(uint32_t)> callback)
{
async_callback_finalize_func_ = callback;
}
// TODO REFACTOR TMA-1043 this should be in memory class
void SetNumActiveThreads(size_t num_threads)
{
num_active_threads_ = num_threads;
}
protected:
/// A helper function to issue inference request to the server.
/// \param request_id The unique id to be associated with the request.
/// \param delayed Whether the request fell behind its scheduled time.
/// \param sequence_id Sequence ID of the request. Note that the default of
/// `0` means the request is not a sequence.
virtual void SendRequest(
const uint64_t request_id, const bool delayed,
const uint64_t sequence_id = 0);
/// Update inputs based on custom json data
void UpdateJsonData();
/// Update inputs based on custom json data for the given sequence
void UpdateSeqJsonData(size_t seq_stat_index);
cb::Error ValidateOutputs(const cb::InferResult* result_ptr);
// Callback function for handling asynchronous requests
void AsyncCallbackFuncImpl(cb::InferResult* result);
bool async_{false};
bool streaming_{false};
const bool on_sequence_model_{false};
bool using_json_data_{false};
const int32_t batch_size_{0};
std::shared_ptr<ThreadStat> thread_stat_;
std::shared_ptr<DataLoader> data_loader_;
std::shared_ptr<ModelParser> parser_;
std::shared_ptr<cb::ClientBackendFactory> factory_;
std::shared_ptr<IInferDataManager> infer_data_manager_;
uint64_t request_id_ = 0;
std::map<std::string, RequestRecord> async_req_map_;
std::atomic<uint> total_ongoing_requests_{0};
size_t data_step_id_;
// Function pointer to the async callback function implementation
std::function<void(cb::InferResult*)> async_callback_func_ = std::bind(
&InferContext::AsyncCallbackFuncImpl, this, std::placeholders::_1);
// Function pointer to registered async callbacks
std::function<void(uint32_t)> async_callback_finalize_func_ = nullptr;
private:
const uint32_t id_{0};
const size_t thread_id_{0};
size_t GetNumActiveThreads() { return num_active_threads_; }
size_t num_active_threads_{0};
// The backend to communicate with the server
std::unique_ptr<cb::ClientBackend> infer_backend_;
InferData infer_data_;
// FIXME: update build to use C++17 instead of C++14. This is a workaround
// since C++14 doesn't have std::optional, but C++17 does.
const bool execute_placeholder_{false};
std::reference_wrapper<const bool> execute_{execute_placeholder_};
std::shared_ptr<SequenceManager> sequence_manager_{nullptr};
#ifndef DOCTEST_CONFIG_DISABLE
friend NaggyMockInferContext;
public:
InferContext() = default;
#endif
};
}} // namespace triton::perfanalyzer
// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include "client_backend/client_backend.h"
#include "tensor_data.h"
namespace triton { namespace perfanalyzer {
/// Holds all the data needed to send an inference request
struct InferData {
~InferData()
{
for (const auto input : inputs_) {
delete input;
}
for (const auto output : outputs_) {
delete output;
}
}
// The vector of pointers to InferInput objects for all possible inputs,
// potentially including optional inputs with no provided data.
std::vector<cb::InferInput*> inputs_;
// The vector of pointers to InferInput objects to be
// used for inference request.
std::vector<cb::InferInput*> valid_inputs_;
// The vector of pointers to InferRequestedOutput objects
// to be used with the inference request.
std::vector<const cb::InferRequestedOutput*> outputs_;
// If not empty, the expected output data in the same order as 'outputs_'
// The outer vector is per-output. The inner vector is for batching of each
// output
std::vector<std::vector<TensorData>> expected_outputs_;
// The InferOptions object holding the details of the
// inference.
std::unique_ptr<cb::InferOptions> options_;
};
}} // namespace triton::perfanalyzer
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment