Commit c68e1835 authored by lijian6's avatar lijian6
Browse files

Initial commit

parents
Pipeline #561 failed with stages
in 0 seconds
// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "scoped_defer.h"
namespace triton { namespace perfanalyzer { namespace clientbackend {
namespace tritoncapi {
ScopedDefer::ScopedDefer(std::function<void()> task)
{
task_ = task;
done_ = false;
}
void
ScopedDefer::Complete()
{
if (!done_) {
task_();
done_ = true;
}
}
ScopedDefer::~ScopedDefer()
{
if (!done_) {
task_();
}
}
}}}} // namespace triton::perfanalyzer::clientbackend::tritoncapi
// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <functional>
namespace triton { namespace perfanalyzer { namespace clientbackend {
namespace tritoncapi {
class ScopedDefer {
public:
ScopedDefer(std::function<void()> task);
~ScopedDefer();
void Complete();
private:
std::function<void()> task_;
bool done_;
};
}}}} // namespace triton::perfanalyzer::clientbackend::tritoncapi
// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "shared_library.h"
#include <dlfcn.h>
#include <iostream>
/// FIXME: Duplication of server/src/core/shared_library.cc
/// Separate shared_library to common library and delete this
namespace triton { namespace perfanalyzer { namespace clientbackend {
namespace tritoncapi {
Error
OpenLibraryHandle(const std::string& path, void** handle)
{
*handle = dlopen(path.c_str(), RTLD_NOW | RTLD_LOCAL);
if (*handle == nullptr) {
return Error("unable to load backend library: " + std::string(dlerror()));
}
return Error::Success;
}
Error
CloseLibraryHandle(void* handle)
{
if (handle != nullptr) {
if (dlclose(handle) != 0) {
return Error(
"unable to unload backend library: " + std::string(dlerror()));
}
}
return Error::Success;
}
Error
GetEntrypoint(
void* handle, const std::string& name, const bool optional, void** befn)
{
*befn = nullptr;
dlerror();
void* fn = dlsym(handle, name.c_str());
const char* dlsym_error = dlerror();
if (dlsym_error != nullptr) {
if (optional) {
return Error::Success;
}
std::string errstr(dlsym_error); // need copy as dlclose overwrites
return Error(
"unable to find required entrypoint '" + name +
"' in backend library: " + errstr);
}
if (fn == nullptr) {
if (optional) {
return Error::Success;
}
return Error(
"unable to find required entrypoint '" + name + "' in backend library");
}
*befn = fn;
return Error::Success;
}
}}}} // namespace triton::perfanalyzer::clientbackend::tritoncapi
// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <string>
#include "../client_backend.h"
/// FIXME: Duplication of server/src/core/shared_library.h
/// Separate shared_library to common library and delete this
namespace triton { namespace perfanalyzer { namespace clientbackend {
namespace tritoncapi {
Error OpenLibraryHandle(const std::string& path, void** handle);
Error CloseLibraryHandle(void* handle);
Error GetEntrypoint(
void* handle, const std::string& name, const bool optional, void** befn);
}}}} // namespace triton::perfanalyzer::clientbackend::tritoncapi
// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "shared_memory_manager.h"
#include <errno.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <unistd.h>
#include "common.h"
namespace triton { namespace perfanalyzer { namespace clientbackend {
namespace tritoncapi {
SharedMemoryManager::~SharedMemoryManager()
{
UnregisterAll(TRITONSERVER_MEMORY_CPU);
UnregisterAll(TRITONSERVER_MEMORY_GPU);
}
#ifdef TRITON_ENABLE_GPU
Error
SharedMemoryManager::RegisterCUDAMemory(
const std::string& name, void* dev_ptr, const size_t byte_size,
const int device_id)
{
// Serialize all operations that write/read current shared memory regions
std::lock_guard<std::mutex> lock(mu_);
// If name is already in shared_memory_map_ then return error saying already
// registered
if (shared_memory_map_.find(name) != shared_memory_map_.end()) {
return Error(
std::string("shared memory region '" + name + "' already in manager"));
}
shared_memory_map_.insert(std::make_pair(
name, std::unique_ptr<MemoryInfo>(new MemoryInfo(
name, 0 /* offset */, byte_size, dev_ptr,
TRITONSERVER_MEMORY_GPU, device_id))));
return Error::Success;
}
#endif // TRITON_ENABLE_GPU
Error
SharedMemoryManager::RegisterSystemMemory(
const std::string& name, void* ptr, const size_t byte_size)
{
// Serialize all operations that write/read current shared memory regions
std::lock_guard<std::mutex> lock(mu_);
// If name is already in shared_memory_map_ then return error saying already
// registered
if (shared_memory_map_.find(name) != shared_memory_map_.end()) {
return Error("shared memory region '" + name + "' already in manager");
}
shared_memory_map_.insert(std::make_pair(
name, std::make_unique<MemoryInfo>(
name, 0 /* offset */, byte_size, ptr, TRITONSERVER_MEMORY_CPU,
0 /* device id */)));
return Error::Success;
}
Error
SharedMemoryManager::GetMemoryInfo(
const std::string& name, size_t offset, void** shm_mapped_addr,
TRITONSERVER_MemoryType* memory_type, int64_t* device_id)
{
// protect shared_memory_map_ from concurrent access
std::lock_guard<std::mutex> lock(mu_);
auto it = shared_memory_map_.find(name);
if (it == shared_memory_map_.end()) {
return Error(
std::string("Unable to find shared memory region: '" + name + "'"));
}
if (it->second->kind_ == TRITONSERVER_MEMORY_CPU) {
*shm_mapped_addr = (void*)((uint8_t*)it->second->mapped_addr_ +
it->second->offset_ + offset);
} else {
*shm_mapped_addr = (void*)((uint8_t*)it->second->mapped_addr_ + offset);
}
*memory_type = it->second->kind_;
*device_id = it->second->device_id_;
return Error::Success;
}
Error
SharedMemoryManager::Unregister(
const std::string& name, TRITONSERVER_MemoryType memory_type)
{
// Serialize all operations that write/read current shared memory regions
std::lock_guard<std::mutex> lock(mu_);
return UnregisterHelper(name, memory_type);
}
Error
SharedMemoryManager::UnregisterAll(TRITONSERVER_MemoryType memory_type)
{
// Serialize all operations that write/read current shared memory regions
std::lock_guard<std::mutex> lock(mu_);
std::string error_message = "Failed to unregister the following ";
std::vector<std::string> unregister_fails;
if (memory_type == TRITONSERVER_MEMORY_CPU) {
error_message += "system shared memory regions: ";
for (auto& it : shared_memory_map_) {
if (it.second->kind_ == TRITONSERVER_MEMORY_CPU) {
Error err = UnregisterHelper(it.first, memory_type);
if (!err.IsOk()) {
unregister_fails.push_back(it.first);
}
}
}
} else if (memory_type == TRITONSERVER_MEMORY_GPU) {
error_message += "cuda shared memory regions: ";
for (auto& it : shared_memory_map_) {
if (it.second->kind_ == TRITONSERVER_MEMORY_GPU) {
Error err = UnregisterHelper(it.first, memory_type);
if (!err.IsOk()) {
unregister_fails.push_back(it.first);
}
}
}
}
if (!unregister_fails.empty()) {
for (auto unreg_fail : unregister_fails) {
error_message += unreg_fail + " ,";
}
return Error(error_message);
}
return Error::Success;
}
Error
SharedMemoryManager::UnregisterHelper(
const std::string& name, TRITONSERVER_MemoryType memory_type)
{
// Must hold the lock on register_mu_ while calling this function.
auto it = shared_memory_map_.find(name);
if (it == shared_memory_map_.end()) {
return Error("Shared memory region " + name + " doesn't exist.");
}
// Remove region information from shared_memory_map_
shared_memory_map_.erase(it);
return Error::Success;
}
}}}} // namespace triton::perfanalyzer::clientbackend::tritoncapi
// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <triton/core/tritonserver.h>
#include <cstring>
#include <map>
#include <memory>
#include <mutex>
#include "../client_backend.h"
#ifdef TRITON_ENABLE_GPU
#include <cuda_runtime_api.h>
#endif // TRITON_ENABLE_GPU
namespace triton { namespace perfanalyzer { namespace clientbackend {
namespace tritoncapi {
class SharedMemoryManager {
public:
SharedMemoryManager() = default;
~SharedMemoryManager();
#ifdef TRITON_ENABLE_GPU
/// Add a memory block representing memory in CUDA (GPU) memory
/// to the manager. Return an Error if a memory block of the same name
/// already exists in the manager.
/// \param name The name of the memory block.
/// \param dev_ptr The device pointer
/// \param byte_size The size, in bytes of the block.
/// \param device id The GPU number the memory region is in.
/// \return an Error indicating success or failure.
Error RegisterCUDAMemory(
const std::string& name, void* dev_ptr, const size_t byte_size,
const int device_id);
#endif // TRITON_ENABLE_GPU
/// Add a system memory block to the manager.
/// Return an Error if a shared memory block of the same name
/// already exists in the manager.
/// \param name The name of the memory block.
/// \param ptr The device pointer
/// \param byte_size The size, in bytes of the block.
/// \return an Error indicating success or failure.
Error RegisterSystemMemory(
const std::string& name, void* ptr, const size_t byte_size);
/// Get the access information for the shared memory block with the specified
/// name. Return an Error if named block doesn't exist.
/// \param name The name of the shared memory block to get.
/// \param offset The offset in the block
/// \param shm_mapped_addr Returns the pointer to the shared
/// memory block with the specified name and offset
/// \param memory_type Returns the type of the memory
/// \param device_id Returns the device id associated with the
/// memory block
/// \return an Error indicating success or failure.
Error GetMemoryInfo(
const std::string& name, size_t offset, void** shm_mapped_addr,
TRITONSERVER_MemoryType* memory_type, int64_t* device_id);
/// Removes the named shared memory block of the specified type from
/// the manager. Any future attempt to get the details of this block
/// will result in an array till another block with the same name is
/// added to the manager.
/// \param name The name of the shared memory block to remove.
/// \param memory_type The type of memory to unregister.
/// \return an Error indicating success or failure.
Error Unregister(
const std::string& name, TRITONSERVER_MemoryType memory_type);
/// Unregister all shared memory blocks of specified type from the manager.
/// \param memory_type The type of memory to unregister.
/// \return an Error indicating success or failure.
Error UnregisterAll(TRITONSERVER_MemoryType memory_type);
private:
/// A helper function to remove the named shared memory blocks of
/// specified type
Error UnregisterHelper(
const std::string& name, TRITONSERVER_MemoryType memory_type);
/// A struct that records the shared memory regions registered by the shared
/// memory manager.
struct MemoryInfo {
MemoryInfo(
const std::string& name, const size_t offset, const size_t byte_size,
void* mapped_addr, const TRITONSERVER_MemoryType kind,
const int64_t device_id)
: name_(name), offset_(offset), byte_size_(byte_size),
mapped_addr_(mapped_addr), kind_(kind), device_id_(device_id)
{
}
std::string name_;
size_t offset_;
size_t byte_size_;
void* mapped_addr_;
TRITONSERVER_MemoryType kind_;
int64_t device_id_;
};
using SharedMemoryStateMap =
std::map<std::string, std::unique_ptr<MemoryInfo>>;
// A map between the name and the details of the associated
// shared memory block
SharedMemoryStateMap shared_memory_map_;
// A mutex to protect the concurrent access to shared_memory_map_
std::mutex mu_;
};
}}}} // namespace triton::perfanalyzer::clientbackend::tritoncapi
// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "triton_c_api_backend.h"
#include "c_api_infer_results.h"
#include "json_utils.h"
#include "triton_loader.h"
namespace triton { namespace perfanalyzer { namespace clientbackend {
namespace tritoncapi {
//==============================================================================
Error
TritonCApiClientBackend::Create(
const std::string& triton_server_path,
const std::string& model_repository_path, const bool verbose,
std::unique_ptr<ClientBackend>* client_backend)
{
if (triton_server_path.empty()) {
return Error(
"--triton-server-path should not be empty when using "
"service-kind=triton_c_api.");
}
if (model_repository_path.empty()) {
return Error(
"--model-repository should not be empty when using "
"service-kind=triton_c_api.");
}
std::unique_ptr<TritonCApiClientBackend> triton_client_backend(
new TritonCApiClientBackend());
TritonLoader::Create(triton_server_path, model_repository_path, verbose);
*client_backend = std::move(triton_client_backend);
return Error::Success;
}
Error
TritonCApiClientBackend::ServerExtensions(std::set<std::string>* extensions)
{
rapidjson::Document server_metadata_json;
RETURN_IF_ERROR(triton_loader_->ServerMetaData(&server_metadata_json));
for (const auto& extension : server_metadata_json["extensions"].GetArray()) {
extensions->insert(
std::string(extension.GetString(), extension.GetStringLength()));
}
return Error::Success;
}
Error
TritonCApiClientBackend::ModelMetadata(
rapidjson::Document* model_metadata, const std::string& model_name,
const std::string& model_version)
{
if (!triton_loader_->ModelIsLoaded()) {
triton_loader_->LoadModel(model_name, model_version);
}
RETURN_IF_ERROR(triton_loader_->ModelMetadata(model_metadata));
return Error::Success;
}
Error
TritonCApiClientBackend::ModelConfig(
rapidjson::Document* model_config, const std::string& model_name,
const std::string& model_version)
{
if (!triton_loader_->ModelIsLoaded()) {
triton_loader_->LoadModel(model_name, model_version);
}
RETURN_IF_ERROR(
triton_loader_->ModelConfig(model_config, model_name, model_version));
return Error::Success;
}
Error
TritonCApiClientBackend::Infer(
cb::InferResult** result, const InferOptions& options,
const std::vector<InferInput*>& inputs,
const std::vector<const InferRequestedOutput*>& outputs)
{
std::vector<tc::InferInput*> triton_inputs;
ParseInferInputToTriton(inputs, &triton_inputs);
std::vector<const tc::InferRequestedOutput*> triton_outputs;
ParseInferRequestedOutputToTriton(outputs, &triton_outputs);
tc::InferOptions triton_options(options.model_name_);
ParseInferOptionsToTriton(options, &triton_options);
capi::InferResult* triton_result;
RETURN_IF_ERROR(triton_loader_->Infer(
triton_options, triton_inputs, triton_outputs, &triton_result));
*result = new TritonCApiInferResult(triton_result);
return Error::Success;
}
Error
TritonCApiClientBackend::ClientInferStat(InferStat* infer_stat)
{
tc::InferStat triton_infer_stat;
triton_loader_->ClientInferStat(&triton_infer_stat);
ParseInferStat(triton_infer_stat, infer_stat);
return Error::Success;
}
Error
TritonCApiClientBackend::ModelInferenceStatistics(
std::map<ModelIdentifier, ModelStatistics>* model_stats,
const std::string& model_name, const std::string& model_version)
{
rapidjson::Document infer_stat_json;
RETURN_IF_ERROR(triton_loader_->ModelInferenceStatistics(
model_name, model_version, &infer_stat_json));
ParseStatistics(infer_stat_json, model_stats);
return Error::Success;
}
Error
TritonCApiClientBackend::UnregisterAllSharedMemory()
{
RETURN_IF_ERROR(triton_loader_->UnregisterAllSharedMemory());
return Error::Success;
}
Error
TritonCApiClientBackend::RegisterSystemMemory(
const std::string& name, void* ptr, const size_t byte_size)
{
RETURN_IF_ERROR(triton_loader_->RegisterSystemMemory(name, ptr, byte_size));
return Error::Success;
}
#ifdef TRITON_ENABLE_GPU
Error
TritonCApiClientBackend::RegisterCudaMemory(
const std::string& name, void* handle, const size_t byte_size)
{
RETURN_IF_ERROR(triton_loader_->RegisterCudaMemory(name, handle, byte_size));
return Error::Success;
}
#endif // TRITON_ENABLE_GPU
void
TritonCApiClientBackend::ParseInferInputToTriton(
const std::vector<InferInput*>& inputs,
std::vector<tc::InferInput*>* triton_inputs)
{
for (const auto input : inputs) {
triton_inputs->push_back(
(dynamic_cast<TritonCApiInferInput*>(input))->Get());
}
}
void
TritonCApiClientBackend::ParseInferRequestedOutputToTriton(
const std::vector<const InferRequestedOutput*>& outputs,
std::vector<const tc::InferRequestedOutput*>* triton_outputs)
{
for (const auto output : outputs) {
triton_outputs->push_back(
(dynamic_cast<const TritonCApiInferRequestedOutput*>(output))->Get());
}
}
void
TritonCApiClientBackend::ParseInferOptionsToTriton(
const InferOptions& options, tc::InferOptions* triton_options)
{
triton_options->model_version_ = options.model_version_;
triton_options->request_id_ = options.request_id_;
if ((options.sequence_id_ != 0) || (options.sequence_id_str_ != "")) {
if (options.sequence_id_ != 0) {
triton_options->sequence_id_ = options.sequence_id_;
} else {
triton_options->sequence_id_str_ = options.sequence_id_str_;
}
triton_options->sequence_start_ = options.sequence_start_;
triton_options->sequence_end_ = options.sequence_end_;
}
}
void
TritonCApiClientBackend::ParseStatistics(
const rapidjson::Document& infer_stat,
std::map<ModelIdentifier, ModelStatistics>* model_stats)
{
model_stats->clear();
for (const auto& this_stat : infer_stat["model_stats"].GetArray()) {
auto it = model_stats
->emplace(
std::make_pair(
this_stat["name"].GetString(),
this_stat["version"].GetString()),
ModelStatistics())
.first;
it->second.inference_count_ = this_stat["inference_count"].GetUint64();
it->second.execution_count_ = this_stat["execution_count"].GetUint64();
it->second.success_count_ =
this_stat["inference_stats"]["success"]["count"].GetUint64();
it->second.queue_count_ =
this_stat["inference_stats"]["queue"]["count"].GetUint64();
it->second.compute_input_count_ =
this_stat["inference_stats"]["compute_input"]["count"].GetUint64();
it->second.compute_infer_count_ =
this_stat["inference_stats"]["compute_infer"]["count"].GetUint64();
it->second.compute_output_count_ =
this_stat["inference_stats"]["compute_output"]["count"].GetUint64();
it->second.cumm_time_ns_ =
this_stat["inference_stats"]["success"]["ns"].GetUint64();
it->second.queue_time_ns_ =
this_stat["inference_stats"]["queue"]["ns"].GetUint64();
it->second.compute_input_time_ns_ =
this_stat["inference_stats"]["compute_input"]["ns"].GetUint64();
it->second.compute_infer_time_ns_ =
this_stat["inference_stats"]["compute_infer"]["ns"].GetUint64();
it->second.compute_output_time_ns_ =
this_stat["inference_stats"]["compute_output"]["ns"].GetUint64();
it->second.cache_hit_count_ =
this_stat["inference_stats"]["cache_hit"]["count"].GetUint64();
it->second.cache_hit_time_ns_ =
this_stat["inference_stats"]["cache_hit"]["ns"].GetUint64();
it->second.cache_miss_count_ =
this_stat["inference_stats"]["cache_miss"]["count"].GetUint64();
it->second.cache_miss_time_ns_ =
this_stat["inference_stats"]["cache_miss"]["ns"].GetUint64();
}
}
void
TritonCApiClientBackend::ParseInferStat(
const tc::InferStat& triton_infer_stat, InferStat* infer_stat)
{
infer_stat->completed_request_count =
triton_infer_stat.completed_request_count;
infer_stat->cumulative_total_request_time_ns =
triton_infer_stat.cumulative_total_request_time_ns;
infer_stat->cumulative_send_time_ns =
triton_infer_stat.cumulative_send_time_ns;
infer_stat->cumulative_receive_time_ns =
triton_infer_stat.cumulative_receive_time_ns;
}
//==============================================================================
Error
TritonCApiInferInput::Create(
InferInput** infer_input, const std::string& name,
const std::vector<int64_t>& dims, const std::string& datatype)
{
TritonCApiInferInput* local_infer_input =
new TritonCApiInferInput(name, datatype);
tc::InferInput* triton_infer_input;
RETURN_IF_TRITON_ERROR(
tc::InferInput::Create(&triton_infer_input, name, dims, datatype));
local_infer_input->input_.reset(triton_infer_input);
*infer_input = local_infer_input;
return Error::Success;
}
const std::vector<int64_t>&
TritonCApiInferInput::Shape() const
{
return input_->Shape();
}
Error
TritonCApiInferInput::SetShape(const std::vector<int64_t>& shape)
{
RETURN_IF_TRITON_ERROR(input_->SetShape(shape));
return Error::Success;
}
Error
TritonCApiInferInput::Reset()
{
RETURN_IF_TRITON_ERROR(input_->Reset());
return Error::Success;
}
Error
TritonCApiInferInput::AppendRaw(const uint8_t* input, size_t input_byte_size)
{
RETURN_IF_TRITON_ERROR(input_->AppendRaw(input, input_byte_size));
return Error::Success;
}
Error
TritonCApiInferInput::SetSharedMemory(
const std::string& name, size_t byte_size, size_t offset)
{
RETURN_IF_TRITON_ERROR(input_->SetSharedMemory(name, byte_size, offset));
return Error::Success;
}
TritonCApiInferInput::TritonCApiInferInput(
const std::string& name, const std::string& datatype)
: InferInput(BackendKind::TRITON_C_API, name, datatype)
{
}
//==============================================================================
Error
TritonCApiInferRequestedOutput::Create(
InferRequestedOutput** infer_output, const std::string& name,
const size_t class_count)
{
TritonCApiInferRequestedOutput* local_infer_output =
new TritonCApiInferRequestedOutput(name);
tc::InferRequestedOutput* triton_infer_output;
RETURN_IF_TRITON_ERROR(tc::InferRequestedOutput::Create(
&triton_infer_output, name, class_count));
local_infer_output->output_.reset(triton_infer_output);
*infer_output = local_infer_output;
return Error::Success;
}
Error
TritonCApiInferRequestedOutput::SetSharedMemory(
const std::string& name, size_t byte_size, size_t offset)
{
RETURN_IF_TRITON_ERROR(output_->SetSharedMemory(name, byte_size, offset));
return Error::Success;
}
TritonCApiInferRequestedOutput::TritonCApiInferRequestedOutput(
const std::string& name)
: InferRequestedOutput(BackendKind::TRITON_C_API, name)
{
}
//==============================================================================
TritonCApiInferResult::TritonCApiInferResult(capi::InferResult* result)
{
result_.reset(result);
}
Error
TritonCApiInferResult::Id(std::string* id) const
{
RETURN_IF_TRITON_ERROR(result_->Id(id));
return Error::Success;
}
Error
TritonCApiInferResult::RequestStatus() const
{
RETURN_IF_TRITON_ERROR(result_->RequestStatus());
return Error::Success;
}
Error
TritonCApiInferResult::RawData(
const std::string& output_name, const uint8_t** buf,
size_t* byte_size) const
{
return Error(
"Output retrieval is not currently supported for Triton C API client "
"backend");
}
//==============================================================================
}}}} // namespace triton::perfanalyzer::clientbackend::tritoncapi
// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <string>
#include "../client_backend.h"
#include "shared_memory_manager.h"
#include "triton_loader.h"
#define RETURN_IF_TRITON_ERROR(S) \
do { \
const tc::Error& status__ = (S); \
if (!status__.IsOk()) { \
return Error(status__.Message()); \
} \
} while (false)
#define FAIL_IF_TRITON_ERR(X, MSG) \
{ \
const tc::Error err = (X); \
if (!err.IsOk()) { \
std::cerr << "error: " << (MSG) << ": " << err << std::endl; \
exit(1); \
} \
}
namespace tc = triton::client;
namespace cb = triton::perfanalyzer::clientbackend;
namespace capi = triton::perfanalyzer::clientbackend::tritoncapi;
namespace triton { namespace perfanalyzer { namespace clientbackend {
namespace tritoncapi {
class InferResult;
//==============================================================================
/// TritonCApiClientBackend uses triton client C++ library to communicate with
/// triton inference service. This uses the local C++ library
///
class TritonCApiClientBackend : public ClientBackend {
public:
/// Create a triton client backend which can be used to interact with the
/// server.
/// \param triton_server_path Tritonserver library that contains
/// lib/libtritonserver.so.
/// \param model_repository_path The model repository.
/// \param verbose Enables the verbose mode of TritonServer.
/// \param client_backend Returns a new TritonCApiClientBackend object.
/// \return Error object indicating success
/// or failure.
static Error Create(
const std::string& triton_server_path,
const std::string& model_repository_path, const bool verbose,
std::unique_ptr<ClientBackend>* client_backend);
~TritonCApiClientBackend() { triton_loader_->Delete(); }
/// See ClientBackend::ServerExtensions()
Error ServerExtensions(std::set<std::string>* server_extensions) override;
/// See ClientBackend::ModelMetadata()
Error ModelMetadata(
rapidjson::Document* model_metadata, const std::string& model_name,
const std::string& model_version) override;
/// See ClientBackend::ModelConfig()
Error ModelConfig(
rapidjson::Document* model_config, const std::string& model_name,
const std::string& model_version) override;
/// See ClientBackend::Infer()
Error Infer(
cb::InferResult** result, const InferOptions& options,
const std::vector<InferInput*>& inputs,
const std::vector<const InferRequestedOutput*>& outputs) override;
/// See ClientBackend::ClientInferStat()
Error ClientInferStat(InferStat* infer_stat) override;
/// See ClientBackend::ModelInferenceStatistics()
Error ModelInferenceStatistics(
std::map<ModelIdentifier, ModelStatistics>* model_stats,
const std::string& model_name = "",
const std::string& model_version = "") override;
#ifdef TRITON_ENABLE_GPU
/// See ClientBackend::RegisterCudaMemory
Error RegisterCudaMemory(
const std::string& name, void* handle, const size_t byte_size) override;
#endif // TRITON_ENABLE_GPU
/// See ClientBackend::RegisterSystemMemory
Error RegisterSystemMemory(
const std::string& name, void* ptr, const size_t byte_size) override;
/// See ClientBackend::UnregisterAllSharedMemory
Error UnregisterAllSharedMemory();
private:
TritonCApiClientBackend()
: ClientBackend(BackendKind::TRITON_C_API),
triton_loader_(TritonLoader::GetSingleton())
{
}
void ParseInferInputToTriton(
const std::vector<InferInput*>& inputs,
std::vector<tc::InferInput*>* triton_inputs);
void ParseInferRequestedOutputToTriton(
const std::vector<const InferRequestedOutput*>& outputs,
std::vector<const tc::InferRequestedOutput*>* triton_outputs);
void ParseInferOptionsToTriton(
const InferOptions& options, tc::InferOptions* triton_options);
void ParseStatistics(
const rapidjson::Document& infer_stat,
std::map<ModelIdentifier, ModelStatistics>* model_stats);
void ParseInferStat(
const tc::InferStat& triton_infer_stat, InferStat* infer_stat);
TritonLoader* triton_loader_;
};
//==============================================================
/// TritonCApiInferInput is a wrapper around InferInput object of
/// triton client library.
///
class TritonCApiInferInput : public InferInput {
public:
static Error Create(
InferInput** infer_input, const std::string& name,
const std::vector<int64_t>& dims, const std::string& datatype);
/// Returns the raw InferInput object required by triton client library.
tc::InferInput* Get() const { return input_.get(); }
/// See InferInput::Shape()
const std::vector<int64_t>& Shape() const override;
/// See InferInput::SetShape()
Error SetShape(const std::vector<int64_t>& shape) override;
/// See InferInput::Reset()
Error Reset() override;
/// See InferInput::AppendRaw()
Error AppendRaw(const uint8_t* input, size_t input_byte_size) override;
/// See InferInput::SetSharedMemory()
Error SetSharedMemory(
const std::string& name, size_t byte_size, size_t offset = 0) override;
private:
explicit TritonCApiInferInput(
const std::string& name, const std::string& datatype);
std::unique_ptr<tc::InferInput> input_;
};
//==============================================================
/// TritonCApiInferRequestedOutput is a wrapper around
/// InferRequestedOutput object of triton client library.
///
class TritonCApiInferRequestedOutput : public InferRequestedOutput {
public:
static Error Create(
InferRequestedOutput** infer_output, const std::string& name,
const size_t class_count = 0);
/// Returns the raw InferRequestedOutput object required by triton client
/// library.
tc::InferRequestedOutput* Get() const { return output_.get(); }
/// See InferInput::SetSharedMemory()
Error SetSharedMemory(
const std::string& name, size_t byte_size, size_t offset = 0) override;
private:
explicit TritonCApiInferRequestedOutput(const std::string& name);
std::unique_ptr<tc::InferRequestedOutput> output_;
};
//==============================================================
/// TritonCApiInferResult is a wrapper around InferResult object of
/// the C API library.
///
class TritonCApiInferResult : public cb::InferResult {
public:
explicit TritonCApiInferResult(capi::InferResult* result);
/// See InferResult::Id()
Error Id(std::string* id) const override;
/// See InferResult::RequestStatus()
Error RequestStatus() const override;
/// See InferResult::RawData()
Error RawData(
const std::string& output_name, const uint8_t** buf,
size_t* byte_size) const override;
private:
std::unique_ptr<capi::InferResult> result_;
};
}}}} // namespace triton::perfanalyzer::clientbackend::tritoncapi
// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define TRITON_INFERENCE_SERVER_CLIENT_CLASS \
triton::perfanalyzer::clientbackend::tritoncapi::TritonLoader
#include "triton_loader.h"
#include <rapidjson/document.h>
#include <rapidjson/error/en.h>
#include <sys/stat.h>
#include <future>
#include <sstream>
#include <string>
#include <thread>
#include <unordered_map>
#include "c_api_infer_results.h"
#include "scoped_defer.h"
namespace triton { namespace perfanalyzer { namespace clientbackend {
namespace tritoncapi {
namespace {
struct AllocPayload {
struct OutputInfo {
enum Kind { BINARY, SHM };
Kind kind_;
void* base_;
uint64_t byte_size_;
TRITONSERVER_MemoryType memory_type_;
int64_t device_id_;
// For shared memory
OutputInfo(
void* base, uint64_t byte_size, TRITONSERVER_MemoryType memory_type,
int64_t device_id)
: kind_(SHM), base_(base), byte_size_(byte_size),
memory_type_(memory_type), device_id_(device_id)
{
}
};
~AllocPayload()
{
for (auto it : output_map_) {
delete it.second;
}
}
std::unordered_map<std::string, OutputInfo*> output_map_;
};
bool helper_verbose = false;
/// Helper function for allocating memory
TRITONSERVER_Error*
ResponseAlloc(
TRITONSERVER_ResponseAllocator* allocator, const char* tensor_name,
size_t byte_size, TRITONSERVER_MemoryType preferred_memory_type,
int64_t preferred_memory_type_id, void* userp, void** buffer,
void** buffer_userp, TRITONSERVER_MemoryType* actual_memory_type,
int64_t* actual_memory_type_id)
{
// Initially attempt to make the actual memory type and id that we
// allocate be the same as preferred memory type
*actual_memory_type = preferred_memory_type;
*actual_memory_type_id = preferred_memory_type_id;
// This variable indicates whether the buffer should be freed or not.
bool* should_free = new bool;
*buffer_userp = should_free;
*should_free = false;
// If 'byte_size' is zero just return 'buffer' == nullptr, we don't
// need to do any other book-keeping.
if (byte_size == 0) {
*buffer = nullptr;
*buffer_userp = nullptr;
if (helper_verbose) {
std::cout << "allocated " << byte_size << " bytes for result tensor "
<< tensor_name << std::endl;
}
} else {
AllocPayload* alloc_payload = reinterpret_cast<AllocPayload*>(userp);
auto output_map_it = alloc_payload->output_map_.find(tensor_name);
if (output_map_it == alloc_payload->output_map_.end()) {
void* allocated_ptr = nullptr;
*actual_memory_type = TRITONSERVER_MEMORY_CPU;
*actual_memory_type_id = 0;
allocated_ptr = malloc(byte_size);
*should_free = true;
if (allocated_ptr != nullptr) {
*buffer = allocated_ptr;
}
} else {
// It is in shared memory
AllocPayload::OutputInfo* output_info = output_map_it->second;
if (byte_size > output_info->byte_size_) {
return TritonLoader::GetSingleton()->ErrorNew(
TRITONSERVER_ERROR_INTERNAL,
std::string(
"shared memory size specified with the request for output '" +
std::string(tensor_name) + "' (" +
std::to_string(output_info->byte_size_) +
" bytes) should be at least " + std::to_string(byte_size) +
" bytes to hold the results")
.c_str());
}
*actual_memory_type = output_info->memory_type_;
*actual_memory_type_id = output_info->device_id_;
*buffer = output_info->base_;
}
}
return nullptr; // Success
}
/// Helper function for releasing memory
TRITONSERVER_Error*
ResponseRelease(
TRITONSERVER_ResponseAllocator* allocator, void* buffer, void* buffer_userp,
size_t byte_size, TRITONSERVER_MemoryType memory_type,
int64_t memory_type_id)
{
bool* should_free = reinterpret_cast<bool*>(buffer_userp);
switch (memory_type) {
case TRITONSERVER_MEMORY_CPU:
if (*should_free) {
free(buffer);
}
break;
}
free(should_free);
return nullptr; // Success
}
void
InferRequestComplete(
TRITONSERVER_InferenceRequest* request, const uint32_t flags, void* userp)
{
TritonLoader::GetSingleton()->DeleteInferRequest(request);
}
void
InferResponseComplete(
TRITONSERVER_InferenceResponse* response, const uint32_t flags, void* userp)
{
if (response != nullptr) {
// Send 'response' to the future.
std::promise<TRITONSERVER_InferenceResponse*>* p =
reinterpret_cast<std::promise<TRITONSERVER_InferenceResponse*>*>(userp);
p->set_value(response);
delete p;
}
}
Error
GetModelVersionFromString(const std::string& version_string, int64_t* version)
{
if (version_string.empty()) {
*version = 1;
return Error::Success;
}
try {
*version = std::stol(version_string);
}
catch (std::exception& e) {
return Error(
std::string(
"Failed to get model version from specified version string '" +
version_string + "' (details: " + e.what() +
"), version should be an integral value > 0")
.c_str());
}
if (*version < 0) {
return Error(std::string(
"invalid model version specified '" + version_string +
"' , version should be an integral value > 0")
.c_str());
}
return Error::Success;
}
Error
FolderExists(const std::string& path)
{
struct stat buffer;
if (!stat(path.c_str(), &buffer)) {
return Error::Success;
} else {
return Error("Unable to find filepath: " + path);
}
}
} // namespace
Error
TritonLoader::Create(
const std::string& triton_server_path,
const std::string& model_repository_path, bool verbose)
{
if (!GetSingleton()->ServerIsReady()) {
GetSingleton()->ClearHandles();
RETURN_IF_ERROR(GetSingleton()->PopulateInternals(
triton_server_path, model_repository_path, verbose));
RETURN_IF_ERROR(GetSingleton()->LoadServerLibrary());
RETURN_IF_ERROR(GetSingleton()->StartTriton());
}
return Error::Success;
}
Error
TritonLoader::Delete()
{
if (server_ != nullptr) {
server_is_ready_ = false;
model_is_loaded_ = false;
server_.reset();
}
return Error::Success;
}
Error
TritonLoader::PopulateInternals(
const std::string& triton_server_path,
const std::string& model_repository_path, bool verbose)
{
RETURN_IF_ERROR(FolderExists(triton_server_path));
RETURN_IF_ERROR(FolderExists(model_repository_path));
triton_server_path_ = triton_server_path;
model_repository_path_ = model_repository_path;
verbose_ = verbose;
verbose_level_ = verbose_ ? 1 : 0;
return Error::Success;
}
Error
TritonLoader::StartTriton()
{
// Check API version.
uint32_t api_version_major, api_version_minor;
REPORT_TRITONSERVER_ERROR(
api_version_fn_(&api_version_major, &api_version_minor));
if ((TRITONSERVER_API_VERSION_MAJOR != api_version_major) ||
(TRITONSERVER_API_VERSION_MINOR > api_version_minor)) {
std::stringstream sstream;
sstream << "triton server API version mismatch. \n"
<< "Expected version major:" << TRITONSERVER_API_VERSION_MAJOR
<< ", minor:" << TRITONSERVER_API_VERSION_MINOR << "\n"
<< " Actual version major:" << api_version_major
<< ", minor:" << api_version_minor;
return Error(sstream.str());
}
// Create the server...
TRITONSERVER_ServerOptions* server_options = nullptr;
RETURN_IF_TRITONSERVER_ERROR(
options_new_fn_(&server_options), "creating server options");
RETURN_IF_TRITONSERVER_ERROR(
options_set_model_repo_path_fn_(
server_options, model_repository_path_.c_str()),
"setting model repository path");
RETURN_IF_TRITONSERVER_ERROR(
set_cuda_memory_pool_byte_size_(server_options, 0, 1073741824),
"setting cuda memory pool byte size failed.");
RETURN_IF_TRITONSERVER_ERROR(
set_log_verbose_fn_(server_options, verbose_level_),
"setting verbose logging level");
RETURN_IF_TRITONSERVER_ERROR(
set_log_info_fn_(server_options, verbose_),
"setting if log verbose level is true");
RETURN_IF_TRITONSERVER_ERROR(
set_backend_directory_fn_(
server_options, (triton_server_path_ + "/backends").c_str()),
"setting backend directory");
RETURN_IF_TRITONSERVER_ERROR(
set_repo_agent_directory_fn_(
server_options, (triton_server_path_ + "/repoagents").c_str()),
"setting repository agent directory");
RETURN_IF_TRITONSERVER_ERROR(
set_strict_model_config_fn_(server_options, true),
"setting strict model configuration");
double min_compute_capability = 0;
// FIXME: Do not have GPU support right now
RETURN_IF_TRITONSERVER_ERROR(
set_min_supported_compute_capability_fn_(
server_options, min_compute_capability),
"setting minimum supported CUDA compute capability");
TRITONSERVER_Server* server_ptr = nullptr;
RETURN_IF_TRITONSERVER_ERROR(
server_new_fn_(&server_ptr, server_options), "creating server");
RETURN_IF_TRITONSERVER_ERROR(
server_options_delete_fn_(server_options), "deleting server options");
std::shared_ptr<TRITONSERVER_Server> shared_server(
server_ptr, server_delete_fn_);
server_ = shared_server;
// Wait until the server is both live and ready.
size_t health_iters = 0;
while (true) {
bool live, ready;
RETURN_IF_TRITONSERVER_ERROR(
server_is_live_fn_(server_.get(), &live),
"unable to get server liveness");
RETURN_IF_TRITONSERVER_ERROR(
server_is_ready_fn_(server_.get(), &ready),
"unable to get server readiness");
if (live && ready) {
server_is_ready_ = true;
break;
}
if (++health_iters >= 10) {
return Error("failed to find healthy inference server");
}
std::this_thread::sleep_for(std::chrono::milliseconds(500));
}
// Print status of the server.
if (verbose_) {
TRITONSERVER_Message* server_metadata_message;
RETURN_IF_TRITONSERVER_ERROR(
server_metadata_fn_(server_.get(), &server_metadata_message),
"unable to get server metadata message");
const char* buffer;
size_t byte_size;
RETURN_IF_TRITONSERVER_ERROR(
message_serialize_to_json_fn_(
server_metadata_message, &buffer, &byte_size),
"unable to serialize server metadata message");
RETURN_IF_TRITONSERVER_ERROR(
message_delete_fn_(server_metadata_message),
"deleting status metadata");
}
return Error::Success;
}
Error
TritonLoader::ServerMetaData(rapidjson::Document* server_metadata)
{
if (!ServerIsReady()) {
return Error("Model is not loaded and/or server is not ready");
}
TRITONSERVER_Message* server_metadata_message;
RETURN_IF_TRITONSERVER_ERROR(
server_metadata_fn_(server_.get(), &server_metadata_message),
"unable to get server metadata message");
const char* buffer;
size_t byte_size;
RETURN_IF_TRITONSERVER_ERROR(
message_serialize_to_json_fn_(
server_metadata_message, &buffer, &byte_size),
"unable to serialize server metadata message");
server_metadata->Parse(buffer, byte_size);
if (server_metadata->HasParseError()) {
return Error(
"error: failed to parse server metadata from JSON: " +
std::string(GetParseError_En(server_metadata->GetParseError())) +
" at " + std::to_string(server_metadata->GetErrorOffset()));
}
RETURN_IF_TRITONSERVER_ERROR(
message_delete_fn_(server_metadata_message), "deleting status metadata");
return Error::Success;
}
Error
TritonLoader::LoadModel(
const std::string& model_name, const std::string& model_version)
{
if (!ServerIsReady()) {
return Error("server is not ready, abort!");
}
model_name_ = model_name;
RETURN_IF_ERROR(GetModelVersionFromString(model_version, &model_version_));
// Wait for the model to become available.
bool is_ready = false;
size_t health_iters = 0;
// some error handling
if (model_repository_path_.empty()) {
return Error("Need to specify model repository");
}
while (!is_ready) {
RETURN_IF_TRITONSERVER_ERROR(
model_is_ready_fn_(
server_.get(), model_name_.c_str(), model_version_, &is_ready),
"unable to get model readiness");
if (!is_ready) {
if (++health_iters >= 10) {
return Error("model failed to be ready in 10 iterations");
}
std::this_thread::sleep_for(std::chrono::milliseconds(500));
continue;
}
}
// flag to confirm model is correct and loaded
model_is_loaded_ = true;
return Error::Success;
}
Error
TritonLoader::ModelMetadata(rapidjson::Document* model_metadata)
{
if (!ModelIsLoaded() || !ServerIsReady()) {
return Error("Model is not loaded and/or server is not ready");
}
TRITONSERVER_Message* model_metadata_message;
// get model metadata
RETURN_IF_TRITONSERVER_ERROR(
model_metadata_fn_(
server_.get(), model_name_.c_str(), model_version_,
&model_metadata_message),
"unable to get model metadata message");
const char* buffer;
size_t byte_size;
RETURN_IF_TRITONSERVER_ERROR(
message_serialize_to_json_fn_(
model_metadata_message, &buffer, &byte_size),
"unable to serialize model status protobuf");
model_metadata->Parse(buffer, byte_size);
if (model_metadata->HasParseError()) {
return Error(
"error: failed to parse model metadata from JSON: " +
std::string(GetParseError_En(model_metadata->GetParseError())) +
" at " + std::to_string(model_metadata->GetErrorOffset()));
}
RETURN_IF_TRITONSERVER_ERROR(
message_delete_fn_(model_metadata_message), "deleting status protobuf");
if (strcmp((*model_metadata)["name"].GetString(), model_name_.c_str())) {
return Error("unable to find metadata for model");
}
bool found_version = false;
if (model_metadata->HasMember("versions")) {
for (const auto& version : (*model_metadata)["versions"].GetArray()) {
if (strcmp(version.GetString(), std::to_string(model_version_).c_str()) ==
0) {
found_version = true;
break;
}
}
}
if (!found_version) {
std::string msg = "unable to find version " +
std::to_string(model_version_) + " status for model";
return Error(msg);
}
return Error::Success;
}
Error
TritonLoader::ModelConfig(
rapidjson::Document* model_config, const std::string& model_name,
const std::string& model_version)
{
if (!ModelIsLoaded() || !ServerIsReady()) {
return Error("Model is not loaded and/or server is not ready");
}
TRITONSERVER_Message* model_config_message;
uint32_t config_version = 1;
RETURN_IF_TRITONSERVER_ERROR(
model_config_fn_(
(server_).get(), model_name.c_str(), model_version_, config_version,
&model_config_message),
"unable to get model config message");
const char* buffer;
size_t byte_size;
RETURN_IF_TRITONSERVER_ERROR(
message_serialize_to_json_fn_(model_config_message, &buffer, &byte_size),
"unable to serialize model config status protobuf");
model_config->Parse(buffer, byte_size);
if (model_config->HasParseError()) {
return Error(
"error: failed to parse model config from JSON: " +
std::string(GetParseError_En(model_config->GetParseError())) + " at " +
std::to_string(model_config->GetErrorOffset()));
}
RETURN_IF_TRITONSERVER_ERROR(
message_delete_fn_(model_config_message),
"deleting server config status protobuf");
return Error::Success;
}
Error
TritonLoader::LoadServerLibrary()
{
std::string full_path = triton_server_path_ + server_library_path_;
RETURN_IF_ERROR(FolderExists(full_path));
RETURN_IF_ERROR(OpenLibraryHandle(full_path, &dlhandle_));
TritonServerApiVersionFn_t apifn;
TritonServerOptionsNewFn_t onfn;
TritonServerOptionSetModelRepoPathFn_t rpfn;
TritonServerSetLogVerboseFn_t slvfn;
TritonServerSetBackendDirFn_t sbdfn;
TritonServerSetRepoAgentDirFn_t srdfn;
TritonServerSetStrictModelConfigFn_t ssmcfn;
TritonServerSetMinSupportedComputeCapabilityFn_t smsccfn;
TritonServerNewFn_t snfn;
TritonServerOptionsDeleteFn_t odfn;
TritonServerDeleteFn_t sdfn;
TritonServerIsLiveFn_t ilfn;
TritonServerIsReadyFn_t irfn;
TritonServerMetadataFn_t smfn;
TritonServerMessageSerializeToJsonFn_t stjfn;
TritonServerMessageDeleteFn_t mdfn;
TritonServerModelIsReadyFn_t mirfn;
TritonServerModelMetadataFn_t mmfn;
TritonServerResponseAllocatorNewFn_t ranfn;
TritonServerInferenceRequestNewFn_t irnfn;
TritonServerInferenceRequestSetIdFn_t irsifn;
TritonServerInferenceRequestSetReleaseCallbackFn_t irsrcfn;
TritonServerInferenceRequestAddInputFn_t iraifn;
TritonServerInferenceRequestAddRequestedOutputFn_t irarofn;
TritonServerInferenceRequestAppendInputDataFn_t iraidfn;
TritonServerInferenceRequestSetResponseCallbackFn_t irsrescfn;
TritonServerInferAsyncFn_t iafn;
TritonServerInferenceResponseErrorFn_t irefn;
TritonServerInferenceResponseDeleteFn_t irdfn;
TritonServerResponseAllocatorDeleteFn_t radfn;
TritonServerErrorNewFn_t enfn;
TritonServerMemoryTypeStringFn_t mtsfn;
TritonServerInferenceResponseOutputCountFn_t irocfn;
TritonServerDataTypeStringFn_t dtsfn;
TritonServerErrorDeleteFn_t edfn;
TritonServerErrorCodeToStringFn_t ectsfn;
TritonServerErrorMessageFn_t emfn;
TritonServerModelConfigFn_t mcfn;
TritonServerInferenceRequestSetCorrelationIdFn_t scidfn;
TritonServerInferenceRequestSetStringCorrelationIdFn_t sscidfn;
TritonServerInferenceRequestSetFlagsFn_t sffn;
TritonServerInferenceRequestSetPriorityFn_t spfn;
TritonServerInferenceRequestSetTimeoutMicrosecondsFn_t stmsfn;
TritonServerStringToDatatypeFn_t stdtfn;
TritonServerInferenceResponseOutputFn_t irofn;
TritonServerRequestIdFn_t ridfn;
TritonServerRequestDeleteFn_t rdfn;
TritonServerModelStatisticsFn_t msfn;
TritonSeverUnloadModelFn_t umfn;
TritonSeverSetLogInfoFn_t slifn;
TritonServerSetCudaMemoryPoolByteSizeFn_t scmpbsfn;
RETURN_IF_ERROR(GetEntrypoint(
dlhandle_, "TRITONSERVER_ApiVersion", false /* optional */,
reinterpret_cast<void**>(&apifn)));
RETURN_IF_ERROR(GetEntrypoint(
dlhandle_, "TRITONSERVER_ServerOptionsNew", false /* optional */,
reinterpret_cast<void**>(&onfn)));
RETURN_IF_ERROR(GetEntrypoint(
dlhandle_, "TRITONSERVER_ServerOptionsSetModelRepositoryPath",
false /* optional */, reinterpret_cast<void**>(&rpfn)));
RETURN_IF_ERROR(GetEntrypoint(
dlhandle_, "TRITONSERVER_ServerOptionsSetLogVerbose",
false /* optional */, reinterpret_cast<void**>(&slvfn)));
RETURN_IF_ERROR(GetEntrypoint(
dlhandle_, "TRITONSERVER_ServerOptionsSetBackendDirectory",
false /* optional */, reinterpret_cast<void**>(&sbdfn)));
RETURN_IF_ERROR(GetEntrypoint(
dlhandle_, "TRITONSERVER_ServerOptionsSetRepoAgentDirectory",
false /* optional */, reinterpret_cast<void**>(&srdfn)));
RETURN_IF_ERROR(GetEntrypoint(
dlhandle_, "TRITONSERVER_ServerOptionsSetStrictModelConfig",
false /* optional */, reinterpret_cast<void**>(&ssmcfn)));
RETURN_IF_ERROR(GetEntrypoint(
dlhandle_, "TRITONSERVER_ServerOptionsSetMinSupportedComputeCapability",
false /* optional */, reinterpret_cast<void**>(&smsccfn)));
RETURN_IF_ERROR(GetEntrypoint(
dlhandle_, "TRITONSERVER_ServerOptionsSetCudaMemoryPoolByteSize",
false /* optional */, reinterpret_cast<void**>(&scmpbsfn)));
RETURN_IF_ERROR(GetEntrypoint(
dlhandle_, "TRITONSERVER_ServerNew", false /* optional */,
reinterpret_cast<void**>(&snfn)));
RETURN_IF_ERROR(GetEntrypoint(
dlhandle_, "TRITONSERVER_ServerOptionsDelete", false /* optional */,
reinterpret_cast<void**>(&odfn)));
RETURN_IF_ERROR(GetEntrypoint(
dlhandle_, "TRITONSERVER_ServerDelete", false /* optional */,
reinterpret_cast<void**>(&sdfn)));
RETURN_IF_ERROR(GetEntrypoint(
dlhandle_, "TRITONSERVER_ServerIsLive", false /* optional */,
reinterpret_cast<void**>(&ilfn)));
RETURN_IF_ERROR(GetEntrypoint(
dlhandle_, "TRITONSERVER_ServerIsReady", false /* optional */,
reinterpret_cast<void**>(&irfn)));
RETURN_IF_ERROR(GetEntrypoint(
dlhandle_, "TRITONSERVER_ServerMetadata", false /* optional */,
reinterpret_cast<void**>(&smfn)));
RETURN_IF_ERROR(GetEntrypoint(
dlhandle_, "TRITONSERVER_MessageSerializeToJson", false /* optional */,
reinterpret_cast<void**>(&stjfn)));
RETURN_IF_ERROR(GetEntrypoint(
dlhandle_, "TRITONSERVER_MessageDelete", false /* optional */,
reinterpret_cast<void**>(&mdfn)));
RETURN_IF_ERROR(GetEntrypoint(
dlhandle_, "TRITONSERVER_ServerModelIsReady", false /* optional */,
reinterpret_cast<void**>(&mirfn)));
RETURN_IF_ERROR(GetEntrypoint(
dlhandle_, "TRITONSERVER_ServerModelMetadata", false /* optional */,
reinterpret_cast<void**>(&mmfn)));
RETURN_IF_ERROR(GetEntrypoint(
dlhandle_, "TRITONSERVER_ResponseAllocatorNew", false /* optional */,
reinterpret_cast<void**>(&ranfn)));
RETURN_IF_ERROR(GetEntrypoint(
dlhandle_, "TRITONSERVER_InferenceRequestNew", false /* optional */,
reinterpret_cast<void**>(&irnfn)));
RETURN_IF_ERROR(GetEntrypoint(
dlhandle_, "TRITONSERVER_InferenceRequestSetId", false /* optional */,
reinterpret_cast<void**>(&irsifn)));
RETURN_IF_ERROR(GetEntrypoint(
dlhandle_, "TRITONSERVER_InferenceRequestSetReleaseCallback",
false /* optional */, reinterpret_cast<void**>(&irsrcfn)));
RETURN_IF_ERROR(GetEntrypoint(
dlhandle_, "TRITONSERVER_InferenceRequestAddInput", false /* optional */,
reinterpret_cast<void**>(&iraifn)));
RETURN_IF_ERROR(GetEntrypoint(
dlhandle_, "TRITONSERVER_InferenceRequestAddRequestedOutput",
false /* optional */, reinterpret_cast<void**>(&irarofn)));
RETURN_IF_ERROR(GetEntrypoint(
dlhandle_, "TRITONSERVER_InferenceRequestAppendInputData",
false /* optional */, reinterpret_cast<void**>(&iraidfn)));
RETURN_IF_ERROR(GetEntrypoint(
dlhandle_, "TRITONSERVER_InferenceRequestSetResponseCallback",
false /* optional */, reinterpret_cast<void**>(&irsrescfn)));
RETURN_IF_ERROR(GetEntrypoint(
dlhandle_, "TRITONSERVER_ServerInferAsync", false /* optional */,
reinterpret_cast<void**>(&iafn)));
RETURN_IF_ERROR(GetEntrypoint(
dlhandle_, "TRITONSERVER_InferenceResponseError", false /* optional */,
reinterpret_cast<void**>(&irefn)));
RETURN_IF_ERROR(GetEntrypoint(
dlhandle_, "TRITONSERVER_InferenceResponseDelete", false /* optional */,
reinterpret_cast<void**>(&irdfn)));
RETURN_IF_ERROR(GetEntrypoint(
dlhandle_, "TRITONSERVER_ResponseAllocatorDelete", false /* optional */,
reinterpret_cast<void**>(&radfn)));
RETURN_IF_ERROR(GetEntrypoint(
dlhandle_, "TRITONSERVER_ErrorNew", false /* optional */,
reinterpret_cast<void**>(&enfn)));
RETURN_IF_ERROR(GetEntrypoint(
dlhandle_, "TRITONSERVER_MemoryTypeString", false /* optional */,
reinterpret_cast<void**>(&mtsfn)));
RETURN_IF_ERROR(GetEntrypoint(
dlhandle_, "TRITONSERVER_InferenceResponseOutputCount",
false /* optional */, reinterpret_cast<void**>(&irocfn)));
RETURN_IF_ERROR(GetEntrypoint(
dlhandle_, "TRITONSERVER_DataTypeString", false /* optional */,
reinterpret_cast<void**>(&dtsfn)));
RETURN_IF_ERROR(GetEntrypoint(
dlhandle_, "TRITONSERVER_ErrorDelete", false /* optional */,
reinterpret_cast<void**>(&edfn)));
RETURN_IF_ERROR(GetEntrypoint(
dlhandle_, "TRITONSERVER_ErrorCodeString", false /* optional */,
reinterpret_cast<void**>(&ectsfn)));
RETURN_IF_ERROR(GetEntrypoint(
dlhandle_, "TRITONSERVER_ErrorMessage", false /* optional */,
reinterpret_cast<void**>(&emfn)));
RETURN_IF_ERROR(GetEntrypoint(
dlhandle_, "TRITONSERVER_ServerModelConfig", false /* optional */,
reinterpret_cast<void**>(&mcfn)));
RETURN_IF_ERROR(GetEntrypoint(
dlhandle_, "TRITONSERVER_InferenceRequestSetCorrelationId",
false /* optional */, reinterpret_cast<void**>(&scidfn)));
RETURN_IF_ERROR(GetEntrypoint(
dlhandle_, "TRITONSERVER_InferenceRequestSetCorrelationIdString",
false /* optional */, reinterpret_cast<void**>(&sscidfn)));
RETURN_IF_ERROR(GetEntrypoint(
dlhandle_, "TRITONSERVER_InferenceRequestSetFlags", false /* optional */,
reinterpret_cast<void**>(&sffn)));
RETURN_IF_ERROR(GetEntrypoint(
dlhandle_, "TRITONSERVER_InferenceRequestSetPriorityUInt64",
false /* optional */, reinterpret_cast<void**>(&spfn)));
RETURN_IF_ERROR(GetEntrypoint(
dlhandle_, "TRITONSERVER_InferenceRequestSetTimeoutMicroseconds",
false /* optional */, reinterpret_cast<void**>(&stmsfn)));
RETURN_IF_ERROR(GetEntrypoint(
dlhandle_, "TRITONSERVER_StringToDataType", false /* optional */,
reinterpret_cast<void**>(&stdtfn)));
RETURN_IF_ERROR(GetEntrypoint(
dlhandle_, "TRITONSERVER_InferenceResponseOutput", false /* optional */,
reinterpret_cast<void**>(&irofn)));
RETURN_IF_ERROR(GetEntrypoint(
dlhandle_, "TRITONSERVER_InferenceRequestId", false /* optional */,
reinterpret_cast<void**>(&ridfn)));
RETURN_IF_ERROR(GetEntrypoint(
dlhandle_, "TRITONSERVER_InferenceRequestDelete", false /* optional */,
reinterpret_cast<void**>(&rdfn)));
RETURN_IF_ERROR(GetEntrypoint(
dlhandle_, "TRITONSERVER_ServerModelStatistics", false /* optional */,
reinterpret_cast<void**>(&msfn)));
RETURN_IF_ERROR(GetEntrypoint(
dlhandle_, "TRITONSERVER_ServerUnloadModel", false /* optional */,
reinterpret_cast<void**>(&umfn)));
RETURN_IF_ERROR(GetEntrypoint(
dlhandle_, "TRITONSERVER_ServerOptionsSetLogInfo", false /* optional */,
reinterpret_cast<void**>(&slifn)));
api_version_fn_ = apifn;
options_new_fn_ = onfn;
options_set_model_repo_path_fn_ = rpfn;
set_log_verbose_fn_ = slvfn;
set_backend_directory_fn_ = sbdfn;
set_repo_agent_directory_fn_ = srdfn;
set_strict_model_config_fn_ = ssmcfn;
set_min_supported_compute_capability_fn_ = smsccfn;
server_new_fn_ = snfn;
server_options_delete_fn_ = odfn;
server_delete_fn_ = sdfn;
server_is_live_fn_ = ilfn;
server_is_ready_fn_ = irfn;
server_metadata_fn_ = smfn;
message_serialize_to_json_fn_ = stjfn;
message_delete_fn_ = mdfn;
model_is_ready_fn_ = mirfn;
model_metadata_fn_ = mmfn;
response_allocator_new_fn_ = ranfn;
inference_request_new_fn_ = irnfn;
inference_request_set_id_fn_ = irsifn;
inference_request_set_release_callback_fn_ = irsrcfn;
inference_request_add_input_fn_ = iraifn;
inference_request_add_requested_output_fn_ = irarofn;
inference_request_append_input_data_fn_ = iraidfn;
inference_request_set_response_callback_fn_ = irsrescfn;
infer_async_fn_ = iafn;
inference_response_error_fn_ = irefn;
inference_response_delete_fn_ = irdfn;
response_allocator_delete_fn_ = radfn;
error_new_fn_ = enfn;
memory_type_string_fn_ = mtsfn;
inference_response_output_count_fn_ = irocfn;
data_type_string_fn_ = dtsfn;
error_delete_fn_ = edfn;
error_code_to_string_fn_ = ectsfn;
error_message_fn_ = emfn;
model_config_fn_ = mcfn;
set_correlation_id_fn_ = scidfn;
set_string_correlation_id_fn_ = sscidfn;
set_flags_fn_ = sffn;
set_priority_fn_ = spfn;
set_timeout_ms_fn_ = stmsfn;
string_to_datatype_fn_ = stdtfn;
inference_response_output_fn_ = irofn;
request_id_fn_ = ridfn;
request_delete_fn_ = rdfn;
model_statistics_fn_ = msfn;
unload_model_fn_ = umfn;
set_log_info_fn_ = slifn;
set_cuda_memory_pool_byte_size_ = scmpbsfn;
return Error::Success;
}
void
TritonLoader::ClearHandles()
{
dlhandle_ = nullptr;
api_version_fn_ = nullptr;
options_new_fn_ = nullptr;
options_set_model_repo_path_fn_ = nullptr;
set_log_verbose_fn_ = nullptr;
set_backend_directory_fn_ = nullptr;
set_repo_agent_directory_fn_ = nullptr;
set_strict_model_config_fn_ = nullptr;
set_min_supported_compute_capability_fn_ = nullptr;
server_new_fn_ = nullptr;
server_options_delete_fn_ = nullptr;
server_delete_fn_ = nullptr;
server_is_live_fn_ = nullptr;
server_is_ready_fn_ = nullptr;
server_metadata_fn_ = nullptr;
message_serialize_to_json_fn_ = nullptr;
message_delete_fn_ = nullptr;
model_is_ready_fn_ = nullptr;
model_metadata_fn_ = nullptr;
response_allocator_new_fn_ = nullptr;
inference_request_new_fn_ = nullptr;
inference_request_set_id_fn_ = nullptr;
inference_request_set_release_callback_fn_ = nullptr;
inference_request_add_input_fn_ = nullptr;
inference_request_add_requested_output_fn_ = nullptr;
inference_request_append_input_data_fn_ = nullptr;
inference_request_set_response_callback_fn_ = nullptr;
infer_async_fn_ = nullptr;
inference_response_error_fn_ = nullptr;
inference_response_delete_fn_ = nullptr;
response_allocator_delete_fn_ = nullptr;
error_new_fn_ = nullptr;
memory_type_string_fn_ = nullptr;
inference_response_output_count_fn_ = nullptr;
data_type_string_fn_ = nullptr;
error_message_fn_ = nullptr;
error_delete_fn_ = nullptr;
error_code_to_string_fn_ = nullptr;
model_config_fn_ = nullptr;
set_correlation_id_fn_ = nullptr;
set_string_correlation_id_fn_ = nullptr;
set_flags_fn_ = nullptr;
set_priority_fn_ = nullptr;
set_timeout_ms_fn_ = nullptr;
string_to_datatype_fn_ = nullptr;
inference_response_output_fn_ = nullptr;
request_id_fn_ = nullptr;
request_delete_fn_ = nullptr;
model_statistics_fn_ = nullptr;
unload_model_fn_ = nullptr;
set_log_info_fn_ = nullptr;
}
Error
TritonLoader::FileExists(std::string& filepath)
{
std::ifstream ifile;
ifile.open(filepath);
if (!ifile) {
return Error("unable to find local Triton library: " + filepath);
} else {
return Error::Success;
}
}
Error
TritonLoader::Infer(
const tc::InferOptions& options, const std::vector<tc::InferInput*>& inputs,
const std::vector<const tc::InferRequestedOutput*>& outputs,
InferResult** result)
{
Error error = Error::Success;
if (!ServerIsReady() || !ModelIsLoaded()) {
return Error("Server is not ready and/or requested model is not loaded");
}
TRITONSERVER_ResponseAllocator* allocator = nullptr;
TRITONSERVER_InferenceRequest* irequest = nullptr;
TRITONSERVER_InferenceResponse* completed_response = nullptr;
tc::RequestTimers timer;
timer.Reset();
timer.CaptureTimestamp(tc::RequestTimers::Kind::REQUEST_START);
RETURN_IF_ERROR(InitializeRequest(options, outputs, &allocator, &irequest));
ScopedDefer error_handler([&error, &completed_response, &allocator, this] {
error = CleanUp(completed_response, allocator);
});
RETURN_IF_ERROR(AddInputs(inputs, irequest));
RETURN_IF_ERROR(AddOutputs(outputs, irequest));
AllocPayload alloc_payload;
for (auto& output : outputs) {
if (output->IsSharedMemory()) {
std::string shm_name;
size_t shm_byte_size;
size_t offset;
// TODO: Error handling
output->SharedMemoryInfo(&shm_name, &shm_byte_size, &offset);
void* buf;
TRITONSERVER_MemoryType memory_type;
int64_t memory_type_id;
RETURN_IF_ERROR(shm_manager_->GetMemoryInfo(
shm_name, offset, &buf, &memory_type, &memory_type_id));
alloc_payload.output_map_.emplace(
std::piecewise_construct, std::forward_as_tuple(output->Name()),
std::forward_as_tuple(new AllocPayload::OutputInfo(
buf, shm_byte_size, memory_type, memory_type_id)));
}
}
const char* cid = nullptr;
RETURN_IF_TRITONSERVER_ERROR(
request_id_fn_(irequest, &cid), "Failed to get request id");
std::string id = cid;
// Perform inference...
timer.CaptureTimestamp(tc::RequestTimers::Kind::SEND_START);
auto p = new std::promise<TRITONSERVER_InferenceResponse*>();
std::future<TRITONSERVER_InferenceResponse*> completed = p->get_future();
RETURN_IF_TRITONSERVER_ERROR(
inference_request_set_response_callback_fn_(
irequest, allocator, &alloc_payload /* response_allocator_userp */,
InferResponseComplete, reinterpret_cast<void*>(p)),
"setting response callback");
RETURN_IF_TRITONSERVER_ERROR(
infer_async_fn_((server_).get(), irequest, nullptr /* trace */),
"running inference");
timer.CaptureTimestamp(tc::RequestTimers::Kind::SEND_END);
// Wait for the inference to complete.
completed_response = completed.get();
RETURN_IF_TRITONSERVER_ERROR(
inference_response_error_fn_(completed_response),
"inference response error");
timer.CaptureTimestamp(tc::RequestTimers::Kind::RECV_START);
timer.CaptureTimestamp(tc::RequestTimers::Kind::RECV_END);
timer.CaptureTimestamp(tc::RequestTimers::Kind::REQUEST_END);
tc::Error err = UpdateInferStat(timer);
if (!err.IsOk()) {
std::cerr << "Failed to update context stat: " << err << std::endl;
}
InferResult::Create(result, err, id);
// CleanUp the response allocators
error_handler.Complete();
return error;
}
Error
TritonLoader::CleanUp(
TRITONSERVER_InferenceResponse* completed_response,
TRITONSERVER_ResponseAllocator* allocator)
{
TRITONSERVER_Error* response_err = nullptr;
if (completed_response != nullptr) {
response_err = inference_response_delete_fn_(completed_response);
}
TRITONSERVER_Error* allocator_err = response_allocator_delete_fn_(allocator);
RETURN_IF_TRITONSERVER_ERROR(response_err, "deleting inference response");
RETURN_IF_TRITONSERVER_ERROR(allocator_err, "deleting response allocator");
return Error::Success;
}
Error
TritonLoader::InitializeRequest(
const tc::InferOptions& options,
const std::vector<const tc::InferRequestedOutput*>& outputs,
TRITONSERVER_ResponseAllocator** allocator,
TRITONSERVER_InferenceRequest** irequest)
{
// Create the allocator that will be used to allocate buffers for
// the result tensors.
RETURN_IF_TRITONSERVER_ERROR(
GetSingleton()->response_allocator_new_fn_(
allocator,
reinterpret_cast<
TRITONSERVER_Error* (*)(TRITONSERVER_ResponseAllocator* allocator,
const char* tensor_name, size_t byte_size,
TRITONSERVER_MemoryType memory_type,
int64_t memory_type_id, void* userp,
void** buffer, void** buffer_userp,
TRITONSERVER_MemoryType*
actual_memory_type,
int64_t* actual_memory_type_id)>(
ResponseAlloc),
reinterpret_cast<
TRITONSERVER_Error* (*)(TRITONSERVER_ResponseAllocator* allocator,
void* buffer, void* buffer_userp,
size_t byte_size,
TRITONSERVER_MemoryType memory_type,
int64_t memory_type_id)>(ResponseRelease),
nullptr /* start_fn */),
"creating response allocator");
// set up inference request
RETURN_IF_TRITONSERVER_ERROR(
inference_request_new_fn_(
irequest, (server_).get(), model_name_.c_str(), model_version_),
"creating inference request");
RETURN_IF_TRITONSERVER_ERROR(
inference_request_set_id_fn_(*irequest, options.request_id_.c_str()),
"setting ID for the request");
if ((options.sequence_id_ != 0) || (options.sequence_id_str_ != "") ||
(options.priority_ != 0) || (options.server_timeout_ != 0) ||
outputs.empty()) {
if (options.sequence_id_ != 0) {
RETURN_IF_TRITONSERVER_ERROR(
set_correlation_id_fn_(*irequest, options.sequence_id_),
"setting sequence ID for the request");
} else if (options.sequence_id_str_ != "") {
RETURN_IF_TRITONSERVER_ERROR(
set_string_correlation_id_fn_(
*irequest, options.sequence_id_str_.c_str()),
"setting sequence ID for the request");
}
uint32_t flags = 0;
if (options.sequence_start_) {
flags |= TRITONSERVER_REQUEST_FLAG_SEQUENCE_START;
}
if (options.sequence_end_) {
flags |= TRITONSERVER_REQUEST_FLAG_SEQUENCE_END;
}
RETURN_IF_TRITONSERVER_ERROR(
set_flags_fn_(*irequest, flags),
"setting inference flags for the request");
}
if (options.priority_ != 0) {
RETURN_IF_TRITONSERVER_ERROR(
set_priority_fn_(*irequest, options.priority_),
"setting priority for the request");
}
if (options.server_timeout_ != 0) {
RETURN_IF_TRITONSERVER_ERROR(
set_timeout_ms_fn_(*irequest, options.server_timeout_),
"setting timeout for the request");
}
RETURN_IF_TRITONSERVER_ERROR(
inference_request_set_release_callback_fn_(
*irequest, InferRequestComplete, nullptr /* request_release_userp */),
"setting request release callback");
return Error::Success;
}
Error
TritonLoader::AddInputs(
const std::vector<tc::InferInput*>& inputs,
TRITONSERVER_InferenceRequest* irequest)
{
for (auto io : inputs) {
const char* input_name = io->Name().c_str();
const char* datatype = io->Datatype().c_str();
const TRITONSERVER_DataType dtype = string_to_datatype_fn_(datatype);
std::vector<int64_t> shape_vec;
for (const int64_t dim : io->Shape()) { // this is a vector, just use it
shape_vec.push_back(dim);
}
RETURN_IF_TRITONSERVER_ERROR(
inference_request_add_input_fn_(
irequest, input_name, dtype, &shape_vec[0], shape_vec.size()),
"setting input for the request");
size_t byte_size;
tc::Error err = io->ByteSize(&byte_size);
if (!err.IsOk()) {
return Error(err.Message());
}
if (byte_size == 0) {
RETURN_IF_TRITONSERVER_ERROR(
inference_request_append_input_data_fn_(
irequest, input_name, nullptr, 0 /* byte_size */,
TRITONSERVER_MEMORY_CPU /* memory type */,
0 /* memory_type_id */),
"appending input data with byte size zero");
} else {
if (!io->IsSharedMemory()) {
io->PrepareForRequest();
bool end_of_input = false;
while (!end_of_input) {
const uint8_t* buf;
size_t buf_size;
io->GetNext(&buf, &buf_size, &end_of_input);
if (buf != nullptr) {
RETURN_IF_TRITONSERVER_ERROR(
inference_request_append_input_data_fn_(
irequest, input_name, const_cast<uint8_t*>(buf), buf_size,
TRITONSERVER_MEMORY_CPU /* memory_type */,
0 /* memory_type_id */),
"appending data to tritonserver");
}
}
} else {
std::string shm_name;
size_t shm_byte_size;
size_t offset;
// TODO: Error handling
io->SharedMemoryInfo(&shm_name, &shm_byte_size, &offset);
void* buf;
TRITONSERVER_MemoryType memory_type;
int64_t memory_type_id;
RETURN_IF_ERROR(shm_manager_->GetMemoryInfo(
shm_name, offset, &buf, &memory_type, &memory_type_id));
RETURN_IF_TRITONSERVER_ERROR(
inference_request_append_input_data_fn_(
irequest, input_name, buf, byte_size,
memory_type /* memory_type */,
memory_type_id /* memory_type_id */),
"appending data to tritonserver");
}
}
}
return Error::Success;
}
Error
TritonLoader::AddOutputs(
const std::vector<const tc::InferRequestedOutput*>& outputs,
TRITONSERVER_InferenceRequest* irequest)
{
for (auto io : outputs) {
const char* output_name = io->Name().c_str();
RETURN_IF_TRITONSERVER_ERROR(
inference_request_add_requested_output_fn_(irequest, output_name),
"setting output for the request");
}
return Error::Success;
}
Error
TritonLoader::ModelInferenceStatistics(
const std::string& model_name, const std::string& model_version,
rapidjson::Document* infer_stat)
{
if (ServerIsReady() && ModelIsLoaded()) {
TRITONSERVER_Message* model_stats_message = nullptr;
int64_t requested_model_version;
auto err =
GetModelVersionFromString(model_version, &requested_model_version);
if (err.IsOk()) {
RETURN_IF_TRITONSERVER_ERROR(
model_statistics_fn_(
(server_).get(), model_name.c_str(), requested_model_version,
&model_stats_message),
"getting model statistics from server");
const char* buffer;
size_t byte_size;
RETURN_IF_TRITONSERVER_ERROR(
message_serialize_to_json_fn_(
model_stats_message, &buffer, &byte_size),
"serializing message to json");
infer_stat->Parse(buffer, byte_size);
if (infer_stat->HasParseError()) {
return Error(
"error: failed to parse server metadata from JSON: " +
std::string(GetParseError_En(infer_stat->GetParseError())) +
" at " + std::to_string(infer_stat->GetErrorOffset()));
}
RETURN_IF_TRITONSERVER_ERROR(
message_delete_fn_(model_stats_message),
"deleting inference statistics message");
}
return err;
} else {
return Error(
"Trying to get model statistics while server is not started or model "
"is not ready");
}
}
TritonLoader*
TritonLoader::GetSingleton()
{
static TritonLoader loader;
return &loader;
}
TritonLoader::~TritonLoader()
{
FAIL_IF_ERR(Delete(), "dereferencing server instance...");
FAIL_IF_ERR(CloseLibraryHandle(dlhandle_), "error on closing triton loader");
ClearHandles();
}
#ifdef TRITON_ENABLE_GPU
Error
TritonLoader::RegisterCudaMemory(
const std::string& name, void* handle, const size_t byte_size)
{
RETURN_IF_ERROR(shm_manager_->RegisterCUDAMemory(
name, handle, byte_size, 0 /* device id */));
return Error::Success;
}
#endif // TRITON_ENABLE_GPU
Error
TritonLoader::RegisterSystemMemory(
const std::string& name, void* ptr, const size_t byte_size)
{
RETURN_IF_ERROR(shm_manager_->RegisterSystemMemory(name, ptr, byte_size));
return Error::Success;
}
Error
TritonLoader::UnregisterAllSharedMemory()
{
RETURN_IF_ERROR(shm_manager_->UnregisterAll(TRITONSERVER_MEMORY_GPU));
RETURN_IF_ERROR(shm_manager_->UnregisterAll(TRITONSERVER_MEMORY_GPU));
return Error::Success;
}
TRITONSERVER_Error*
TritonLoader::ErrorNew(TRITONSERVER_Error_Code code, const char* message)
{
return error_new_fn_(code, message);
}
}}}} // namespace triton::perfanalyzer::clientbackend::tritoncapi
// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <rapidjson/document.h>
#include <rapidjson/error/en.h>
#include <fstream>
#include <iostream>
#include <memory>
#include <string>
#include "../client_backend.h"
#include "common.h"
#include "shared_library.h"
#include "shared_memory_manager.h"
#include "triton/core/tritonserver.h"
// If TRITONSERVER error is non-OK, return the corresponding status.
#define RETURN_IF_TRITONSERVER_ERROR(E, MSG) \
do { \
TRITONSERVER_Error* err__ = (E); \
if (err__ != nullptr) { \
std::cout << "error: " << (MSG) << ": " \
<< GetSingleton()->error_code_to_string_fn_(err__) << " - " \
<< GetSingleton()->error_message_fn_(err__) << std::endl; \
Error newErr = Error(MSG); \
GetSingleton()->error_delete_fn_(err__); \
return newErr; \
} \
} while (false)
#define FAIL_IF_TRITONSERVER_ERROR(E, MSG) \
do { \
TRITONSERVER_Error* err__ = (E); \
if (err__ != nullptr) { \
std::cerr << "error: " << (MSG) << ": " \
<< GetSingleton()->error_code_to_string_fn_(err__) << " - " \
<< GetSingleton()->error_message_fn_(err__) << std::endl; \
Error newErr = Error(MSG); \
GetSingleton()->error_delete_fn_(err__); \
exit(1); \
} \
} while (false)
#define REPORT_TRITONSERVER_ERROR(E) \
do { \
TRITONSERVER_Error* err__ = (E); \
if (err__ != nullptr) { \
std::cout << GetSingleton()->error_message_fn_(err__) << std::endl; \
GetSingleton()->error_delete_fn_(err__); \
} \
} while (false)
namespace tc = triton::client;
namespace triton { namespace perfanalyzer { namespace clientbackend {
namespace tritoncapi {
class InferResult;
class TritonLoader : public tc::InferenceServerClient {
public:
~TritonLoader();
static Error Create(
const std::string& triton_server_path,
const std::string& model_repository_path, bool verbose);
Error Delete();
Error StartTriton();
Error LoadModel(
const std::string& model_name, const std::string& model_version);
Error ModelMetadata(rapidjson::Document* model_metadata);
Error ModelConfig(
rapidjson::Document* model_config, const std::string& model_name,
const std::string& model_version);
Error ServerMetaData(rapidjson::Document* server_metadata);
Error Infer(
const tc::InferOptions& options,
const std::vector<tc::InferInput*>& inputs,
const std::vector<const tc::InferRequestedOutput*>& outputs,
InferResult** result);
Error CleanUp(
TRITONSERVER_InferenceResponse* completed_response,
TRITONSERVER_ResponseAllocator* allocator);
Error ModelInferenceStatistics(
const std::string& model_name, const std::string& model_version,
rapidjson::Document* infer_stat);
Error ClientInferStat(tc::InferStat* infer_stat)
{
*infer_stat = infer_stat_;
return Error::Success;
}
#ifdef TRITON_ENABLE_GPU
Error RegisterCudaMemory(
const std::string& name, void* handle, const size_t byte_size);
#endif // TRITON_ENABLE_GPU
Error RegisterSystemMemory(
const std::string& name, void* ptr, const size_t byte_size);
Error UnregisterAllSharedMemory();
TRITONSERVER_Error* ErrorNew(
TRITONSERVER_Error_Code code, const char* message);
bool ModelIsLoaded() { return model_is_loaded_; }
bool ServerIsReady() { return server_is_ready_; }
TRITONSERVER_Error* DeleteInferRequest(
TRITONSERVER_InferenceRequest* irequest)
{
return request_delete_fn_(irequest);
}
static TritonLoader* GetSingleton();
// TRITONSERVER_ApiVersion
typedef TRITONSERVER_Error* (*TritonServerApiVersionFn_t)(
uint32_t* major, uint32_t* minor);
// TRITONSERVER_ServerOptionsNew
typedef TRITONSERVER_Error* (*TritonServerOptionsNewFn_t)(
TRITONSERVER_ServerOptions** options);
// TRITONSERVER_ServerOptionsSetModelRepositoryPath
typedef TRITONSERVER_Error* (*TritonServerOptionSetModelRepoPathFn_t)(
TRITONSERVER_ServerOptions* options, const char* model_repository_path);
// TRITONSERVER_ServerOptionsSetLogVerbose
typedef TRITONSERVER_Error* (*TritonServerSetLogVerboseFn_t)(
TRITONSERVER_ServerOptions* options, int level);
// TRITONSERVER_ServerOptionsSetBackendDirectory
typedef TRITONSERVER_Error* (*TritonServerSetBackendDirFn_t)(
TRITONSERVER_ServerOptions* options, const char* backend_dir);
// TRITONSERVER_ServerOptionsSetRepoAgentDirectory
typedef TRITONSERVER_Error* (*TritonServerSetRepoAgentDirFn_t)(
TRITONSERVER_ServerOptions* options, const char* repoagent_dir);
// TRITONSERVER_ServerOptionsSetStrictModelConfig
typedef TRITONSERVER_Error* (*TritonServerSetStrictModelConfigFn_t)(
TRITONSERVER_ServerOptions* options, bool strict);
// TRITONSERVER_ServerOptionsSetMinSupportedComputeCapability
typedef TRITONSERVER_Error* (
*TritonServerSetMinSupportedComputeCapabilityFn_t)(
TRITONSERVER_ServerOptions* options, double cc);
// TRITONSERVER_ServerNew
typedef TRITONSERVER_Error* (*TritonServerNewFn_t)(
TRITONSERVER_Server** server, TRITONSERVER_ServerOptions* option);
// TRITONSERVER_ServerOptionsDelete
typedef TRITONSERVER_Error* (*TritonServerOptionsDeleteFn_t)(
TRITONSERVER_ServerOptions* options);
// TRITONSERVER_ServerDelete
typedef TRITONSERVER_Error* (*TritonServerDeleteFn_t)(
TRITONSERVER_Server* server);
// TRITONSERVER_ServerIsLive
typedef TRITONSERVER_Error* (*TritonServerIsLiveFn_t)(
TRITONSERVER_Server* server, bool* live);
// TRITONSERVER_ServerIsReady
typedef TRITONSERVER_Error* (*TritonServerIsReadyFn_t)(
TRITONSERVER_Server* server, bool* ready);
// TRITONSERVER_ServerMetadata
typedef TRITONSERVER_Error* (*TritonServerMetadataFn_t)(
TRITONSERVER_Server* server, TRITONSERVER_Message** server_metadata);
// TRITONSERVER_MessageSerializeToJson
typedef TRITONSERVER_Error* (*TritonServerMessageSerializeToJsonFn_t)(
TRITONSERVER_Message* message, const char** base, size_t* byte_size);
// TRITONSERVER_MessageDelete
typedef TRITONSERVER_Error* (*TritonServerMessageDeleteFn_t)(
TRITONSERVER_Message* message);
// TRITONSERVER_ServerModelIsReady
typedef TRITONSERVER_Error* (*TritonServerModelIsReadyFn_t)(
TRITONSERVER_Server* server, const char* model_name,
const int64_t model_version, bool* ready);
// TRITONSERVER_ServerModelMetadata
typedef TRITONSERVER_Error* (*TritonServerModelMetadataFn_t)(
TRITONSERVER_Server* server, const char* model_name,
const int64_t model_version, TRITONSERVER_Message** model_metadata);
// TRITONSERVER_ResponseAllocatorNew
typedef TRITONSERVER_Error* (*TritonServerResponseAllocatorNewFn_t)(
TRITONSERVER_ResponseAllocator** allocator,
TRITONSERVER_ResponseAllocatorAllocFn_t alloc_fn,
TRITONSERVER_ResponseAllocatorReleaseFn_t release_fn,
TRITONSERVER_ResponseAllocatorStartFn_t start_fn);
// TRITONSERVER_InferenceRequestNew
typedef TRITONSERVER_Error* (*TritonServerInferenceRequestNewFn_t)(
TRITONSERVER_InferenceRequest** inference_request,
TRITONSERVER_Server* server, const char* model_name,
const int64_t model_version);
// TRITONSERVER_InferenceRequestSetId
typedef TRITONSERVER_Error* (*TritonServerInferenceRequestSetIdFn_t)(
TRITONSERVER_InferenceRequest* inference_request, const char* id);
// TRITONSERVER_InferenceRequestSetReleaseCallback
typedef TRITONSERVER_Error* (
*TritonServerInferenceRequestSetReleaseCallbackFn_t)(
TRITONSERVER_InferenceRequest* inference_request,
TRITONSERVER_InferenceRequestReleaseFn_t request_release_fn,
void* request_release_userp);
// TRITONSERVER_InferenceRequestAddInput
typedef TRITONSERVER_Error* (*TritonServerInferenceRequestAddInputFn_t)(
TRITONSERVER_InferenceRequest* inference_request, const char* name,
const TRITONSERVER_DataType datatype, const int64_t* shape,
uint64_t dim_count);
// TRITONSERVER_InferenceRequestAddRequestedOutput
typedef TRITONSERVER_Error* (
*TritonServerInferenceRequestAddRequestedOutputFn_t)(
TRITONSERVER_InferenceRequest* inference_request, const char* name);
// TRITONSERVER_InferenceRequestAppendInputData
typedef TRITONSERVER_Error* (
*TritonServerInferenceRequestAppendInputDataFn_t)(
TRITONSERVER_InferenceRequest* inference_request, const char* name,
const void* base, size_t byte_size, TRITONSERVER_MemoryType memory_type,
int64_t memory_type_i);
// TRITONSERVER_InferenceRequestSetResponseCallback
typedef TRITONSERVER_Error* (
*TritonServerInferenceRequestSetResponseCallbackFn_t)(
TRITONSERVER_InferenceRequest* inference_request,
TRITONSERVER_ResponseAllocator* response_allocator,
void* response_allocator_userp,
TRITONSERVER_InferenceResponseCompleteFn_t response_fn,
void* response_userp);
// TRITONSERVER_ServerInferAsync
typedef TRITONSERVER_Error* (*TritonServerInferAsyncFn_t)(
TRITONSERVER_Server* server,
TRITONSERVER_InferenceRequest* inference_request,
TRITONSERVER_InferenceTrace* trace);
// TRITONSERVER_InferenceResponseError
typedef TRITONSERVER_Error* (*TritonServerInferenceResponseErrorFn_t)(
TRITONSERVER_InferenceResponse* inference_response);
// TRITONSERVER_InferenceResponseDelete
typedef TRITONSERVER_Error* (*TritonServerInferenceResponseDeleteFn_t)(
TRITONSERVER_InferenceResponse* inference_response);
// TRITONSERVER_InferenceRequestRemoveAllInputData
typedef TRITONSERVER_Error* (
*TritonServerInferenceRequestRemoveAllInputDataFn_t)(
TRITONSERVER_InferenceRequest* inference_request, const char* name);
// TRITONSERVER_ResponseAllocatorDelete
typedef TRITONSERVER_Error* (*TritonServerResponseAllocatorDeleteFn_t)(
TRITONSERVER_ResponseAllocator* allocator);
// TRITONSERVER_ErrorNew
typedef TRITONSERVER_Error* (*TritonServerErrorNewFn_t)(
TRITONSERVER_Error_Code code, const char* msg);
// TRITONSERVER_MemoryTypeString
typedef const char* (*TritonServerMemoryTypeStringFn_t)(
TRITONSERVER_MemoryType memtype);
// TRITONSERVER_InferenceResponseOutputCount
typedef TRITONSERVER_Error* (*TritonServerInferenceResponseOutputCountFn_t)(
TRITONSERVER_InferenceResponse* inference_response, uint32_t* count);
// TRITONSERVER_DataTypeString
typedef const char* (*TritonServerDataTypeStringFn_t)(
TRITONSERVER_DataType datatype);
// TRITONSERVER_ErrorMessage
typedef const char* (*TritonServerErrorMessageFn_t)(
TRITONSERVER_Error* error);
// TRITONSERVER_ErrorDelete
typedef void (*TritonServerErrorDeleteFn_t)(TRITONSERVER_Error* error);
// TRITONSERVER_ErrorCodeString
typedef const char* (*TritonServerErrorCodeToStringFn_t)(
TRITONSERVER_Error* error);
// TRITONSERVER_ServerModelConfig
typedef TRITONSERVER_Error* (*TritonServerModelConfigFn_t)(
TRITONSERVER_Server* server, const char* model_name,
const int64_t model_version, const uint32_t config_version,
TRITONSERVER_Message** model_config);
// TRITONSERVER_InferenceRequestSetCorrelationId
typedef TRITONSERVER_Error* (
*TritonServerInferenceRequestSetCorrelationIdFn_t)(
TRITONSERVER_InferenceRequest* inference_request,
uint64_t correlation_id);
// TRITONSERVER_InferenceRequestSetCorrelationId
typedef TRITONSERVER_Error* (
*TritonServerInferenceRequestSetStringCorrelationIdFn_t)(
TRITONSERVER_InferenceRequest* inference_request,
const char* correlation_id);
// TRITONSERVER_InferenceRequestSetFlags
typedef TRITONSERVER_Error* (*TritonServerInferenceRequestSetFlagsFn_t)(
TRITONSERVER_InferenceRequest* inference_request, uint32_t flags);
// TRITONSERVER_InferenceRequestSetPriorityUInt64
typedef TRITONSERVER_Error* (*TritonServerInferenceRequestSetPriorityFn_t)(
TRITONSERVER_InferenceRequest* inference_request, uint64_t priority);
// TRITONSERVER_InferenceRequestSetTimeoutMicroseconds
typedef TRITONSERVER_Error* (
*TritonServerInferenceRequestSetTimeoutMicrosecondsFn_t)(
TRITONSERVER_InferenceRequest* inference_request, uint64_t timeout_us);
// TRITONSERVER_StringToDataType
typedef TRITONSERVER_DataType (*TritonServerStringToDatatypeFn_t)(
const char* dtype);
// TRITONSERVER_InferenceResponseOutput
typedef TRITONSERVER_Error* (*TritonServerInferenceResponseOutputFn_t)(
TRITONSERVER_InferenceResponse* inference_response, const uint32_t index,
const char** name, TRITONSERVER_DataType* datatype, const int64_t** shape,
uint64_t* dim_count, const void** base, size_t* byte_size,
TRITONSERVER_MemoryType* memory_type, int64_t* memory_type_id,
void** userp);
// TRITONSERVER_InferenceRequestId
typedef TRITONSERVER_Error* (*TritonServerRequestIdFn_t)(
TRITONSERVER_InferenceRequest* inference_request, const char** id);
// TRITONSERVER_InferenceRequestDelete
typedef TRITONSERVER_Error* (*TritonServerRequestDeleteFn_t)(
TRITONSERVER_InferenceRequest* inference_request);
// TRITONSERVER_ServerModelStatistics
typedef TRITONSERVER_Error* (*TritonServerModelStatisticsFn_t)(
TRITONSERVER_Server* server, const char* model_name,
const int64_t model_version, TRITONSERVER_Message** model_stats);
// TRITONSERVER_ServerUnloadModel
typedef TRITONSERVER_Error* (*TritonSeverUnloadModelFn_t)(
TRITONSERVER_Server* server, const char* model_name);
// TRITONSERVER_ServerOptionsSetLogInfo
typedef TRITONSERVER_Error* (*TritonSeverSetLogInfoFn_t)(
TRITONSERVER_ServerOptions* options, bool log);
// TRITONSERVER_ServerOptionsSetCudaMemoryPoolByteSize
typedef TRITONSERVER_Error* (*TritonServerSetCudaMemoryPoolByteSizeFn_t)(
TRITONSERVER_ServerOptions* options, int gpu_device, uint64_t size);
private:
TritonLoader()
: InferenceServerClient(
false /* verbose flag that is set later during ::Create*/)
{
verbose_level_ = 0;
enforce_memory_type_ = false;
requested_memory_type_ = TRITONSERVER_MEMORY_CPU;
model_is_loaded_ = false;
server_is_ready_ = false;
shm_manager_ = std::make_unique<SharedMemoryManager>();
}
Error PopulateInternals(
const std::string& triton_server_path,
const std::string& model_repository_path, bool verbose);
/// Load all tritonserver.h functions onto triton_loader
/// internal handles
Error LoadServerLibrary();
void ClearHandles();
/// Check if file exists in the current directory
/// \param filepath Path of library to check
/// \return perfanalyzer::clientbackend::Error
Error FileExists(std::string& filepath);
Error InitializeRequest(
const tc::InferOptions& options,
const std::vector<const tc::InferRequestedOutput*>& outputs,
TRITONSERVER_ResponseAllocator** allocator,
TRITONSERVER_InferenceRequest** irequest);
Error AddInputs(
const std::vector<tc::InferInput*>& inputs,
TRITONSERVER_InferenceRequest* irequest);
Error AddOutputs(
const std::vector<const tc::InferRequestedOutput*>& outputs,
TRITONSERVER_InferenceRequest* irequest);
void* dlhandle_;
TritonServerApiVersionFn_t api_version_fn_;
TritonServerOptionsNewFn_t options_new_fn_;
TritonServerOptionSetModelRepoPathFn_t options_set_model_repo_path_fn_;
TritonServerSetLogVerboseFn_t set_log_verbose_fn_;
TritonServerSetBackendDirFn_t set_backend_directory_fn_;
TritonServerSetRepoAgentDirFn_t set_repo_agent_directory_fn_;
TritonServerSetStrictModelConfigFn_t set_strict_model_config_fn_;
TritonServerSetMinSupportedComputeCapabilityFn_t
set_min_supported_compute_capability_fn_;
TritonServerNewFn_t server_new_fn_;
TritonServerOptionsDeleteFn_t server_options_delete_fn_;
TritonServerDeleteFn_t server_delete_fn_;
TritonServerIsLiveFn_t server_is_live_fn_;
TritonServerIsReadyFn_t server_is_ready_fn_;
TritonServerMetadataFn_t server_metadata_fn_;
TritonServerMessageSerializeToJsonFn_t message_serialize_to_json_fn_;
TritonServerMessageDeleteFn_t message_delete_fn_;
TritonServerModelIsReadyFn_t model_is_ready_fn_;
TritonServerModelMetadataFn_t model_metadata_fn_;
TritonServerResponseAllocatorNewFn_t response_allocator_new_fn_;
TritonServerInferenceRequestNewFn_t inference_request_new_fn_;
TritonServerInferenceRequestSetIdFn_t inference_request_set_id_fn_;
TritonServerInferenceRequestSetReleaseCallbackFn_t
inference_request_set_release_callback_fn_;
TritonServerInferenceRequestAddInputFn_t inference_request_add_input_fn_;
TritonServerInferenceRequestAddRequestedOutputFn_t
inference_request_add_requested_output_fn_;
TritonServerInferenceRequestAppendInputDataFn_t
inference_request_append_input_data_fn_;
TritonServerInferenceRequestSetResponseCallbackFn_t
inference_request_set_response_callback_fn_;
TritonServerInferAsyncFn_t infer_async_fn_;
TritonServerInferenceResponseErrorFn_t inference_response_error_fn_;
TritonServerInferenceResponseDeleteFn_t inference_response_delete_fn_;
TritonServerResponseAllocatorDeleteFn_t response_allocator_delete_fn_;
TritonServerErrorNewFn_t error_new_fn_;
TritonServerMemoryTypeStringFn_t memory_type_string_fn_;
TritonServerInferenceResponseOutputCountFn_t
inference_response_output_count_fn_;
TritonServerDataTypeStringFn_t data_type_string_fn_;
TritonServerErrorMessageFn_t error_message_fn_;
TritonServerErrorDeleteFn_t error_delete_fn_;
TritonServerErrorCodeToStringFn_t error_code_to_string_fn_;
TritonServerModelConfigFn_t model_config_fn_;
TritonServerInferenceRequestSetCorrelationIdFn_t set_correlation_id_fn_;
TritonServerInferenceRequestSetStringCorrelationIdFn_t
set_string_correlation_id_fn_;
TritonServerInferenceRequestSetFlagsFn_t set_flags_fn_;
TritonServerInferenceRequestSetPriorityFn_t set_priority_fn_;
TritonServerInferenceRequestSetTimeoutMicrosecondsFn_t set_timeout_ms_fn_;
TritonServerStringToDatatypeFn_t string_to_datatype_fn_;
TritonServerInferenceResponseOutputFn_t inference_response_output_fn_;
TritonServerRequestIdFn_t request_id_fn_;
TritonServerRequestDeleteFn_t request_delete_fn_;
TritonServerModelStatisticsFn_t model_statistics_fn_;
TritonSeverUnloadModelFn_t unload_model_fn_;
TritonSeverSetLogInfoFn_t set_log_info_fn_;
TritonServerSetCudaMemoryPoolByteSizeFn_t set_cuda_memory_pool_byte_size_;
std::shared_ptr<TRITONSERVER_Server> server_{nullptr};
std::string triton_server_path_{};
const std::string server_library_path_{"/lib/libtritonserver.so"};
int verbose_level_{0};
TRITONSERVER_MemoryType requested_memory_type_{TRITONSERVER_MEMORY_CPU};
bool enforce_memory_type_{false};
std::string model_repository_path_{""};
std::string model_name_{""};
int64_t model_version_{-1};
bool model_is_loaded_{false};
bool server_is_ready_{false};
std::unique_ptr<SharedMemoryManager> shm_manager_{nullptr};
};
}}}} // namespace triton::perfanalyzer::clientbackend::tritoncapi
// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
#include "command_line_parser.h"
#include <getopt.h>
#include <algorithm>
#include <iomanip>
#include <iostream>
#include <string>
#include "perf_analyzer_exception.h"
namespace triton { namespace perfanalyzer {
PAParamsPtr
CLParser::Parse(int argc, char** argv)
{
ParseCommandLine(argc, argv);
VerifyOptions();
return params_;
}
// Used to format the usage message
std::string
CLParser::FormatMessage(std::string str, int offset) const
{
int width = 60;
int current_pos = offset;
while (current_pos + width < int(str.length())) {
int n = str.rfind(' ', current_pos + width);
if (n != int(std::string::npos)) {
str.replace(n, 1, "\n\t ");
current_pos += (width + 10);
}
}
return str;
}
void
CLParser::Usage(const std::string& msg)
{
if (!msg.empty()) {
std::cerr << "Error: " << msg << std::endl;
}
std::cerr << "Usage: " << argv_[0] << " [options]" << std::endl;
std::cerr << "==== SYNOPSIS ====\n \n";
std::cerr << "\t--version " << std::endl;
std::cerr << "\t--service-kind "
"<\"triton\"|\"tfserving\"|\"torchserve\"|\"triton_c_api\">"
<< std::endl;
std::cerr << "\t-m <model name>" << std::endl;
std::cerr << "\t-x <model version>" << std::endl;
std::cerr << "\t--bls-composing-models=<string>" << std::endl;
std::cerr << "\t--model-signature-name <model signature name>" << std::endl;
std::cerr << "\t-v" << std::endl;
std::cerr << std::endl;
std::cerr << "I. MEASUREMENT PARAMETERS: " << std::endl;
std::cerr << "\t--async (-a)" << std::endl;
std::cerr << "\t--sync" << std::endl;
std::cerr << "\t--measurement-interval (-p) <measurement window (in msec)>"
<< std::endl;
std::cerr << "\t--concurrency-range <start:end:step>" << std::endl;
std::cerr << "\t--request-rate-range <start:end:step>" << std::endl;
std::cerr << "\t--request-distribution <\"poisson\"|\"constant\">"
<< std::endl;
std::cerr << "\t--request-intervals <path to file containing time intervals "
"in microseconds>"
<< std::endl;
std::cerr << "\t--serial-sequences" << std::endl;
std::cerr << "\t--binary-search" << std::endl;
std::cerr << "\t--num-of-sequences <number of concurrent sequences>"
<< std::endl;
std::cerr << "\t--latency-threshold (-l) <latency threshold (in msec)>"
<< std::endl;
std::cerr << "\t--max-threads <thread counts>" << std::endl;
std::cerr << "\t--stability-percentage (-s) <deviation threshold for stable "
"measurement (in percentage)>"
<< std::endl;
std::cerr << "\t--max-trials (-r) <maximum number of measurements for each "
"profiling>"
<< std::endl;
std::cerr << "\t--percentile <percentile>" << std::endl;
std::cerr << "\tDEPRECATED OPTIONS" << std::endl;
std::cerr << "\t-t <number of concurrent requests>" << std::endl;
std::cerr << "\t-c <maximum concurrency>" << std::endl;
std::cerr << "\t-d" << std::endl;
std::cerr << std::endl;
std::cerr << "II. INPUT DATA OPTIONS: " << std::endl;
std::cerr << "\t-b <batch size>" << std::endl;
std::cerr << "\t--input-data <\"zero\"|\"random\"|<path>>" << std::endl;
std::cerr << "\t--shared-memory <\"system\"|\"cuda\"|\"none\">" << std::endl;
std::cerr << "\t--output-shared-memory-size <size in bytes>" << std::endl;
std::cerr << "\t--shape <name:shape>" << std::endl;
std::cerr << "\t--sequence-length <length>" << std::endl;
std::cerr << "\t--sequence-length-variation <variation>" << std::endl;
std::cerr << "\t--sequence-id-range <start:end>" << std::endl;
std::cerr << "\t--string-length <length>" << std::endl;
std::cerr << "\t--string-data <string>" << std::endl;
std::cerr << "\t--input-tensor-format=[binary|json]" << std::endl;
std::cerr << "\t--output-tensor-format=[binary|json]" << std::endl;
std::cerr << "\tDEPRECATED OPTIONS" << std::endl;
std::cerr << "\t-z" << std::endl;
std::cerr << "\t--data-directory <path>" << std::endl;
std::cerr << std::endl;
std::cerr << "III. SERVER DETAILS: " << std::endl;
std::cerr << "\t-u <URL for inference service>" << std::endl;
std::cerr << "\t-i <Protocol used to communicate with inference service>"
<< std::endl;
std::cerr << "\t--ssl-grpc-use-ssl <bool>" << std::endl;
std::cerr << "\t--ssl-grpc-root-certifications-file <path>" << std::endl;
std::cerr << "\t--ssl-grpc-private-key-file <path>" << std::endl;
std::cerr << "\t--ssl-grpc-certificate-chain-file <path>" << std::endl;
std::cerr << "\t--ssl-https-verify-peer <number>" << std::endl;
std::cerr << "\t--ssl-https-verify-host <number>" << std::endl;
std::cerr << "\t--ssl-https-ca-certificates-file <path>" << std::endl;
std::cerr << "\t--ssl-https-client-certificate-file <path>" << std::endl;
std::cerr << "\t--ssl-https-client-certificate-type <string>" << std::endl;
std::cerr << "\t--ssl-https-private-key-file <path>" << std::endl;
std::cerr << "\t--ssl-https-private-key-type <string>" << std::endl;
std::cerr << std::endl;
std::cerr << "IV. OTHER OPTIONS: " << std::endl;
std::cerr << "\t-f <filename for storing report in csv format>" << std::endl;
std::cerr << "\t--profile-export-file <path>" << std::endl;
std::cerr << "\t-H <HTTP header>" << std::endl;
std::cerr << "\t--streaming" << std::endl;
std::cerr << "\t--grpc-compression-algorithm <compression_algorithm>"
<< std::endl;
std::cerr << "\t--trace-file" << std::endl;
std::cerr << "\t--trace-level" << std::endl;
std::cerr << "\t--trace-rate" << std::endl;
std::cerr << "\t--trace-count" << std::endl;
std::cerr << "\t--log-frequency" << std::endl;
std::cerr << "\t--collect-metrics" << std::endl;
std::cerr << "\t--metrics-url" << std::endl;
std::cerr << "\t--metrics-interval" << std::endl;
std::cerr << std::endl;
std::cerr << "==== OPTIONS ==== \n \n";
std::cerr << FormatMessage(
" --version: print the current version of Perf Analyzer.",
18)
<< std::endl;
std::cerr
<< FormatMessage(
" --service-kind: Describes the kind of service perf_analyzer to "
"generate load for. The options are \"triton\", \"triton_c_api\", "
"\"tfserving\" and \"torchserve\". Default value is \"triton\". "
"Note in order to use \"torchserve\" backend --input-data option "
"must point to a json file holding data in the following format "
"{\"data\" : [{\"TORCHSERVE_INPUT\" : [\"<complete path to the "
"content file>\"]}, {...}...]}. The type of file here will depend "
"on the model. In order to use \"triton_c_api\" you must specify "
"the Triton server install path and the model repository path via "
"the --triton-server-directory and --model-repository flags",
18)
<< std::endl;
std::cerr
<< std::setw(9) << std::left << " -m: "
<< FormatMessage(
"This is a required argument and is used to specify the model"
" against which to run perf_analyzer.",
9)
<< std::endl;
std::cerr << std::setw(9) << std::left << " -x: "
<< FormatMessage(
"The version of the above model to be used. If not specified"
" the most recent version (that is, the highest numbered"
" version) of the model will be used.",
9)
<< std::endl;
std::cerr << FormatMessage(
" --model-signature-name: The signature name of the saved "
"model to use. Default value is \"serving_default\". This "
"option will be ignored if --service-kind is not "
"\"tfserving\".",
18)
<< std::endl;
std::cerr << std::setw(9) << std::left
<< " -v: " << FormatMessage("Enables verbose mode.", 9)
<< std::endl;
std::cerr << std::setw(9) << std::left
<< " -v -v: " << FormatMessage("Enables extra verbose mode.", 9)
<< std::endl;
std::cerr << std::endl;
std::cerr << "I. MEASUREMENT PARAMETERS: " << std::endl;
std::cerr
<< FormatMessage(
" --async (-a): Enables asynchronous mode in perf_analyzer. "
"By default, perf_analyzer will use synchronous API to "
"request inference. However, if the model is sequential "
"then default mode is asynchronous. Specify --sync to "
"operate sequential models in synchronous mode. In synchronous "
"mode, perf_analyzer will start threads equal to the concurrency "
"level. Use asynchronous mode to limit the number of threads, yet "
"maintain the concurrency.",
18)
<< std::endl;
std::cerr << FormatMessage(
" --sync: Force enables synchronous mode in perf_analyzer. "
"Can be used to operate perf_analyzer with sequential model "
"in synchronous mode.",
18)
<< std::endl;
std::cerr
<< FormatMessage(
" --measurement-interval (-p): Indicates the time interval used "
"for each measurement in milliseconds. The perf analyzer will "
"sample a time interval specified by -p and take measurement over "
"the requests completed within that time interval. The default "
"value is 5000 msec.",
18)
<< std::endl;
std::cerr << FormatMessage(
" --measurement-mode <\"time_windows\"|\"count_windows\">: "
"Indicates the mode used for stabilizing measurements."
" \"time_windows\" will create windows such that the length "
"of each window is equal to --measurement-interval. "
"\"count_windows\" will create "
"windows such that there are at least "
"--measurement-request-count requests in each window.",
18)
<< std::endl;
std::cerr
<< FormatMessage(
" --measurement-request-count: "
"Indicates the minimum number of requests to be collected in each "
"measurement window when \"count_windows\" mode is used. This "
"mode can "
"be enabled using the --measurement-mode flag.",
18)
<< std::endl;
std::cerr
<< FormatMessage(
" --concurrency-range <start:end:step>: Determines the range of "
"concurrency levels covered by the perf_analyzer. The "
"perf_analyzer "
"will start from the concurrency level of 'start' and go till "
"'end' with a stride of 'step'. The default value of 'end' and "
"'step' are 1. If 'end' is not specified then perf_analyzer will "
"run for a single concurrency level determined by 'start'. If "
"'end' is set as 0, then the concurrency limit will be "
"incremented by 'step' till latency threshold is met. 'end' and "
"--latency-threshold can not be both 0 simultaneously. 'end' can "
"not be 0 for sequence models while using asynchronous mode.",
18)
<< std::endl;
std::cerr
<< FormatMessage(
" --request-rate-range <start:end:step>: Determines the range of "
"request rates for load generated by analyzer. This option can "
"take floating-point values. The search along the request rate "
"range is enabled only when using this option. If not specified, "
"then analyzer will search along the concurrency-range. The "
"perf_analyzer will start from the request rate of 'start' and go "
"till 'end' with a stride of 'step'. The default values of "
"'start', 'end' and 'step' are all 1.0. If 'end' is not specified "
"then perf_analyzer will run for a single request rate as "
"determined by 'start'. If 'end' is set as 0.0, then the request "
"rate will be incremented by 'step' till latency threshold is "
"met. 'end' and --latency-threshold can not be both 0 "
"simultaneously.",
18)
<< std::endl;
std::cerr
<< FormatMessage(
" --request-distribution <\"poisson\"|\"constant\">: Specifies "
"the time interval distribution between dispatching inference "
"requests to the server. Poisson distribution closely mimics the "
"real-world work load on a server. This option is ignored if not "
"using --request-rate-range. By default, this option is set to be "
"constant.",
18)
<< std::endl;
std::cerr
<< FormatMessage(
" --request-intervals: Specifies a path to a file containing time "
"intervals in microseconds. Each time interval should be in a new "
"line. The analyzer will try to maintain time intervals between "
"successive generated requests to be as close as possible in this "
"file. This option can be used to apply custom load to server "
"with a certain pattern of interest. The analyzer will loop "
"around the file if the duration of execution exceeds to that "
"accounted for by the intervals. This option can not be used with "
"--request-rate-range or --concurrency-range.",
18)
<< std::endl;
std::cerr
<< FormatMessage(
"--binary-search: Enables the binary search on the specified "
"search range. This option requires 'start' and 'end' to be "
"expilicitly specified in the --concurrency-range or "
"--request-rate-range. When using this option, 'step' is more "
"like the precision. Lower the 'step', more the number of "
"iterations along the search path to find suitable convergence. "
"By default, linear search is used.",
18)
<< std::endl;
std::cerr << FormatMessage(
"--num-of-sequences: Sets the number of concurrent "
"sequences for sequence models. This option is ignored when "
"--request-rate-range is not specified. By default, its "
"value is 4.",
18)
<< std::endl;
std::cerr
<< FormatMessage(
" --latency-threshold (-l): Sets the limit on the observed "
"latency. Analyzer will terminate the concurrency search once "
"the measured latency exceeds this threshold. By default, "
"latency threshold is set 0 and the perf_analyzer will run "
"for entire --concurrency-range.",
18)
<< std::endl;
std::cerr
<< FormatMessage(
" --max-threads: Sets the maximum number of threads that will be "
"created for providing desired concurrency or request rate. "
"However, when running"
"in synchronous mode with concurrency-range having explicit 'end' "
"specification,"
"this value will be ignored. Default is 4 if --request-rate-range "
"is specified otherwise default is 16.",
18)
<< std::endl;
std::cerr
<< FormatMessage(
" --stability-percentage (-s): Indicates the allowed variation in "
"latency measurements when determining if a result is stable. The "
"measurement is considered as stable if the ratio of max / min "
"from the recent 3 measurements is within (stability percentage)% "
"in terms of both infer per second and latency. Default is "
"10(%).",
18)
<< std::endl;
std::cerr << FormatMessage(
" --max-trials (-r): Indicates the maximum number of "
"measurements for each concurrency level visited during "
"search. The perf analyzer will take multiple measurements "
"and report the measurement until it is stable. The perf "
"analyzer will abort if the measurement is still unstable "
"after the maximum number of measurements. The default "
"value is 10.",
18)
<< std::endl;
std::cerr
<< FormatMessage(
" --percentile: Indicates the confidence value as a percentile "
"that will be used to determine if a measurement is stable. For "
"example, a value of 85 indicates that the 85th percentile "
"latency will be used to determine stability. The percentile will "
"also be reported in the results. The default is -1 indicating "
"that the average latency is used to determine stability",
18)
<< std::endl;
std::cerr << FormatMessage(
" --serial-sequences: Enables serial sequence mode "
"where a maximum of one request is outstanding at a time "
"for any given sequence. The default is false.",
18)
<< std::endl;
std::cerr << std::endl;
std::cerr << "II. INPUT DATA OPTIONS: " << std::endl;
std::cerr << std::setw(9) << std::left
<< " -b: " << FormatMessage("Batch size for each request sent.", 9)
<< std::endl;
std::cerr
<< FormatMessage(
" --input-data: Select the type of data that will be used "
"for input in inference requests. The available options are "
"\"zero\", \"random\", path to a directory or a json file. If the "
"option is path to a directory then the directory must "
"contain a binary/text file for each non-string/string input "
"respectively, named the same as the input. Each "
"file must contain the data required for that input for a batch-1 "
"request. Each binary file should contain the raw binary "
"representation of the input in row-major order for non-string "
"inputs. The text file should contain all strings needed by "
"batch-1, each in a new line, listed in row-major order. When "
"pointing to a json file, user must adhere to the format "
"described in the Performance Analyzer documentation. By "
"specifying json data users can control data used with every "
"request. Multiple data streams can be specified for a sequence "
"model and the analyzer will select a data stream in a "
"round-robin fashion for every new sequence. Multiple json files "
"can also be provided (--input-data json_file1 --input-data "
"json-file2 and so on) and the analyzer will append data streams "
"from each file. When using --service-kind=torchserve make sure "
"this option points to a json file. Default is \"random\".",
18)
<< std::endl;
std::cerr << FormatMessage(
" --shared-memory <\"system\"|\"cuda\"|\"none\">: Specifies "
"the type of the shared memory to use for input and output "
"data. Default is none.",
18)
<< std::endl;
std::cerr
<< FormatMessage(
" --output-shared-memory-size: The size in bytes of the shared "
"memory region to allocate per output tensor. Only needed when "
"one or more of the outputs are of string type and/or variable "
"shape. The value should be larger than the size of the largest "
"output tensor the model is expected to return. The analyzer will "
"use the following formula to calculate the total shared memory "
"to allocate: output_shared_memory_size * number_of_outputs * "
"batch_size. Defaults to 100KB.",
18)
<< std::endl;
std::cerr << FormatMessage(
" --shape: The shape used for the specified input. The "
"argument must be specified as 'name:shape' where the shape "
"is a comma-separated list for dimension sizes, for example "
"'--shape input_name:1,2,3' indicate tensor shape [ 1, 2, 3 "
"]. --shape may be specified multiple times to specify "
"shapes for different inputs.",
18)
<< std::endl;
std::cerr << FormatMessage(
" --sequence-length: Indicates the base length of a "
"sequence used for sequence models. A sequence with length "
"X will be composed of X requests to be sent as the "
"elements in the sequence. The actual length of the sequence"
"will be within +/- Y% of the base length, where Y defaults "
"to 20% and is customizable via "
"`--sequence-length-variation`. If sequence length is "
"unspecified and input data is provided, the sequence "
"length will be the number of inputs in the user-provided "
"input data. Default is 20.",
18)
<< std::endl;
std::cerr << FormatMessage(
" --sequence-length-variation: The percentage variation in "
"length of sequences. This flag is only valid when "
"not using user-provided input data or when "
"`--sequence-length` is specified while using user-provided "
"input data. Default is 20.",
18)
<< std::endl;
std::cerr
<< FormatMessage(
" --sequence-id-range <start:end>: Determines the range of "
"sequence id used by the perf_analyzer. The perf_analyzer "
"will start from the sequence id of 'start' and go till "
"'end' (excluded). If 'end' is not specified then perf_analyzer "
"will use new sequence id without bounds. If 'end' is specified "
"and the concurrency setting may result in maintaining a number "
"of sequences more than the range of available sequence id, "
"perf analyzer will exit with error due to possible sequence id "
"collision. The default setting is start from sequence id 1 and "
"without bounds",
18)
<< std::endl;
std::cerr << FormatMessage(
" --string-length: Specifies the length of the random "
"strings to be generated by the analyzer for string input. "
"This option is ignored if --input-data points to a "
"directory. Default is 128.",
18)
<< std::endl;
std::cerr << FormatMessage(
" --string-data: If provided, analyzer will use this string "
"to initialize string input buffers. The perf analyzer will "
"replicate the given string to build tensors of required "
"shape. --string-length will not have any effect. This "
"option is ignored if --input-data points to a directory.",
18)
<< std::endl;
std::cerr << FormatMessage(
" --input-tensor-format=[binary|json]: Specifies Triton "
"inference request input tensor format. Only valid when "
"HTTP protocol is used. Default is 'binary'.",
18)
<< std::endl;
std::cerr << FormatMessage(
" --output-tensor-format=[binary|json]: Specifies Triton "
"inference response output tensor format. Only valid when "
"HTTP protocol is used. Default is 'binary'.",
18)
<< std::endl;
std::cerr << std::endl;
std::cerr << "III. SERVER DETAILS: " << std::endl;
std::cerr << std::setw(38) << std::left << " -u: "
<< FormatMessage(
"Specify URL to the server. When using triton default is "
"\"localhost:8000\" if using HTTP and \"localhost:8001\" "
"if using gRPC. When using tfserving default is "
"\"localhost:8500\". ",
38)
<< std::endl;
std::cerr << std::setw(38) << std::left << " -i: "
<< FormatMessage(
"The communication protocol to use. The available protocols "
"are gRPC and HTTP. Default is HTTP.",
38)
<< std::endl;
std::cerr << std::setw(38) << std::left << " --ssl-grpc-use-ssl: "
<< FormatMessage(
"Bool (true|false) for whether "
"to use encrypted channel to the server. Default false.",
38)
<< std::endl;
std::cerr << std::setw(38) << std::left
<< " --ssl-grpc-root-certifications-file: "
<< FormatMessage(
"Path to file containing the "
"PEM encoding of the server root certificates.",
38)
<< std::endl;
std::cerr << std::setw(38) << std::left << " --ssl-grpc-private-key-file: "
<< FormatMessage(
"Path to file containing the "
"PEM encoding of the client's private key.",
38)
<< std::endl;
std::cerr << std::setw(38) << std::left
<< " --ssl-grpc-certificate-chain-file: "
<< FormatMessage(
"Path to file containing the "
"PEM encoding of the client's certificate chain.",
38)
<< std::endl;
std::cerr << std::setw(38) << std::left << " --ssl-https-verify-peer: "
<< FormatMessage(
"Number (0|1) to verify the "
"peer's SSL certificate. See "
"https://curl.se/libcurl/c/CURLOPT_SSL_VERIFYPEER.html for "
"the meaning of each value. Default is 1.",
38)
<< std::endl;
std::cerr
<< std::setw(38) << std::left << " --ssl-https-verify-host: "
<< FormatMessage(
"Number (0|1|2) to verify the "
"certificate's name against host. "
"See https://curl.se/libcurl/c/CURLOPT_SSL_VERIFYHOST.html for "
"the meaning of each value. Default is 2.",
38)
<< std::endl;
std::cerr << std::setw(38) << std::left
<< " --ssl-https-ca-certificates-file: "
<< FormatMessage(
"Path to Certificate Authority "
"(CA) bundle.",
38)
<< std::endl;
std::cerr << std::setw(38) << std::left
<< " --ssl-https-client-certificate-file: "
<< FormatMessage("Path to the SSL client certificate.", 38)
<< std::endl;
std::cerr << std::setw(38) << std::left
<< " --ssl-https-client-certificate-type: "
<< FormatMessage(
"Type (PEM|DER) of the client "
"SSL certificate. Default is PEM.",
38)
<< std::endl;
std::cerr << std::setw(38) << std::left << " --ssl-https-private-key-file: "
<< FormatMessage(
"Path to the private keyfile "
"for TLS and SSL client cert.",
38)
<< std::endl;
std::cerr << std::setw(38) << std::left << " --ssl-https-private-key-type: "
<< FormatMessage(
"Type (PEM|DER) of the private "
"key file. Default is PEM.",
38)
<< std::endl;
std::cerr << std::endl;
std::cerr << "IV. OTHER OPTIONS: " << std::endl;
std::cerr
<< std::setw(9) << std::left << " -f: "
<< FormatMessage(
"The latency report will be stored in the file named by "
"this option. By default, the result is not recorded in a file.",
9)
<< std::endl;
std::cerr << std::setw(9) << std::left << " --profile-export-file: "
<< FormatMessage(
"Specifies the path that the profile export will be "
"generated at. By default, the profile export will not be "
"generated.",
9)
<< std::endl;
std::cerr
<< std::setw(9) << std::left << " -H: "
<< FormatMessage(
"The header will be added to HTTP requests (ignored for GRPC "
"requests). The header must be specified as 'Header:Value'. -H "
"may be specified multiple times to add multiple headers.",
9)
<< std::endl;
std::cerr
<< FormatMessage(
" --streaming: Enables the use of streaming API. This flag is "
"only valid with gRPC protocol. By default, it is set false.",
18)
<< std::endl;
std::cerr << FormatMessage(
" --grpc-compression-algorithm: The compression algorithm "
"to be used by gRPC when sending request. Only supported "
"when grpc protocol is being used. The supported values are "
"none, gzip, and deflate. Default value is none.",
18)
<< std::endl;
std::cerr
<< FormatMessage(
" --trace-file: Set the file where trace output will be saved."
" If --log-frequency is also specified, this argument "
"value will be the prefix of the files to save the trace "
"output. See --log-frequency for details. Only used for "
"service-kind of triton. Default value is none.",
18)
<< std::endl;
std::cerr
<< FormatMessage(
" --trace-level: Specify a trace level. OFF to disable tracing, "
"TIMESTAMPS to trace timestamps, TENSORS to trace tensors. It "
"may be specified multiple times to trace multiple "
"information. Default is OFF.",
18)
<< std::endl;
std::cerr
<< FormatMessage(
" --trace-rate: Set the trace sampling rate. Default is 1000.", 18)
<< std::endl;
std::cerr << FormatMessage(
" --trace-count: Set the number of traces to be sampled. "
"If the value is -1, the number of traces to be sampled "
"will not be limited. Default is -1.",
18)
<< std::endl;
std::cerr
<< FormatMessage(
" --log-frequency: Set the trace log frequency. If the "
"value is 0, Triton will only log the trace output to "
"<trace-file> when shutting down. Otherwise, Triton will log "
"the trace output to <trace-file>.<idx> when it collects the "
"specified number of traces. For example, if the log frequency "
"is 100, when Triton collects the 100-th trace, it logs the "
"traces to file <trace-file>.0, and when it collects the 200-th "
"trace, it logs the 101-th to the 200-th traces to file "
"<trace-file>.1. Default is 0.",
18)
<< std::endl;
std::cerr << FormatMessage(
" --triton-server-directory: The Triton server install "
"path. Required by and only used when C API "
"is used (--service-kind=triton_c_api). "
"eg:--triton-server-directory=/opt/tritonserver.",
18)
<< std::endl;
std::cerr
<< FormatMessage(
" --model-repository: The model repository of which the model is "
"loaded. Required by and only used when C API is used "
"(--service-kind=triton_c_api). "
"eg:--model-repository=/tmp/host/docker-data/model_unit_test.",
18)
<< std::endl;
std::cerr << FormatMessage(
" --verbose-csv: The csv files generated by perf analyzer "
"will include additional information.",
18)
<< std::endl;
std::cerr << FormatMessage(
" --collect-metrics: Enables collection of server-side "
"inference server metrics. Outputs metrics in the csv file "
"generated with the -f option. Must enable `--verbose-csv` "
"option to use the `--collect-metrics`.",
18)
<< std::endl;
std::cerr << FormatMessage(
" --metrics-url: The URL to query for server-side inference "
"server metrics. Default is 'localhost:8002/metrics'.",
18)
<< std::endl;
std::cerr << FormatMessage(
" --metrics-interval: How often in milliseconds, within "
"each measurement window, to query for server-side "
"inference server metrics. Default is 1000.",
18)
<< std::endl;
std::cerr << FormatMessage(
" --bls-composing-models: A comma separated list of all "
"BLS composing models (with optional model version number "
"after a colon for each) that may be called by the input "
"BLS model. For example, 'modelA:3,modelB' would specify "
"that modelA and modelB are composing models that may be "
"called by the input BLS model, and that modelA will use "
"version 3, while modelB's version is unspecified",
18)
<< std::endl;
exit(GENERIC_ERROR);
}
void
CLParser::PrintVersion()
{
std::cerr << "Perf Analyzer Version " << VERSION << " (commit " << SHA << ")"
<< std::endl;
exit(SUCCESS);
}
void
CLParser::ParseCommandLine(int argc, char** argv)
{
argc_ = argc;
argv_ = argv;
// {name, has_arg, *flag, val}
static struct option long_options[] = {
{"streaming", no_argument, 0, 0},
{"max-threads", required_argument, 0, 1},
{"sequence-length", required_argument, 0, 2},
{"percentile", required_argument, 0, 3},
{"data-directory", required_argument, 0, 4},
{"shape", required_argument, 0, 5},
{"measurement-interval", required_argument, 0, 6},
{"concurrency-range", required_argument, 0, 7},
{"latency-threshold", required_argument, 0, 8},
{"stability-percentage", required_argument, 0, 9},
{"max-trials", required_argument, 0, 10},
{"input-data", required_argument, 0, 11},
{"string-length", required_argument, 0, 12},
{"string-data", required_argument, 0, 13},
{"async", no_argument, 0, 14},
{"sync", no_argument, 0, 15},
{"request-rate-range", required_argument, 0, 16},
{"num-of-sequences", required_argument, 0, 17},
{"binary-search", no_argument, 0, 18},
{"request-distribution", required_argument, 0, 19},
{"request-intervals", required_argument, 0, 20},
{"shared-memory", required_argument, 0, 21},
{"output-shared-memory-size", required_argument, 0, 22},
{"service-kind", required_argument, 0, 23},
{"model-signature-name", required_argument, 0, 24},
{"grpc-compression-algorithm", required_argument, 0, 25},
{"measurement-mode", required_argument, 0, 26},
{"measurement-request-count", required_argument, 0, 27},
{"triton-server-directory", required_argument, 0, 28},
{"model-repository", required_argument, 0, 29},
{"sequence-id-range", required_argument, 0, 30},
{"ssl-grpc-use-ssl", no_argument, 0, 31},
{"ssl-grpc-root-certifications-file", required_argument, 0, 32},
{"ssl-grpc-private-key-file", required_argument, 0, 33},
{"ssl-grpc-certificate-chain-file", required_argument, 0, 34},
{"ssl-https-verify-peer", required_argument, 0, 35},
{"ssl-https-verify-host", required_argument, 0, 36},
{"ssl-https-ca-certificates-file", required_argument, 0, 37},
{"ssl-https-client-certificate-file", required_argument, 0, 38},
{"ssl-https-client-certificate-type", required_argument, 0, 39},
{"ssl-https-private-key-file", required_argument, 0, 40},
{"ssl-https-private-key-type", required_argument, 0, 41},
{"verbose-csv", no_argument, 0, 42},
{"enable-mpi", no_argument, 0, 43},
{"trace-file", required_argument, 0, 44},
{"trace-level", required_argument, 0, 45},
{"trace-rate", required_argument, 0, 46},
{"trace-count", required_argument, 0, 47},
{"log-frequency", required_argument, 0, 48},
{"collect-metrics", no_argument, 0, 49},
{"metrics-url", required_argument, 0, 50},
{"metrics-interval", required_argument, 0, 51},
{"sequence-length-variation", required_argument, 0, 52},
{"bls-composing-models", required_argument, 0, 53},
{"serial-sequences", no_argument, 0, 54},
{"input-tensor-format", required_argument, 0, 55},
{"output-tensor-format", required_argument, 0, 56},
{"version", no_argument, 0, 57},
{"profile-export-file", required_argument, 0, 58},
{0, 0, 0, 0}};
// Parse commandline...
int opt;
while ((opt = getopt_long(
argc, argv, "vdazc:u:m:x:b:t:p:i:H:l:r:s:f:", long_options,
NULL)) != -1) {
try {
switch (opt) {
case 0:
params_->streaming = true;
break;
case 1: {
std::string max_threads{optarg};
if (std::stoi(max_threads) > 0) {
params_->max_threads = std::stoull(max_threads);
params_->max_threads_specified = true;
} else {
Usage("Failed to parse --max-threads. The value must be > 0.");
}
break;
}
case 2: {
std::string sequence_length{optarg};
if (std::stoi(sequence_length) > 0) {
params_->sequence_length = std::stoull(sequence_length);
} else {
std::cerr << "WARNING: The sequence length must be > 0. Perf "
"Analyzer will use default value if it is measuring "
"on sequence model."
<< std::endl;
}
params_->sequence_length_specified = true;
break;
}
case 3:
params_->percentile = std::atoi(optarg);
break;
case 4:
params_->user_data.push_back(optarg);
break;
case 5: {
std::string arg = optarg;
auto colon_pos = arg.rfind(":");
if (colon_pos == std::string::npos) {
Usage(
"Failed to parse --shape. There must be a colon after input "
"name.");
}
std::string name = arg.substr(0, colon_pos);
std::string shape_str = arg.substr(name.size() + 1);
size_t pos = 0;
std::vector<int64_t> shape;
while (pos != std::string::npos) {
size_t comma_pos = shape_str.find(",", pos);
int64_t dim;
if (comma_pos == std::string::npos) {
dim = std::stoll(shape_str.substr(pos, comma_pos));
pos = comma_pos;
} else {
dim = std::stoll(shape_str.substr(pos, comma_pos - pos));
pos = comma_pos + 1;
}
if (dim <= 0) {
Usage(
"Failed to parse --shape. The dimensions of input tensor "
"must be > 0.");
}
shape.emplace_back(dim);
}
params_->input_shapes[name] = shape;
break;
}
case 6:
case 'p': {
std::string measurement_window_ms{optarg};
if (std::stoi(measurement_window_ms) > 0) {
params_->measurement_window_ms = std::stoull(measurement_window_ms);
} else {
Usage(
"Failed to parse --measurement-interval (-p). The value must "
"be > 0 msec.");
}
break;
}
case 7: {
params_->using_concurrency_range = true;
std::string arg = optarg;
size_t pos = 0;
int index = 0;
while (pos != std::string::npos) {
size_t colon_pos = arg.find(":", pos);
if (index > 2) {
Usage(
"Failed to parse --concurrency-range. The value does not "
"match <start:end:step>.");
}
int64_t val;
if (colon_pos == std::string::npos) {
val = std::stoull(arg.substr(pos, colon_pos));
pos = colon_pos;
} else {
val = std::stoull(arg.substr(pos, colon_pos - pos));
pos = colon_pos + 1;
}
switch (index) {
case 0:
params_->concurrency_range.start = val;
break;
case 1:
params_->concurrency_range.end = val;
break;
case 2:
params_->concurrency_range.step = val;
break;
}
index++;
}
break;
}
case 8:
case 'l': {
std::string latency_threshold_ms{optarg};
if (std::stoi(latency_threshold_ms) == 0) {
params_->latency_threshold_ms = NO_LIMIT;
} else if (std::stoi(latency_threshold_ms) > 0) {
params_->latency_threshold_ms = std::stoull(latency_threshold_ms);
} else {
Usage(
"Failed to parse --latency-threshold (-l). The value must be "
">= 0 msecs.");
}
break;
}
case 9:
case 's': {
std::string stability_threshold{optarg};
if (std::stof(stability_threshold) >= 0.0) {
params_->stability_threshold = std::stof(optarg) / 100;
} else {
Usage(
"Failed to parse --stability-percentage (-s). The value must "
"be >= 0.0.");
}
break;
}
case 10:
case 'r': {
std::string max_trials{optarg};
if (std::stoi(max_trials) > 0) {
params_->max_trials = std::stoull(max_trials);
} else {
Usage("Failed to parse --max-trials (-r). The value must be > 0.");
}
break;
}
case 11: {
std::string arg = optarg;
// Check whether the argument is a directory
if (IsDirectory(arg) || IsFile(arg)) {
params_->user_data.push_back(optarg);
} else if (arg.compare("zero") == 0) {
params_->zero_input = true;
} else if (arg.compare("random") == 0) {
break;
} else {
Usage(
"Failed to parse --input-data. Unsupported type provided: '" +
std::string(optarg) +
"'. The available options are 'zero', 'random', path to a "
"directory, or a json file.");
}
break;
}
case 12: {
std::string string_length{optarg};
if (std::stoi(string_length) > 0) {
params_->string_length = std::stoull(string_length);
} else {
Usage("Failed to parse --string-length. The value must be > 0");
}
break;
}
case 13: {
params_->string_data = optarg;
break;
}
case 14:
case 'a': {
params_->async = true;
break;
}
case 15: {
params_->forced_sync = true;
break;
}
case 16: {
params_->using_request_rate_range = true;
std::string arg = optarg;
size_t pos = 0;
int index = 0;
while (pos != std::string::npos) {
size_t colon_pos = arg.find(":", pos);
if (index > 2) {
Usage(
"Failed to parse --request-rate-range. The value does not "
"match <start:end:step>.");
}
if (colon_pos == std::string::npos) {
params_->request_rate_range[index] =
std::stod(arg.substr(pos, colon_pos));
pos = colon_pos;
} else {
params_->request_rate_range[index] =
std::stod(arg.substr(pos, colon_pos - pos));
pos = colon_pos + 1;
index++;
}
}
break;
}
case 17: {
std::string num_of_sequences{optarg};
if (std::stoi(num_of_sequences) > 0) {
params_->num_of_sequences = std::stoul(num_of_sequences);
} else {
Usage("Failed to parse --num-of-sequences. The value must be > 0.");
}
break;
}
case 18: {
params_->search_mode = SearchMode::BINARY;
break;
}
case 19: {
std::string arg = optarg;
if (arg.compare("poisson") == 0) {
params_->request_distribution = Distribution::POISSON;
} else if (arg.compare("constant") == 0) {
params_->request_distribution = Distribution::CONSTANT;
} else {
Usage(
"Failed to parse --request-distribution. Unsupported type "
"provided: '" +
std::string(optarg) + "'. Choices are 'posson' or 'constant'.");
}
break;
}
case 20: {
std::string request_intervals_file{optarg};
if (IsFile(request_intervals_file)) {
params_->request_intervals_file = request_intervals_file;
params_->using_custom_intervals = true;
} else {
Usage(
"Failed to parse --request-intervals. The value must be a "
"valid file path");
}
break;
}
case 21: {
std::string arg = optarg;
if (arg.compare("system") == 0) {
params_->shared_memory_type =
SharedMemoryType::SYSTEM_SHARED_MEMORY;
} else if (arg.compare("cuda") == 0) {
#ifdef TRITON_ENABLE_GPU
params_->shared_memory_type = SharedMemoryType::CUDA_SHARED_MEMORY;
#else
Usage(
"Cuda shared memory is not supported when "
"TRITON_ENABLE_GPU=0.");
#endif // TRITON_ENABLE_GPU
} else if (arg.compare("none") == 0) {
params_->shared_memory_type = SharedMemoryType::NO_SHARED_MEMORY;
} else {
Usage(
"Failed to parse --shared-memory. Unsupported type provided: "
"'" +
std::string(optarg) +
"'. The available options are 'system', 'cuda', or 'none'.");
}
break;
}
case 22: {
std::string output_shm_size{optarg};
if (std::stoi(output_shm_size) >= 0) {
params_->output_shm_size = std::stoull(output_shm_size);
} else {
Usage(
"Failed to parse --output-shared-memory-size. The value must "
"be >= 0.");
}
break;
}
case 23: {
std::string arg = optarg;
if (arg.compare("triton") == 0) {
params_->kind = cb::TRITON;
} else if (arg.compare("tfserving") == 0) {
params_->kind = cb::TENSORFLOW_SERVING;
} else if (arg.compare("torchserve") == 0) {
params_->kind = cb::TORCHSERVE;
} else if (arg.compare("triton_c_api") == 0) {
params_->kind = cb::TRITON_C_API;
} else {
Usage(
"Failed to parse --service-kind. Unsupported type provided: '" +
std::string{optarg} +
"'. The available options are 'triton', 'tfserving', "
"'torchserve', or 'triton_c_api'.");
}
break;
}
case 24:
params_->model_signature_name = optarg;
break;
case 25: {
std::string arg = optarg;
if (arg.compare("none") == 0) {
params_->compression_algorithm = cb::COMPRESS_NONE;
} else if (arg.compare("deflate") == 0) {
params_->compression_algorithm = cb::COMPRESS_DEFLATE;
} else if (arg.compare("gzip") == 0) {
params_->compression_algorithm = cb::COMPRESS_GZIP;
} else {
Usage(
"Failed to parse --grpc-compression-algorithm. Unsupported "
"type provided: '" +
arg +
"'. The available options are 'gzip', 'deflate', or 'none'.");
}
params_->using_grpc_compression = true;
break;
}
case 26: {
std::string arg = optarg;
if (arg.compare("time_windows") == 0) {
params_->measurement_mode = MeasurementMode::TIME_WINDOWS;
} else if (arg.compare("count_windows") == 0) {
params_->measurement_mode = MeasurementMode::COUNT_WINDOWS;
} else {
Usage(
"Failed to parse --measurement-mode. Unsupported type "
"provided: '" +
arg +
"'. The available options are 'time_windows' or "
"'count_windows'.");
}
break;
}
case 27: {
std::string request_count{optarg};
if (std::stoi(request_count) > 0) {
params_->measurement_request_count = std::stoull(request_count);
} else {
Usage(
"Failed to parse --measurement-request-count. The value must "
"be > 0.");
}
break;
}
case 28: {
params_->triton_server_path = optarg;
break;
}
case 29: {
params_->model_repository_path = optarg;
break;
}
case 30: {
std::string arg = optarg;
int64_t start_id;
int64_t end_id;
size_t pos = 0;
int index = 0;
while (pos != std::string::npos) {
size_t colon_pos = arg.find(":", pos);
if (index > 1) {
Usage(
"Failed to parse --sequence-id-range. The value does not "
"match <start:end>.");
}
if (colon_pos == std::string::npos) {
std::string sequence_id{arg.substr(pos, colon_pos)};
if (index == 0) {
start_id = std::stoi(sequence_id);
} else {
end_id = std::stoi(sequence_id);
}
pos = colon_pos;
} else {
std::string sequence_id{arg.substr(pos, colon_pos - pos)};
start_id = std::stoi(sequence_id);
pos = colon_pos + 1;
index++;
}
}
// Check for invalid inputs
if (start_id < 0 || end_id < 0) {
Usage(
"Failed to parse --sequence-id-range. The range values must be "
">= 0.");
} else if (start_id > end_id) {
Usage(
"Failed to parse --sequence-id-range. The 'end' value must be "
"greater than 'start' value.");
}
if (index == 0) { // Only start ID is given
params_->start_sequence_id = start_id;
} else {
params_->start_sequence_id = start_id;
params_->sequence_id_range = end_id - start_id;
}
break;
}
case 31: {
params_->ssl_options.ssl_grpc_use_ssl = true;
break;
}
case 32: {
if (IsFile(optarg)) {
params_->ssl_options.ssl_grpc_root_certifications_file = optarg;
} else {
Usage(
"Failed to parse --ssl-grpc-root-certifications-file. The "
"value must be a valid file path.");
}
break;
}
case 33: {
if (IsFile(optarg)) {
params_->ssl_options.ssl_grpc_private_key_file = optarg;
} else {
Usage(
"Failed to parse --ssl-grpc-private-key-file. The value must "
"be a valid file path.");
}
break;
}
case 34: {
if (IsFile(optarg)) {
params_->ssl_options.ssl_grpc_certificate_chain_file = optarg;
} else {
Usage(
"Failed to parse --ssl-grpc-certificate-chain-file. The value "
"must be a valid file path.");
}
break;
}
case 35: {
if (std::atol(optarg) == 0 || std::atol(optarg) == 1) {
params_->ssl_options.ssl_https_verify_peer = std::atol(optarg);
} else {
Usage(
"Failed to parse --ssl-https-verify-peer. The value must be "
"either 0 or 1.");
}
break;
}
case 36: {
if (std::atol(optarg) == 0 || std::atol(optarg) == 1 ||
std::atol(optarg) == 2) {
params_->ssl_options.ssl_https_verify_host = std::atol(optarg);
} else {
Usage(
"Failed to parse --ssl-https-verify-host. The value must be "
"either 0, 1, or 2.");
}
break;
}
case 37: {
if (IsFile(optarg)) {
params_->ssl_options.ssl_https_ca_certificates_file = optarg;
} else {
Usage(
"Failed to parse --ssl-https-ca-certificates-file. The value "
"must be a valid file path.");
}
break;
}
case 38: {
if (IsFile(optarg)) {
params_->ssl_options.ssl_https_client_certificate_file = optarg;
} else {
Usage(
"Failed to parse --ssl-https-client-certificate-file. The "
"value must be a valid file path.");
}
break;
}
case 39: {
if (std::string(optarg) == "PEM" || std::string(optarg) == "DER") {
params_->ssl_options.ssl_https_client_certificate_type = optarg;
} else {
Usage(
"Failed to parse --ssl-https-client-certificate-type. "
"Unsupported type provided: '" +
std::string{optarg} +
"'. The available options are 'PEM' or 'DER'.");
}
break;
}
case 40: {
if (IsFile(optarg)) {
params_->ssl_options.ssl_https_private_key_file = optarg;
} else {
Usage(
"Failed to parse --ssl-https-private-key-file. The value must "
"be a valid file path.");
}
break;
}
case 41: {
if (std::string(optarg) == "PEM" || std::string(optarg) == "DER") {
params_->ssl_options.ssl_https_private_key_type = optarg;
} else {
Usage(
"Failed to parse --ssl-https-private-key-type. Unsupported "
"type provided: '" +
std::string{optarg} +
"'. The available options are 'PEM' or 'DER'.");
}
break;
}
case 42: {
params_->verbose_csv = true;
break;
}
case 43: {
params_->enable_mpi = true;
break;
}
case 44: {
params_->trace_options["trace_file"] = {optarg};
break;
}
case 45: {
std::string trace_level{optarg};
if (trace_level == "OFF" || trace_level == "TIMESTAMPS" ||
trace_level == "TENSORS") {
params_->trace_options["trace_level"] = {trace_level};
} else {
Usage(
"Failed to parse --trace-level. Unsupported type provided: '" +
trace_level +
"'. The available options are 'OFF', 'TIMESTAMPS', or "
"'TENSORS'.");
}
break;
}
case 46: {
params_->trace_options["trace_rate"] = {optarg};
break;
}
case 47: {
std::string trace_count{optarg};
if (std::stoi(trace_count) >= -1) {
params_->trace_options["trace_count"] = {trace_count};
} else {
Usage(
"Failed to parse --trace-count. The value must be >= 0 or set "
"to -1 (default).");
}
break;
}
case 48: {
std::string log_frequency{optarg};
if (std::stoi(log_frequency) >= 0) {
params_->trace_options["log_frequency"] = {log_frequency};
} else {
Usage("Failed to parse --log-frequency. The value must be >= 0.");
}
break;
}
case 49: {
params_->should_collect_metrics = true;
break;
}
case 50: {
params_->metrics_url = optarg;
params_->metrics_url_specified = true;
break;
}
case 51: {
std::string metrics_interval_ms{optarg};
if (std::stoi(metrics_interval_ms) > 0) {
params_->metrics_interval_ms = std::stoull(metrics_interval_ms);
params_->metrics_interval_ms_specified = true;
} else {
Usage(
"Failed to parse --metrics-interval. The value must be > 0 "
"msecs.");
}
break;
}
case 52: {
params_->sequence_length_variation = std::stod(optarg);
break;
}
case 53: {
std::string arg = optarg;
// Remove all spaces in the string
arg.erase(
std::remove_if(arg.begin(), arg.end(), ::isspace), arg.end());
std::stringstream ss(arg);
while (ss.good()) {
std::string model_name;
std::string model_version{""};
std::string tmp_model_name;
getline(ss, tmp_model_name, ',');
size_t colon_pos = tmp_model_name.find(":");
if (colon_pos == std::string::npos) {
model_name = tmp_model_name;
} else {
model_name = tmp_model_name.substr(0, colon_pos);
model_version = tmp_model_name.substr(colon_pos + 1);
}
params_->bls_composing_models.push_back(
{model_name, model_version});
}
break;
}
case 54: {
params_->serial_sequences = true;
break;
}
case 55: {
cb::TensorFormat input_tensor_format{ParseTensorFormat(optarg)};
if (input_tensor_format == cb::TensorFormat::UNKNOWN) {
Usage(
"Failed to parse --input-tensor-format. Unsupported type "
"provided: '" +
std::string{optarg} +
"'. The available options are 'binary' or 'json'.");
}
params_->input_tensor_format = input_tensor_format;
break;
}
case 56: {
cb::TensorFormat output_tensor_format{ParseTensorFormat(optarg)};
if (output_tensor_format == cb::TensorFormat::UNKNOWN) {
Usage(
"Failed to parse --output-tensor-format. Unsupported type "
"provided: '" +
std::string{optarg} +
"'. The available options are 'binary' or 'json'.");
}
params_->output_tensor_format = output_tensor_format;
break;
}
case 57: {
PrintVersion();
break;
}
case 58: {
std::string profile_export_file{optarg};
if (IsFile(profile_export_file) || IsDirectory(profile_export_file)) {
Usage(
"Failed to parse --profile-export-file. Path must not already "
"exist.");
}
params_->profile_export_file = profile_export_file;
break;
}
case 'v':
params_->extra_verbose = params_->verbose;
params_->verbose = true;
break;
case 'z':
params_->zero_input = true;
break;
case 'd':
params_->using_old_options = true;
params_->dynamic_concurrency_mode = true;
break;
case 'u':
params_->url_specified = true;
params_->url = optarg;
break;
case 'm':
params_->model_name = optarg;
break;
case 'x':
params_->model_version = optarg;
break;
case 'b': {
std::string batch_size{optarg};
if (std::stoi(batch_size) > 0) {
params_->batch_size = std::stoull(batch_size);
params_->using_batch_size = true;
} else {
Usage("Failed to parse -b (batch size). The value must be > 0.");
}
break;
}
case 't':
params_->using_old_options = true;
params_->concurrent_request_count = std::atoi(optarg);
break;
case 'i':
params_->protocol = ParseProtocol(optarg);
break;
case 'H': {
std::string arg = optarg;
std::string header = arg.substr(0, arg.find(":"));
(*params_->http_headers)[header] = arg.substr(header.size() + 1);
break;
}
case 'c':
params_->using_old_options = true;
params_->max_concurrency = std::atoi(optarg);
break;
case 'f':
params_->filename = optarg;
break;
case '?':
Usage();
break;
}
}
catch (const std::invalid_argument& ia) {
if (opt >= 'A') { // short options
Usage(
"Failed to parse -" + std::string{(char)opt} +
". Invalid value provided: " + std::string{optarg});
} else {
Usage(
"Failed to parse --" + std::string{long_options[opt].name} +
". Invalid value provided: " + std::string{optarg});
}
}
}
params_->mpi_driver = std::shared_ptr<triton::perfanalyzer::MPIDriver>{
std::make_shared<triton::perfanalyzer::MPIDriver>(params_->enable_mpi)};
params_->mpi_driver->MPIInit(&argc, &argv);
if (!params_->url_specified &&
(params_->protocol == cb::ProtocolType::GRPC)) {
if (params_->kind == cb::BackendKind::TRITON) {
params_->url = "localhost:8001";
} else if (params_->kind == cb::BackendKind::TENSORFLOW_SERVING) {
params_->url = "localhost:8500";
}
}
// Overriding the max_threads default for request_rate search
if (!params_->max_threads_specified && params_->targeting_concurrency()) {
params_->max_threads = 16;
}
if (params_->using_custom_intervals) {
// Will be using user-provided time intervals, hence no control variable.
params_->search_mode = SearchMode::NONE;
}
}
void
CLParser::VerifyOptions()
{
if (params_->model_name.empty()) {
Usage("Failed to parse -m (model name). The value must be specified.");
}
if (params_->concurrency_range.start <= 0 ||
params_->concurrent_request_count < 0) {
Usage("The start of the search range must be > 0");
}
if (params_->request_rate_range[SEARCH_RANGE::kSTART] <= 0) {
Usage(
"Failed to parse --request-rate-range. The start of the search range "
"must be > 0.");
}
if (params_->protocol == cb::ProtocolType::UNKNOWN) {
Usage(
"Failed to parse -i (protocol). The value should be either HTTP or "
"gRPC.");
}
if (params_->streaming && (params_->protocol != cb::ProtocolType::GRPC)) {
Usage("Streaming is only allowed with gRPC protocol.");
}
if (params_->using_grpc_compression &&
(params_->protocol != cb::ProtocolType::GRPC)) {
Usage("Using compression algorithm is only allowed with gRPC protocol.");
}
if (params_->sequence_length_variation < 0.0) {
Usage(
"Failed to parse --sequence-length-variation. The value must be >= "
"0.0.");
}
if (params_->start_sequence_id == 0) {
params_->start_sequence_id = 1;
std::cerr << "WARNING: using an invalid start sequence id. Perf Analyzer"
<< " will use default value if it is measuring on sequence model."
<< std::endl;
}
if (params_->percentile != -1 &&
(params_->percentile > 99 || params_->percentile < 1)) {
Usage(
"Failed to parse --percentile. The value must be -1 for not reporting "
"or in range (0, 100).");
}
if (params_->zero_input && !params_->user_data.empty()) {
Usage("The -z flag cannot be set when --data-directory is provided.");
}
if (params_->async && params_->forced_sync) {
Usage("Cannot specify --async and --sync simultaneously.");
}
if (params_->using_concurrency_range && params_->using_old_options) {
Usage("Cannot use deprecated options with --concurrency-range.");
} else if (params_->using_old_options) {
if (params_->dynamic_concurrency_mode) {
params_->concurrency_range.end = params_->max_concurrency;
}
params_->concurrency_range.start = params_->concurrent_request_count;
}
if (params_->using_request_rate_range && params_->using_old_options) {
Usage("Cannot use concurrency options with --request-rate-range.");
}
if (params_->using_request_rate_range && params_->using_concurrency_range) {
Usage(
"Cannot specify --concurrency-range and --request-rate-range "
"simultaneously.");
}
if (params_->using_request_rate_range && params_->mpi_driver->IsMPIRun() &&
(params_->request_rate_range[SEARCH_RANGE::kEND] != 1.0 ||
params_->request_rate_range[SEARCH_RANGE::kSTEP] != 1.0)) {
Usage("Cannot specify --request-rate-range when in multi-model mode.");
}
if (params_->using_custom_intervals && params_->using_old_options) {
Usage("Cannot use deprecated options with --request-intervals.");
}
if ((params_->using_custom_intervals) &&
(params_->using_request_rate_range || params_->using_concurrency_range)) {
Usage(
"Cannot use --concurrency-range or --request-rate-range "
"along with --request-intervals.");
}
if (params_->using_concurrency_range && params_->mpi_driver->IsMPIRun() &&
(params_->concurrency_range.end != 1 ||
params_->concurrency_range.step != 1)) {
Usage("Cannot specify --concurrency-range when in multi-model mode.");
}
if (((params_->concurrency_range.end == NO_LIMIT) ||
(params_->request_rate_range[SEARCH_RANGE::kEND] ==
static_cast<double>(NO_LIMIT))) &&
(params_->latency_threshold_ms == NO_LIMIT)) {
Usage(
"The end of the search range and the latency limit can not be both 0 "
"(or 0.0) simultaneously");
}
if (((params_->concurrency_range.end == NO_LIMIT) ||
(params_->request_rate_range[SEARCH_RANGE::kEND] ==
static_cast<double>(NO_LIMIT))) &&
(params_->search_mode == SearchMode::BINARY)) {
Usage("The end of the range can not be 0 (or 0.0) for binary search mode.");
}
if ((params_->search_mode == SearchMode::BINARY) &&
(params_->latency_threshold_ms == NO_LIMIT)) {
Usage("The --latency-threshold cannot be 0 for binary search mode.");
}
if (((params_->concurrency_range.end < params_->concurrency_range.start) ||
(params_->request_rate_range[SEARCH_RANGE::kEND] <
params_->request_rate_range[SEARCH_RANGE::kSTART])) &&
(params_->search_mode == SearchMode::BINARY)) {
Usage(
"The end of the range can not be less than start of the range for "
"binary search mode.");
}
if (params_->kind == cb::TENSORFLOW_SERVING) {
if (params_->protocol != cb::ProtocolType::GRPC) {
Usage(
"perf_analyzer supports only grpc protocol for TensorFlow Serving.");
} else if (params_->streaming) {
Usage("perf_analyzer does not support streaming for TensorFlow Serving.");
} else if (params_->async) {
Usage("perf_analyzer does not support async API for TensorFlow Serving.");
} else if (!params_->using_batch_size) {
params_->batch_size = 0;
}
} else if (params_->kind == cb::TORCHSERVE) {
if (params_->user_data.empty()) {
Usage(
"--input-data should be provided with a json file with "
"input data for torchserve.");
}
}
if (params_->kind == cb::BackendKind::TRITON_C_API) {
if (params_->triton_server_path.empty()) {
Usage(
"--triton-server-path should not be empty when using "
"service-kind=triton_c_api.");
}
if (params_->model_repository_path.empty()) {
Usage(
"--model-repository should not be empty when using "
"service-kind=triton_c_api.");
}
if (params_->async) {
Usage(
"Async mode is not supported by triton_c_api service "
"kind.");
}
params_->protocol = cb::ProtocolType::UNKNOWN;
}
if (params_->should_collect_metrics &&
params_->kind != cb::BackendKind::TRITON) {
Usage(
"Server-side metric collection is only supported with Triton client "
"backend.");
}
if (params_->metrics_url_specified &&
params_->should_collect_metrics == false) {
Usage(
"Must specify --collect-metrics when using the --metrics-url option.");
}
if (params_->metrics_interval_ms_specified &&
params_->should_collect_metrics == false) {
Usage(
"Must specify --collect-metrics when using the --metrics-interval "
"option.");
}
if (params_->should_collect_metrics && !params_->metrics_url_specified) {
// Update the default metrics URL to be associated with the input URL
// instead of localhost
//
size_t colon_pos = params_->url.find(':');
if (colon_pos != std::string::npos) {
params_->metrics_url =
params_->url.substr(0, colon_pos) + ":8002/metrics";
}
}
}
}} // namespace triton::perfanalyzer
// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "constants.h"
#include "mpi_utils.h"
#include "perf_utils.h"
namespace triton { namespace perfanalyzer {
enum SEARCH_RANGE { kSTART = 0, kEND = 1, kSTEP = 2 };
// Perf Analyzer command line parameters.
// PAParams are used to initialize PerfAnalyzer and track configuration
//
struct PerfAnalyzerParameters {
bool verbose = false;
bool extra_verbose = false;
bool streaming = false;
size_t max_threads = 4;
bool max_threads_specified = false;
size_t sequence_length = 20; // average length of a sentence
bool sequence_length_specified = false;
double sequence_length_variation = 20.0;
int32_t percentile = -1;
std::vector<std::string> user_data;
std::unordered_map<std::string, std::vector<int64_t>> input_shapes;
std::vector<cb::ModelIdentifier> bls_composing_models;
uint64_t measurement_window_ms = 5000;
bool using_concurrency_range = false;
Range<uint64_t> concurrency_range{1, 1, 1};
uint64_t latency_threshold_ms = NO_LIMIT;
double stability_threshold = 0.1;
size_t max_trials = 10;
bool zero_input = false;
size_t string_length = 128;
std::string string_data;
bool async = false;
bool forced_sync = false;
bool using_request_rate_range = false;
double request_rate_range[3] = {1.0, 1.0, 1.0};
uint32_t num_of_sequences = 4;
bool serial_sequences = false;
SearchMode search_mode = SearchMode::LINEAR;
Distribution request_distribution = Distribution::CONSTANT;
bool using_custom_intervals = false;
std::string request_intervals_file{""};
SharedMemoryType shared_memory_type = NO_SHARED_MEMORY;
size_t output_shm_size = 100 * 1024;
clientbackend::BackendKind kind = clientbackend::BackendKind::TRITON;
std::string model_signature_name{"serving_default"};
bool using_grpc_compression = false;
clientbackend::GrpcCompressionAlgorithm compression_algorithm =
clientbackend::GrpcCompressionAlgorithm::COMPRESS_NONE;
MeasurementMode measurement_mode = MeasurementMode::TIME_WINDOWS;
uint64_t measurement_request_count = 50;
std::string triton_server_path = "/opt/tritonserver";
std::string model_repository_path;
uint64_t start_sequence_id = 1;
uint64_t sequence_id_range = UINT32_MAX;
clientbackend::SslOptionsBase ssl_options; // gRPC and HTTP SSL options
// Verbose csv option for including additional information
bool verbose_csv = false;
// Enable MPI option for using MPI functionality with multi-model mode.
bool enable_mpi = false;
std::map<std::string, std::vector<std::string>> trace_options;
bool using_old_options = false;
bool dynamic_concurrency_mode = false;
bool url_specified = false;
std::string url{"localhost:8000"};
std::string model_name;
std::string model_version;
uint64_t batch_size = 1;
bool using_batch_size = false;
int32_t concurrent_request_count = 1;
clientbackend::ProtocolType protocol = clientbackend::ProtocolType::HTTP;
std::shared_ptr<clientbackend::Headers> http_headers{
new clientbackend::Headers()};
size_t max_concurrency = 0;
std::string filename{""};
std::shared_ptr<MPIDriver> mpi_driver;
std::string memory_type{"system"}; // currently not used, to be removed
// Enable collection of server-side metrics from inference server.
bool should_collect_metrics{false};
// The URL to query for server-side inference server metrics.
std::string metrics_url{"localhost:8002/metrics"};
bool metrics_url_specified{false};
// How often, within each measurement window, to query for server-side
// inference server metrics.
uint64_t metrics_interval_ms{1000};
bool metrics_interval_ms_specified{false};
// Return true if targeting concurrency
//
bool targeting_concurrency() const
{
return (
using_concurrency_range || using_old_options ||
!(using_request_rate_range || using_custom_intervals));
}
// Sets the threshold for PA client overhead.
// Overhead is defined as the percentage of time when PA is doing work and
// requests are not outstanding to the triton server. If the overhead
// percentage exceeds the threshold, a warning is displayed.
//
double overhead_pct_threshold{50.0};
// Triton inference request input tensor format.
cb::TensorFormat input_tensor_format{cb::TensorFormat::BINARY};
// Triton inference response output tensor format.
cb::TensorFormat output_tensor_format{cb::TensorFormat::BINARY};
// The profile export file path.
std::string profile_export_file{""};
};
using PAParamsPtr = std::shared_ptr<PerfAnalyzerParameters>;
class CLParser {
public:
CLParser() : params_(new PerfAnalyzerParameters{}) {}
// Parse command line arguments into a parameters struct
//
PAParamsPtr Parse(int argc, char** argv);
private:
char** argv_;
int argc_;
PAParamsPtr params_;
std::string FormatMessage(std::string str, int offset) const;
virtual void Usage(const std::string& msg = std::string());
void PrintVersion();
void ParseCommandLine(int argc, char** argv);
void VerifyOptions();
};
}} // namespace triton::perfanalyzer
// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include "base_queue_ctx_id_tracker.h"
namespace triton { namespace perfanalyzer {
// Context ID Tracker that always returns context 0, but ensures that only X
// requests are outstanding at a time
//
class ConcurrencyCtxIdTracker : public BaseQueueCtxIdTracker {
public:
ConcurrencyCtxIdTracker() = default;
void Reset(size_t count) override
{
Clear();
for (size_t i = 0; i < count; ++i) {
free_ctx_ids_.push(0);
}
}
};
}}; // namespace triton::perfanalyzer
// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "concurrency_manager.h"
#include <queue>
namespace triton { namespace perfanalyzer {
ConcurrencyManager::~ConcurrencyManager()
{
// The destruction of derived class should wait for all the request generator
// threads to finish
StopWorkerThreads();
}
cb::Error
ConcurrencyManager::Create(
const bool async, const bool streaming, const int32_t batch_size,
const size_t max_threads, const size_t max_concurrency,
const SharedMemoryType shared_memory_type, const size_t output_shm_size,
const std::shared_ptr<ModelParser>& parser,
const std::shared_ptr<cb::ClientBackendFactory>& factory,
std::unique_ptr<LoadManager>* manager)
{
std::unique_ptr<ConcurrencyManager> local_manager(new ConcurrencyManager(
async, streaming, batch_size, max_threads, max_concurrency,
shared_memory_type, output_shm_size, parser, factory));
*manager = std::move(local_manager);
return cb::Error::Success;
}
ConcurrencyManager::ConcurrencyManager(
const bool async, const bool streaming, const int32_t batch_size,
const size_t max_threads, const size_t max_concurrency,
const SharedMemoryType shared_memory_type, const size_t output_shm_size,
const std::shared_ptr<ModelParser>& parser,
const std::shared_ptr<cb::ClientBackendFactory>& factory)
: LoadManager(
async, streaming, batch_size, max_threads, shared_memory_type,
output_shm_size, parser, factory),
execute_(true), max_concurrency_(max_concurrency)
{
threads_config_.reserve(max_threads);
}
void
ConcurrencyManager::InitManagerFinalize()
{
if (on_sequence_model_) {
sequence_manager_->InitSequenceStatuses(max_concurrency_);
}
}
cb::Error
ConcurrencyManager::ChangeConcurrencyLevel(
const size_t concurrent_request_count)
{
PauseSequenceWorkers();
ReconfigThreads(concurrent_request_count);
ResumeSequenceWorkers();
std::cout << "Request concurrency: " << concurrent_request_count << std::endl;
return cb::Error::Success;
}
void
ConcurrencyManager::PauseSequenceWorkers()
{
if (on_sequence_model_) {
execute_ = false;
// Wait to see all threads are paused.
for (auto& thread_config : threads_config_) {
while (!thread_config->is_paused_) {
std::this_thread::sleep_for(std::chrono::milliseconds(10));
}
}
}
}
void
ConcurrencyManager::ReconfigThreads(const size_t concurrent_request_count)
{
// Always prefer to create new threads if the maximum limit has not been met
//
// While operating in synchronous mode, each context can send only one
// request at a time, hence the number of worker threads should be equal to
// the requested concurrency levels.
//
while ((concurrent_request_count > threads_.size()) &&
(threads_.size() < max_threads_)) {
// Launch new thread for inferencing
threads_stat_.emplace_back(new ThreadStat());
threads_config_.emplace_back(
new ConcurrencyWorker::ThreadConfig(threads_config_.size()));
workers_.push_back(
MakeWorker(threads_stat_.back(), threads_config_.back()));
threads_.emplace_back(&IWorker::Infer, workers_.back());
}
{
// Make sure all threads are reconfigured before they are woken up
std::lock_guard<std::mutex> lock(wake_mutex_);
// Compute the new concurrency level for each thread (take floor)
// and spread the remaining value
size_t avg_concurrency = concurrent_request_count / threads_.size();
size_t threads_add_one = concurrent_request_count % threads_.size();
size_t seq_stat_index_offset = 0;
active_threads_ = 0;
for (size_t i = 0; i < threads_stat_.size(); i++) {
size_t concurrency = avg_concurrency + (i < threads_add_one ? 1 : 0);
threads_config_[i]->concurrency_ = concurrency;
threads_config_[i]->seq_stat_index_offset_ = seq_stat_index_offset;
seq_stat_index_offset += concurrency;
if (concurrency) {
active_threads_++;
}
}
// TODO REFACTOR TMA-1043 the memory manager should have API to set
// num_active_threads in constructor, as well as overwrite it here
}
}
void
ConcurrencyManager::ResumeSequenceWorkers()
{
if (on_sequence_model_) {
execute_ = true;
}
// Make sure all threads will check their updated concurrency level
wake_signal_.notify_all();
}
std::shared_ptr<IWorker>
ConcurrencyManager::MakeWorker(
std::shared_ptr<ThreadStat> thread_stat,
std::shared_ptr<ConcurrencyWorker::ThreadConfig> thread_config)
{
uint32_t id = workers_.size();
return std::make_shared<ConcurrencyWorker>(
id, thread_stat, thread_config, parser_, data_loader_, factory_,
on_sequence_model_, async_, max_concurrency_, using_json_data_,
streaming_, batch_size_, wake_signal_, wake_mutex_, active_threads_,
execute_, infer_data_manager_, sequence_manager_);
}
}} // namespace triton::perfanalyzer
// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include "concurrency_worker.h"
#include "load_manager.h"
namespace triton { namespace perfanalyzer {
#ifndef DOCTEST_CONFIG_DISABLE
class TestConcurrencyManager;
#endif
//==============================================================================
/// ConcurrencyManager is a helper class to send inference requests to inference
/// server consistently, based on the specified setting, so that the
/// perf_analyzer can measure performance under different concurrency.
///
/// An instance of concurrency manager will be created at the beginning of the
/// perf_analyzer and it will be used to simulate different load level in
/// respect to number of concurrent infer requests and to collect per-request
/// statistic.
///
/// Detail:
/// Concurrency Manager will maintain the number of concurrent requests by
/// spawning worker threads that keep sending randomly generated requests to the
/// server. The worker threads will record the start time and end
/// time of each request into a shared vector.
///
class ConcurrencyManager : public LoadManager {
public:
~ConcurrencyManager();
/// Create a concurrency manager that is responsible to maintain specified
/// load on inference server.
/// \param async Whether to use asynchronous or synchronous API for infer
/// request.
/// \param streaming Whether to use gRPC streaming API for infer request
/// \param batch_size The batch size used for each request.
/// \param max_threads The maximum number of working threads to be spawned.
/// \param max_concurrency The maximum concurrency which will be requested.
/// \param string_length The length of the string to create for input.
/// \param string_data The data to use for generating string input.
/// \param zero_input Whether to fill the input tensors with zero.
/// \param user_data The vector containing path/paths to user-provided data
/// that can be a directory or path to a json data file.
/// \param shared_memory_type The type of shared memory to use for inputs.
/// \param output_shm_size The size in bytes of the shared memory to
/// allocate for the output.
/// \param parser The ModelParser object to get the model details.
/// \param factory The ClientBackendFactory object used to create
/// client to the server.
/// \param manager Returns a new ConcurrencyManager object.
/// \return cb::Error object indicating success or failure.
static cb::Error Create(
const bool async, const bool streaming, const int32_t batch_size,
const size_t max_threads, const size_t max_concurrency,
const SharedMemoryType shared_memory_type, const size_t output_shm_size,
const std::shared_ptr<ModelParser>& parser,
const std::shared_ptr<cb::ClientBackendFactory>& factory,
std::unique_ptr<LoadManager>* manager);
/// Adjusts the number of concurrent requests to be the same as
/// 'concurrent_request_count' (by creating or pausing threads)
/// \param concurent_request_count The number of concurrent requests.
/// \return cb::Error object indicating success or failure.
cb::Error ChangeConcurrencyLevel(const size_t concurrent_request_count);
protected:
// Makes a new worker
virtual std::shared_ptr<IWorker> MakeWorker(
std::shared_ptr<ThreadStat>,
std::shared_ptr<ConcurrencyWorker::ThreadConfig>);
private:
ConcurrencyManager(
const bool async, const bool streaming, const int32_t batch_size,
const size_t max_threads, const size_t max_concurrency,
const SharedMemoryType shared_memory_type, const size_t output_shm_size,
const std::shared_ptr<ModelParser>& parser,
const std::shared_ptr<cb::ClientBackendFactory>& factory);
void InitManagerFinalize() override;
// Pause all worker threads that are working on sequences
//
void PauseSequenceWorkers();
// Create new threads (if necessary), and then reconfigure all worker threads
// to handle the new concurrent request count
//
void ReconfigThreads(size_t concurrent_request_count);
// Restart all worker threads that were working on sequences
//
void ResumeSequenceWorkers();
// The number of worker threads with non-zero concurrencies
size_t active_threads_;
bool execute_;
size_t max_concurrency_;
std::vector<std::shared_ptr<ConcurrencyWorker::ThreadConfig>> threads_config_;
#ifndef DOCTEST_CONFIG_DISABLE
friend TestConcurrencyManager;
public:
ConcurrencyManager() = default;
#endif
};
}} // namespace triton::perfanalyzer
// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "concurrency_worker.h"
#include <algorithm>
#include "client_backend/client_backend.h"
#include "perf_utils.h"
namespace triton { namespace perfanalyzer {
// Function for worker threads.
// If the model is non-sequence model, each worker uses only one context
// to maintain concurrency assigned to worker.
// If the model is sequence model, each worker has to use multiples contexts
// to maintain (sequence) concurrency assigned to worker.
void
ConcurrencyWorker::Infer()
{
CreateCtxIdTracker();
ReserveContexts();
// run inferencing until receiving exit signal to maintain server load.
do {
HandleExecuteOff();
if (HandleNoConcurrency()) {
return;
}
CreateContextsAsNecessary();
if (HandleExitConditions()) {
return;
}
SendInferRequests();
if (HandleExitConditions()) {
return;
}
WaitForResponses();
if (HandleExitConditions()) {
return;
}
} while (true);
}
void
ConcurrencyWorker::CreateCtxIdTracker()
{
bool is_concurrency = true;
bool serial_sequences = false;
ctx_id_tracker_ = CtxIdTrackerFactory::CreateTracker(
is_concurrency, on_sequence_model_, serial_sequences);
}
void
ConcurrencyWorker::ReserveContexts()
{
// Reserve the vectors in case of sequence models. In non-sequence or
// synchronous mode only one context will be opened hence no need of
// reserving.
if (on_sequence_model_ && async_) {
thread_stat_->contexts_stat_.reserve(max_concurrency_);
ctxs_.reserve(max_concurrency_);
}
}
void
ConcurrencyWorker::HandleExecuteOff()
{
if (on_sequence_model_) {
if (!execute_) {
// Ensures the clean exit of the sequences
CompleteOngoingSequences();
WaitForOngoingRequests();
// Reset Ctx IDs because CompleteOngoingSequences()
// has destructive side affects
ResetFreeCtxIds();
// Wait if no request should be sent and it is not exiting
thread_config_->is_paused_ = true;
std::unique_lock<std::mutex> lock(wake_mutex_);
wake_signal_.wait(lock, [this]() { return early_exit || execute_; });
// TODO REFACTOR TMA-1043 - memory manager should be handling this instead
// of here
for (auto ctx : ctxs_) {
ctx->SetNumActiveThreads(active_threads_);
}
}
}
thread_config_->is_paused_ = false;
}
bool
ConcurrencyWorker::HandleNoConcurrency()
{
// Only interact with synchronous mechanism if the worker should wait
if (thread_config_->concurrency_ == 0) {
// Wait if no request should be sent and it is not exiting
std::unique_lock<std::mutex> lock(wake_mutex_);
wake_signal_.wait(lock, [this]() {
return early_exit || (thread_config_->concurrency_ > 0);
});
// Stop executing if concurrency is 0 and early exit is requested
if (early_exit && thread_config_->concurrency_ == 0) {
return true;
}
}
return false;
}
void
ConcurrencyWorker::CreateContextsAsNecessary()
{
// If the model is non-sequence model, use one InferContext to
// maintain concurrency for this thread.
size_t active_ctx_cnt = on_sequence_model_ ? thread_config_->concurrency_ : 1;
if (active_ctx_cnt > ctxs_.size()) {
while (active_ctx_cnt > ctxs_.size()) {
CreateContext();
}
ResetFreeCtxIds();
}
// TODO REFACTOR TMA-1043 -- this shouldn't be handled here
for (auto ctx : ctxs_) {
ctx->SetNumActiveThreads(active_threads_);
}
}
void
ConcurrencyWorker::SendInferRequests()
{
while (ctx_id_tracker_->IsAvailable() && execute_ && !ShouldExit()) {
uint32_t ctx_id = GetCtxId();
SendInferRequest(ctx_id);
RestoreFreeCtxId(ctx_id);
}
}
void
ConcurrencyWorker::WaitForResponses()
{
if (async_) {
{
// If async, then wait for signal from callback.
std::unique_lock<std::mutex> lk(cb_mtx_);
thread_stat_->idle_timer.Start();
cb_cv_.wait(lk, [this] {
if (notified_) {
notified_ = false;
return true;
}
return false;
});
thread_stat_->idle_timer.Stop();
}
}
}
void
ConcurrencyWorker::ResetFreeCtxIds()
{
std::lock_guard<std::mutex> lock(cb_mtx_);
ctx_id_tracker_->Reset(thread_config_->concurrency_);
}
uint32_t
ConcurrencyWorker::GetSeqStatIndex(uint32_t ctx_id)
{
return (thread_config_->seq_stat_index_offset_ + ctx_id);
}
}} // namespace triton::perfanalyzer
// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <memory>
#include "load_worker.h"
#include "sequence_manager.h"
namespace triton { namespace perfanalyzer {
#ifndef DOCTEST_CONFIG_DISABLE
class NaggyMockConcurrencyWorker;
#endif
/// Worker thread for the ConcurrencyManager
///
/// The worker maintains concurrency in different ways:
/// For sequence models, multiple contexts must be created for multiple
/// concurrent sequences.
///
/// For non-sequence models, one context can send out multiple requests
/// at the same time. Thus it uses one single context as every infer context
/// creates a worker thread implicitly.
///
class ConcurrencyWorker : public LoadWorker {
public:
struct ThreadConfig {
ThreadConfig(size_t thread_id)
: thread_id_(thread_id), concurrency_(0), seq_stat_index_offset_(0),
is_paused_(false)
{
}
// ID of corresponding worker thread
size_t thread_id_;
// The concurrency level that the worker should produce
size_t concurrency_;
// The starting sequence stat index for this worker
size_t seq_stat_index_offset_;
// Whether or not the thread is issuing new inference requests
bool is_paused_;
};
ConcurrencyWorker(
uint32_t id, std::shared_ptr<ThreadStat> thread_stat,
std::shared_ptr<ThreadConfig> thread_config,
const std::shared_ptr<ModelParser> parser,
std::shared_ptr<DataLoader> data_loader,
const std::shared_ptr<cb::ClientBackendFactory> factory,
const bool on_sequence_model, const bool async,
const size_t max_concurrency, const bool using_json_data,
const bool streaming, const int32_t batch_size,
std::condition_variable& wake_signal, std::mutex& wake_mutex,
size_t& active_threads, bool& execute,
const std::shared_ptr<IInferDataManager>& infer_data_manager,
std::shared_ptr<SequenceManager> sequence_manager)
: LoadWorker(
id, thread_stat, parser, data_loader, factory, on_sequence_model,
async, streaming, batch_size, using_json_data, wake_signal,
wake_mutex, execute, infer_data_manager, sequence_manager),
thread_config_(thread_config), max_concurrency_(max_concurrency),
active_threads_(active_threads)
{
}
void Infer() override;
private:
const size_t max_concurrency_;
// TODO REFACTOR TMA-1020 can we decouple this thread from the total count of
// threads?
size_t& active_threads_;
std::shared_ptr<ThreadConfig> thread_config_;
void CreateCtxIdTracker();
// Reserve vector size for contexts
void ReserveContexts();
// Handle the case where execute_ is false
void HandleExecuteOff();
// Handle the case where this thread is configured to do nothing
// Returns true if an exit condition was met
bool HandleNoConcurrency();
// Create and populate contexts if needed
void CreateContextsAsNecessary();
// Send out the desired concurrency of requests
void SendInferRequests();
void WaitForResponses();
void ResetFreeCtxIds();
uint32_t GetSeqStatIndex(uint32_t ctx_id) override;
void CreateContextFinalize(std::shared_ptr<InferContext> ctx) override
{
ctx->RegisterAsyncCallbackFinalize(std::bind(
&ConcurrencyWorker::AsyncCallbackFinalize, this,
std::placeholders::_1));
}
#ifndef DOCTEST_CONFIG_DISABLE
friend NaggyMockConcurrencyWorker;
#endif
};
}} // namespace triton::perfanalyzer
// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <cstdint>
#include <string>
#define STRINGIFY_(x) #x
#define STRINGIFY(x) STRINGIFY_(x)
namespace triton { namespace perfanalyzer {
const std::string SHA{STRINGIFY(GIT_SHA)};
const std::string VERSION{STRINGIFY(PERF_ANALYZER_VERSION)};
constexpr static const uint32_t SUCCESS = 0;
constexpr static const uint32_t STABILITY_ERROR = 2;
constexpr static const uint32_t OPTION_ERROR = 3;
constexpr static const uint32_t GENERIC_ERROR = 99;
const double DELAY_PCT_THRESHOLD{1.0};
/// Different measurement modes possible.
enum MeasurementMode { TIME_WINDOWS = 0, COUNT_WINDOWS = 1 };
}} // namespace triton::perfanalyzer
// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <memory>
#include "concurrency_ctx_id_tracker.h"
#include "fifo_ctx_id_tracker.h"
#include "rand_ctx_id_tracker.h"
namespace triton { namespace perfanalyzer {
// Context ID tracker that is always available and returns random Context IDs
//
class CtxIdTrackerFactory {
public:
CtxIdTrackerFactory() = delete;
/// Creates and returns a Context Id Tracker
///
/// \param is_concurrency True if targeting Concurrency
/// \param is_sequence_model True if the model is a sequence model
/// \param serial_sequences True if in serial sequence mode
///
static std::shared_ptr<ICtxIdTracker> CreateTracker(
bool is_concurrency, bool is_sequence_model, bool serial_sequences)
{
if (is_concurrency) {
if (is_sequence_model) {
return std::make_shared<FifoCtxIdTracker>();
} else {
return std::make_shared<ConcurrencyCtxIdTracker>();
}
} else {
if (is_sequence_model && serial_sequences) {
return std::make_shared<FifoCtxIdTracker>();
} else {
return std::make_shared<RandCtxIdTracker>();
}
}
}
};
}} // namespace triton::perfanalyzer
// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "custom_load_manager.h"
#include <fstream>
#include "constants.h"
namespace triton { namespace perfanalyzer {
cb::Error
CustomLoadManager::Create(
const bool async, const bool streaming,
const uint64_t measurement_window_ms, const size_t max_trials,
const std::string& request_intervals_file, const int32_t batch_size,
const size_t max_threads, const uint32_t num_of_sequences,
const SharedMemoryType shared_memory_type, const size_t output_shm_size,
const bool serial_sequences, const std::shared_ptr<ModelParser>& parser,
const std::shared_ptr<cb::ClientBackendFactory>& factory,
std::unique_ptr<LoadManager>* manager)
{
std::unique_ptr<CustomLoadManager> local_manager(new CustomLoadManager(
async, streaming, request_intervals_file, batch_size,
measurement_window_ms, max_trials, max_threads, num_of_sequences,
shared_memory_type, output_shm_size, serial_sequences, parser, factory));
*manager = std::move(local_manager);
return cb::Error::Success;
}
CustomLoadManager::CustomLoadManager(
const bool async, const bool streaming,
const std::string& request_intervals_file, int32_t batch_size,
const uint64_t measurement_window_ms, const size_t max_trials,
const size_t max_threads, const uint32_t num_of_sequences,
const SharedMemoryType shared_memory_type, const size_t output_shm_size,
const bool serial_sequences, const std::shared_ptr<ModelParser>& parser,
const std::shared_ptr<cb::ClientBackendFactory>& factory)
: RequestRateManager(
async, streaming, Distribution::CUSTOM, batch_size,
measurement_window_ms, max_trials, max_threads, num_of_sequences,
shared_memory_type, output_shm_size, serial_sequences, parser,
factory),
request_intervals_file_(request_intervals_file)
{
}
cb::Error
CustomLoadManager::InitCustomIntervals()
{
PauseWorkers();
ConfigureThreads();
auto status = GenerateSchedule();
ResumeWorkers();
return status;
}
cb::Error
CustomLoadManager::GenerateSchedule()
{
if (request_intervals_file_.empty()) {
return cb::Error::Success;
}
RETURN_IF_ERROR(
ReadTimeIntervalsFile(request_intervals_file_, &custom_intervals_));
auto worker_schedules = CreateWorkerSchedules();
GiveSchedulesToWorkers(worker_schedules);
return cb::Error::Success;
}
std::vector<RateSchedulePtr_t>
CustomLoadManager::CreateWorkerSchedules()
{
std::vector<RateSchedulePtr_t> worker_schedules =
CreateEmptyWorkerSchedules();
std::vector<size_t> thread_ids{CalculateThreadIds()};
size_t thread_id_index = 0;
size_t worker_index = 0;
size_t intervals_index = 0;
std::chrono::nanoseconds next_timestamp(0);
bool started = false;
// Keep filling the schedule until both the thread_ids (which can differ if
// sequences are enabled) and the intervals are both at the end of their
// lists. This effectively finds the least common multiple of the two sizes
// and makes sure that the schedule is complete and can be repeated
// indefinitely
//
while (!started || thread_id_index != 0 || intervals_index != 0) {
started = true;
next_timestamp += custom_intervals_[intervals_index];
worker_index = thread_ids[thread_id_index];
worker_schedules[worker_index]->intervals.emplace_back(next_timestamp);
thread_id_index = (thread_id_index + 1) % thread_ids.size();
intervals_index = (intervals_index + 1) % custom_intervals_.size();
}
SetScheduleDurations(worker_schedules);
return worker_schedules;
}
cb::Error
CustomLoadManager::GetCustomRequestRate(double* request_rate)
{
if (custom_intervals_.empty()) {
return cb::Error("The custom intervals vector is empty", pa::GENERIC_ERROR);
}
uint64_t total_time_ns = 0;
for (auto interval : custom_intervals_) {
total_time_ns += interval.count();
}
*request_rate =
(custom_intervals_.size() * NANOS_PER_SECOND) / (total_time_ns);
return cb::Error::Success;
}
cb::Error
CustomLoadManager::ReadTimeIntervalsFile(
const std::string& path, NanoIntervals* contents)
{
std::ifstream in(path);
if (!in) {
return cb::Error("failed to open file '" + path + "'", pa::GENERIC_ERROR);
}
std::string current_string;
while (std::getline(in, current_string)) {
std::chrono::nanoseconds curent_time_interval_ns(
std::stol(current_string) * 1000);
contents->push_back(curent_time_interval_ns);
}
in.close();
if (contents->size() == 0) {
return cb::Error("file '" + path + "' is empty", pa::GENERIC_ERROR);
}
return cb::Error::Success;
}
}} // namespace triton::perfanalyzer
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment