Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Lmdeploy
Commits
b30f3cdb
Commit
b30f3cdb
authored
Nov 14, 2023
by
xiabo
Browse files
添加下载的代码
parent
e38ee081
Changes
157
Hide whitespace changes
Inline
Side-by-side
Showing
17 changed files
with
5026 additions
and
0 deletions
+5026
-0
3rdparty/core-r22.12/src/model_repository_manager.h
3rdparty/core-r22.12/src/model_repository_manager.h
+345
-0
3rdparty/core-r22.12/src/numa_utils.cc
3rdparty/core-r22.12/src/numa_utils.cc
+237
-0
3rdparty/core-r22.12/src/numa_utils.h
3rdparty/core-r22.12/src/numa_utils.h
+57
-0
3rdparty/core-r22.12/src/payload.cc
3rdparty/core-r22.12/src/payload.cc
+215
-0
3rdparty/core-r22.12/src/payload.h
3rdparty/core-r22.12/src/payload.h
+102
-0
3rdparty/core-r22.12/src/pinned_memory_manager.cc
3rdparty/core-r22.12/src/pinned_memory_manager.cc
+378
-0
3rdparty/core-r22.12/src/pinned_memory_manager.h
3rdparty/core-r22.12/src/pinned_memory_manager.h
+108
-0
3rdparty/core-r22.12/src/rate_limiter.cc
3rdparty/core-r22.12/src/rate_limiter.cc
+943
-0
3rdparty/core-r22.12/src/rate_limiter.h
3rdparty/core-r22.12/src/rate_limiter.h
+310
-0
3rdparty/core-r22.12/src/repo_agent.cc
3rdparty/core-r22.12/src/repo_agent.cc
+573
-0
3rdparty/core-r22.12/src/repo_agent.h
3rdparty/core-r22.12/src/repo_agent.h
+182
-0
3rdparty/core-r22.12/src/response_allocator.h
3rdparty/core-r22.12/src/response_allocator.h
+77
-0
3rdparty/core-r22.12/src/response_cache.cc
3rdparty/core-r22.12/src/response_cache.cc
+542
-0
3rdparty/core-r22.12/src/response_cache.h
3rdparty/core-r22.12/src/response_cache.h
+198
-0
3rdparty/core-r22.12/src/scheduler.h
3rdparty/core-r22.12/src/scheduler.h
+80
-0
3rdparty/core-r22.12/src/scheduler_utils.cc
3rdparty/core-r22.12/src/scheduler_utils.cc
+423
-0
3rdparty/core-r22.12/src/scheduler_utils.h
3rdparty/core-r22.12/src/scheduler_utils.h
+256
-0
No files found.
Too many changes to show.
To preserve performance only
157 of 157+
files are displayed.
Plain diff
Email patch
3rdparty/core-r22.12/src/model_repository_manager.h
0 → 100644
View file @
b30f3cdb
// Copyright 2018-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
#pragma once
#include <functional>
#include <map>
#include <mutex>
#include <set>
#include "infer_parameter.h"
#include "model_config.pb.h"
#include "model_lifecycle.h"
#include "status.h"
#include "triton/common/model_config.h"
namespace
triton
{
namespace
core
{
class
InferenceServer
;
class
Model
;
// [FIXME] should have separated load / unload functions for clarity
enum
ActionType
{
NO_ACTION
,
LOAD
,
UNLOAD
};
/// Predefined reason strings
#define MODEL_READY_REASON_DUPLICATE "model appears in two or more repositories"
/// An object to manage the model repository active in the server.
class
ModelRepositoryManager
{
public:
// Index information for a model.
struct
ModelIndex
{
ModelIndex
(
const
std
::
string
&
n
)
:
name_only_
(
true
),
name_
(
n
),
version_
(
-
1
),
state_
(
ModelReadyState
::
UNKNOWN
)
{
}
ModelIndex
(
const
std
::
string
&
n
,
const
int64_t
v
,
const
ModelReadyState
s
,
const
std
::
string
&
r
)
:
name_only_
(
false
),
name_
(
n
),
version_
(
v
),
state_
(
s
),
reason_
(
r
)
{
}
const
bool
name_only_
;
const
std
::
string
name_
;
const
int64_t
version_
;
const
ModelReadyState
state_
;
const
std
::
string
reason_
;
};
/// A basic unit in dependency graph that records the models seen by the model
/// repository manager.
struct
DependencyNode
{
DependencyNode
(
const
std
::
string
&
model_name
)
:
model_name_
(
model_name
),
status_
(
Status
::
Success
),
checked_
(
false
)
{
}
std
::
string
model_name_
;
Status
status_
;
bool
checked_
;
bool
explicitly_load_
;
inference
::
ModelConfig
model_config_
;
std
::
set
<
int64_t
>
loaded_versions_
;
std
::
set
<
DependencyNode
*>
missing_upstreams_
;
std
::
unordered_map
<
DependencyNode
*
,
std
::
set
<
int64_t
>>
upstreams_
;
std
::
set
<
DependencyNode
*>
downstreams_
;
};
~
ModelRepositoryManager
();
/// Create a manager for a repository.
/// \param server The pointer to the inference server.
/// \param server_version The version of the inference server.
/// \param repository_paths A set of file-system paths of the repositories.
/// \param startup_models A set of models to be loaded at startup
/// if model control is enabled.
/// \param strict_model_config If false attempt to autofill missing required
/// information in each model configuration.
/// \param polling_enabled If true, then PollAndUpdate() is allowed.
/// Otherwise, it is not allowed.
/// \param model_control_enabled If true, then LoadUnloadModel() is allowed
/// and the models in the model repository will not be loaded at startup.
/// Otherwise, LoadUnloadModel() is not allowed and the models will be loaded.
/// Cannot be set to true if polling_enabled is true.
/// \param life_cycle_options The options to configure ModelLifeCycle.
/// \param model_repository_manager Return the model repository manager.
/// \return The error status.
static
Status
Create
(
InferenceServer
*
server
,
const
std
::
string
&
server_version
,
const
std
::
set
<
std
::
string
>&
repository_paths
,
const
std
::
set
<
std
::
string
>&
startup_models
,
const
bool
strict_model_config
,
const
bool
polling_enabled
,
const
bool
model_control_enabled
,
const
ModelLifeCycleOptions
&
life_cycle_options
,
std
::
unique_ptr
<
ModelRepositoryManager
>*
model_repository_manager
);
/// Poll the model repository to determine the new set of models and
/// compare with the current set. And serve the new set of models based
/// on their version policy.
Status
PollAndUpdate
();
/// Load or unload a specified model.
/// \param models The models and the parameters to be loaded or unloaded
/// \param type The type action to be performed. If the action is LOAD and
/// the model has been loaded, the model will be re-loaded.
/// \return error status. Return "NOT_FOUND" if it tries to load
/// a non-existing model or if it tries to unload a model that hasn't been
/// loaded.
Status
LoadUnloadModel
(
const
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
const
InferenceParameter
*>>&
models
,
const
ActionType
type
,
const
bool
unload_dependents
);
/// Unload all models. This function should be called before shutting down
/// the model repository manager.
/// \return error status.
Status
UnloadAllModels
();
/// Instruct all models to stop accepting new inference requests. However,
/// the models are still capable of processing inference requests
/// if the model considers them as part of the in-flight inference.
/// \return error status.
Status
StopAllModels
();
/// \return the number of in-flight inferences for the all versions of all
/// models. The set element will be a tuple of <model_name, model_version,
/// in-flight inference count>. Note that a model version will not be included
/// if it doesn't have in-flight inferences.
const
std
::
set
<
std
::
tuple
<
std
::
string
,
int64_t
,
size_t
>>
InflightStatus
();
/// \param strict_readiness If true, only models that have at least one
/// ready version will be considered as live. Otherwise, the models that
/// have loading / unloading versions will also be live.
/// \return the state of all versions of all live models.
const
ModelStateMap
LiveModelStates
(
bool
strict_readiness
=
false
);
/// \return the state of all versions of all models that have every
/// been (attempted) loaded over the lifetime of the server.
const
ModelStateMap
ModelStates
();
/// \return the states of all versions of a specific model.
const
VersionStateMap
VersionStates
(
const
std
::
string
&
model_name
);
/// \return the ready-state of a specific model version.
Status
ModelState
(
const
std
::
string
&
model_name
,
const
int64_t
model_version
,
ModelReadyState
*
state
);
/// Get the index of all models in all repositories.
/// \param ready_only If true return only index of models that are ready.
/// \param index Returns the index.
/// \return error status.
Status
RepositoryIndex
(
const
bool
ready_only
,
std
::
vector
<
ModelIndex
>*
index
);
/// Obtain the specified model.
/// \param model_name The name of the model.
/// \param model_version The version of the model.
/// \param model Return the model object.
/// \return error status.
Status
GetModel
(
const
std
::
string
&
model_name
,
const
int64_t
model_version
,
std
::
shared_ptr
<
Model
>*
model
);
// Register model repository path.
/// \param repository Path to model repository.
/// \param model_mapping Mapping with (overridden) model name as key, subdir
/// name as value.
/// \return error status
Status
RegisterModelRepository
(
const
std
::
string
&
repository
,
const
std
::
unordered_map
<
std
::
string
,
std
::
string
>&
model_mapping
);
// Unregister model repository path.
/// \param repository Path to model repository.
/// \return error status
Status
UnregisterModelRepository
(
const
std
::
string
&
repository
);
private:
struct
ModelInfo
;
// Map from model name to information about the model.
using
ModelInfoMap
=
std
::
unordered_map
<
std
::
string
,
std
::
unique_ptr
<
ModelInfo
>>
;
// Set of DependencyNode
using
NodeSet
=
std
::
set
<
DependencyNode
*>
;
ModelRepositoryManager
(
const
std
::
set
<
std
::
string
>&
repository_paths
,
const
bool
autofill
,
const
bool
polling_enabled
,
const
bool
model_control_enabled
,
const
double
min_compute_capability
,
std
::
unique_ptr
<
ModelLifeCycle
>
life_cycle
);
/// The internal function that are called in Create() and PollAndUpdate().
Status
PollAndUpdateInternal
(
bool
*
all_models_polled
);
/// The internal function that load or unload a set of models.
Status
LoadUnloadModels
(
const
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
const
InferenceParameter
*>>&
models
,
const
ActionType
type
,
const
bool
unload_dependents
,
bool
*
all_models_polled
);
/// Poll the requested models in the model repository and
/// compare with the current set. Return the additions, deletions,
/// and modifications that have occurred. This function will not updated
/// the current model info, it is caller's responsibility to do so.
/// \param models The map from models to be polled to their associated
/// parameters.
/// \param added The names of the models added to the repository.
/// \param deleted The names of the models removed from the repository.
/// \param modified The names of the models remaining in the
/// repository that have been changed.
/// \param unmodified The names of the models remaining in the
/// repository that have not changed.
/// \param updated_infos The model infos retrieved from the poll.
/// \param all_models_polled Return true if all models are polled and
/// their model configuration are validated successfully. Instead of aborting
/// the polling, the models that fail will be ignored and their model infos
/// will stay in the previous state.
/// \return The error status.
Status
Poll
(
const
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
const
InferenceParameter
*>>&
models
,
std
::
set
<
std
::
string
>*
added
,
std
::
set
<
std
::
string
>*
deleted
,
std
::
set
<
std
::
string
>*
modified
,
std
::
set
<
std
::
string
>*
unmodified
,
ModelInfoMap
*
updated_infos
,
bool
*
all_models_polled
);
/// Helper function for Poll() to initialize ModelInfo for the model.
/// \param name The name of the model.
/// \param path The model path. Empty path means the model is provided via
/// 'params'
/// \param params The model parameters provided for polling model.
/// \param info Return the updated ModelInfo. 'nullptr' will be returned if
/// existing ModelInfo for the model should be reused.
/// \return The error status.
Status
InitializeModelInfo
(
const
std
::
string
&
name
,
const
std
::
string
&
path
,
const
std
::
vector
<
const
InferenceParameter
*>&
params
,
std
::
unique_ptr
<
ModelInfo
>*
info
);
/// Load models based on the dependency graph. The function will iteratively
/// load models that all the models they depend on has been loaded, and unload
/// models if their dependencies are no longer satisfied.
/// \return The status of the model loads.
std
::
map
<
std
::
string
,
Status
>
LoadModelByDependency
();
/// Helper function to update the dependency graph based on the poll result
/// \param added The names of the models added to the repository.
/// \param deleted The names of the models removed from the repository.
/// \param modified The names of the models remaining in the
/// repository that have been changed.
/// \param deleted_dependents The names of dependent models to be removed
/// from the repository.
/// \return The error status.
Status
UpdateDependencyGraph
(
const
std
::
set
<
std
::
string
>&
added
,
const
std
::
set
<
std
::
string
>&
deleted
,
const
std
::
set
<
std
::
string
>&
modified
,
std
::
set
<
std
::
string
>*
deleted_dependents
=
nullptr
);
/// Helper function to uncheck the nodes because the model that they depends
/// on has changed. The unchecked nodes will be validated again.
/// The function will be call recursively to uncheck all downstreams.
/// \param downstreams The nodes to be unchecked.
/// \param updated_nodes Return the nodes that have been unchecked
void
UncheckDownstream
(
NodeSet
*
downstreams
,
NodeSet
*
updated_nodes
);
/// Helper function to construct the edges between nodes in dependency graph.
/// \param updated_node The node that is newly added or modified.
/// \return True if the node represents an ensemble model. False otherwise.
bool
ConnectDependencyGraph
(
DependencyNode
*
updated_node
);
/// Get the model info for a named model.
/// \param name The model name.
/// \param model_info Returns the model information.
/// \return OK if found, NOT_FOUND otherwise.
Status
GetModelInfo
(
const
std
::
string
&
name
,
ModelInfo
**
model_info
);
/// Get the models to be loaded / unloaded based on the model loaded in
/// previous iteration.
/// \param loaded_models The models loaded / unloaded in previous iteration.
/// Unloaded models will be represented as models with no loaded versions.
/// \return A pair of node set containing models to be loaded and models to be
/// unloaded for the next iteration.
std
::
pair
<
NodeSet
,
NodeSet
>
ModelsToLoadUnload
(
const
NodeSet
&
loaded_models
);
/// Check if the node is ready for the next iteration. A node is ready if the
/// node is invalid (containing invalid model config or its depdencies failed
/// to load) or all of its dependencies are satisfied.
/// \param node The node to be checked.
/// \return True if the node is ready. False otherwise.
bool
CheckNode
(
DependencyNode
*
node
);
Status
CircularcyCheck
(
DependencyNode
*
current_node
,
const
DependencyNode
*
start_node
);
bool
ModelDirectoryOverride
(
const
std
::
vector
<
const
InferenceParameter
*>&
model_params
);
std
::
set
<
std
::
string
>
repository_paths_
;
const
bool
autofill_
;
const
bool
polling_enabled_
;
const
bool
model_control_enabled_
;
const
double
min_compute_capability_
;
std
::
mutex
poll_mu_
;
ModelInfoMap
infos_
;
std
::
unordered_map
<
std
::
string
,
std
::
unique_ptr
<
DependencyNode
>>
dependency_graph_
;
std
::
unordered_map
<
std
::
string
,
std
::
unique_ptr
<
DependencyNode
>>
missing_nodes_
;
// Mappings from (overridden) model names to a pair of their repository and
// absolute path
std
::
unordered_map
<
std
::
string
,
std
::
pair
<
std
::
string
,
std
::
string
>>
model_mappings_
;
std
::
unique_ptr
<
ModelLifeCycle
>
model_life_cycle_
;
};
}}
// namespace triton::core
3rdparty/core-r22.12/src/numa_utils.cc
0 → 100644
View file @
b30f3cdb
// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "numa_utils.h"
#ifndef _WIN32
#include <numa.h>
#include <numaif.h>
#endif
#include "triton/common/logging.h"
namespace
triton
{
namespace
core
{
namespace
{
std
::
string
VectorToString
(
const
std
::
vector
<
int
>&
vec
)
{
std
::
string
str
(
"["
);
for
(
const
auto
&
element
:
vec
)
{
str
+=
std
::
to_string
(
element
);
str
+=
","
;
}
str
+=
"]"
;
return
str
;
}
Status
ParseIntOption
(
const
std
::
string
&
msg
,
const
std
::
string
&
arg
,
int
*
value
)
{
try
{
*
value
=
std
::
stoi
(
arg
);
}
catch
(
const
std
::
invalid_argument
&
ia
)
{
return
Status
(
Status
::
Code
::
INVALID_ARG
,
msg
+
": Can't parse '"
+
arg
+
"' to integer"
);
}
return
Status
::
Success
;
}
}
// namespace
// NUMA setting will be ignored on Windows platform
#ifdef _WIN32
Status
SetNumaConfigOnThread
(
const
triton
::
common
::
HostPolicyCmdlineConfig
&
host_policy
)
{
return
Status
::
Success
;
}
Status
SetNumaMemoryPolicy
(
const
triton
::
common
::
HostPolicyCmdlineConfig
&
host_policy
)
{
return
Status
::
Success
;
}
Status
GetNumaMemoryPolicyNodeMask
(
unsigned
long
*
node_mask
)
{
*
node_mask
=
0
;
return
Status
::
Success
;
}
Status
ResetNumaMemoryPolicy
()
{
return
Status
::
Success
;
}
Status
SetNumaThreadAffinity
(
std
::
thread
::
native_handle_type
thread
,
const
triton
::
common
::
HostPolicyCmdlineConfig
&
host_policy
)
{
return
Status
::
Success
;
}
#else
// Use variable to make sure no NUMA related function is actually called
// if Triton is not running with NUMA awareness. i.e. Extra docker permission
// is needed to call the NUMA functions and this ensures backward compatibility.
thread_local
bool
numa_set
=
false
;
Status
SetNumaConfigOnThread
(
const
triton
::
common
::
HostPolicyCmdlineConfig
&
host_policy
)
{
// Set thread affinity
RETURN_IF_ERROR
(
SetNumaThreadAffinity
(
pthread_self
(),
host_policy
));
// Set memory policy
RETURN_IF_ERROR
(
SetNumaMemoryPolicy
(
host_policy
));
return
Status
::
Success
;
}
Status
SetNumaMemoryPolicy
(
const
triton
::
common
::
HostPolicyCmdlineConfig
&
host_policy
)
{
const
auto
it
=
host_policy
.
find
(
"numa-node"
);
if
(
it
!=
host_policy
.
end
())
{
int
node_id
;
RETURN_IF_ERROR
(
ParseIntOption
(
"Parsing 'numa-node' value"
,
it
->
second
,
&
node_id
));
LOG_VERBOSE
(
1
)
<<
"Thread is binding to NUMA node "
<<
it
->
second
<<
". Max NUMA node count: "
<<
(
numa_max_node
()
+
1
);
numa_set
=
true
;
unsigned
long
node_mask
=
1UL
<<
node_id
;
if
(
set_mempolicy
(
MPOL_BIND
,
&
node_mask
,
(
numa_max_node
()
+
1
)
+
1
)
!=
0
)
{
return
Status
(
Status
::
Code
::
INTERNAL
,
std
::
string
(
"Unable to set NUMA memory policy: "
)
+
strerror
(
errno
));
}
}
return
Status
::
Success
;
}
Status
GetNumaMemoryPolicyNodeMask
(
unsigned
long
*
node_mask
)
{
*
node_mask
=
0
;
int
mode
;
if
(
numa_set
&&
get_mempolicy
(
&
mode
,
node_mask
,
numa_max_node
()
+
1
,
NULL
,
0
)
!=
0
)
{
return
Status
(
Status
::
Code
::
INTERNAL
,
std
::
string
(
"Unable to get NUMA node for current thread: "
)
+
strerror
(
errno
));
}
return
Status
::
Success
;
}
Status
ResetNumaMemoryPolicy
()
{
if
(
numa_set
&&
(
set_mempolicy
(
MPOL_DEFAULT
,
nullptr
,
0
)
!=
0
))
{
return
Status
(
Status
::
Code
::
INTERNAL
,
std
::
string
(
"Unable to reset NUMA memory policy: "
)
+
strerror
(
errno
));
}
numa_set
=
false
;
return
Status
::
Success
;
}
Status
SetNumaThreadAffinity
(
std
::
thread
::
native_handle_type
thread
,
const
triton
::
common
::
HostPolicyCmdlineConfig
&
host_policy
)
{
const
auto
it
=
host_policy
.
find
(
"cpu-cores"
);
if
(
it
!=
host_policy
.
end
())
{
// Parse CPUs
std
::
vector
<
int
>
cpus
;
{
const
auto
&
cpu_str
=
it
->
second
;
auto
delim_cpus
=
cpu_str
.
find
(
","
);
int
current_pos
=
0
;
while
(
true
)
{
auto
delim_range
=
cpu_str
.
find
(
"-"
,
current_pos
);
if
(
delim_range
==
std
::
string
::
npos
)
{
return
Status
(
Status
::
Code
::
INVALID_ARG
,
std
::
string
(
"host policy setting 'cpu-cores' format is "
"'<lower_cpu_core_id>-<upper_cpu_core_id>'. Got "
)
+
cpu_str
.
substr
(
current_pos
,
((
delim_cpus
==
std
::
string
::
npos
)
?
(
cpu_str
.
length
()
+
1
)
:
delim_cpus
)
-
current_pos
));
}
int
lower
,
upper
;
RETURN_IF_ERROR
(
ParseIntOption
(
"Parsing 'cpu-cores' value"
,
cpu_str
.
substr
(
current_pos
,
delim_range
-
current_pos
),
&
lower
));
RETURN_IF_ERROR
(
ParseIntOption
(
"Parsing 'cpu-cores' value"
,
(
delim_cpus
==
std
::
string
::
npos
)
?
cpu_str
.
substr
(
delim_range
+
1
)
:
cpu_str
.
substr
(
delim_range
+
1
,
delim_cpus
-
(
delim_range
+
1
)),
&
upper
));
for
(;
lower
<=
upper
;
++
lower
)
{
cpus
.
push_back
(
lower
);
}
// break if the processed range is the last specified range
if
(
delim_cpus
!=
std
::
string
::
npos
)
{
current_pos
=
delim_cpus
+
1
;
delim_cpus
=
cpu_str
.
find
(
","
,
current_pos
);
}
else
{
break
;
}
}
}
LOG_VERBOSE
(
1
)
<<
"Thread is binding to one of the CPUs: "
<<
VectorToString
(
cpus
);
numa_set
=
true
;
cpu_set_t
cpuset
;
CPU_ZERO
(
&
cpuset
);
for
(
int
cpu
:
cpus
)
{
CPU_SET
(
cpu
,
&
cpuset
);
}
if
(
pthread_setaffinity_np
(
thread
,
sizeof
(
cpu_set_t
),
&
cpuset
)
!=
0
)
{
return
Status
(
Status
::
Code
::
INTERNAL
,
std
::
string
(
"Unable to set NUMA thread affinity: "
)
+
strerror
(
errno
));
}
}
return
Status
::
Success
;
}
#endif
}}
// namespace triton::core
3rdparty/core-r22.12/src/numa_utils.h
0 → 100644
View file @
b30f3cdb
// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <map>
#include <thread>
#include <vector>
#include "status.h"
#include "triton/common/model_config.h"
#include "tritonserver_apis.h"
namespace
triton
{
namespace
core
{
// Helper function to set memory policy and thread affinity on current thread
Status
SetNumaConfigOnThread
(
const
triton
::
common
::
HostPolicyCmdlineConfig
&
host_policy
);
// Restrict the memory allocation to specific NUMA node.
Status
SetNumaMemoryPolicy
(
const
triton
::
common
::
HostPolicyCmdlineConfig
&
host_policy
);
// Retrieve the node mask used to set memory policy for the current thread
Status
GetNumaMemoryPolicyNodeMask
(
unsigned
long
*
node_mask
);
// Reset the memory allocation setting.
Status
ResetNumaMemoryPolicy
();
// Set a thread affinity to be on specific cpus.
Status
SetNumaThreadAffinity
(
std
::
thread
::
native_handle_type
thread
,
const
triton
::
common
::
HostPolicyCmdlineConfig
&
host_policy
);
}}
// namespace triton::core
3rdparty/core-r22.12/src/payload.cc
0 → 100644
View file @
b30f3cdb
// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "payload.h"
namespace
triton
{
namespace
core
{
Payload
::
Payload
()
:
op_type_
(
Operation
::
INFER_RUN
),
requests_
(
std
::
vector
<
std
::
unique_ptr
<
InferenceRequest
>>
()),
OnCallback_
([]()
{}),
instance_
(
nullptr
),
state_
(
State
::
UNINITIALIZED
),
batcher_start_ns_
(
0
),
saturated_
(
false
)
{
exec_mu_
.
reset
(
new
std
::
mutex
());
}
const
Status
&
Payload
::
MergePayload
(
std
::
shared_ptr
<
Payload
>&
payload
)
{
if
((
payload
->
GetOpType
()
!=
Operation
::
INFER_RUN
)
||
(
op_type_
!=
Operation
::
INFER_RUN
))
{
static
Status
op_type_error
(
Status
::
Code
::
INTERNAL
,
"Attempted to merge payloads of type that are not INFER_RUN"
);
return
op_type_error
;
}
if
(
payload
->
GetInstance
()
!=
instance_
)
{
static
Status
instance_error
(
Status
::
Code
::
INTERNAL
,
"Attempted to merge payloads of mismatching instance"
);
return
instance_error
;
}
if
((
payload
->
GetState
()
!=
State
::
EXECUTING
)
||
(
state_
!=
State
::
EXECUTING
))
{
static
Status
state_error
(
Status
::
Code
::
INTERNAL
,
"Attempted to merge payloads that are not in executing state"
);
return
state_error
;
}
// Skip comparison if not initialized (required), here assume either all
// payloads are initialized or otherwise.
if
(
required_equal_inputs_
.
Initialized
()
&&
!
required_equal_inputs_
.
HasEqualInputs
(
*
payload
->
Requests
().
begin
()))
{
static
Status
shape_error
(
Status
::
Code
::
INVALID_ARG
,
"Attempted to merge payloads that has non-equal inputs"
);
return
shape_error
;
}
requests_
.
insert
(
requests_
.
end
(),
std
::
make_move_iterator
(
payload
->
Requests
().
begin
()),
std
::
make_move_iterator
(
payload
->
Requests
().
end
()));
payload
->
Callback
();
return
Status
::
Success
;
}
void
Payload
::
Reset
(
const
Operation
op_type
,
TritonModelInstance
*
instance
)
{
op_type_
=
op_type
;
requests_
.
clear
();
OnCallback_
=
[]()
{};
release_callbacks_
.
clear
();
instance_
=
instance
;
state_
=
State
::
UNINITIALIZED
;
status_
.
reset
(
new
std
::
promise
<
Status
>
());
required_equal_inputs_
=
RequiredEqualInputs
();
batcher_start_ns_
=
0
;
saturated_
=
false
;
}
void
Payload
::
Release
()
{
op_type_
=
Operation
::
INFER_RUN
;
requests_
.
clear
();
OnCallback_
=
[]()
{};
release_callbacks_
.
clear
();
instance_
=
nullptr
;
state_
=
State
::
RELEASED
;
required_equal_inputs_
=
RequiredEqualInputs
();
batcher_start_ns_
=
0
;
saturated_
=
false
;
}
size_t
Payload
::
BatchSize
()
{
size_t
batch_size
=
0
;
for
(
const
auto
&
request
:
requests_
)
{
batch_size
+=
std
::
max
(
1U
,
request
->
BatchSize
());
}
return
batch_size
;
}
void
Payload
::
ReserveRequests
(
size_t
size
)
{
requests_
.
reserve
(
size
);
}
void
Payload
::
AddRequest
(
std
::
unique_ptr
<
InferenceRequest
>
request
)
{
if
((
batcher_start_ns_
==
0
)
||
(
batcher_start_ns_
>
request
->
BatcherStartNs
()))
{
batcher_start_ns_
=
request
->
BatcherStartNs
();
}
requests_
.
push_back
(
std
::
move
(
request
));
}
void
Payload
::
SetCallback
(
std
::
function
<
void
()
>
OnCallback
)
{
OnCallback_
=
OnCallback
;
}
void
Payload
::
SetInstance
(
TritonModelInstance
*
model_instance
)
{
instance_
=
model_instance
;
}
void
Payload
::
AddInternalReleaseCallback
(
std
::
function
<
void
()
>&&
callback
)
{
release_callbacks_
.
emplace_back
(
std
::
move
(
callback
));
}
void
Payload
::
MarkSaturated
()
{
saturated_
=
true
;
}
void
Payload
::
SetState
(
Payload
::
State
state
)
{
state_
=
state
;
}
Status
Payload
::
Wait
()
{
return
status_
->
get_future
().
get
();
}
void
Payload
::
Callback
()
{
OnCallback_
();
}
void
Payload
::
OnRelease
()
{
// Invoke the release callbacks added internally before releasing the
// request to user provided callback.
for
(
auto
it
=
release_callbacks_
.
rbegin
();
it
!=
release_callbacks_
.
rend
();
it
++
)
{
(
*
it
)();
}
release_callbacks_
.
clear
();
}
void
Payload
::
Execute
(
bool
*
should_exit
)
{
*
should_exit
=
false
;
Status
status
;
switch
(
op_type_
)
{
case
Operation
::
INFER_RUN
:
instance_
->
Schedule
(
std
::
move
(
requests_
),
OnCallback_
);
break
;
case
Operation
::
INIT
:
status
=
instance_
->
Initialize
();
break
;
case
Operation
::
WARM_UP
:
status
=
instance_
->
WarmUp
();
break
;
case
Operation
::
EXIT
:
*
should_exit
=
true
;
}
status_
->
set_value
(
status
);
}
}}
// namespace triton::core
3rdparty/core-r22.12/src/payload.h
0 → 100644
View file @
b30f3cdb
// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <functional>
#include <future>
#include <memory>
#include <mutex>
#include <queue>
#include <vector>
#include "backend_model_instance.h"
#include "infer_request.h"
#include "scheduler_utils.h"
#include "status.h"
namespace
triton
{
namespace
core
{
class
Payload
{
public:
enum
Operation
{
INFER_RUN
=
0
,
INIT
=
1
,
WARM_UP
=
2
,
EXIT
=
3
};
enum
State
{
UNINITIALIZED
=
0
,
READY
=
1
,
REQUESTED
=
2
,
SCHEDULED
=
3
,
EXECUTING
=
4
,
RELEASED
=
5
};
Payload
();
void
Reset
(
const
Operation
op_type
,
TritonModelInstance
*
instance
=
nullptr
);
const
Status
&
MergePayload
(
std
::
shared_ptr
<
Payload
>&
payload
);
Operation
GetOpType
()
{
return
op_type_
;
}
std
::
mutex
*
GetExecMutex
()
{
return
exec_mu_
.
get
();
}
size_t
RequestCount
()
{
return
requests_
.
size
();
}
size_t
BatchSize
();
void
ReserveRequests
(
size_t
size
);
void
AddRequest
(
std
::
unique_ptr
<
InferenceRequest
>
request
);
std
::
vector
<
std
::
unique_ptr
<
InferenceRequest
>>&
Requests
()
{
return
requests_
;
}
uint64_t
BatcherStartNs
()
{
return
batcher_start_ns_
;
}
void
SetCallback
(
std
::
function
<
void
()
>
OnCallback
);
void
Callback
();
void
AddInternalReleaseCallback
(
std
::
function
<
void
()
>&&
callback
);
void
OnRelease
();
void
SetInstance
(
TritonModelInstance
*
model_instance
);
TritonModelInstance
*
GetInstance
()
{
return
instance_
;
}
void
MarkSaturated
();
bool
IsSaturated
()
{
return
saturated_
;
}
RequiredEqualInputs
*
MutableRequiredEqualInputs
()
{
return
&
required_equal_inputs_
;
}
State
GetState
()
{
return
state_
;
}
void
SetState
(
State
state
);
void
Execute
(
bool
*
should_exit
);
Status
Wait
();
void
Release
();
private:
Operation
op_type_
;
std
::
vector
<
std
::
unique_ptr
<
InferenceRequest
>>
requests_
;
std
::
function
<
void
()
>
OnCallback_
;
std
::
vector
<
std
::
function
<
void
()
>>
release_callbacks_
;
TritonModelInstance
*
instance_
;
State
state_
;
std
::
unique_ptr
<
std
::
promise
<
Status
>>
status_
;
std
::
unique_ptr
<
std
::
mutex
>
exec_mu_
;
uint64_t
batcher_start_ns_
;
RequiredEqualInputs
required_equal_inputs_
;
bool
saturated_
;
};
}}
// namespace triton::core
3rdparty/core-r22.12/src/pinned_memory_manager.cc
0 → 100644
View file @
b30f3cdb
// Copyright 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
#include "pinned_memory_manager.h"
#include <sstream>
#include "numa_utils.h"
#include "triton/common/logging.h"
#ifdef TRITON_ENABLE_GPU
#include <cuda_runtime_api.h>
#endif // TRITON_ENABLE_GPU
namespace
triton
{
namespace
core
{
namespace
{
std
::
string
PointerToString
(
void
*
ptr
)
{
std
::
stringstream
ss
;
ss
<<
ptr
;
return
ss
.
str
();
}
Status
ParseIntOption
(
const
std
::
string
&
msg
,
const
std
::
string
&
arg
,
int
*
value
)
{
try
{
*
value
=
std
::
stoi
(
arg
);
}
catch
(
const
std
::
invalid_argument
&
ia
)
{
return
Status
(
Status
::
Code
::
INVALID_ARG
,
msg
+
": Can't parse '"
+
arg
+
"' to integer"
);
}
return
Status
::
Success
;
}
}
// namespace
std
::
unique_ptr
<
PinnedMemoryManager
>
PinnedMemoryManager
::
instance_
;
uint64_t
PinnedMemoryManager
::
pinned_memory_byte_size_
;
PinnedMemoryManager
::
PinnedMemory
::
PinnedMemory
(
void
*
pinned_memory_buffer
,
uint64_t
size
)
:
pinned_memory_buffer_
(
pinned_memory_buffer
)
{
if
(
pinned_memory_buffer_
!=
nullptr
)
{
managed_pinned_memory_
=
boost
::
interprocess
::
managed_external_buffer
(
boost
::
interprocess
::
create_only_t
{},
pinned_memory_buffer_
,
size
);
}
}
PinnedMemoryManager
::
PinnedMemory
::~
PinnedMemory
()
{
#ifdef TRITON_ENABLE_GPU
if
(
pinned_memory_buffer_
!=
nullptr
)
{
cudaFreeHost
(
pinned_memory_buffer_
);
}
#endif // TRITON_ENABLE_GPU
}
PinnedMemoryManager
::~
PinnedMemoryManager
()
{
// Clean up
for
(
const
auto
&
memory_info
:
memory_info_
)
{
const
auto
&
is_pinned
=
memory_info
.
second
.
first
;
if
(
!
is_pinned
)
{
free
(
memory_info
.
first
);
}
}
}
void
PinnedMemoryManager
::
AddPinnedMemoryBuffer
(
const
std
::
shared_ptr
<
PinnedMemory
>&
pinned_memory_buffer
,
unsigned
long
node_mask
)
{
pinned_memory_buffers_
[
node_mask
]
=
pinned_memory_buffer
;
}
Status
PinnedMemoryManager
::
AllocInternal
(
void
**
ptr
,
uint64_t
size
,
TRITONSERVER_MemoryType
*
allocated_type
,
bool
allow_nonpinned_fallback
,
PinnedMemory
*
pinned_memory_buffer
)
{
auto
status
=
Status
::
Success
;
if
(
pinned_memory_buffer
->
pinned_memory_buffer_
!=
nullptr
)
{
std
::
lock_guard
<
std
::
mutex
>
lk
(
pinned_memory_buffer
->
buffer_mtx_
);
*
ptr
=
pinned_memory_buffer
->
managed_pinned_memory_
.
allocate
(
size
,
std
::
nothrow_t
{});
*
allocated_type
=
TRITONSERVER_MEMORY_CPU_PINNED
;
if
(
*
ptr
==
nullptr
)
{
status
=
Status
(
Status
::
Code
::
INTERNAL
,
"failed to allocate pinned system memory"
);
}
}
else
{
status
=
Status
(
Status
::
Code
::
INTERNAL
,
"failed to allocate pinned system memory: no pinned memory pool"
);
}
bool
is_pinned
=
true
;
if
((
!
status
.
IsOk
())
&&
allow_nonpinned_fallback
)
{
static
bool
warning_logged
=
false
;
if
(
!
warning_logged
)
{
LOG_WARNING
<<
status
.
Message
()
<<
", falling back to non-pinned system memory"
;
warning_logged
=
true
;
}
*
ptr
=
malloc
(
size
);
*
allocated_type
=
TRITONSERVER_MEMORY_CPU
;
is_pinned
=
false
;
if
(
*
ptr
==
nullptr
)
{
status
=
Status
(
Status
::
Code
::
INTERNAL
,
"failed to allocate non-pinned system memory"
);
}
else
{
status
=
Status
::
Success
;
}
}
// keep track of allocated buffer or clean up
{
std
::
lock_guard
<
std
::
mutex
>
lk
(
info_mtx_
);
if
(
status
.
IsOk
())
{
auto
res
=
memory_info_
.
emplace
(
*
ptr
,
std
::
make_pair
(
is_pinned
,
pinned_memory_buffer
));
if
(
!
res
.
second
)
{
status
=
Status
(
Status
::
Code
::
INTERNAL
,
"unexpected memory address collision, '"
+
PointerToString
(
*
ptr
)
+
"' has been managed"
);
}
LOG_VERBOSE
(
1
)
<<
(
is_pinned
?
""
:
"non-"
)
<<
"pinned memory allocation: "
<<
"size "
<<
size
<<
", addr "
<<
*
ptr
;
}
}
if
((
!
status
.
IsOk
())
&&
(
*
ptr
!=
nullptr
))
{
if
(
is_pinned
)
{
std
::
lock_guard
<
std
::
mutex
>
lk
(
pinned_memory_buffer
->
buffer_mtx_
);
pinned_memory_buffer
->
managed_pinned_memory_
.
deallocate
(
*
ptr
);
}
else
{
free
(
*
ptr
);
}
}
return
status
;
}
Status
PinnedMemoryManager
::
FreeInternal
(
void
*
ptr
)
{
bool
is_pinned
=
true
;
PinnedMemory
*
pinned_memory_buffer
=
nullptr
;
{
std
::
lock_guard
<
std
::
mutex
>
lk
(
info_mtx_
);
auto
it
=
memory_info_
.
find
(
ptr
);
if
(
it
!=
memory_info_
.
end
())
{
is_pinned
=
it
->
second
.
first
;
pinned_memory_buffer
=
it
->
second
.
second
;
LOG_VERBOSE
(
1
)
<<
(
is_pinned
?
""
:
"non-"
)
<<
"pinned memory deallocation: "
<<
"addr "
<<
ptr
;
memory_info_
.
erase
(
it
);
}
else
{
return
Status
(
Status
::
Code
::
INTERNAL
,
"unexpected memory address '"
+
PointerToString
(
ptr
)
+
"' is not being managed"
);
}
}
if
(
is_pinned
)
{
std
::
lock_guard
<
std
::
mutex
>
lk
(
pinned_memory_buffer
->
buffer_mtx_
);
pinned_memory_buffer
->
managed_pinned_memory_
.
deallocate
(
ptr
);
}
else
{
free
(
ptr
);
}
return
Status
::
Success
;
}
void
PinnedMemoryManager
::
Reset
()
{
instance_
.
reset
();
}
Status
PinnedMemoryManager
::
Create
(
const
Options
&
options
)
{
if
(
instance_
!=
nullptr
)
{
LOG_WARNING
<<
"New pinned memory pool of size "
<<
options
.
pinned_memory_pool_byte_size_
<<
" could not be created since one already exists"
<<
" of size "
<<
pinned_memory_byte_size_
;
return
Status
::
Success
;
}
instance_
.
reset
(
new
PinnedMemoryManager
());
if
(
options
.
host_policy_map_
.
empty
())
{
void
*
buffer
=
nullptr
;
#ifdef TRITON_ENABLE_GPU
auto
err
=
cudaHostAlloc
(
&
buffer
,
options
.
pinned_memory_pool_byte_size_
,
cudaHostAllocPortable
);
if
(
err
!=
cudaSuccess
)
{
buffer
=
nullptr
;
LOG_WARNING
<<
"Unable to allocate pinned system memory, pinned memory "
"pool will not be available: "
<<
std
::
string
(
cudaGetErrorString
(
err
));
}
else
if
(
options
.
pinned_memory_pool_byte_size_
!=
0
)
{
LOG_INFO
<<
"Pinned memory pool is created at '"
<<
PointerToString
(
buffer
)
<<
"' with size "
<<
options
.
pinned_memory_pool_byte_size_
;
}
else
{
LOG_INFO
<<
"Pinned memory pool disabled"
;
}
#endif // TRITON_ENABLE_GPU
try
{
instance_
->
AddPinnedMemoryBuffer
(
std
::
shared_ptr
<
PinnedMemory
>
(
new
PinnedMemory
(
buffer
,
options
.
pinned_memory_pool_byte_size_
)),
0
);
}
catch
(
const
std
::
exception
&
ex
)
{
return
Status
(
Status
::
Code
::
INTERNAL
,
"Failed to add Pinned Memory buffer: "
+
std
::
string
(
ex
.
what
()));
}
}
else
{
// Create only one buffer / manager should be created for one node,
// and all associated devices should request memory from the shared manager
std
::
map
<
int32_t
,
std
::
string
>
numa_map
;
for
(
const
auto
host_policy
:
options
.
host_policy_map_
)
{
const
auto
numa_it
=
host_policy
.
second
.
find
(
"numa-node"
);
if
(
numa_it
!=
host_policy
.
second
.
end
())
{
int32_t
numa_id
;
if
(
ParseIntOption
(
"Parsing NUMA node"
,
numa_it
->
second
,
&
numa_id
)
.
IsOk
())
{
numa_map
.
emplace
(
numa_id
,
host_policy
.
first
);
}
}
}
for
(
const
auto
node_policy
:
numa_map
)
{
auto
status
=
SetNumaMemoryPolicy
(
options
.
host_policy_map_
.
at
(
node_policy
.
second
));
if
(
!
status
.
IsOk
())
{
LOG_WARNING
<<
"Unable to allocate pinned system memory for NUMA node "
<<
node_policy
.
first
<<
": "
<<
status
.
AsString
();
continue
;
}
unsigned
long
node_mask
;
status
=
GetNumaMemoryPolicyNodeMask
(
&
node_mask
);
if
(
!
status
.
IsOk
())
{
LOG_WARNING
<<
"Unable to get NUMA node set for current thread: "
<<
status
.
AsString
();
continue
;
}
void
*
buffer
=
nullptr
;
#ifdef TRITON_ENABLE_GPU
auto
err
=
cudaHostAlloc
(
&
buffer
,
options
.
pinned_memory_pool_byte_size_
,
cudaHostAllocPortable
);
if
(
err
!=
cudaSuccess
)
{
buffer
=
nullptr
;
LOG_WARNING
<<
"Unable to allocate pinned system memory, pinned memory "
"pool will not be available: "
<<
std
::
string
(
cudaGetErrorString
(
err
));
}
else
if
(
options
.
pinned_memory_pool_byte_size_
!=
0
)
{
LOG_INFO
<<
"Pinned memory pool is created at '"
<<
PointerToString
(
buffer
)
<<
"' with size "
<<
options
.
pinned_memory_pool_byte_size_
;
}
else
{
LOG_INFO
<<
"Pinned memory pool disabled"
;
}
#endif // TRITON_ENABLE_GPU
ResetNumaMemoryPolicy
();
try
{
instance_
->
AddPinnedMemoryBuffer
(
std
::
shared_ptr
<
PinnedMemory
>
(
new
PinnedMemory
(
buffer
,
options
.
pinned_memory_pool_byte_size_
)),
node_mask
);
}
catch
(
const
std
::
exception
&
ex
)
{
return
Status
(
Status
::
Code
::
INTERNAL
,
"Failed to add Pinned Memory buffer with host policy: "
+
std
::
string
(
ex
.
what
()));
}
}
// If no pinned memory is allocated, add an empty entry where all allocation
// will be on normal system memory
if
(
instance_
->
pinned_memory_buffers_
.
empty
())
{
try
{
instance_
->
AddPinnedMemoryBuffer
(
std
::
shared_ptr
<
PinnedMemory
>
(
new
PinnedMemory
(
nullptr
,
options
.
pinned_memory_pool_byte_size_
)),
0
);
}
catch
(
const
std
::
exception
&
ex
)
{
return
Status
(
Status
::
Code
::
INTERNAL
,
"Failed to add empty Pinned Memory entry: "
+
std
::
string
(
ex
.
what
()));
}
}
}
pinned_memory_byte_size_
=
options
.
pinned_memory_pool_byte_size_
;
return
Status
::
Success
;
}
Status
PinnedMemoryManager
::
Alloc
(
void
**
ptr
,
uint64_t
size
,
TRITONSERVER_MemoryType
*
allocated_type
,
bool
allow_nonpinned_fallback
)
{
if
(
instance_
==
nullptr
)
{
return
Status
(
Status
::
Code
::
UNAVAILABLE
,
"PinnedMemoryManager has not been created"
);
}
auto
pinned_memory_buffer
=
instance_
->
pinned_memory_buffers_
.
begin
()
->
second
.
get
();
if
(
instance_
->
pinned_memory_buffers_
.
size
()
>
1
)
{
unsigned
long
node_mask
;
if
(
GetNumaMemoryPolicyNodeMask
(
&
node_mask
).
IsOk
())
{
auto
it
=
instance_
->
pinned_memory_buffers_
.
find
(
node_mask
);
if
(
it
!=
instance_
->
pinned_memory_buffers_
.
end
())
{
pinned_memory_buffer
=
it
->
second
.
get
();
}
}
}
return
instance_
->
AllocInternal
(
ptr
,
size
,
allocated_type
,
allow_nonpinned_fallback
,
pinned_memory_buffer
);
}
Status
PinnedMemoryManager
::
Free
(
void
*
ptr
)
{
if
(
instance_
==
nullptr
)
{
return
Status
(
Status
::
Code
::
UNAVAILABLE
,
"PinnedMemoryManager has not been created"
);
}
return
instance_
->
FreeInternal
(
ptr
);
}
}}
// namespace triton::core
3rdparty/core-r22.12/src/pinned_memory_manager.h
0 → 100644
View file @
b30f3cdb
// Copyright 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
#pragma once
#include <boost/interprocess/managed_external_buffer.hpp>
#include <map>
#include <memory>
#include <mutex>
#include "status.h"
#include "triton/common/model_config.h"
namespace
triton
{
namespace
core
{
// This is a singleton class responsible for maintaining pinned memory pool
// used by the inference server. Pinned memory allocations and deallocations
// must be requested via functions provided by this class.
class
PinnedMemoryManager
{
public:
// Options to configure pinned memeory manager.
struct
Options
{
Options
(
uint64_t
b
=
0
,
const
triton
::
common
::
HostPolicyCmdlineConfigMap
&
host_policy_map
=
{})
:
pinned_memory_pool_byte_size_
(
b
),
host_policy_map_
(
host_policy_map
)
{
}
uint64_t
pinned_memory_pool_byte_size_
;
triton
::
common
::
HostPolicyCmdlineConfigMap
host_policy_map_
;
};
~
PinnedMemoryManager
();
// Create the pinned memory manager based on 'options' specified.
// Return Status object indicating success or failure.
static
Status
Create
(
const
Options
&
options
);
// Allocate pinned memory with the requested 'size' and return the pointer
// in 'ptr'. If 'allow_nonpinned_fallback' is true, regular system memory
// will be allocated as fallback in the case where pinned memory fails to
// be allocated.
// Return Status object indicating success or failure.
static
Status
Alloc
(
void
**
ptr
,
uint64_t
size
,
TRITONSERVER_MemoryType
*
allocated_type
,
bool
allow_nonpinned_fallback
);
// Free the memory allocated by the pinned memory manager.
// Return Status object indicating success or failure.
static
Status
Free
(
void
*
ptr
);
protected:
// Provide explicit control on the lifecycle of the CUDA memory manager,
// for testing only.
static
void
Reset
();
private:
class
PinnedMemory
{
public:
PinnedMemory
(
void
*
pinned_memory_buffer
,
uint64_t
size
);
~
PinnedMemory
();
void
*
pinned_memory_buffer_
;
std
::
mutex
buffer_mtx_
;
boost
::
interprocess
::
managed_external_buffer
managed_pinned_memory_
;
};
PinnedMemoryManager
()
=
default
;
Status
AllocInternal
(
void
**
ptr
,
uint64_t
size
,
TRITONSERVER_MemoryType
*
allocated_type
,
bool
allow_nonpinned_fallback
,
PinnedMemory
*
pinned_memory_buffer
);
Status
FreeInternal
(
void
*
ptr
);
void
AddPinnedMemoryBuffer
(
const
std
::
shared_ptr
<
PinnedMemory
>&
pinned_memory_buffer
,
unsigned
long
node_mask
);
static
std
::
unique_ptr
<
PinnedMemoryManager
>
instance_
;
static
uint64_t
pinned_memory_byte_size_
;
std
::
mutex
info_mtx_
;
std
::
map
<
void
*
,
std
::
pair
<
bool
,
PinnedMemory
*>>
memory_info_
;
std
::
map
<
unsigned
long
,
std
::
shared_ptr
<
PinnedMemory
>>
pinned_memory_buffers_
;
};
}}
// namespace triton::core
3rdparty/core-r22.12/src/rate_limiter.cc
0 → 100644
View file @
b30f3cdb
// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "rate_limiter.h"
#include <limits>
#include "triton/common/logging.h"
namespace
triton
{
namespace
core
{
constexpr
size_t
MAX_PAYLOAD_BUCKET_COUNT
=
1000
;
//=========================================================================
// Core Implementation
//=========================================================================
Status
RateLimiter
::
Create
(
const
bool
ignore_resources_and_priority
,
const
RateLimiter
::
ResourceMap
&
resource_map
,
std
::
unique_ptr
<
RateLimiter
>*
rate_limiter
)
{
std
::
unique_ptr
<
RateLimiter
>
local_rate_limiter
(
new
RateLimiter
(
ignore_resources_and_priority
,
resource_map
));
*
rate_limiter
=
std
::
move
(
local_rate_limiter
);
return
Status
::
Success
;
}
Status
RateLimiter
::
RegisterModelInstance
(
TritonModelInstance
*
triton_model_instance
,
const
RateLimiterConfig
&
rate_limiter_config
)
{
{
std
::
lock_guard
<
std
::
mutex
>
lk1
(
model_ctx_mtx_
);
std
::
lock_guard
<
std
::
mutex
>
lk2
(
model_instance_ctx_mtx_
);
auto
&
model_context
=
model_contexts_
[
triton_model_instance
->
Model
()];
auto
&
model_instances
=
model_instance_ctxs_
[
triton_model_instance
->
Model
()];
model_instances
.
push_back
(
std
::
shared_ptr
<
ModelInstanceContext
>
(
new
ModelInstanceContext
(
triton_model_instance
,
&
model_context
,
rate_limiter_config
,
[
this
](
ModelInstanceContext
*
instance
)
{
OnStage
(
instance
);
},
[
this
](
ModelInstanceContext
*
instance
)
{
OnRelease
(
instance
);
})));
model_context
.
AddAvailableInstance
(
model_instances
.
back
().
get
());
model_context
.
AddSpecificRequestQueue
();
if
(
!
ignore_resources_and_priority_
)
{
resource_manager_
->
AddModelInstance
(
model_instances
.
back
().
get
());
RETURN_IF_ERROR
(
resource_manager_
->
UpdateResourceLimits
());
}
}
InitializePayloadQueues
(
triton_model_instance
);
return
Status
::
Success
;
}
Status
RateLimiter
::
UnregisterModel
(
const
TritonModel
*
model
)
{
{
std
::
lock_guard
<
std
::
mutex
>
lk1
(
model_ctx_mtx_
);
std
::
lock_guard
<
std
::
mutex
>
lk2
(
model_instance_ctx_mtx_
);
auto
&
model_context
=
model_contexts_
[
model
];
model_context
.
RequestRemoval
();
for
(
const
auto
&
instance
:
model_instance_ctxs_
[
model
])
{
instance
->
WaitForRemoval
();
if
(
!
ignore_resources_and_priority_
)
{
resource_manager_
->
RemoveModelInstance
(
instance
.
get
());
}
}
model_instance_ctxs_
.
erase
(
model
);
model_contexts_
.
erase
(
model
);
}
if
(
!
ignore_resources_and_priority_
)
{
RETURN_IF_ERROR
(
resource_manager_
->
UpdateResourceLimits
());
}
{
std
::
lock_guard
<
std
::
mutex
>
lk
(
payload_queues_mu_
);
if
(
payload_queues_
.
find
(
model
)
!=
payload_queues_
.
end
())
{
payload_queues_
.
erase
(
model
);
}
}
return
Status
::
Success
;
}
bool
RateLimiter
::
PayloadSlotAvailable
(
const
TritonModel
*
model
)
{
bool
result
;
PayloadQueue
*
payload_queue
=
payload_queues_
[
model
].
get
();
{
std
::
lock_guard
<
std
::
mutex
>
lk
(
payload_queue
->
mu_
);
result
=
payload_queue
->
queue_
->
Size
()
<
2
*
payload_queue
->
specific_queues_
.
size
();
}
return
result
;
}
Status
RateLimiter
::
EnqueuePayload
(
const
TritonModel
*
model
,
std
::
shared_ptr
<
Payload
>
payload
)
{
auto
pinstance
=
payload
->
GetInstance
();
if
(
payload_queues_
.
find
(
model
)
==
payload_queues_
.
end
())
{
LOG_INFO
<<
"Should not print this "
;
}
PayloadQueue
*
payload_queue
=
payload_queues_
[
model
].
get
();
{
std
::
lock_guard
<
std
::
mutex
>
lk
(
payload_queue
->
mu_
);
payload
->
SetState
(
Payload
::
State
::
REQUESTED
);
if
(
ignore_resources_and_priority_
)
{
SchedulePayload
(
pinstance
,
payload_queue
,
payload
);
}
}
if
(
ignore_resources_and_priority_
)
{
if
(
pinstance
==
nullptr
)
{
payload_queue
->
cv_
.
notify_one
();
}
else
{
payload_queue
->
cv_
.
notify_all
();
}
}
else
{
StandardScheduleFunc
sched_func
=
[
this
,
payload_queue
,
payload
](
ModelInstanceContext
*
mi
)
{
{
std
::
lock_guard
<
std
::
mutex
>
lk
(
payload_queue
->
mu_
);
this
->
SchedulePayload
(
mi
->
RawInstance
(),
payload_queue
,
payload
);
}
auto
cb
=
[
mi
]()
{
mi
->
Release
();
};
payload
->
AddInternalReleaseCallback
(
cb
);
if
(
mi
->
RawInstance
()
==
nullptr
)
{
payload_queue
->
cv_
.
notify_one
();
}
else
{
payload_queue
->
cv_
.
notify_all
();
}
};
DeferPayloadSchedule
(
sched_func
,
model
,
payload
->
GetInstance
());
}
return
Status
::
Success
;
}
void
RateLimiter
::
DequeuePayload
(
std
::
deque
<
TritonModelInstance
*>&
instances
,
std
::
shared_ptr
<
Payload
>*
payload
)
{
payload
->
reset
();
if
(
payload_queues_
.
find
(
instances
[
0
]
->
Model
())
==
payload_queues_
.
end
())
{
LOG_INFO
<<
"Should not print this "
;
}
PayloadQueue
*
payload_queue
=
payload_queues_
[
instances
[
0
]
->
Model
()].
get
();
std
::
vector
<
std
::
shared_ptr
<
Payload
>>
merged_payloads
;
size_t
instance_index
=
std
::
numeric_limits
<
std
::
size_t
>::
max
();
{
std
::
unique_lock
<
std
::
mutex
>
lk
(
payload_queue
->
mu_
);
payload_queue
->
cv_
.
wait
(
lk
,
[
&
instances
,
&
instance_index
,
payload_queue
]()
{
bool
empty
=
payload_queue
->
queue_
->
Empty
();
if
(
empty
)
{
instance_index
=
0
;
for
(
const
auto
instance
:
instances
)
{
empty
=
payload_queue
->
specific_queues_
[
instance
]
->
Empty
();
if
(
empty
)
{
instance_index
++
;
}
else
{
break
;
}
}
}
return
!
empty
;
});
if
(
instance_index
<
instances
.
size
())
{
TritonModelInstance
*
instance
=
instances
[
instance_index
];
if
(
!
payload_queue
->
specific_queues_
[
instance
]
->
Empty
())
{
payload_queue
->
specific_queues_
[
instance
]
->
Dequeue
(
payload
,
&
merged_payloads
);
}
}
else
{
payload_queue
->
queue_
->
Dequeue
(
payload
,
&
merged_payloads
);
}
}
for
(
auto
&
merge_payload
:
merged_payloads
)
{
PayloadRelease
(
merge_payload
);
}
(
*
payload
)
->
Callback
();
if
((
*
payload
)
->
GetInstance
()
==
nullptr
)
{
(
*
payload
)
->
SetInstance
(
instances
.
front
());
instances
.
pop_front
();
}
else
{
instances
.
erase
(
instances
.
begin
()
+
instance_index
);
}
}
std
::
shared_ptr
<
Payload
>
RateLimiter
::
GetPayload
(
const
Payload
::
Operation
op_type
,
TritonModelInstance
*
instance
)
{
std
::
shared_ptr
<
Payload
>
payload
;
if
(
max_payload_bucket_count_
>
0
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
payload_mu_
);
if
(
!
payload_bucket_
.
empty
())
{
payload
=
payload_bucket_
.
back
();
payload_bucket_
.
pop_back
();
}
if
(
payload
.
get
()
==
nullptr
&&
(
!
payloads_in_use_
.
empty
()))
{
// Just checking the front of the queue instead the entire queue for
// an available payload to save time.
if
(
payloads_in_use_
.
front
().
use_count
()
==
1
)
{
payload
=
payloads_in_use_
.
front
();
payloads_in_use_
.
pop_front
();
}
}
}
if
(
payload
.
get
()
==
nullptr
)
{
payload
.
reset
(
new
Payload
());
}
payload
->
Reset
(
op_type
,
instance
);
return
payload
;
}
void
RateLimiter
::
PayloadRelease
(
std
::
shared_ptr
<
Payload
>&
payload
)
{
payload
->
OnRelease
();
if
(
max_payload_bucket_count_
>
0
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
payload_mu_
);
if
(
payloads_in_use_
.
size
()
+
payload_bucket_
.
size
()
<
max_payload_bucket_count_
)
{
// Release iff the payload shared_ptr is uniquely held.
if
(
payload
.
use_count
()
==
1
)
{
payload
->
Release
();
payload_bucket_
.
push_back
(
std
::
move
(
payload
));
return
;
}
else
{
payloads_in_use_
.
push_back
(
std
::
move
(
payload
));
}
}
}
}
RateLimiter
::
RateLimiter
(
const
bool
ignore_resources_and_priority
,
const
ResourceMap
&
resource_map
)
:
ignore_resources_and_priority_
(
ignore_resources_and_priority
),
max_payload_bucket_count_
(
MAX_PAYLOAD_BUCKET_COUNT
)
{
ResourceManager
::
Create
(
resource_map
,
&
resource_manager_
);
}
void
RateLimiter
::
InitializePayloadQueues
(
const
TritonModelInstance
*
instance
)
{
auto
&
config
=
instance
->
Model
()
->
Config
();
uint64_t
max_queue_delay_microseconds
;
if
(
config
.
has_sequence_batching
())
{
const
auto
&
batcher_config
=
config
.
sequence_batching
();
if
(
batcher_config
.
has_oldest
())
{
max_queue_delay_microseconds
=
batcher_config
.
oldest
().
max_queue_delay_microseconds
();
}
else
{
max_queue_delay_microseconds
=
0
;
}
}
else
if
(
config
.
has_dynamic_batching
())
{
max_queue_delay_microseconds
=
config
.
dynamic_batching
().
max_queue_delay_microseconds
();
}
else
{
max_queue_delay_microseconds
=
0
;
}
{
std
::
lock_guard
<
std
::
mutex
>
lk
(
payload_queues_mu_
);
if
(
payload_queues_
.
find
(
instance
->
Model
())
==
payload_queues_
.
end
())
{
payload_queues_
.
emplace
(
instance
->
Model
(),
new
PayloadQueue
(
config
.
max_batch_size
(),
max_queue_delay_microseconds
*
1000
));
}
}
PayloadQueue
*
payload_queue
=
payload_queues_
[
instance
->
Model
()].
get
();
if
(
payload_queue
->
specific_queues_
.
find
(
instance
)
==
payload_queue
->
specific_queues_
.
end
())
{
payload_queue
->
specific_queues_
.
emplace
(
instance
,
new
InstanceQueue
(
config
.
max_batch_size
(),
max_queue_delay_microseconds
*
1000
));
}
}
Status
RateLimiter
::
DeferPayloadSchedule
(
const
StandardScheduleFunc
&
OnSchedule
,
const
TritonModel
*
model
,
TritonModelInstance
*
triton_model_instance
)
{
std
::
lock_guard
<
std
::
mutex
>
lk
(
model_ctx_mtx_
);
auto
itr
=
model_contexts_
.
find
(
model
);
if
(
itr
==
model_contexts_
.
end
())
{
return
Status
(
Status
::
Code
::
INTERNAL
,
"Requested model is not yet registered with rate limiter"
);
}
if
(
itr
->
second
.
isRemovalInProgress
())
{
return
Status
(
Status
::
Code
::
INTERNAL
,
"New model requests can not be made to a model that is being "
"removed"
);
}
itr
->
second
.
EnqueueModelInstanceRequest
(
OnSchedule
,
triton_model_instance
);
itr
->
second
.
StageInstanceIfAvailable
(
triton_model_instance
);
return
Status
::
Success
;
}
void
RateLimiter
::
SchedulePayload
(
TritonModelInstance
*
tmi
,
PayloadQueue
*
payload_queue
,
const
std
::
shared_ptr
<
Payload
>&
payload
)
{
if
(
tmi
==
nullptr
)
{
payload_queue
->
queue_
->
Enqueue
(
payload
);
}
else
{
payload_queue
->
specific_queues_
[
tmi
]
->
Enqueue
(
payload
);
}
payload
->
SetState
(
Payload
::
State
::
SCHEDULED
);
}
void
RateLimiter
::
OnStage
(
ModelInstanceContext
*
instance
)
{
{
std
::
lock_guard
<
std
::
recursive_mutex
>
lk
(
staged_instances_mtx_
);
staged_instances_
.
push
(
instance
);
}
AttemptAllocation
();
}
void
RateLimiter
::
OnRelease
(
ModelInstanceContext
*
instance
)
{
auto
&
model_context
=
model_contexts_
[
instance
->
RawInstance
()
->
Model
()];
model_context
.
AddAvailableInstance
(
instance
);
resource_manager_
->
ReleaseResources
(
instance
);
if
(
model_context
.
ContainsPendingRequests
(
instance
->
RawInstance
()
->
Index
()))
{
model_context
.
StageInstanceIfAvailable
(
instance
->
RawInstance
());
}
AttemptAllocation
();
}
void
RateLimiter
::
AttemptAllocation
()
{
std
::
lock_guard
<
std
::
recursive_mutex
>
lk
(
staged_instances_mtx_
);
if
(
!
staged_instances_
.
empty
())
{
ModelInstanceContext
*
instance
=
staged_instances_
.
top
();
if
(
resource_manager_
->
AllocateResources
(
instance
))
{
staged_instances_
.
pop
();
instance
->
Allocate
();
}
}
}
//=========================================================================
// ModelContext Implementation
//=========================================================================
RateLimiter
::
ModelContext
::
ModelContext
()
:
removal_in_progress_
(
false
)
{}
Status
RateLimiter
::
ModelContext
::
EnqueueModelInstanceRequest
(
const
StandardScheduleFunc
&
OnSchedule
,
TritonModelInstance
*
triton_model_instance
)
{
std
::
lock_guard
<
std
::
recursive_mutex
>
lk
(
sched_request_queue_mtx_
);
if
(
triton_model_instance
==
nullptr
)
{
generic_sched_request_queue_
.
push
(
OnSchedule
);
}
else
if
(
(
uint32_t
)
triton_model_instance
->
Index
()
<
specific_sched_request_queues_
.
size
())
{
specific_sched_request_queues_
[
triton_model_instance
->
Index
()].
push
(
OnSchedule
);
}
else
{
return
Status
(
Status
::
Code
::
INTERNAL
,
"expected instance index between 0 and "
+
std
::
to_string
(
specific_sched_request_queues_
.
size
())
+
", got "
+
std
::
to_string
(
triton_model_instance
->
Index
()));
}
return
Status
::
Success
;
}
void
RateLimiter
::
ModelContext
::
AddAvailableInstance
(
ModelInstanceContext
*
instance
)
{
std
::
lock_guard
<
std
::
recursive_mutex
>
lk
(
avbl_instances_mtx_
);
avbl_instances_
.
push
(
instance
);
instance
->
MarkAvailable
();
}
void
RateLimiter
::
ModelContext
::
StageInstanceIfAvailable
(
TritonModelInstance
*
req_instance
)
{
std
::
lock_guard
<
std
::
recursive_mutex
>
lk1
(
sched_request_queue_mtx_
);
std
::
lock_guard
<
std
::
recursive_mutex
>
lk2
(
avbl_instances_mtx_
);
PriorityQueue
backup_queue
;
while
(
!
avbl_instances_
.
empty
())
{
ModelInstanceContext
*
instance
=
avbl_instances_
.
top
();
if
((
req_instance
!=
nullptr
)
&&
(
instance
->
RawInstance
()
!=
req_instance
))
{
backup_queue
.
push
(
instance
);
avbl_instances_
.
pop
();
continue
;
}
if
(
!
specific_sched_request_queues_
[
instance
->
RawInstance
()
->
Index
()]
.
empty
())
{
// Prioritize the specific requests for the available model
// instance highest priority.
const
StandardScheduleFunc
func
=
specific_sched_request_queues_
[
instance
->
RawInstance
()
->
Index
()]
.
front
();
specific_sched_request_queues_
[
instance
->
RawInstance
()
->
Index
()].
pop
();
instance
->
Stage
(
func
);
}
else
if
(
!
generic_sched_request_queue_
.
empty
())
{
// If request is for generic model instance then use the
// instance with the highest priority.
const
StandardScheduleFunc
func
=
generic_sched_request_queue_
.
front
();
generic_sched_request_queue_
.
pop
();
instance
->
Stage
(
func
);
}
else
{
// If there are requests for a specific model instance then backup
// the model instance and keep searching through the available
// model instances. The prioritization will be taken care of in the
// staging priority queue.
backup_queue
.
push
(
instance
);
}
avbl_instances_
.
pop
();
}
// Restore the backup queue
if
(
!
backup_queue
.
empty
())
{
avbl_instances_
.
swap
(
backup_queue
);
}
}
void
RateLimiter
::
ModelContext
::
AllocateInstanceIfAvailable
()
{
std
::
lock_guard
<
std
::
recursive_mutex
>
lk1
(
sched_request_queue_mtx_
);
std
::
lock_guard
<
std
::
recursive_mutex
>
lk2
(
avbl_instances_mtx_
);
PriorityQueue
backup_queue
;
while
(
!
avbl_instances_
.
empty
())
{
ModelInstanceContext
*
instance
=
avbl_instances_
.
top
();
if
(
!
specific_sched_request_queues_
[
instance
->
RawInstance
()
->
Index
()]
.
empty
())
{
// Prioritize the specific requests for the available model
// instance highest priority.
const
StandardScheduleFunc
func
=
specific_sched_request_queues_
[
instance
->
RawInstance
()
->
Index
()]
.
front
();
specific_sched_request_queues_
[
instance
->
RawInstance
()
->
Index
()].
pop
();
instance
->
DirectAllocate
(
func
);
}
else
if
(
!
generic_sched_request_queue_
.
empty
())
{
// If request is for generic model instance then use the
// instance with the highest priority.
const
StandardScheduleFunc
func
=
generic_sched_request_queue_
.
front
();
generic_sched_request_queue_
.
pop
();
instance
->
DirectAllocate
(
func
);
}
else
{
// If there are requests for a specific model instance then backup
// the model instance and keep searching through the available
// model instances. The prioritization will be taken care of in the
// staging priority queue.
backup_queue
.
push
(
instance
);
}
avbl_instances_
.
pop
();
}
// Restore the backup queue
if
(
!
backup_queue
.
empty
())
{
avbl_instances_
.
swap
(
backup_queue
);
}
}
void
RateLimiter
::
ModelContext
::
AddSpecificRequestQueue
()
{
std
::
lock_guard
<
std
::
recursive_mutex
>
lk
(
sched_request_queue_mtx_
);
specific_sched_request_queues_
.
emplace_back
();
}
bool
RateLimiter
::
ModelContext
::
ContainsPendingRequests
(
int
index
)
{
std
::
lock_guard
<
std
::
recursive_mutex
>
lk
(
sched_request_queue_mtx_
);
return
(
generic_sched_request_queue_
.
size
()
!=
0
)
||
(
specific_sched_request_queues_
[
index
].
size
()
!=
0
);
}
void
RateLimiter
::
ModelContext
::
RequestRemoval
()
{
removal_in_progress_
=
true
;
}
//=========================================================================
// ModelInstanceContext Implementation
//=========================================================================
RateLimiter
::
ModelInstanceContext
::
ModelInstanceContext
(
TritonModelInstance
*
triton_model_instance
,
RateLimiter
::
ModelContext
*
model_context
,
const
RateLimiter
::
RateLimiterConfig
&
rate_limiter_config
,
RateLimiter
::
StandardStageFunc
OnStage
,
RateLimiter
::
StandardReleaseFunc
OnRelease
)
:
triton_model_instance_
(
triton_model_instance
),
index_
(
triton_model_instance
->
Index
()),
model_context_
(
model_context
),
rate_limiter_config_
(
rate_limiter_config
),
OnStage_
(
OnStage
),
OnRelease_
(
OnRelease
),
exec_count_
(
0
),
state_
(
AVAILABLE
)
{
}
void
RateLimiter
::
ModelInstanceContext
::
MarkAvailable
()
{
std
::
lock_guard
<
std
::
mutex
>
lk
(
state_mtx_
);
state_
=
AVAILABLE
;
}
Status
RateLimiter
::
ModelInstanceContext
::
Stage
(
StandardScheduleFunc
OnSchedule
)
{
{
std
::
lock_guard
<
std
::
mutex
>
lk
(
state_mtx_
);
if
(
state_
!=
AVAILABLE
)
{
return
Status
(
Status
::
Code
::
INTERNAL
,
"Can not stage a model instance that is not yet available"
);
}
state_
=
STAGED
;
OnSchedule_
=
OnSchedule
;
}
OnStage_
(
this
);
return
Status
::
Success
;
}
Status
RateLimiter
::
ModelInstanceContext
::
Allocate
()
{
{
std
::
lock_guard
<
std
::
mutex
>
lk
(
state_mtx_
);
if
(
state_
!=
STAGED
)
{
return
Status
(
Status
::
Code
::
INTERNAL
,
"Can not allocate a model instance that is not yet staged"
);
}
state_
=
ALLOCATED
;
}
OnSchedule_
(
this
);
return
Status
::
Success
;
}
Status
RateLimiter
::
ModelInstanceContext
::
DirectAllocate
(
StandardScheduleFunc
OnSchedule
)
{
{
std
::
lock_guard
<
std
::
mutex
>
lk
(
state_mtx_
);
if
(
state_
!=
AVAILABLE
)
{
return
Status
(
Status
::
Code
::
INTERNAL
,
"Can not allocate a model instance that is not yet available"
);
}
state_
=
ALLOCATED
;
}
OnSchedule
(
this
);
return
Status
::
Success
;
}
void
RateLimiter
::
ModelInstanceContext
::
Release
()
{
exec_count_
++
;
OnRelease_
(
this
);
{
std
::
lock_guard
<
std
::
mutex
>
lk
(
state_mtx_
);
if
((
model_context_
->
isRemovalInProgress
())
&&
(
state_
==
AVAILABLE
)
&&
(
!
model_context_
->
ContainsPendingRequests
(
index_
)))
{
state_
=
REMOVED
;
}
}
if
(
state_
==
REMOVED
)
{
cv_
.
notify_all
();
}
}
void
RateLimiter
::
ModelInstanceContext
::
RequestRemoval
()
{
std
::
lock_guard
<
std
::
mutex
>
lk
(
state_mtx_
);
if
((
state_
==
AVAILABLE
)
&&
(
!
model_context_
->
ContainsPendingRequests
(
index_
)))
{
state_
=
REMOVED
;
}
}
void
RateLimiter
::
ModelInstanceContext
::
WaitForRemoval
()
{
if
(
!
model_context_
->
isRemovalInProgress
())
{
model_context_
->
RequestRemoval
();
}
RequestRemoval
();
// Wait for the instance to be removed
{
std
::
unique_lock
<
std
::
mutex
>
lk
(
state_mtx_
);
cv_
.
wait
(
lk
,
[
this
]
{
return
state_
==
REMOVED
;
});
}
}
double
RateLimiter
::
ModelInstanceContext
::
ScaledPriority
()
{
// TODO: Different schemes for the prioritization of
// model instance can be added here.
// The priority of instance is 1 by default. If specified
// as 0, the priority is still treated as 1.
auto
priority
=
std
::
max
(
rate_limiter_config_
.
priority
(),
1u
);
return
(
exec_count_
*
priority
);
}
//=========================================================================
// ResourceManager Implementation
//=========================================================================
Status
RateLimiter
::
ResourceManager
::
Create
(
const
ResourceMap
&
resource_map
,
std
::
unique_ptr
<
ResourceManager
>*
resource_manager
)
{
std
::
unique_ptr
<
ResourceManager
>
local_resource_manager
(
new
ResourceManager
(
resource_map
));
*
resource_manager
=
std
::
move
(
local_resource_manager
);
return
Status
::
Success
;
}
void
RateLimiter
::
ResourceManager
::
AddModelInstance
(
const
ModelInstanceContext
*
instance
)
{
std
::
lock_guard
<
std
::
mutex
>
lk
(
model_resources_mtx_
);
auto
pr
=
model_resources_
.
emplace
(
std
::
make_pair
(
instance
,
ResourceMap
()));
for
(
const
auto
&
resource
:
instance
->
GetRateLimiterConfig
()
->
resources
())
{
if
(
resource
.
global
())
{
(
pr
.
first
->
second
[
GLOBAL_RESOURCE_KEY
])[
resource
.
name
()]
=
resource
.
count
();
}
else
{
(
pr
.
first
->
second
[
instance
->
RawInstance
()
->
DeviceId
()])[
resource
.
name
()]
=
resource
.
count
();
}
}
}
Status
RateLimiter
::
ResourceManager
::
RemoveModelInstance
(
const
ModelInstanceContext
*
instance
)
{
std
::
lock_guard
<
std
::
mutex
>
lk
(
model_resources_mtx_
);
const
auto
&
itr
=
model_resources_
.
find
(
instance
);
if
(
itr
==
model_resources_
.
end
())
{
return
Status
(
Status
::
Code
::
INTERNAL
,
"Can not find the instance to remove"
);
}
model_resources_
.
erase
(
instance
);
return
Status
::
Success
;
}
Status
RateLimiter
::
ResourceManager
::
UpdateResourceLimits
()
{
std
::
lock_guard
<
std
::
mutex
>
lk1
(
max_resources_mtx_
);
std
::
lock_guard
<
std
::
mutex
>
lk2
(
model_resources_mtx_
);
max_resources_
.
clear
();
// Obtain the maximum resource across all the instances
// and use it as the default available.
for
(
const
auto
&
instance_resources
:
model_resources_
)
{
for
(
const
auto
&
resource_device_map
:
instance_resources
.
second
)
{
auto
ditr
=
max_resources_
.
find
(
resource_device_map
.
first
);
if
(
ditr
==
max_resources_
.
end
())
{
ditr
=
max_resources_
.
emplace
(
resource_device_map
.
first
,
resource_device_map
.
second
)
.
first
;
}
else
{
for
(
const
auto
resource
:
resource_device_map
.
second
)
{
auto
ritr
=
ditr
->
second
.
find
(
resource
.
first
);
if
(
ritr
==
ditr
->
second
.
end
())
{
ritr
=
ditr
->
second
.
emplace
(
resource
.
first
,
resource
.
second
).
first
;
}
else
{
if
(
ritr
->
second
<
resource
.
second
)
{
ritr
->
second
=
resource
.
second
;
}
}
}
}
}
}
if
(
!
explicit_max_resources_
.
empty
())
{
RETURN_IF_ERROR
(
ParseAndValidateExplicitResources
());
}
RETURN_IF_ERROR
(
ValidateMaxResources
());
if
(
LOG_VERBOSE_IS_ON
(
1
))
{
std
::
string
resource_map_str
{
"
\n
Max Resource Map===>
\n
"
};
for
(
const
auto
&
ditr
:
max_resources_
)
{
if
(
!
ditr
.
second
.
empty
())
{
std
::
string
device_str
{(
ditr
.
first
==
GLOBAL_RESOURCE_KEY
)
?
"GLOBAL"
:
std
::
to_string
(
ditr
.
first
)};
resource_map_str
+=
"
\t
Device: "
+
device_str
+
"
\n
"
;
for
(
const
auto
&
ritr
:
ditr
.
second
)
{
resource_map_str
+=
"
\t\t
Resource: "
+
ritr
.
first
+
"
\t
Count: "
+
std
::
to_string
(
ritr
.
second
)
+
"
\n
"
;
}
}
}
LOG_VERBOSE
(
1
)
<<
resource_map_str
;
}
return
Status
::
Success
;
}
Status
RateLimiter
::
ResourceManager
::
ValidateMaxResources
()
{
for
(
const
auto
&
global_resource
:
max_resources_
[
GLOBAL_RESOURCE_KEY
])
{
for
(
const
auto
&
ditr
:
max_resources_
)
{
if
(
ditr
.
first
!=
GLOBAL_RESOURCE_KEY
)
{
for
(
const
auto
&
ritr
:
ditr
.
second
)
{
if
(
global_resource
.
first
.
compare
(
ritr
.
first
)
==
0
)
{
return
Status
(
Status
::
Code
::
INVALID_ARG
,
(
std
::
string
(
"Resource
\"
"
)
+
ritr
.
first
+
"
\"
is present as both global and device-specific resource in "
"the model configuration."
)
.
c_str
());
}
}
}
}
}
return
Status
::
Success
;
}
Status
RateLimiter
::
ResourceManager
::
ParseAndValidateExplicitResources
()
{
for
(
auto
&
ditr
:
max_resources_
)
{
for
(
auto
&
ritr
:
ditr
.
second
)
{
// If not specified explicitly, consider the resource to be unavailable.
size_t
resource_count
=
0
;
if
(
ditr
.
first
==
GLOBAL_RESOURCE_KEY
)
{
// Ignore the device specification... will search for all resources in
// the map...
for
(
const
auto
&
exp_ditr
:
explicit_max_resources_
)
{
for
(
const
auto
&
exp_ritr
:
exp_ditr
.
second
)
{
if
(
ritr
.
first
.
compare
(
exp_ritr
.
first
)
==
0
)
{
if
(
resource_count
<
exp_ritr
.
second
)
{
resource_count
=
exp_ritr
.
second
;
}
}
}
}
}
else
{
// Search only for the device specific or per-device resources...
// device-specific
for
(
const
auto
&
exp_ritr
:
explicit_max_resources_
[
ditr
.
first
])
{
if
(
ritr
.
first
.
compare
(
exp_ritr
.
first
)
==
0
)
{
if
(
resource_count
<
exp_ritr
.
second
)
{
resource_count
=
exp_ritr
.
second
;
}
}
}
// per-device
for
(
const
auto
&
exp_ritr
:
explicit_max_resources_
[
PER_DEVICE_RESOURCE_KEY
])
{
if
(
ritr
.
first
.
compare
(
exp_ritr
.
first
)
==
0
)
{
if
(
resource_count
<
exp_ritr
.
second
)
{
resource_count
=
exp_ritr
.
second
;
}
}
}
}
if
(
resource_count
<
ritr
.
second
)
{
return
Status
(
Status
::
Code
::
INVALID_ARG
,
(
std
::
string
(
"Resource count for
\"
"
)
+
ritr
.
first
+
"
\"
is limited to "
+
std
::
to_string
(
resource_count
)
+
" which will prevent scheduling of one or more model "
"instances, the minimum required count is "
+
std
::
to_string
(
ritr
.
second
))
.
c_str
());
}
else
{
ritr
.
second
=
resource_count
;
}
}
}
return
Status
::
Success
;
}
bool
RateLimiter
::
ResourceManager
::
AllocateResources
(
const
ModelInstanceContext
*
instance
)
{
std
::
lock_guard
<
std
::
mutex
>
lk1
(
model_resources_mtx_
);
std
::
lock_guard
<
std
::
mutex
>
lk2
(
allocated_resources_mtx_
);
const
auto
&
itr
=
model_resources_
.
find
(
instance
);
if
(
itr
==
model_resources_
.
end
())
{
return
false
;
}
else
{
// First pass to verify if resources are available
{
std
::
lock_guard
<
std
::
mutex
>
lk3
(
max_resources_mtx_
);
for
(
const
auto
&
ditr
:
itr
->
second
)
{
auto
allocated_ditr
=
allocated_resources_
.
find
(
ditr
.
first
);
if
(
allocated_ditr
==
allocated_resources_
.
end
())
{
allocated_ditr
=
allocated_resources_
.
emplace
(
ditr
.
first
,
std
::
map
<
std
::
string
,
size_t
>
())
.
first
;
}
for
(
const
auto
&
ritr
:
ditr
.
second
)
{
auto
allocated_ritr
=
allocated_ditr
->
second
.
find
(
ritr
.
first
);
if
(
allocated_ritr
==
allocated_ditr
->
second
.
end
())
{
allocated_ritr
=
allocated_ditr
->
second
.
emplace
(
ritr
.
first
,
0
).
first
;
}
if
((
allocated_ritr
->
second
+
ritr
.
second
)
>
(
max_resources_
[
ditr
.
first
])[
ritr
.
first
])
{
return
false
;
}
}
}
}
// Second pass to actually allocate the resources
for
(
const
auto
&
ditr
:
itr
->
second
)
{
for
(
const
auto
&
ritr
:
ditr
.
second
)
{
(
allocated_resources_
[
ditr
.
first
])[
ritr
.
first
]
+=
ritr
.
second
;
}
}
}
return
true
;
}
Status
RateLimiter
::
ResourceManager
::
ReleaseResources
(
const
ModelInstanceContext
*
instance
)
{
std
::
lock_guard
<
std
::
mutex
>
lk1
(
model_resources_mtx_
);
std
::
lock_guard
<
std
::
mutex
>
lk2
(
allocated_resources_mtx_
);
const
auto
&
itr
=
model_resources_
.
find
(
instance
);
if
(
itr
==
model_resources_
.
end
())
{
return
Status
(
Status
::
Code
::
INTERNAL
,
"Unable find the instance resources to release"
);
}
else
{
for
(
const
auto
&
ditr
:
itr
->
second
)
{
for
(
const
auto
&
ritr
:
ditr
.
second
)
{
(
allocated_resources_
[
ditr
.
first
])[
ritr
.
first
]
-=
ritr
.
second
;
}
}
}
return
Status
::
Success
;
}
RateLimiter
::
ResourceManager
::
ResourceManager
(
const
ResourceMap
&
resource_map
)
:
explicit_max_resources_
(
resource_map
)
{
}
}}
// namespace triton::core
3rdparty/core-r22.12/src/rate_limiter.h
0 → 100644
View file @
b30f3cdb
// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <condition_variable>
#include <functional>
#include <mutex>
#include <queue>
#include <vector>
#include "backend_model.h"
#include "backend_model_instance.h"
#include "instance_queue.h"
#include "model_config.pb.h"
#include "payload.h"
#include "status.h"
namespace
triton
{
namespace
core
{
// Limits the rate at which requests are dispatched to the model instances
class
RateLimiter
{
public:
using
RateLimiterConfig
=
inference
::
ModelRateLimiter
;
using
ResourceMap
=
std
::
map
<
int
,
std
::
map
<
std
::
string
,
size_t
>>
;
enum
RESOURCE_KIND_KEY
{
// Key for holding global resources
GLOBAL_RESOURCE_KEY
=
-
2
,
// Key for holding resources per each device
PER_DEVICE_RESOURCE_KEY
=
-
1
};
/// Creates a rate limiter object which will funnel the requests to
/// the model instances. A typical lifetime of the model instance within
/// RateLimiter transition from available -> staged -> allocated -> available.
/// The transition from available to staged occurs when a request is
/// registered for the model. Depending upon the resource availabilty and
/// priority, the RateLimiter will transition an instance to allocated state
/// at some point in the future. The staged state is skipped when
/// configured to ignore the resource constraints. The cycle in this case
/// will be available -> allocated -> available.
/// \param ignore_resources_and_priority Whether or not to ignore resource
/// constraints and cross-model priority. An available instance is directly
/// allocated when true.
/// \param resource_map The map to the available resource count provided
/// explicitly.
/// \return Status object indicating success or failure.
static
Status
Create
(
const
bool
ignore_resources_and_priority
,
const
ResourceMap
&
resource_map
,
std
::
unique_ptr
<
RateLimiter
>*
rate_limiter
);
/// Registers the model instance with the rate limiter.
/// \param instance The pointer to the TritonModelInstance object to register
/// with the rate limiter.
/// \param rate_limiter_config The rate limiter configuration associated with
/// the model instance.
/// \return Status object indicating success or failure.
Status
RegisterModelInstance
(
TritonModelInstance
*
instance
,
const
RateLimiterConfig
&
rate_limiter_config
);
/// Remove model from the set of models being managed by the rate limiter.
/// \param model The pointer to TritonModel object to be removed.
/// \return Status object indicating success or failure.
Status
UnregisterModel
(
const
TritonModel
*
model
);
/// Returns true if there is a payload slot available for the given model.
/// \param model The pointer to TritonModel object to be removed.
/// \return slot availability in boolean.
bool
PayloadSlotAvailable
(
const
TritonModel
*
model
);
/// Enqueues the payload to rate limiter for scheduling on the given model.
/// \param model The pointer to TritonModel object to be removed.
/// \param payload The shared pointer to the payload object.
/// \return Status object indicating success or failure.
Status
EnqueuePayload
(
const
TritonModel
*
model
,
std
::
shared_ptr
<
Payload
>
payload
);
/// Returns the payload that has been scheduled for the given set of model
/// instances. Note that this call is blocking and depends upon the
/// availability of payloads in the rate limiter for the triton model
/// instance.
/// \param instance The pointers to TritonModelInstance objects whose
/// payload is being requested.
/// \param payload The shared pointer to the payload object.
void
DequeuePayload
(
std
::
deque
<
TritonModelInstance
*>&
instance
,
std
::
shared_ptr
<
Payload
>*
payload
);
/// Returns a new payload object.
/// \param op_type The operation type for the payload.
/// \param instance Optional field that providess the model instance that must
/// be used for the execution of the payload. Default is nullptr which allows
/// any model instance to execute the payload.
/// \return The shared pointer to a new payload object.
std
::
shared_ptr
<
Payload
>
GetPayload
(
const
Payload
::
Operation
op_type
,
TritonModelInstance
*
instance
=
nullptr
);
/// Releases the given payload object back to the rate limiter.
/// \param payload The payload to release.
void
PayloadRelease
(
std
::
shared_ptr
<
Payload
>&
payload
);
private:
class
ModelInstanceContext
;
class
ModelContext
;
struct
PayloadQueue
;
using
StandardReleaseFunc
=
std
::
function
<
void
(
ModelInstanceContext
*
)
>
;
using
StandardScheduleFunc
=
std
::
function
<
void
(
ModelInstanceContext
*
)
>
;
using
StandardStageFunc
=
std
::
function
<
void
(
ModelInstanceContext
*
)
>
;
// Holds the state of the model instance.
class
ModelInstanceContext
{
public:
friend
class
RateLimiter
;
friend
class
ResourceManager
;
enum
State
{
AVAILABLE
,
STAGED
,
ALLOCATED
,
REMOVED
};
void
Release
();
TritonModelInstance
*
RawInstance
()
const
{
return
triton_model_instance_
;
}
private:
ModelInstanceContext
(
TritonModelInstance
*
triton_model_instance
,
ModelContext
*
model_context
,
const
RateLimiterConfig
&
rate_limiter_config
,
StandardStageFunc
OnStage
,
StandardReleaseFunc
OnRelease
);
const
RateLimiterConfig
*
GetRateLimiterConfig
()
const
{
return
&
rate_limiter_config_
;
}
void
MarkAvailable
();
double
ScaledPriority
();
Status
Stage
(
StandardScheduleFunc
OnSchedule
);
Status
Allocate
();
Status
DirectAllocate
(
StandardScheduleFunc
OnSchedule
);
void
RequestRemoval
();
void
WaitForRemoval
();
TritonModelInstance
*
triton_model_instance_
;
size_t
index_
;
ModelContext
*
model_context_
;
RateLimiterConfig
rate_limiter_config_
;
StandardStageFunc
OnStage_
;
StandardReleaseFunc
OnRelease_
;
std
::
atomic
<
uint64_t
>
exec_count_
;
State
state_
;
bool
removal_in_progress_
;
std
::
mutex
state_mtx_
;
StandardScheduleFunc
OnSchedule_
;
std
::
condition_variable
cv_
;
};
class
ScaledPriorityComparator
{
public:
bool
operator
()(
ModelInstanceContext
*
a
,
ModelInstanceContext
*
b
)
{
return
a
->
ScaledPriority
()
>
b
->
ScaledPriority
();
}
};
using
PriorityQueue
=
std
::
priority_queue
<
ModelInstanceContext
*
,
std
::
vector
<
ModelInstanceContext
*>
,
ScaledPriorityComparator
>
;
// Holds the active context to a model
class
ModelContext
{
public:
ModelContext
();
Status
EnqueueModelInstanceRequest
(
const
StandardScheduleFunc
&
OnSchedule
,
TritonModelInstance
*
triton_model_instance
);
void
AddAvailableInstance
(
ModelInstanceContext
*
instance
);
void
StageInstanceIfAvailable
(
TritonModelInstance
*
triton_model_instance
);
void
AllocateInstanceIfAvailable
();
void
AddSpecificRequestQueue
();
bool
ContainsPendingRequests
(
int32_t
index
);
void
RequestRemoval
();
bool
isRemovalInProgress
()
{
return
removal_in_progress_
;
}
private:
bool
removal_in_progress_
;
// Queue holding pending scheduling request
std
::
queue
<
StandardScheduleFunc
>
generic_sched_request_queue_
;
std
::
vector
<
std
::
queue
<
StandardScheduleFunc
>>
specific_sched_request_queues_
;
std
::
recursive_mutex
sched_request_queue_mtx_
;
// The set of instances that are available at the moment
PriorityQueue
avbl_instances_
;
std
::
recursive_mutex
avbl_instances_mtx_
;
};
// Manages and keep track of resource allocation to the model instances.
class
ResourceManager
{
public:
static
Status
Create
(
const
ResourceMap
&
resource_map
,
std
::
unique_ptr
<
ResourceManager
>*
resource_manager
);
void
AddModelInstance
(
const
ModelInstanceContext
*
instance
);
Status
RemoveModelInstance
(
const
ModelInstanceContext
*
instance
);
Status
UpdateResourceLimits
();
bool
AllocateResources
(
const
ModelInstanceContext
*
instance
);
Status
ReleaseResources
(
const
ModelInstanceContext
*
instance
);
private:
ResourceManager
(
const
ResourceMap
&
resource_map
);
Status
ValidateMaxResources
();
Status
ParseAndValidateExplicitResources
();
ResourceMap
explicit_max_resources_
;
std
::
map
<
const
ModelInstanceContext
*
,
ResourceMap
>
model_resources_
;
std
::
mutex
model_resources_mtx_
;
ResourceMap
max_resources_
;
std
::
mutex
max_resources_mtx_
;
ResourceMap
allocated_resources_
;
std
::
mutex
allocated_resources_mtx_
;
};
RateLimiter
(
const
bool
ignore_resources_and_priority
,
const
ResourceMap
&
resource_map
);
void
InitializePayloadQueues
(
const
TritonModelInstance
*
instance
);
Status
DeferPayloadSchedule
(
const
StandardScheduleFunc
&
OnSchedule
,
const
TritonModel
*
model
,
TritonModelInstance
*
instance
=
nullptr
);
void
OnStage
(
ModelInstanceContext
*
instance_ptr
);
void
OnRelease
(
ModelInstanceContext
*
instance_ptr
);
void
AttemptAllocation
();
void
SchedulePayload
(
TritonModelInstance
*
tmi
,
PayloadQueue
*
payload_queue
,
const
std
::
shared_ptr
<
Payload
>&
payload
);
bool
ignore_resources_and_priority_
;
// Instance context for the models
std
::
map
<
const
TritonModel
*
,
std
::
vector
<
std
::
shared_ptr
<
ModelInstanceContext
>>>
model_instance_ctxs_
;
std
::
mutex
model_instance_ctx_mtx_
;
// Running context of the models
std
::
map
<
const
TritonModel
*
,
ModelContext
>
model_contexts_
;
std
::
mutex
model_ctx_mtx_
;
// Holds the model instances that have been staged
PriorityQueue
staged_instances_
;
std
::
recursive_mutex
staged_instances_mtx_
;
// Manager to keep track of the resource allocations
std
::
unique_ptr
<
ResourceManager
>
resource_manager_
;
// Mutex to serialize Payload [de]allocation
std
::
mutex
payload_mu_
;
// Mutex to serialize Payload Queues deallocation
std
::
mutex
payload_queues_mu_
;
// Keep some number of Payload objects for reuse to avoid the overhead
// of creating a Payload for every new request.
const
size_t
max_payload_bucket_count_
;
std
::
vector
<
std
::
shared_ptr
<
Payload
>>
payload_bucket_
;
std
::
deque
<
std
::
shared_ptr
<
Payload
>>
payloads_in_use_
;
struct
PayloadQueue
{
explicit
PayloadQueue
(
size_t
max_batch_size
,
uint64_t
max_queue_delay_ns
)
{
queue_
.
reset
(
new
InstanceQueue
(
max_batch_size
,
max_queue_delay_ns
));
}
std
::
unique_ptr
<
InstanceQueue
>
queue_
;
std
::
map
<
const
TritonModelInstance
*
,
std
::
unique_ptr
<
InstanceQueue
>>
specific_queues_
;
std
::
mutex
mu_
;
std
::
condition_variable
cv_
;
};
std
::
map
<
const
TritonModel
*
,
std
::
unique_ptr
<
PayloadQueue
>>
payload_queues_
;
};
}}
// namespace triton::core
3rdparty/core-r22.12/src/repo_agent.cc
0 → 100644
View file @
b30f3cdb
// Copyright 2021-2022, NVIDIA CORPORATION. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "repo_agent.h"
#include <string>
#include "filesystem.h"
#include "shared_library.h"
#include "triton/common/logging.h"
#include "tritonserver_apis.h"
// For unknown reason, windows will not export the TRITONREPOAGENT_*
// functions declared with dllexport in tritonrepoagent.h. To get
// those functions exported it is (also?) necessary to mark the
// definitions in this file with dllexport as well.
#if defined(_MSC_VER)
#define TRITONAPI_DECLSPEC __declspec(dllexport)
#elif defined(__GNUC__)
#define TRITONAPI_DECLSPEC __attribute__((__visibility__("default")))
#else
#define TRITONAPI_DECLSPEC
#endif
namespace
triton
{
namespace
core
{
std
::
string
TritonRepoAgentLibraryName
(
const
std
::
string
&
agent_name
)
{
#ifdef _WIN32
return
std
::
string
(
"tritonrepoagent_"
)
+
agent_name
+
".dll"
;
#else
return
std
::
string
(
"libtritonrepoagent_"
)
+
agent_name
+
".so"
;
#endif
}
std
::
string
TRITONREPOAGENT_ActionTypeString
(
const
TRITONREPOAGENT_ActionType
type
)
{
switch
(
type
)
{
case
TRITONREPOAGENT_ACTION_LOAD
:
return
"TRITONREPOAGENT_ACTION_LOAD"
;
case
TRITONREPOAGENT_ACTION_LOAD_COMPLETE
:
return
"TRITONREPOAGENT_ACTION_LOAD_COMPLETE"
;
case
TRITONREPOAGENT_ACTION_LOAD_FAIL
:
return
"TRITONREPOAGENT_ACTION_LOAD_FAIL"
;
case
TRITONREPOAGENT_ACTION_UNLOAD
:
return
"TRITONREPOAGENT_ACTION_UNLOAD"
;
case
TRITONREPOAGENT_ACTION_UNLOAD_COMPLETE
:
return
"TRITONREPOAGENT_ACTION_UNLOAD_COMPLETE"
;
}
return
"Unknown TRITONREPOAGENT_ActionType"
;
}
std
::
string
TRITONREPOAGENT_ArtifactTypeString
(
const
TRITONREPOAGENT_ArtifactType
type
)
{
switch
(
type
)
{
case
TRITONREPOAGENT_ARTIFACT_FILESYSTEM
:
return
"TRITONREPOAGENT_ARTIFACT_FILESYSTEM"
;
case
TRITONREPOAGENT_ARTIFACT_REMOTE_FILESYSTEM
:
return
"TRITONREPOAGENT_ARTIFACT_REMOTE_FILESYSTEM"
;
}
return
"Unknown TRITONREPOAGENT_ArtifactType"
;
}
//
// TritonRepoAgent
//
Status
TritonRepoAgent
::
Create
(
const
std
::
string
&
name
,
const
std
::
string
&
libpath
,
std
::
shared_ptr
<
TritonRepoAgent
>*
agent
)
{
std
::
shared_ptr
<
TritonRepoAgent
>
lagent
(
new
TritonRepoAgent
(
name
));
{
std
::
unique_ptr
<
SharedLibrary
>
slib
;
RETURN_IF_ERROR
(
SharedLibrary
::
Acquire
(
&
slib
));
RETURN_IF_ERROR
(
slib
->
OpenLibraryHandle
(
libpath
,
&
lagent
->
dlhandle_
));
RETURN_IF_ERROR
(
slib
->
GetEntrypoint
(
lagent
->
dlhandle_
,
"TRITONREPOAGENT_Initialize"
,
true
/* optional */
,
reinterpret_cast
<
void
**>
(
&
lagent
->
init_fn_
)));
RETURN_IF_ERROR
(
slib
->
GetEntrypoint
(
lagent
->
dlhandle_
,
"TRITONREPOAGENT_Finalize"
,
true
/* optional */
,
reinterpret_cast
<
void
**>
(
&
lagent
->
fini_fn_
)));
RETURN_IF_ERROR
(
slib
->
GetEntrypoint
(
lagent
->
dlhandle_
,
"TRITONREPOAGENT_ModelInitialize"
,
true
/* optional */
,
reinterpret_cast
<
void
**>
(
&
lagent
->
model_init_fn_
)));
RETURN_IF_ERROR
(
slib
->
GetEntrypoint
(
lagent
->
dlhandle_
,
"TRITONREPOAGENT_ModelFinalize"
,
true
/* optional */
,
reinterpret_cast
<
void
**>
(
&
lagent
->
model_fini_fn_
)));
RETURN_IF_ERROR
(
slib
->
GetEntrypoint
(
lagent
->
dlhandle_
,
"TRITONREPOAGENT_ModelAction"
,
false
/* optional */
,
reinterpret_cast
<
void
**>
(
&
lagent
->
model_action_fn_
)));
}
// Initialize if needed
if
(
lagent
->
init_fn_
!=
nullptr
)
{
RETURN_IF_TRITONSERVER_ERROR
(
lagent
->
init_fn_
(
reinterpret_cast
<
TRITONREPOAGENT_Agent
*>
(
lagent
.
get
())));
}
*
agent
=
std
::
move
(
lagent
);
return
Status
::
Success
;
}
TritonRepoAgent
::~
TritonRepoAgent
()
{
// Finalize if needed
if
(
fini_fn_
!=
nullptr
)
{
auto
err
=
fini_fn_
(
reinterpret_cast
<
TRITONREPOAGENT_Agent
*>
(
this
));
if
(
err
!=
nullptr
)
{
LOG_ERROR
<<
"~TritonRepoAgent: "
<<
Status
(
TritonCodeToStatusCode
(
TRITONSERVER_ErrorCode
(
err
)),
TRITONSERVER_ErrorMessage
(
err
))
.
AsString
();
TRITONSERVER_ErrorDelete
(
err
);
};
}
{
std
::
unique_ptr
<
SharedLibrary
>
slib
;
LOG_STATUS_ERROR
(
SharedLibrary
::
Acquire
(
&
slib
),
"~TritonRepoAgent"
);
LOG_STATUS_ERROR
(
slib
->
CloseLibraryHandle
(
dlhandle_
),
"~TritonRepoAgent"
);
}
}
//
// TritonRepoAgentModel
//
Status
TritonRepoAgentModel
::
Create
(
const
TRITONREPOAGENT_ArtifactType
type
,
const
std
::
string
&
location
,
const
inference
::
ModelConfig
&
config
,
const
std
::
shared_ptr
<
TritonRepoAgent
>&
agent
,
const
TritonRepoAgent
::
Parameters
&
agent_parameters
,
std
::
unique_ptr
<
TritonRepoAgentModel
>*
agent_model
)
{
std
::
unique_ptr
<
TritonRepoAgentModel
>
lagent_model
(
new
TritonRepoAgentModel
(
type
,
location
,
config
,
agent
,
agent_parameters
));
if
(
agent
->
AgentModelInitFn
()
!=
nullptr
)
{
RETURN_IF_TRITONSERVER_ERROR
(
agent
->
AgentModelInitFn
()(
reinterpret_cast
<
TRITONREPOAGENT_Agent
*>
(
agent
.
get
()),
reinterpret_cast
<
TRITONREPOAGENT_AgentModel
*>
(
lagent_model
.
get
())));
}
*
agent_model
=
std
::
move
(
lagent_model
);
return
Status
::
Success
;
}
TritonRepoAgentModel
::~
TritonRepoAgentModel
()
{
// Need to ensure the proper lifecycle is informed
if
(
action_type_set_
)
{
switch
(
current_action_type_
)
{
case
TRITONREPOAGENT_ACTION_LOAD
:
LOG_TRITONSERVER_ERROR
(
agent_
->
AgentModelActionFn
()(
reinterpret_cast
<
TRITONREPOAGENT_Agent
*>
(
agent_
.
get
()),
reinterpret_cast
<
TRITONREPOAGENT_AgentModel
*>
(
this
),
TRITONREPOAGENT_ACTION_LOAD_FAIL
),
"Inform TRITONREPOAGENT_ACTION_LOAD_FAIL"
);
break
;
case
TRITONREPOAGENT_ACTION_LOAD_COMPLETE
:
LOG_TRITONSERVER_ERROR
(
agent_
->
AgentModelActionFn
()(
reinterpret_cast
<
TRITONREPOAGENT_Agent
*>
(
agent_
.
get
()),
reinterpret_cast
<
TRITONREPOAGENT_AgentModel
*>
(
this
),
TRITONREPOAGENT_ACTION_UNLOAD
),
"Inform TRITONREPOAGENT_ACTION_UNLOAD"
);
// Fallthough is not yet an language feature until C++17
LOG_TRITONSERVER_ERROR
(
agent_
->
AgentModelActionFn
()(
reinterpret_cast
<
TRITONREPOAGENT_Agent
*>
(
agent_
.
get
()),
reinterpret_cast
<
TRITONREPOAGENT_AgentModel
*>
(
this
),
TRITONREPOAGENT_ACTION_UNLOAD_COMPLETE
),
"Inform TRITONREPOAGENT_ACTION_UNLOAD_COMPLETE"
);
break
;
case
TRITONREPOAGENT_ACTION_UNLOAD
:
LOG_TRITONSERVER_ERROR
(
agent_
->
AgentModelActionFn
()(
reinterpret_cast
<
TRITONREPOAGENT_Agent
*>
(
agent_
.
get
()),
reinterpret_cast
<
TRITONREPOAGENT_AgentModel
*>
(
this
),
TRITONREPOAGENT_ACTION_UNLOAD_COMPLETE
),
"Inform TRITONREPOAGENT_ACTION_UNLOAD_COMPLETE"
);
break
;
case
TRITONREPOAGENT_ACTION_LOAD_FAIL
:
case
TRITONREPOAGENT_ACTION_UNLOAD_COMPLETE
:
break
;
}
}
if
(
agent_
->
AgentModelFiniFn
()
!=
nullptr
)
{
LOG_TRITONSERVER_ERROR
(
agent_
->
AgentModelFiniFn
()(
reinterpret_cast
<
TRITONREPOAGENT_Agent
*>
(
agent_
.
get
()),
reinterpret_cast
<
TRITONREPOAGENT_AgentModel
*>
(
this
)),
"~TritonRepoAgentModel"
);
}
if
(
!
acquired_location_
.
empty
())
{
DeleteMutableLocation
();
}
}
Status
TritonRepoAgentModel
::
InvokeAgent
(
const
TRITONREPOAGENT_ActionType
action_type
)
{
if
((
!
action_type_set_
)
&&
(
action_type
!=
TRITONREPOAGENT_ACTION_LOAD
))
{
return
Status
(
Status
::
Code
::
INTERNAL
,
"Unexpected lifecycle start state "
+
TRITONREPOAGENT_ActionTypeString
(
action_type
));
}
switch
(
action_type
)
{
case
TRITONREPOAGENT_ACTION_LOAD
:
if
(
action_type_set_
)
{
return
Status
(
Status
::
Code
::
INTERNAL
,
"Unexpected lifecycle state transition from "
+
TRITONREPOAGENT_ActionTypeString
(
current_action_type_
)
+
" to "
+
TRITONREPOAGENT_ActionTypeString
(
action_type
));
}
break
;
case
TRITONREPOAGENT_ACTION_LOAD_COMPLETE
:
case
TRITONREPOAGENT_ACTION_LOAD_FAIL
:
if
(
current_action_type_
!=
TRITONREPOAGENT_ACTION_LOAD
)
{
return
Status
(
Status
::
Code
::
INTERNAL
,
"Unexpected lifecycle state transition from "
+
TRITONREPOAGENT_ActionTypeString
(
current_action_type_
)
+
" to "
+
TRITONREPOAGENT_ActionTypeString
(
action_type
));
}
break
;
case
TRITONREPOAGENT_ACTION_UNLOAD
:
if
(
current_action_type_
!=
TRITONREPOAGENT_ACTION_LOAD_COMPLETE
)
{
return
Status
(
Status
::
Code
::
INTERNAL
,
"Unexpected lifecycle state transition from "
+
TRITONREPOAGENT_ActionTypeString
(
current_action_type_
)
+
" to "
+
TRITONREPOAGENT_ActionTypeString
(
action_type
));
}
break
;
case
TRITONREPOAGENT_ACTION_UNLOAD_COMPLETE
:
if
(
current_action_type_
!=
TRITONREPOAGENT_ACTION_UNLOAD
)
{
return
Status
(
Status
::
Code
::
INTERNAL
,
"Unexpected lifecycle state transition from "
+
TRITONREPOAGENT_ActionTypeString
(
current_action_type_
)
+
" to "
+
TRITONREPOAGENT_ActionTypeString
(
action_type
));
}
break
;
}
current_action_type_
=
action_type
;
action_type_set_
=
true
;
RETURN_IF_TRITONSERVER_ERROR
(
agent_
->
AgentModelActionFn
()(
reinterpret_cast
<
TRITONREPOAGENT_Agent
*>
(
agent_
.
get
()),
reinterpret_cast
<
TRITONREPOAGENT_AgentModel
*>
(
this
),
action_type
));
return
Status
::
Success
;
}
Status
TritonRepoAgentModel
::
SetLocation
(
const
TRITONREPOAGENT_ArtifactType
type
,
const
std
::
string
&
location
)
{
if
(
current_action_type_
!=
TRITONREPOAGENT_ACTION_LOAD
)
{
return
Status
(
Status
::
Code
::
INVALID_ARG
,
"location can only be updated during TRITONREPOAGENT_ACTION_LOAD, "
"current action type is "
+
(
action_type_set_
?
TRITONREPOAGENT_ActionTypeString
(
current_action_type_
)
:
"not set"
));
}
type_
=
type
;
location_
=
location
;
return
Status
::
Success
;
}
Status
TritonRepoAgentModel
::
Location
(
TRITONREPOAGENT_ArtifactType
*
type
,
const
char
**
location
)
{
if
(
location_
.
empty
())
{
return
Status
(
Status
::
Code
::
INTERNAL
,
"Model repository location is not set"
);
}
*
type
=
type_
;
*
location
=
location_
.
c_str
();
return
Status
::
Success
;
}
Status
TritonRepoAgentModel
::
AcquireMutableLocation
(
const
TRITONREPOAGENT_ArtifactType
type
,
const
char
**
location
)
{
if
(
type
!=
TRITONREPOAGENT_ARTIFACT_FILESYSTEM
)
{
return
Status
(
Status
::
Code
::
INVALID_ARG
,
"Unexpected artifact type, expects "
"'TRITONREPOAGENT_ARTIFACT_FILESYSTEM'"
);
}
if
(
acquired_location_
.
empty
())
{
std
::
string
lacquired_location
;
RETURN_IF_ERROR
(
MakeTemporaryDirectory
(
FileSystemType
::
LOCAL
,
&
lacquired_location
));
acquired_location_
.
swap
(
lacquired_location
);
acquired_type_
=
type
;
}
*
location
=
acquired_location_
.
c_str
();
return
Status
::
Success
;
}
Status
TritonRepoAgentModel
::
DeleteMutableLocation
()
{
if
(
acquired_location_
.
empty
())
{
return
Status
(
Status
::
Code
::
UNAVAILABLE
,
"No mutable location to be deleted"
);
}
auto
status
=
DeletePath
(
acquired_location_
);
if
(
!
status
.
IsOk
())
{
LOG_ERROR
<<
"Failed to delete previously acquired location '"
<<
acquired_location_
<<
"': "
<<
status
.
AsString
();
}
acquired_location_
.
clear
();
return
Status
::
Success
;
}
//
// TritonRepoAgentManager
//
TritonRepoAgentManager
&
TritonRepoAgentManager
::
Singleton
()
{
static
TritonRepoAgentManager
triton_repo_agent_manager
;
return
triton_repo_agent_manager
;
}
Status
TritonRepoAgentManager
::
SetGlobalSearchPath
(
const
std
::
string
&
path
)
{
auto
&
singleton_manager
=
Singleton
();
std
::
lock_guard
<
std
::
mutex
>
lock
(
singleton_manager
.
mu_
);
singleton_manager
.
global_search_path_
=
path
;
return
Status
::
Success
;
}
Status
TritonRepoAgentManager
::
CreateAgent
(
const
std
::
string
&
agent_name
,
std
::
shared_ptr
<
TritonRepoAgent
>*
agent
)
{
auto
&
singleton_manager
=
Singleton
();
std
::
lock_guard
<
std
::
mutex
>
lock
(
singleton_manager
.
mu_
);
// Get the path to the agent shared library. Search path is global
// agent directory. FIXME expose global path as Triton option
const
std
::
vector
<
std
::
string
>
search_paths
=
{
JoinPath
({
singleton_manager
.
global_search_path_
,
agent_name
})};
std
::
string
agent_libname
=
TritonRepoAgentLibraryName
(
agent_name
);
std
::
string
libpath
;
for
(
const
auto
&
path
:
search_paths
)
{
const
auto
full_path
=
JoinPath
({
path
,
agent_libname
});
bool
exists
=
false
;
RETURN_IF_ERROR
(
FileExists
(
full_path
,
&
exists
));
if
(
exists
)
{
libpath
=
full_path
;
break
;
}
}
if
(
libpath
.
empty
())
{
return
Status
(
Status
::
Code
::
INVALID_ARG
,
"unable to find '"
+
agent_libname
+
"' for repo agent '"
+
agent_name
+
"', searched: "
+
singleton_manager
.
global_search_path_
);
}
const
auto
&
itr
=
singleton_manager
.
agent_map_
.
find
(
libpath
);
if
(
itr
!=
singleton_manager
.
agent_map_
.
end
())
{
// Found in map. If the weak_ptr is still valid that means that
// there are other models using the agent and we just reuse that
// same agent. If the weak_ptr is not valid then agent has been
// unloaded so we need to remove the weak_ptr from the map and
// create the agent again.
*
agent
=
itr
->
second
.
lock
();
if
(
*
agent
!=
nullptr
)
{
return
Status
::
Success
;
}
singleton_manager
.
agent_map_
.
erase
(
itr
);
}
RETURN_IF_ERROR
(
TritonRepoAgent
::
Create
(
agent_name
,
libpath
,
agent
));
singleton_manager
.
agent_map_
.
insert
({
libpath
,
*
agent
});
return
Status
::
Success
;
}
Status
TritonRepoAgentManager
::
AgentState
(
std
::
unique_ptr
<
std
::
unordered_map
<
std
::
string
,
std
::
string
>>*
agent_state
)
{
auto
&
singleton_manager
=
Singleton
();
std
::
lock_guard
<
std
::
mutex
>
lock
(
singleton_manager
.
mu_
);
std
::
unique_ptr
<
std
::
unordered_map
<
std
::
string
,
std
::
string
>>
agent_state_map
(
new
std
::
unordered_map
<
std
::
string
,
std
::
string
>
);
for
(
const
auto
&
agent_pair
:
singleton_manager
.
agent_map_
)
{
auto
&
libpath
=
agent_pair
.
first
;
auto
agent
=
agent_pair
.
second
.
lock
();
if
(
agent
!=
nullptr
)
{
agent_state_map
->
insert
({
agent
->
Name
(),
libpath
});
}
}
*
agent_state
=
std
::
move
(
agent_state_map
);
return
Status
::
Success
;
}
extern
"C"
{
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONREPOAGENT_ApiVersion
(
uint32_t
*
major
,
uint32_t
*
minor
)
{
*
major
=
TRITONREPOAGENT_API_VERSION_MAJOR
;
*
minor
=
TRITONREPOAGENT_API_VERSION_MINOR
;
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONREPOAGENT_ModelRepositoryLocation
(
TRITONREPOAGENT_Agent
*
agent
,
TRITONREPOAGENT_AgentModel
*
model
,
TRITONREPOAGENT_ArtifactType
*
artifact_type
,
const
char
**
location
)
{
TritonRepoAgentModel
*
tam
=
reinterpret_cast
<
TritonRepoAgentModel
*>
(
model
);
RETURN_TRITONSERVER_ERROR_IF_ERROR
(
tam
->
Location
(
artifact_type
,
location
));
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONREPOAGENT_ModelRepositoryLocationAcquire
(
TRITONREPOAGENT_Agent
*
agent
,
TRITONREPOAGENT_AgentModel
*
model
,
const
TRITONREPOAGENT_ArtifactType
artifact_type
,
const
char
**
location
)
{
TritonRepoAgentModel
*
tam
=
reinterpret_cast
<
TritonRepoAgentModel
*>
(
model
);
RETURN_TRITONSERVER_ERROR_IF_ERROR
(
tam
->
AcquireMutableLocation
(
artifact_type
,
location
));
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONREPOAGENT_ModelRepositoryLocationRelease
(
TRITONREPOAGENT_Agent
*
agent
,
TRITONREPOAGENT_AgentModel
*
model
,
const
char
*
location
)
{
TritonRepoAgentModel
*
tam
=
reinterpret_cast
<
TritonRepoAgentModel
*>
(
model
);
RETURN_TRITONSERVER_ERROR_IF_ERROR
(
tam
->
DeleteMutableLocation
());
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONREPOAGENT_ModelRepositoryUpdate
(
TRITONREPOAGENT_Agent
*
agent
,
TRITONREPOAGENT_AgentModel
*
model
,
const
TRITONREPOAGENT_ArtifactType
artifact_type
,
const
char
*
location
)
{
TritonRepoAgentModel
*
tam
=
reinterpret_cast
<
TritonRepoAgentModel
*>
(
model
);
RETURN_TRITONSERVER_ERROR_IF_ERROR
(
tam
->
SetLocation
(
artifact_type
,
location
));
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONREPOAGENT_ModelParameterCount
(
TRITONREPOAGENT_Agent
*
agent
,
TRITONREPOAGENT_AgentModel
*
model
,
uint32_t
*
count
)
{
TritonRepoAgentModel
*
tam
=
reinterpret_cast
<
TritonRepoAgentModel
*>
(
model
);
*
count
=
tam
->
AgentParameters
().
size
();
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONREPOAGENT_ModelParameter
(
TRITONREPOAGENT_Agent
*
agent
,
TRITONREPOAGENT_AgentModel
*
model
,
const
uint32_t
index
,
const
char
**
parameter_name
,
const
char
**
parameter_value
)
{
TritonRepoAgentModel
*
tam
=
reinterpret_cast
<
TritonRepoAgentModel
*>
(
model
);
const
auto
&
params
=
tam
->
AgentParameters
();
if
(
index
>=
params
.
size
())
{
return
TRITONSERVER_ErrorNew
(
TRITONSERVER_ERROR_INVALID_ARG
,
"index out of range for model parameters"
);
}
*
parameter_name
=
params
[
index
].
first
.
c_str
();
*
parameter_value
=
params
[
index
].
second
.
c_str
();
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONREPOAGENT_ModelConfig
(
TRITONREPOAGENT_Agent
*
agent
,
TRITONREPOAGENT_AgentModel
*
model
,
const
uint32_t
config_version
,
TRITONSERVER_Message
**
model_config
)
{
TritonRepoAgentModel
*
tam
=
reinterpret_cast
<
TritonRepoAgentModel
*>
(
model
);
std
::
string
model_config_json
;
RETURN_TRITONSERVER_ERROR_IF_ERROR
(
ModelConfigToJson
(
tam
->
Config
(),
config_version
,
&
model_config_json
));
return
TRITONSERVER_MessageNewFromSerializedJson
(
model_config
,
model_config_json
.
c_str
(),
model_config_json
.
length
());
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONREPOAGENT_ModelState
(
TRITONREPOAGENT_AgentModel
*
model
,
void
**
state
)
{
TritonRepoAgentModel
*
tam
=
reinterpret_cast
<
TritonRepoAgentModel
*>
(
model
);
*
state
=
tam
->
State
();
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONREPOAGENT_ModelSetState
(
TRITONREPOAGENT_AgentModel
*
model
,
void
*
state
)
{
TritonRepoAgentModel
*
tam
=
reinterpret_cast
<
TritonRepoAgentModel
*>
(
model
);
tam
->
SetState
(
state
);
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONREPOAGENT_State
(
TRITONREPOAGENT_Agent
*
agent
,
void
**
state
)
{
TritonRepoAgent
*
ta
=
reinterpret_cast
<
TritonRepoAgent
*>
(
agent
);
*
state
=
ta
->
State
();
return
nullptr
;
// success
}
TRITONAPI_DECLSPEC
TRITONSERVER_Error
*
TRITONREPOAGENT_SetState
(
TRITONREPOAGENT_Agent
*
agent
,
void
*
state
)
{
TritonRepoAgent
*
ta
=
reinterpret_cast
<
TritonRepoAgent
*>
(
agent
);
ta
->
SetState
(
state
);
return
nullptr
;
// success
}
}
// extern C
}}
// namespace triton::core
3rdparty/core-r22.12/src/repo_agent.h
0 → 100644
View file @
b30f3cdb
// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include "tritonserver_apis.h"
#include <memory>
#include <mutex>
#include <unordered_map>
#include <vector>
#include "constants.h"
#include "model_config_utils.h"
namespace
triton
{
namespace
core
{
std
::
string
TritonRepoAgentLibraryName
(
const
std
::
string
&
agent_name
);
std
::
string
TRITONREPOAGENT_ActionTypeString
(
const
TRITONREPOAGENT_ActionType
type
);
std
::
string
TRITONREPOAGENT_ArtifactTypeString
(
const
TRITONREPOAGENT_ArtifactType
type
);
class
TritonRepoAgent
{
public:
using
Parameters
=
std
::
vector
<
std
::
pair
<
std
::
string
,
std
::
string
>>
;
typedef
TRITONSERVER_Error
*
(
*
TritonRepoAgentInitFn_t
)(
TRITONREPOAGENT_Agent
*
agent
);
typedef
TRITONSERVER_Error
*
(
*
TritonRepoAgentFiniFn_t
)(
TRITONREPOAGENT_Agent
*
agent
);
typedef
TRITONSERVER_Error
*
(
*
TritonRepoAgentModelInitFn_t
)(
TRITONREPOAGENT_Agent
*
agent
,
TRITONREPOAGENT_AgentModel
*
model
);
typedef
TRITONSERVER_Error
*
(
*
TritonRepoAgentModelFiniFn_t
)(
TRITONREPOAGENT_Agent
*
agent
,
TRITONREPOAGENT_AgentModel
*
model
);
typedef
TRITONSERVER_Error
*
(
*
TritonRepoAgentModelActionFn_t
)(
TRITONREPOAGENT_Agent
*
agent
,
TRITONREPOAGENT_AgentModel
*
model
,
const
TRITONREPOAGENT_ActionType
action_type
);
static
Status
Create
(
const
std
::
string
&
name
,
const
std
::
string
&
libpath
,
std
::
shared_ptr
<
TritonRepoAgent
>*
agent
);
~
TritonRepoAgent
();
const
std
::
string
&
Name
()
{
return
name_
;
}
void
*
State
()
{
return
state_
;
}
void
SetState
(
void
*
state
)
{
state_
=
state
;
}
TritonRepoAgentModelActionFn_t
AgentModelActionFn
()
const
{
return
model_action_fn_
;
}
TritonRepoAgentModelInitFn_t
AgentModelInitFn
()
const
{
return
model_init_fn_
;
}
TritonRepoAgentModelFiniFn_t
AgentModelFiniFn
()
const
{
return
model_fini_fn_
;
}
protected:
DISALLOW_COPY_AND_ASSIGN
(
TritonRepoAgent
);
TritonRepoAgent
(
const
std
::
string
&
name
)
:
name_
(
name
),
state_
(
nullptr
),
dlhandle_
(
nullptr
),
init_fn_
(
nullptr
),
fini_fn_
(
nullptr
),
model_init_fn_
(
nullptr
),
model_fini_fn_
(
nullptr
),
model_action_fn_
(
nullptr
)
{
}
const
std
::
string
name_
;
void
*
state_
;
// dlopen / dlsym handles
void
*
dlhandle_
;
TritonRepoAgentInitFn_t
init_fn_
;
TritonRepoAgentFiniFn_t
fini_fn_
;
TritonRepoAgentModelInitFn_t
model_init_fn_
;
TritonRepoAgentModelFiniFn_t
model_fini_fn_
;
TritonRepoAgentModelActionFn_t
model_action_fn_
;
};
class
TritonRepoAgentModel
{
public:
static
Status
Create
(
const
TRITONREPOAGENT_ArtifactType
type
,
const
std
::
string
&
location
,
const
inference
::
ModelConfig
&
config
,
const
std
::
shared_ptr
<
TritonRepoAgent
>&
agent
,
const
TritonRepoAgent
::
Parameters
&
agent_parameters
,
std
::
unique_ptr
<
TritonRepoAgentModel
>*
agent_model
);
~
TritonRepoAgentModel
();
void
*
State
()
{
return
state_
;
}
void
SetState
(
void
*
state
)
{
state_
=
state
;
}
Status
InvokeAgent
(
const
TRITONREPOAGENT_ActionType
action_type
);
const
TritonRepoAgent
::
Parameters
&
AgentParameters
()
{
return
agent_parameters_
;
}
Status
SetLocation
(
const
TRITONREPOAGENT_ArtifactType
type
,
const
std
::
string
&
location
);
Status
Location
(
TRITONREPOAGENT_ArtifactType
*
type
,
const
char
**
location
);
Status
AcquireMutableLocation
(
const
TRITONREPOAGENT_ArtifactType
type
,
const
char
**
location
);
Status
DeleteMutableLocation
();
const
inference
::
ModelConfig
Config
()
{
return
config_
;
}
private:
DISALLOW_COPY_AND_ASSIGN
(
TritonRepoAgentModel
);
TritonRepoAgentModel
(
const
TRITONREPOAGENT_ArtifactType
type
,
const
std
::
string
&
location
,
const
inference
::
ModelConfig
&
config
,
const
std
::
shared_ptr
<
TritonRepoAgent
>&
agent
,
const
TritonRepoAgent
::
Parameters
&
agent_parameters
)
:
state_
(
nullptr
),
config_
(
config
),
agent_
(
agent
),
agent_parameters_
(
agent_parameters
),
type_
(
type
),
location_
(
location
),
action_type_set_
(
false
),
current_action_type_
(
TRITONREPOAGENT_ACTION_UNLOAD_COMPLETE
)
{
}
void
*
state_
;
const
inference
::
ModelConfig
config_
;
const
std
::
shared_ptr
<
TritonRepoAgent
>
agent_
;
const
TritonRepoAgent
::
Parameters
agent_parameters_
;
TRITONREPOAGENT_ArtifactType
type_
;
std
::
string
location_
;
TRITONREPOAGENT_ArtifactType
acquired_type_
;
std
::
string
acquired_location_
;
bool
action_type_set_
;
TRITONREPOAGENT_ActionType
current_action_type_
;
};
class
TritonRepoAgentManager
{
public:
static
Status
SetGlobalSearchPath
(
const
std
::
string
&
path
);
static
Status
CreateAgent
(
const
std
::
string
&
agent_name
,
std
::
shared_ptr
<
TritonRepoAgent
>*
agent
);
static
Status
AgentState
(
std
::
unique_ptr
<
std
::
unordered_map
<
std
::
string
,
std
::
string
>>*
agent_state
);
private:
DISALLOW_COPY_AND_ASSIGN
(
TritonRepoAgentManager
);
TritonRepoAgentManager
()
:
global_search_path_
(
"/opt/tritonserver/repoagents"
){};
static
TritonRepoAgentManager
&
Singleton
();
std
::
mutex
mu_
;
std
::
string
global_search_path_
;
std
::
unordered_map
<
std
::
string
,
std
::
weak_ptr
<
TritonRepoAgent
>>
agent_map_
;
};
}}
// namespace triton::core
3rdparty/core-r22.12/src/response_allocator.h
0 → 100644
View file @
b30f3cdb
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include "tritonserver_apis.h"
namespace
triton
{
namespace
core
{
//
// Implementation for TRITONSERVER_ResponseAllocator.
//
class
ResponseAllocator
{
public:
explicit
ResponseAllocator
(
TRITONSERVER_ResponseAllocatorAllocFn_t
alloc_fn
,
TRITONSERVER_ResponseAllocatorReleaseFn_t
release_fn
,
TRITONSERVER_ResponseAllocatorStartFn_t
start_fn
)
:
alloc_fn_
(
alloc_fn
),
buffer_attributes_fn_
(
nullptr
),
query_fn_
(
nullptr
),
release_fn_
(
release_fn
),
start_fn_
(
start_fn
)
{
}
void
SetQueryFunction
(
TRITONSERVER_ResponseAllocatorQueryFn_t
query_fn
)
{
query_fn_
=
query_fn
;
}
void
SetBufferAttributesFunction
(
TRITONSERVER_ResponseAllocatorBufferAttributesFn_t
buffer_attributes_fn
)
{
buffer_attributes_fn_
=
buffer_attributes_fn
;
}
TRITONSERVER_ResponseAllocatorAllocFn_t
AllocFn
()
const
{
return
alloc_fn_
;
}
TRITONSERVER_ResponseAllocatorBufferAttributesFn_t
BufferAttributesFn
()
const
{
return
buffer_attributes_fn_
;
}
TRITONSERVER_ResponseAllocatorQueryFn_t
QueryFn
()
const
{
return
query_fn_
;
}
TRITONSERVER_ResponseAllocatorReleaseFn_t
ReleaseFn
()
const
{
return
release_fn_
;
}
TRITONSERVER_ResponseAllocatorStartFn_t
StartFn
()
const
{
return
start_fn_
;
}
private:
TRITONSERVER_ResponseAllocatorAllocFn_t
alloc_fn_
;
TRITONSERVER_ResponseAllocatorBufferAttributesFn_t
buffer_attributes_fn_
;
TRITONSERVER_ResponseAllocatorQueryFn_t
query_fn_
;
TRITONSERVER_ResponseAllocatorReleaseFn_t
release_fn_
;
TRITONSERVER_ResponseAllocatorStartFn_t
start_fn_
;
};
}}
// namespace triton::core
3rdparty/core-r22.12/src/response_cache.cc
0 → 100644
View file @
b30f3cdb
// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "response_cache.h"
#include "infer_stats.h"
#include "triton/common/logging.h"
namespace
{
enum
class
ScopedTimerType
{
INSERTION
,
LOOKUP
};
class
ScopedTimer
{
public:
explicit
ScopedTimer
(
triton
::
core
::
InferenceRequest
&
request
,
uint64_t
&
duration
,
ScopedTimerType
type
)
:
request_
(
request
),
duration_
(
duration
),
type_
(
type
)
{
switch
(
type_
)
{
case
ScopedTimerType
::
LOOKUP
:
request_
.
CaptureCacheLookupStartNs
();
break
;
case
ScopedTimerType
::
INSERTION
:
request_
.
CaptureCacheInsertionStartNs
();
break
;
}
}
~
ScopedTimer
()
{
switch
(
type_
)
{
case
ScopedTimerType
::
LOOKUP
:
request_
.
CaptureCacheLookupEndNs
();
duration_
+=
request_
.
CacheLookupEndNs
()
-
request_
.
CacheLookupStartNs
();
break
;
case
ScopedTimerType
::
INSERTION
:
request_
.
CaptureCacheInsertionEndNs
();
duration_
+=
request_
.
CacheInsertionEndNs
()
-
request_
.
CacheInsertionStartNs
();
break
;
}
}
private:
triton
::
core
::
InferenceRequest
&
request_
;
uint64_t
&
duration_
;
ScopedTimerType
type_
;
};
std
::
string
PointerToString
(
void
*
ptr
)
{
std
::
stringstream
ss
;
ss
<<
ptr
;
return
ss
.
str
();
}
}
// namespace
namespace
triton
{
namespace
core
{
Status
RequestResponseCache
::
Create
(
uint64_t
cache_size
,
std
::
unique_ptr
<
RequestResponseCache
>*
cache
)
{
try
{
cache
->
reset
(
new
RequestResponseCache
(
cache_size
));
}
catch
(
const
std
::
exception
&
ex
)
{
return
Status
(
Status
::
Code
::
INTERNAL
,
"Failed to initialize Response Cache: "
+
std
::
string
(
ex
.
what
()));
}
return
Status
::
Success
;
}
RequestResponseCache
::
RequestResponseCache
(
const
uint64_t
size
)
{
// Allocate buffer
buffer_
=
malloc
(
size
);
// Exit early if buffer allocation failed
if
(
buffer_
==
nullptr
)
{
throw
std
::
runtime_error
(
"failed to allocate buffer"
);
}
// Create cache as managed buffer
managed_buffer_
=
boost
::
interprocess
::
managed_external_buffer
(
boost
::
interprocess
::
create_only_t
{},
buffer_
,
size
);
LOG_INFO
<<
"Response Cache is created at '"
<<
PointerToString
(
buffer_
)
<<
"' with size "
<<
size
;
}
RequestResponseCache
::~
RequestResponseCache
()
{
// Deallocate each chunk from managed buffer
for
(
auto
&
iter
:
cache_
)
{
auto
&
entry
=
iter
.
second
;
for
(
auto
&
output
:
entry
.
outputs_
)
{
if
(
output
.
buffer_
!=
nullptr
)
{
managed_buffer_
.
deallocate
(
output
.
buffer_
);
}
}
}
// Validate we freed all underlying memory managed by cache
if
(
!
managed_buffer_
.
all_memory_deallocated
())
{
// Destructors can't throw exceptions
LOG_ERROR
<<
"failed to free managed cache memory"
;
}
// Free total cache buffer
if
(
buffer_
!=
nullptr
)
{
free
(
buffer_
);
}
}
Status
RequestResponseCache
::
Lookup
(
InferenceResponse
*
const
response
,
InferenceRequest
*
const
request
)
{
// Lock on cache lookup
std
::
lock_guard
<
std
::
recursive_mutex
>
lk
(
cache_mtx_
);
if
(
request
==
nullptr
)
{
return
Status
(
Status
::
Code
::
INTERNAL
,
"Cache Lookup passed a nullptr request"
);
}
// Capture start latency now and end latency when timer goes out of scope
ScopedTimer
timer
(
*
request
,
total_lookup_latency_ns_
,
ScopedTimerType
::
LOOKUP
);
// Hash the request and set cache key if it hasn't already been set
if
(
!
request
->
CacheKeyIsSet
())
{
RETURN_IF_ERROR
(
HashAndSet
(
request
));
}
const
uint64_t
key
=
request
->
CacheKey
();
num_lookups_
++
;
LOG_VERBOSE
(
1
)
<<
request
->
LogRequest
()
<<
"Looking up key ["
+
std
::
to_string
(
key
)
+
"] in cache."
;
// Search cache for request hash key
auto
iter
=
cache_
.
find
(
key
);
if
(
iter
==
cache_
.
end
())
{
num_misses_
++
;
LOG_VERBOSE
(
1
)
<<
request
->
LogRequest
()
<<
"MISS for key ["
+
std
::
to_string
(
key
)
+
"] in cache."
;
return
Status
(
Status
::
Code
::
INTERNAL
,
request
->
LogRequest
()
+
"key not found in cache"
);
}
// If find succeeds, it's a cache hit
num_hits_
++
;
LOG_VERBOSE
(
1
)
<<
request
->
LogRequest
()
<<
"HIT for key ["
+
std
::
to_string
(
key
)
+
"] in cache."
;
// Populate passed-in "response" from cache entry
auto
entry
=
iter
->
second
;
// Build InferenceResponse from CacheEntry
RETURN_IF_ERROR
(
BuildInferenceResponse
(
entry
,
response
));
// Update this key to front of LRU list
UpdateLRU
(
iter
);
LOG_VERBOSE
(
1
)
<<
request
->
LogRequest
()
<<
"Using cached response for key ["
+
std
::
to_string
(
key
)
+
"]."
;
return
Status
::
Success
;
}
Status
RequestResponseCache
::
Insert
(
const
InferenceResponse
&
response
,
InferenceRequest
*
const
request
)
{
// Lock on cache insertion
std
::
lock_guard
<
std
::
recursive_mutex
>
lk
(
cache_mtx_
);
if
(
request
==
nullptr
)
{
return
Status
(
Status
::
Code
::
INTERNAL
,
"Cache Insert passed a nullptr request"
);
}
// Capture start latency now and end latency when timer goes out of scope
ScopedTimer
timer
(
*
request
,
total_insertion_latency_ns_
,
ScopedTimerType
::
INSERTION
);
// Hash the request and set cache key if it hasn't already been set
if
(
!
request
->
CacheKeyIsSet
())
{
RETURN_IF_ERROR
(
HashAndSet
(
request
));
}
const
uint64_t
key
=
request
->
CacheKey
();
// Exit early if key already exists in cache
auto
iter
=
cache_
.
find
(
key
);
if
(
iter
!=
cache_
.
end
())
{
return
Status
(
Status
::
Code
::
ALREADY_EXISTS
,
request
->
LogRequest
()
+
"key ["
+
std
::
to_string
(
key
)
+
"] already exists in cache"
);
}
// Construct cache entry from response
auto
entry
=
CacheEntry
();
RETURN_IF_ERROR
(
BuildCacheEntry
(
response
,
&
entry
));
// Insert entry into cache
LOG_VERBOSE
(
1
)
<<
request
->
LogRequest
()
<<
"Inserting key ["
+
std
::
to_string
(
key
)
+
"] into cache."
;
auto
cache_pair
=
cache_
.
insert
({
key
,
entry
});
// Exit early if cache insertion failed
if
(
!
cache_pair
.
second
)
{
LOG_ERROR
<<
request
->
LogRequest
()
<<
"Failed to insert key into map."
;
return
Status
(
Status
::
Code
::
INTERNAL
,
request
->
LogRequest
()
+
"Cache insertion failed"
);
}
// Update LRU with new cache entry
auto
cache_iter
=
cache_pair
.
first
;
UpdateLRU
(
cache_iter
);
return
Status
::
Success
;
}
// LRU
Status
RequestResponseCache
::
Evict
()
{
// Lock on cache eviction
std
::
lock_guard
<
std
::
recursive_mutex
>
lk
(
cache_mtx_
);
// Nothing to evict if cache is empty
if
(
NumEntries
()
==
0
)
{
return
Status
(
Status
::
Code
::
INTERNAL
,
"Cache is empty, nothing to evict."
);
}
// Least recently used key in back of LRU list
uint64_t
lru_key
=
lru_
.
back
();
LOG_VERBOSE
(
1
)
<<
"Evicting key ["
+
std
::
to_string
(
lru_key
)
+
"] from cache."
;
// Find cache entry for least recently used key
auto
iter
=
cache_
.
find
(
lru_key
);
// Error check if key isn't in cache, but this shouldn't happen in evict
// and probably indicates a bug
if
(
iter
==
cache_
.
end
())
{
return
Status
(
Status
::
Code
::
INTERNAL
,
"key ["
+
std
::
to_string
(
lru_key
)
+
"] not found in cache during eviction: this indicates a bug in the "
"code"
);
}
// Get size of cache entry being evicted to update available size
auto
entry
=
iter
->
second
;
// Free managed memory used in cache entry's outputs
for
(
auto
&
output
:
entry
.
outputs_
)
{
// Lock on buffer deallocation
std
::
lock_guard
<
std
::
recursive_mutex
>
lk
(
buffer_mtx_
);
managed_buffer_
.
deallocate
(
output
.
buffer_
);
}
// Remove LRU entry from cache
cache_
.
erase
(
lru_key
);
// Remove LRU key from LRU list
lru_
.
pop_back
();
// Increment number of evictions
num_evictions_
++
;
return
Status
::
Success
;
}
// Helpers
void
RequestResponseCache
::
UpdateLRU
(
std
::
unordered_map
<
uint64_t
,
CacheEntry
>::
iterator
&
cache_iter
)
{
// Lock on cache update
std
::
lock_guard
<
std
::
recursive_mutex
>
lk
(
cache_mtx_
);
const
auto
&
key
=
cache_iter
->
first
;
auto
&
cache_entry
=
cache_iter
->
second
;
// Remove key from LRU list if it was already in there
auto
lru_iter
=
std
::
find
(
lru_
.
begin
(),
lru_
.
end
(),
key
);
if
(
lru_iter
!=
lru_
.
end
())
{
lru_
.
erase
(
lru_iter
);
}
// Add key to front of LRU list since it's most recently used
lru_
.
push_front
(
key
);
// Set CacheEntry LRU iterator to new LRU key location
cache_entry
.
lru_iter_
=
lru_
.
begin
();
}
Status
RequestResponseCache
::
BuildCacheEntry
(
const
InferenceResponse
&
response
,
CacheEntry
*
const
entry
)
{
// Build cache entry data from response outputs
for
(
const
auto
&
response_output
:
response
.
Outputs
())
{
auto
cache_output
=
Output
();
// Fetch output buffer details
const
void
*
response_buffer
=
nullptr
;
size_t
response_byte_size
=
0
;
TRITONSERVER_MemoryType
response_memory_type
;
int64_t
response_memory_type_id
;
void
*
userp
;
RETURN_IF_ERROR
(
response_output
.
DataBuffer
(
&
response_buffer
,
&
response_byte_size
,
&
response_memory_type
,
&
response_memory_type_id
,
&
userp
));
// TODO: Handle other memory types
if
(
response_memory_type
!=
TRITONSERVER_MEMORY_CPU
&&
response_memory_type
!=
TRITONSERVER_MEMORY_CPU_PINNED
)
{
return
Status
(
Status
::
Code
::
INTERNAL
,
"Only input buffers in CPU memory are allowed in cache currently"
);
}
// Exit early if response buffer from output is invalid
if
(
response_buffer
==
nullptr
)
{
return
Status
(
Status
::
Code
::
INTERNAL
,
"Response buffer from output was nullptr"
);
}
// Lock on managed buffer references
{
std
::
lock_guard
<
std
::
recursive_mutex
>
lk
(
buffer_mtx_
);
// Exit early if cache entry will be larger than available cache size
if
(
response_byte_size
>
managed_buffer_
.
get_size
())
{
return
Status
(
Status
::
Code
::
INTERNAL
,
"Cache entry is larger than total cache size"
);
}
// If cache doesn't have enough space, evict until enough space available
// NOTE: FreeBytes() doesn't account for allocator overhead so allocation
// may fail even if response_byte_size is less than FreeBytes()
while
(
response_byte_size
>
FreeBytes
())
{
LOG_VERBOSE
(
1
)
<<
"EVICT: Response larger than remaining available "
"memory, attempting to evict from cache."
;
RETURN_IF_ERROR
(
Evict
());
}
// Attempt to allocate buffer until success or eviction from cache fails
while
(
cache_output
.
buffer_
==
nullptr
)
{
// Allocate buffer for response output in cache entry
cache_output
.
buffer_
=
managed_buffer_
.
allocate
(
response_byte_size
,
std
::
nothrow_t
{});
// Attempt to evict if allocation fails
if
(
cache_output
.
buffer_
==
nullptr
)
{
LOG_VERBOSE
(
1
)
<<
"FAILED to allocate buffer in cache. Attempting to "
"evict an entry."
;
// Exit out if Eviction fails
RETURN_IF_ERROR
(
Evict
());
}
}
// Copy data from response buffer to cache entry output buffer
// TODO: Handle other memory types
std
::
memcpy
(
cache_output
.
buffer_
,
response_buffer
,
response_byte_size
);
// Set output metadata
cache_output
.
name_
=
response_output
.
Name
();
cache_output
.
dtype_
=
response_output
.
DType
();
cache_output
.
shape_
=
response_output
.
Shape
();
cache_output
.
buffer_size_
=
static_cast
<
uint64_t
>
(
response_byte_size
);
}
// Add each output to cache entry
entry
->
outputs_
.
push_back
(
cache_output
);
}
return
Status
::
Success
;
}
Status
RequestResponseCache
::
BuildInferenceResponse
(
const
CacheEntry
&
entry
,
InferenceResponse
*
const
response
)
{
if
(
response
==
nullptr
)
{
return
Status
(
Status
::
Code
::
INTERNAL
,
"invalid response ptr passed in"
);
}
// Lock on cache references
{
std
::
lock_guard
<
std
::
recursive_mutex
>
lk
(
cache_mtx_
);
// Inference response outputs should be empty so we can append to them
if
(
response
->
Outputs
().
size
()
!=
0
)
{
return
Status
(
Status
::
Code
::
INTERNAL
,
"InferenceResponse already contains some outputs"
);
}
for
(
auto
&
cache_output
:
entry
.
outputs_
)
{
InferenceResponse
::
Output
*
response_output
=
nullptr
;
RETURN_IF_ERROR
(
response
->
AddOutput
(
cache_output
.
name_
,
cache_output
.
dtype_
,
cache_output
.
shape_
,
&
response_output
));
if
(
response_output
==
nullptr
)
{
return
Status
(
Status
::
Code
::
INTERNAL
,
"InferenceResponse::Output pointer as nullptr"
);
}
TRITONSERVER_MemoryType
memory_type
=
TRITONSERVER_MEMORY_CPU
;
int64_t
memory_type_id
=
0
;
// Allocate buffer for inference response
void
*
buffer
;
RETURN_IF_ERROR
(
response_output
->
AllocateDataBuffer
(
&
buffer
,
cache_output
.
buffer_size_
,
&
memory_type
,
&
memory_type_id
));
// TODO: Handle other memory types
if
(
memory_type
!=
TRITONSERVER_MEMORY_CPU
&&
memory_type
!=
TRITONSERVER_MEMORY_CPU_PINNED
)
{
return
Status
(
Status
::
Code
::
INTERNAL
,
"Only input buffers in CPU memory are allowed in cache currently"
);
}
if
(
buffer
==
nullptr
)
{
return
Status
(
Status
::
Code
::
INTERNAL
,
"failed to allocate buffer for output '"
+
cache_output
.
name_
+
"'"
);
}
// Copy cached output buffer to allocated response output buffer
std
::
memcpy
(
buffer
,
cache_output
.
buffer_
,
cache_output
.
buffer_size_
);
// TODO: Add field to InferenceResponse to indicate this was from cache
// response.cached = true;
}
}
return
Status
::
Success
;
}
Status
RequestResponseCache
::
HashInputBuffers
(
const
InferenceRequest
::
Input
*
input
,
size_t
*
seed
)
{
// Iterate over each data buffer in input in case of non-contiguous memory
for
(
size_t
idx
=
0
;
idx
<
input
->
DataBufferCount
();
++
idx
)
{
const
void
*
src_buffer
;
size_t
src_byte_size
;
TRITONSERVER_MemoryType
src_memory_type
;
int64_t
src_memory_type_id
;
RETURN_IF_ERROR
(
input
->
DataBuffer
(
idx
,
&
src_buffer
,
&
src_byte_size
,
&
src_memory_type
,
&
src_memory_type_id
));
// TODO: Handle other memory types
if
(
src_memory_type
!=
TRITONSERVER_MEMORY_CPU
&&
src_memory_type
!=
TRITONSERVER_MEMORY_CPU_PINNED
)
{
return
Status
(
Status
::
Code
::
INTERNAL
,
"Only input buffers in CPU memory are allowed in cache currently"
);
}
// Add each byte of input buffer chunk to hash
const
unsigned
char
*
tmp
=
static_cast
<
const
unsigned
char
*>
(
src_buffer
);
for
(
uint64_t
byte
=
0
;
byte
<
src_byte_size
;
byte
++
)
{
boost
::
hash_combine
(
*
seed
,
tmp
[
byte
]);
}
}
return
Status
::
Success
;
}
Status
RequestResponseCache
::
HashInputs
(
const
InferenceRequest
&
request
,
size_t
*
seed
)
{
const
auto
&
inputs
=
request
.
ImmutableInputs
();
// Convert inputs to ordered map for consistency in hashing
// inputs sorted by key (input) name
std
::
map
<
std
::
string
,
InferenceRequest
::
Input
*>
ordered_inputs
(
inputs
.
begin
(),
inputs
.
end
());
for
(
const
auto
&
input
:
ordered_inputs
)
{
// Add input name to hash
boost
::
hash_combine
(
*
seed
,
input
.
second
->
Name
());
// Fetch input buffer for hashing raw data
RETURN_IF_ERROR
(
HashInputBuffers
(
input
.
second
,
seed
));
}
return
Status
::
Success
;
}
Status
RequestResponseCache
::
Hash
(
const
InferenceRequest
&
request
,
uint64_t
*
key
)
{
std
::
size_t
seed
=
0
;
// Add request model name to hash
boost
::
hash_combine
(
seed
,
request
.
ModelName
());
// Add request model version to hash
boost
::
hash_combine
(
seed
,
request
.
ActualModelVersion
());
RETURN_IF_ERROR
(
HashInputs
(
request
,
&
seed
));
*
key
=
static_cast
<
uint64_t
>
(
seed
);
return
Status
::
Success
;
}
Status
RequestResponseCache
::
HashAndSet
(
InferenceRequest
*
const
request
)
{
uint64_t
key
=
0
;
RETURN_IF_ERROR
(
Hash
(
*
request
,
&
key
));
request
->
SetCacheKey
(
key
);
return
Status
::
Success
;
}
}}
// namespace triton::core
3rdparty/core-r22.12/src/response_cache.h
0 → 100644
View file @
b30f3cdb
// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <list>
#include <string>
#include <unordered_map>
#include "infer_request.h"
#include "infer_response.h"
#include "model.h"
#include "status.h"
#include <boost/functional/hash.hpp>
#include <boost/interprocess/managed_external_buffer.hpp>
namespace
triton
{
namespace
core
{
// Assuming CPU memory only for now
struct
Output
{
// Output tensor data buffer
void
*
buffer_
;
// Size of "buffer" above
uint64_t
buffer_size_
=
0
;
// Name of the output
std
::
string
name_
;
// Datatype of the output
inference
::
DataType
dtype_
;
// Shape of the output
std
::
vector
<
int64_t
>
shape_
;
};
struct
CacheEntry
{
explicit
CacheEntry
()
{}
// Point to key in LRU list for maintaining LRU order
std
::
list
<
uint64_t
>::
iterator
lru_iter_
;
// each output buffer = managed_buffer.allocate(size, ...)
std
::
vector
<
Output
>
outputs_
;
};
class
RequestResponseCache
{
public:
~
RequestResponseCache
();
// Create the request/response cache object
static
Status
Create
(
uint64_t
cache_size
,
std
::
unique_ptr
<
RequestResponseCache
>*
cache
);
// Hash inference request for cache access and store it in "request" object.
// This will also be called internally in Lookup/Insert if the request hasn't
// already stored it's hash. It is up to the user to update the hash in the
// request if modifying any hashed fields of the request object after storing.
// Return Status object indicating success or failure.
Status
HashAndSet
(
InferenceRequest
*
const
request
);
// Lookup 'request' hash in cache and return the inference response in
// 'response' on cache hit or nullptr on cache miss
// Return Status object indicating success or failure.
Status
Lookup
(
InferenceResponse
*
const
response
,
InferenceRequest
*
const
request
);
// Insert response into cache, evict entries to make space if necessary
// Return Status object indicating success or failure.
Status
Insert
(
const
InferenceResponse
&
response
,
InferenceRequest
*
const
request
);
// Evict entry from cache based on policy
// Return Status object indicating success or failure.
Status
Evict
();
// Returns number of items in cache
size_t
NumEntries
()
{
std
::
lock_guard
<
std
::
recursive_mutex
>
lk
(
cache_mtx_
);
return
cache_
.
size
();
}
// Returns number of items evicted in cache lifespan
size_t
NumEvictions
()
{
std
::
lock_guard
<
std
::
recursive_mutex
>
lk
(
cache_mtx_
);
return
num_evictions_
;
}
// Returns number of lookups in cache lifespan, should sum to hits + misses
size_t
NumLookups
()
{
std
::
lock_guard
<
std
::
recursive_mutex
>
lk
(
cache_mtx_
);
return
num_lookups_
;
}
// Returns number of cache hits in cache lifespan
size_t
NumHits
()
{
std
::
lock_guard
<
std
::
recursive_mutex
>
lk
(
cache_mtx_
);
return
num_hits_
;
}
// Returns number of cache hits in cache lifespan
size_t
NumMisses
()
{
std
::
lock_guard
<
std
::
recursive_mutex
>
lk
(
cache_mtx_
);
return
num_misses_
;
}
// Returns the total lookup latency (nanoseconds) of all lookups in cache
// lifespan
uint64_t
TotalLookupLatencyNs
()
{
std
::
lock_guard
<
std
::
recursive_mutex
>
lk
(
cache_mtx_
);
return
total_lookup_latency_ns_
;
}
uint64_t
TotalInsertionLatencyNs
()
{
std
::
lock_guard
<
std
::
recursive_mutex
>
lk
(
cache_mtx_
);
return
total_insertion_latency_ns_
;
}
// Returns total number of bytes allocated for cache
size_t
TotalBytes
()
{
std
::
lock_guard
<
std
::
recursive_mutex
>
lk
(
buffer_mtx_
);
return
managed_buffer_
.
get_size
();
}
// Returns number of free bytes in cache
size_t
FreeBytes
()
{
std
::
lock_guard
<
std
::
recursive_mutex
>
lk
(
buffer_mtx_
);
return
managed_buffer_
.
get_free_memory
();
}
// Returns number of bytes in use by cache
size_t
AllocatedBytes
()
{
std
::
lock_guard
<
std
::
recursive_mutex
>
lk
(
buffer_mtx_
);
return
managed_buffer_
.
get_size
()
-
managed_buffer_
.
get_free_memory
();
}
// Returns fraction of bytes allocated over total cache size between [0, 1]
double
TotalUtilization
()
{
std
::
lock_guard
<
std
::
recursive_mutex
>
lk
(
buffer_mtx_
);
return
static_cast
<
double
>
(
AllocatedBytes
())
/
static_cast
<
double
>
(
TotalBytes
());
}
private:
explicit
RequestResponseCache
(
const
uint64_t
cache_size
);
// Update LRU ordering on lookup
void
UpdateLRU
(
std
::
unordered_map
<
uint64_t
,
CacheEntry
>::
iterator
&
);
// Build CacheEntry from InferenceResponse
Status
BuildCacheEntry
(
const
InferenceResponse
&
response
,
CacheEntry
*
const
entry
);
// Build InferenceResponse from CacheEntry
Status
BuildInferenceResponse
(
const
CacheEntry
&
entry
,
InferenceResponse
*
const
response
);
// Helper function to hash data buffers used by "input"
Status
HashInputBuffers
(
const
InferenceRequest
::
Input
*
input
,
size_t
*
seed
);
// Helper function to hash each input in "request"
Status
HashInputs
(
const
InferenceRequest
&
request
,
size_t
*
seed
);
// Helper function to hash request and store it in "key"
Status
Hash
(
const
InferenceRequest
&
request
,
uint64_t
*
key
);
// Cache buffer
void
*
buffer_
;
// Managed buffer
boost
::
interprocess
::
managed_external_buffer
managed_buffer_
;
// key -> CacheEntry containing values and list iterator for LRU management
std
::
unordered_map
<
uint64_t
,
CacheEntry
>
cache_
;
// List of keys sorted from most to least recently used
std
::
list
<
uint64_t
>
lru_
;
// Cache metrics
size_t
num_evictions_
=
0
;
size_t
num_lookups_
=
0
;
size_t
num_hits_
=
0
;
size_t
num_misses_
=
0
;
uint64_t
total_lookup_latency_ns_
=
0
;
uint64_t
total_insertion_latency_ns_
=
0
;
// Mutex for buffer synchronization
std
::
recursive_mutex
buffer_mtx_
;
// Mutex for cache synchronization
std
::
recursive_mutex
cache_mtx_
;
};
}}
// namespace triton::core
3rdparty/core-r22.12/src/scheduler.h
0 → 100644
View file @
b30f3cdb
// Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <functional>
#include "infer_request.h"
#include "status.h"
namespace
triton
{
namespace
core
{
// Scheduler interface.
class
Scheduler
{
public:
virtual
~
Scheduler
()
{}
// The prototype for the initialization function that will be called
// by the "standard" schedulers created based on a model's
// scheduling_choice settings. The init function is called once by
// the runner that will later execute requests for 'runner_idx'. A
// non-OK error status indicates an initialization error that
// prevents scheduler from using the runner.
using
StandardInitFunc
=
std
::
function
<
Status
(
uint32_t
runner_idx
)
>
;
// The prototype for the warmup function that will be called by the
// "standard" schedulers created based on a model's
// scheduling_choice settings. The warmup function is called once by
// the runner that will later execute requests for 'runner_idx'. A
// non-OK error status indicates an error that prevents scheduler
// from sending warmup requests to the runner.
using
StandardWarmupFunc
=
std
::
function
<
Status
(
uint32_t
runner_idx
)
>
;
// The prototype for the run function that will be called by the
// "standard" schedulers created based on a model's
// scheduling_choice settings. The run function must accept a
// 'runner_idx' indicating which runner should execute the
// 'requests'. Ownership of the 'requests' is transferred to the
// runner which is responsible for generating responses and
// releasing the requests.
using
StandardRunFunc
=
std
::
function
<
void
(
uint32_t
runner_idx
,
std
::
vector
<
std
::
unique_ptr
<
InferenceRequest
>>&&
requests
)
>
;
// Enqueue a request with the scheduler. If Status::Success is returned
// then the backend has taken ownership of the request object and so
// 'request' will be nullptr. If non-success is returned then the
// caller still retains ownership of 'request'.
virtual
Status
Enqueue
(
std
::
unique_ptr
<
InferenceRequest
>&
request
)
=
0
;
// Return the number of in-flight inferences tracked by the scheduler.
virtual
size_t
InflightInferenceCount
()
=
0
;
// Instruct the scheduler to stop processing future requests unless they are
// considered as in-flight.
virtual
void
Stop
()
=
0
;
};
}}
// namespace triton::core
3rdparty/core-r22.12/src/scheduler_utils.cc
0 → 100644
View file @
b30f3cdb
// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "scheduler_utils.h"
#include <cassert>
#include "constants.h"
#include "triton/common/logging.h"
namespace
triton
{
namespace
core
{
Status
RequiredEqualInputs
::
Initialize
(
const
std
::
unique_ptr
<
InferenceRequest
>&
request
,
const
std
::
unordered_map
<
std
::
string
,
bool
>&
enforce_equal_shape_tensors
,
const
bool
has_optional_input
)
{
has_optional_input_
=
has_optional_input
;
required_inputs_
.
clear
();
for
(
const
auto
&
pr
:
request
->
ImmutableInputs
())
{
const
InferenceRequest
::
Input
*
input
=
pr
.
second
;
const
auto
itr
=
enforce_equal_shape_tensors
.
find
(
input
->
Name
());
if
(
itr
!=
enforce_equal_shape_tensors
.
end
())
{
required_inputs_
.
emplace
(
std
::
piecewise_construct
,
std
::
forward_as_tuple
(
input
->
Name
()),
std
::
forward_as_tuple
(
input
,
itr
->
second
));
}
// When the model has optional inputs, overload 'required_inputs_'
// to track the inputs involved in the batch
else
if
(
has_optional_input
)
{
required_inputs_
.
emplace
(
std
::
piecewise_construct
,
std
::
forward_as_tuple
(
input
->
Name
()),
std
::
forward_as_tuple
(
nullptr
,
false
));
}
}
init_
=
true
;
return
Status
::
Success
;
}
bool
RequiredEqualInputs
::
HasEqualInputs
(
const
std
::
unique_ptr
<
InferenceRequest
>&
request
)
{
// If current request has different number of inputs, then dynamic batching
// shouldn't be applied.
if
(
has_optional_input_
&&
(
request
->
ImmutableInputs
().
size
()
!=
required_inputs_
.
size
()))
{
return
false
;
}
for
(
const
auto
&
pr
:
request
->
ImmutableInputs
())
{
const
InferenceRequest
::
Input
*
input
=
pr
.
second
;
const
auto
itr
=
required_inputs_
.
find
(
input
->
Name
());
if
(
itr
!=
required_inputs_
.
end
())
{
if
(
itr
->
second
.
first
!=
nullptr
)
{
// Make sure shape of input tensors is equal.
if
(
!
triton
::
common
::
CompareDims
(
itr
->
second
.
first
->
Shape
(),
input
->
Shape
()))
{
return
false
;
}
// If necessary compare the contents as well...
if
(
itr
->
second
.
second
)
{
const
auto
&
d1
=
itr
->
second
.
first
->
Data
();
const
auto
&
d2
=
input
->
Data
();
// For now being conservative and assuming that content
// comparison is for shape tensors which are likely to always
// be in a single buffer.
if
((
d1
->
BufferCount
()
!=
1
)
||
(
d2
->
BufferCount
()
!=
1
))
{
return
false
;
}
size_t
d1_byte_size
,
d2_byte_size
;
TRITONSERVER_MemoryType
d1_memory_type
,
d2_memory_type
;
int64_t
d1_memory_id
,
d2_memory_id
;
const
char
*
d1_buffer
=
d1
->
BufferAt
(
0
/* idx */
,
&
d1_byte_size
,
&
d1_memory_type
,
&
d1_memory_id
);
const
char
*
d2_buffer
=
d2
->
BufferAt
(
0
/* idx */
,
&
d2_byte_size
,
&
d2_memory_type
,
&
d2_memory_id
);
// Tensor must be same size and in in CPU memory so that it
// can be easily compared. If not return false conservatively.
if
((
d1_byte_size
!=
d2_byte_size
)
||
(
d1_buffer
==
nullptr
)
||
(
d2_buffer
==
nullptr
)
||
(
d1_memory_type
==
TRITONSERVER_MEMORY_GPU
)
||
(
d2_memory_type
==
TRITONSERVER_MEMORY_GPU
))
{
return
false
;
}
if
(
strncmp
(
d1_buffer
,
d2_buffer
,
d1_byte_size
)
!=
0
)
{
return
false
;
}
}
}
}
else
if
(
has_optional_input_
)
{
// If the model has optional inputs, the current request must contains all
// inputs that in the first request (tracked in 'required_inputs_').
return
false
;
}
}
return
true
;
}
Status
PriorityQueue
::
PolicyQueue
::
Enqueue
(
std
::
unique_ptr
<
InferenceRequest
>&
request
)
{
if
((
max_queue_size_
!=
0
)
&&
(
Size
()
>=
max_queue_size_
))
{
return
Status
(
Status
::
Code
::
UNAVAILABLE
,
request
->
LogRequest
()
+
"Exceeds maximum queue size"
);
}
queue_
.
emplace_back
(
std
::
move
(
request
));
auto
timeout_us
=
default_timeout_us_
;
if
(
allow_timeout_override_
)
{
auto
override_timeout_us
=
queue_
.
back
()
->
TimeoutMicroseconds
();
if
(
override_timeout_us
!=
0
&&
override_timeout_us
<
timeout_us
)
{
timeout_us
=
override_timeout_us
;
}
}
if
(
timeout_us
!=
0
)
{
timeout_timestamp_ns_
.
emplace_back
(
std
::
chrono
::
duration_cast
<
std
::
chrono
::
nanoseconds
>
(
std
::
chrono
::
steady_clock
::
now
().
time_since_epoch
())
.
count
()
+
timeout_us
*
1000
);
}
else
{
timeout_timestamp_ns_
.
emplace_back
(
0
);
}
return
Status
::
Success
;
}
Status
PriorityQueue
::
PolicyQueue
::
Dequeue
(
std
::
unique_ptr
<
InferenceRequest
>*
request
)
{
if
(
!
queue_
.
empty
())
{
*
request
=
std
::
move
(
queue_
.
front
());
queue_
.
pop_front
();
timeout_timestamp_ns_
.
pop_front
();
}
else
{
*
request
=
std
::
move
(
delayed_queue_
.
front
());
delayed_queue_
.
pop_front
();
}
return
Status
::
Success
;
}
bool
PriorityQueue
::
PolicyQueue
::
ApplyPolicy
(
size_t
idx
,
size_t
*
rejected_count
,
size_t
*
rejected_batch_size
)
{
uint64_t
now_nanoseconds
=
std
::
chrono
::
duration_cast
<
std
::
chrono
::
nanoseconds
>
(
std
::
chrono
::
steady_clock
::
now
().
time_since_epoch
())
.
count
();
if
(
idx
<
queue_
.
size
())
{
size_t
curr_idx
=
idx
;
while
(
curr_idx
<
queue_
.
size
())
{
if
((
timeout_timestamp_ns_
[
curr_idx
]
!=
0
)
&&
(
now_nanoseconds
>
timeout_timestamp_ns_
[
curr_idx
]))
{
if
(
timeout_action_
==
inference
::
ModelQueuePolicy
::
DELAY
)
{
delayed_queue_
.
emplace_back
(
std
::
move
(
queue_
[
curr_idx
]));
}
else
{
rejected_queue_
.
emplace_back
(
std
::
move
(
queue_
[
curr_idx
]));
*
rejected_count
+=
1
;
*
rejected_batch_size
+=
std
::
max
(
1U
,
rejected_queue_
.
back
()
->
BatchSize
());
}
curr_idx
++
;
}
else
{
break
;
}
}
// Use range erasure on deque as all erasure functions are linear,
// this implies in the edge case where this function is always called on
// 'bad' index can be O(n^2). However, for data structures that are O(1)
// erasure, the traversal may not be as efficient due to cache miss
// (elements not stored contiguously).
queue_
.
erase
(
queue_
.
begin
()
+
idx
,
queue_
.
begin
()
+
curr_idx
);
timeout_timestamp_ns_
.
erase
(
timeout_timestamp_ns_
.
begin
()
+
idx
,
timeout_timestamp_ns_
.
begin
()
+
curr_idx
);
// Current idx is pointing to an item with unexpired timeout
if
(
idx
<
queue_
.
size
())
{
return
true
;
}
}
// At this point, idx is pointing to an item with expired timeout.
// If the item is in delayed queue, then return true. Otherwise, false
// meaning the queue has no item with this 'idx'.
return
((
idx
-
queue_
.
size
())
<
delayed_queue_
.
size
());
}
void
PriorityQueue
::
PolicyQueue
::
ReleaseRejectedQueue
(
std
::
deque
<
std
::
unique_ptr
<
InferenceRequest
>>*
requests
)
{
rejected_queue_
.
swap
(
*
requests
);
}
const
std
::
unique_ptr
<
InferenceRequest
>&
PriorityQueue
::
PolicyQueue
::
At
(
size_t
idx
)
const
{
if
(
idx
<
queue_
.
size
())
{
return
queue_
[
idx
];
}
else
{
return
delayed_queue_
[
idx
-
queue_
.
size
()];
}
}
uint64_t
PriorityQueue
::
PolicyQueue
::
TimeoutAt
(
size_t
idx
)
{
if
(
idx
<
queue_
.
size
())
{
return
timeout_timestamp_ns_
[
idx
];
}
else
{
return
0
;
}
}
PriorityQueue
::
PriorityQueue
()
:
size_
(
0
),
front_priority_level_
(
0
),
last_priority_level_
(
0
)
{
inference
::
ModelQueuePolicy
default_policy
;
queues_
.
emplace
(
0
,
PolicyQueue
(
default_policy
));
front_priority_level_
=
queues_
.
begin
()
->
first
;
ResetCursor
();
}
PriorityQueue
::
PriorityQueue
(
const
inference
::
ModelQueuePolicy
&
default_queue_policy
,
uint32_t
priority_levels
,
const
ModelQueuePolicyMap
queue_policy_map
)
:
size_
(
0
),
last_priority_level_
(
priority_levels
)
{
if
(
priority_levels
==
0
)
{
queues_
.
emplace
(
0
,
PolicyQueue
(
default_queue_policy
));
}
else
{
for
(
uint32_t
level
=
1
;
level
<=
priority_levels
;
level
++
)
{
auto
it
=
queue_policy_map
.
find
(
level
);
if
(
it
==
queue_policy_map
.
end
())
{
queues_
.
emplace
(
level
,
PolicyQueue
(
default_queue_policy
));
}
else
{
queues_
.
emplace
(
level
,
PolicyQueue
(
it
->
second
));
}
}
}
front_priority_level_
=
queues_
.
begin
()
->
first
;
ResetCursor
();
}
Status
PriorityQueue
::
Enqueue
(
uint32_t
priority_level
,
std
::
unique_ptr
<
InferenceRequest
>&
request
)
{
auto
status
=
queues_
[
priority_level
].
Enqueue
(
request
);
if
(
status
.
IsOk
())
{
size_
++
;
front_priority_level_
=
std
::
min
(
front_priority_level_
,
priority_level
);
// Invalidate the pending batch cursor if the enqueued item is placed
// within the pending batch. At the same priority level the request is
// guaranteed to be after pending batch if the batch hasn't reached
// delayed queue.
if
((
priority_level
<
pending_cursor_
.
curr_it_
->
first
)
||
((
priority_level
==
pending_cursor_
.
curr_it_
->
first
)
&&
(
pending_cursor_
.
at_delayed_queue_
)))
{
pending_cursor_
.
valid_
=
false
;
}
}
return
status
;
}
Status
PriorityQueue
::
Dequeue
(
std
::
unique_ptr
<
InferenceRequest
>*
request
)
{
pending_cursor_
.
valid_
=
false
;
while
(
true
)
{
if
(
!
queues_
[
front_priority_level_
].
Empty
())
{
RETURN_IF_ERROR
(
queues_
[
front_priority_level_
].
Dequeue
(
request
));
size_
--
;
return
Status
::
Success
;
}
else
if
(
front_priority_level_
!=
last_priority_level_
)
{
front_priority_level_
++
;
continue
;
}
// Control reach here if the queue for last priority level is also
// empty, then return error below.
break
;
}
return
Status
(
Status
::
Code
::
UNAVAILABLE
,
(
*
request
)
->
LogRequest
()
+
"dequeue on empty queue"
);
}
void
PriorityQueue
::
ReleaseRejectedRequests
(
std
::
shared_ptr
<
std
::
vector
<
std
::
deque
<
std
::
unique_ptr
<
InferenceRequest
>>>>*
requests
)
{
auto
res
=
std
::
make_shared
<
std
::
vector
<
std
::
deque
<
std
::
unique_ptr
<
InferenceRequest
>>>>
(
queues_
.
size
());
size_t
idx
=
0
;
for
(
auto
&
queue
:
queues_
)
{
queue
.
second
.
ReleaseRejectedQueue
(
&
((
*
res
)[
idx
]));
idx
++
;
}
requests
->
swap
(
res
);
}
bool
PriorityQueue
::
IsCursorValid
()
{
if
(
pending_cursor_
.
valid_
)
{
return
(
uint64_t
)
std
::
chrono
::
duration_cast
<
std
::
chrono
::
nanoseconds
>
(
std
::
chrono
::
steady_clock
::
now
().
time_since_epoch
())
.
count
()
<
pending_cursor_
.
pending_batch_closest_timeout_ns_
;
}
return
false
;
}
PriorityQueue
::
Cursor
::
Cursor
(
PriorityQueues
::
iterator
start_it
)
:
curr_it_
(
start_it
),
queue_idx_
(
0
),
at_delayed_queue_
(
false
),
pending_batch_closest_timeout_ns_
(
0
),
pending_batch_oldest_enqueue_time_ns_
(
0
),
pending_batch_count_
(
0
),
valid_
(
true
)
{
}
size_t
PriorityQueue
::
ApplyPolicyAtCursor
()
{
size_t
rejected_batch_size
=
0
;
size_t
rejected_count
=
0
;
while
(
pending_cursor_
.
curr_it_
!=
queues_
.
end
())
{
if
(
!
(
pending_cursor_
.
curr_it_
->
second
.
ApplyPolicy
(
pending_cursor_
.
queue_idx_
,
&
rejected_count
,
&
rejected_batch_size
)))
{
if
(
size_
>
pending_cursor_
.
pending_batch_count_
+
rejected_count
)
{
pending_cursor_
.
curr_it_
++
;
pending_cursor_
.
queue_idx_
=
0
;
continue
;
}
}
// Control reach here if the cursor points to a request that is candidate
// for pending batch, or if all requests are in pending batch.
break
;
}
size_
-=
rejected_count
;
return
rejected_batch_size
;
}
void
PriorityQueue
::
AdvanceCursor
()
{
if
(
pending_cursor_
.
pending_batch_count_
>=
size_
)
{
return
;
}
const
auto
&
timeout_ns
=
pending_cursor_
.
curr_it_
->
second
.
TimeoutAt
(
pending_cursor_
.
queue_idx_
);
if
(
timeout_ns
!=
0
)
{
if
(
pending_cursor_
.
pending_batch_closest_timeout_ns_
!=
0
)
{
pending_cursor_
.
pending_batch_closest_timeout_ns_
=
std
::
min
(
pending_cursor_
.
pending_batch_closest_timeout_ns_
,
timeout_ns
);
}
else
{
pending_cursor_
.
pending_batch_closest_timeout_ns_
=
timeout_ns
;
}
}
uint64_t
curr_enqueue_time_ns
=
pending_cursor_
.
curr_it_
->
second
.
At
(
pending_cursor_
.
queue_idx_
)
->
BatcherStartNs
();
if
(
pending_cursor_
.
pending_batch_oldest_enqueue_time_ns_
!=
0
)
{
pending_cursor_
.
pending_batch_oldest_enqueue_time_ns_
=
std
::
min
(
pending_cursor_
.
pending_batch_oldest_enqueue_time_ns_
,
curr_enqueue_time_ns
);
}
else
{
pending_cursor_
.
pending_batch_oldest_enqueue_time_ns_
=
curr_enqueue_time_ns
;
}
++
pending_cursor_
.
queue_idx_
;
++
pending_cursor_
.
pending_batch_count_
;
// pending batch includes delayed request if (queue_idx_ - 1) points to
// delayed queue.
pending_cursor_
.
at_delayed_queue_
=
(
pending_cursor_
.
queue_idx_
>
pending_cursor_
.
curr_it_
->
second
.
UnexpiredSize
());
}
}}
// namespace triton::core
3rdparty/core-r22.12/src/scheduler_utils.h
0 → 100644
View file @
b30f3cdb
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <deque>
#include <unordered_map>
#include "scheduler.h"
namespace
triton
{
namespace
core
{
struct
RequiredEqualInputs
{
public:
RequiredEqualInputs
()
:
init_
(
false
),
has_optional_input_
(
false
)
{}
Status
Initialize
(
const
std
::
unique_ptr
<
InferenceRequest
>&
request
,
const
std
::
unordered_map
<
std
::
string
,
bool
>&
enforce_equal_shape_tensors
,
const
bool
has_optional_input
);
bool
HasEqualInputs
(
const
std
::
unique_ptr
<
InferenceRequest
>&
request
);
bool
Initialized
()
{
return
init_
;
};
private:
bool
init_
;
bool
has_optional_input_
;
// A collection of inputs in the request, an nullptr for
// InferenceRequest::Input indicates that the inputs doesn't require
// equality check
std
::
unordered_map
<
std
::
string
,
std
::
pair
<
const
InferenceRequest
::
Input
*
,
bool
/* compare contents */
>>
required_inputs_
;
};
//
// PriorityQueue
//
using
ModelQueuePolicyMap
=
::
google
::
protobuf
::
Map
<
::
google
::
protobuf
::
uint32
,
inference
::
ModelQueuePolicy
>
;
class
PriorityQueue
{
public:
// Construct a queue with no priority level with default queue policy,
// which will behave the same as regular queue.
PriorityQueue
();
// Construct a queue with 'priority_levels', the priority starts from 1.
// Different priority level may follow different queue policies given by
// 'queue_policy_map', otherwise, the 'default_queue_policy' will be used.
PriorityQueue
(
const
inference
::
ModelQueuePolicy
&
default_queue_policy
,
uint32_t
priority_levels
,
const
ModelQueuePolicyMap
queue_policy_map
);
// Enqueue a request with priority set to 'priority_level'. If
// Status::Success is returned then the queue has taken ownership of
// the request object and so 'request' will be nullptr. If
// non-success is returned then the caller still retains ownership
// of 'request'.
Status
Enqueue
(
uint32_t
priority_level
,
std
::
unique_ptr
<
InferenceRequest
>&
request
);
// Dequeue the request at the front of the queue.
Status
Dequeue
(
std
::
unique_ptr
<
InferenceRequest
>*
request
);
// Retrieve the requests that are rejected based on the queue policies.
void
ReleaseRejectedRequests
(
std
::
shared_ptr
<
std
::
vector
<
std
::
deque
<
std
::
unique_ptr
<
InferenceRequest
>>>>*
requests
);
// Return the number of requests in the queue, rejected requests are
// not included.
size_t
Size
()
{
return
size_
;
}
// Is the queue is empty? Rejected requests are not included.
bool
Empty
()
{
return
Size
()
==
0
;
}
// Reset the cursor such that it is representing an empty pending batch.
void
ResetCursor
()
{
pending_cursor_
=
Cursor
(
queues_
.
begin
());
}
// Record the current cursor. The cursor can be restored to recorded state
// by invoking SetCursorToMark(). Note that Enqueue(), Dequeue(), and
// ResetCursor() will invalidate the marker, it is the function caller's
// responsibility to ensure the marker is valid before calling
// SetCursorToMark().
void
MarkCursor
()
{
current_mark_
=
pending_cursor_
;
}
// Apply the queue policy and alter the underlying queue accordingly. After
// the function returns, the cursor may be at its end to indicate that
// there no request after the pending batch.
// Returns the total batch size of the newly rejected requests.
size_t
ApplyPolicyAtCursor
();
// Return the request at the cursor.
const
std
::
unique_ptr
<
InferenceRequest
>&
RequestAtCursor
()
{
return
pending_cursor_
.
curr_it_
->
second
.
At
(
pending_cursor_
.
queue_idx_
);
}
// Advance the cursor for pending batch. This function will not trigger the
// queue policy. No effect if the cursor already reach the end of the queue.
void
AdvanceCursor
();
// Whether the cursor reaches its end,
bool
CursorEnd
()
{
return
pending_cursor_
.
pending_batch_count_
==
size_
;
}
// Restore the cursor state to the marker.
void
SetCursorToMark
()
{
pending_cursor_
=
current_mark_
;
}
// Whether the cursor is still valid. The cursor is valid only if the pending
// batch is unchanged.
bool
IsCursorValid
();
// Return the oldest queued time of requests in pending batch.
uint64_t
OldestEnqueueTime
()
{
return
pending_cursor_
.
pending_batch_oldest_enqueue_time_ns_
;
}
// Return the closest timeout of requests in pending batch.
uint64_t
ClosestTimeout
()
{
return
pending_cursor_
.
pending_batch_closest_timeout_ns_
;
}
// Return the number of requests in pending batch.
size_t
PendingBatchCount
()
{
return
pending_cursor_
.
pending_batch_count_
;
}
private:
class
PolicyQueue
{
public:
// Construct a policy queue with default policy, which will behave the same
// as regular queue.
PolicyQueue
()
:
timeout_action_
(
inference
::
ModelQueuePolicy
::
REJECT
),
default_timeout_us_
(
0
),
allow_timeout_override_
(
false
),
max_queue_size_
(
0
)
{
}
// Construct a policy queue with given 'policy'.
PolicyQueue
(
const
inference
::
ModelQueuePolicy
&
policy
)
:
timeout_action_
(
policy
.
timeout_action
()),
default_timeout_us_
(
policy
.
default_timeout_microseconds
()),
allow_timeout_override_
(
policy
.
allow_timeout_override
()),
max_queue_size_
(
policy
.
max_queue_size
())
{
}
// Enqueue a request and set up its timeout accordingly. If
// Status::Success is returned then the queue has taken ownership
// of the request object and so 'request' will be nullptr. If
// non-success is returned then the caller still retains ownership
// of 'request'.
Status
Enqueue
(
std
::
unique_ptr
<
InferenceRequest
>&
request
);
// Dequeue the request at the front of the queue.
Status
Dequeue
(
std
::
unique_ptr
<
InferenceRequest
>*
request
);
// Apply the queue policy to the request at 'idx'.
// 'rejected_count' will be incremented by the number of the newly rejected
// requets after applying the policy.
// 'rejected_batch_size' will be incremented by the total batch size of the
// newly rejected requests after applying the policy.
// Return true if the 'idx' still points to a request after applying the
// policy, false otherwise.
bool
ApplyPolicy
(
size_t
idx
,
size_t
*
rejected_count
,
size_t
*
rejected_batch_size
);
// Return the rejected requests held by the queue.
void
ReleaseRejectedQueue
(
std
::
deque
<
std
::
unique_ptr
<
InferenceRequest
>>*
requests
);
// Return the request at 'idx'.
const
std
::
unique_ptr
<
InferenceRequest
>&
At
(
size_t
idx
)
const
;
// Return the timeout timestamp of the request at 'idx', in ns. A value of 0
// indicates that the request doesn't specify a timeout.
uint64_t
TimeoutAt
(
size_t
idx
);
// Return whether the queue is empty, rejected requests are not included.
bool
Empty
()
{
return
Size
()
==
0
;
}
// Return the number of requests in the queue, rejected requests are not
// included.
size_t
Size
()
{
return
queue_
.
size
()
+
delayed_queue_
.
size
();
}
// Return the number of unexpired requests in the queue
size_t
UnexpiredSize
()
{
return
queue_
.
size
();
}
private:
// Variables that define the policy for the queue
const
inference
::
ModelQueuePolicy
::
TimeoutAction
timeout_action_
;
const
uint64_t
default_timeout_us_
;
const
bool
allow_timeout_override_
;
const
uint32_t
max_queue_size_
;
std
::
deque
<
uint64_t
>
timeout_timestamp_ns_
;
std
::
deque
<
std
::
unique_ptr
<
InferenceRequest
>>
queue_
;
std
::
deque
<
std
::
unique_ptr
<
InferenceRequest
>>
delayed_queue_
;
std
::
deque
<
std
::
unique_ptr
<
InferenceRequest
>>
rejected_queue_
;
};
using
PriorityQueues
=
std
::
map
<
uint32_t
,
PolicyQueue
>
;
// Cursor for tracking pending batch, the cursor points to the item after
// the pending batch.
struct
Cursor
{
Cursor
()
=
default
;
Cursor
(
PriorityQueues
::
iterator
start_it
);
Cursor
(
const
Cursor
&
rhs
)
=
default
;
Cursor
&
operator
=
(
const
Cursor
&
rhs
)
=
default
;
PriorityQueues
::
iterator
curr_it_
;
size_t
queue_idx_
;
bool
at_delayed_queue_
;
uint64_t
pending_batch_closest_timeout_ns_
;
uint64_t
pending_batch_oldest_enqueue_time_ns_
;
size_t
pending_batch_count_
;
bool
valid_
;
};
PriorityQueues
queues_
;
size_t
size_
;
// Keep track of the priority level that the first request in the queue
// is at to avoid traversing 'queues_'
uint32_t
front_priority_level_
;
uint32_t
last_priority_level_
;
Cursor
pending_cursor_
;
Cursor
current_mark_
;
};
}}
// namespace triton::core
Prev
1
…
4
5
6
7
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment