Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
YOLOv7-tiny_triton
Commits
c68e1835
Commit
c68e1835
authored
Sep 18, 2023
by
lijian6
Browse files
Initial commit
parents
Pipeline
#561
failed with stages
in 0 seconds
Changes
184
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
4990 additions
and
0 deletions
+4990
-0
src/c++/perf_analyzer/infer_data_manager.cc
src/c++/perf_analyzer/infer_data_manager.cc
+206
-0
src/c++/perf_analyzer/infer_data_manager.h
src/c++/perf_analyzer/infer_data_manager.h
+92
-0
src/c++/perf_analyzer/infer_data_manager_base.cc
src/c++/perf_analyzer/infer_data_manager_base.cc
+183
-0
src/c++/perf_analyzer/infer_data_manager_base.h
src/c++/perf_analyzer/infer_data_manager_base.h
+144
-0
src/c++/perf_analyzer/infer_data_manager_factory.h
src/c++/perf_analyzer/infer_data_manager_factory.h
+78
-0
src/c++/perf_analyzer/infer_data_manager_shm.cc
src/c++/perf_analyzer/infer_data_manager_shm.cc
+381
-0
src/c++/perf_analyzer/infer_data_manager_shm.h
src/c++/perf_analyzer/infer_data_manager_shm.h
+159
-0
src/c++/perf_analyzer/inference_profiler.cc
src/c++/perf_analyzer/inference_profiler.cc
+1761
-0
src/c++/perf_analyzer/inference_profiler.h
src/c++/perf_analyzer/inference_profiler.h
+734
-0
src/c++/perf_analyzer/ischeduler.h
src/c++/perf_analyzer/ischeduler.h
+42
-0
src/c++/perf_analyzer/iworker.h
src/c++/perf_analyzer/iworker.h
+38
-0
src/c++/perf_analyzer/load_manager.cc
src/c++/perf_analyzer/load_manager.cc
+284
-0
src/c++/perf_analyzer/load_manager.h
src/c++/perf_analyzer/load_manager.h
+180
-0
src/c++/perf_analyzer/load_worker.cc
src/c++/perf_analyzer/load_worker.cc
+125
-0
src/c++/perf_analyzer/load_worker.h
src/c++/perf_analyzer/load_worker.h
+155
-0
src/c++/perf_analyzer/main.cc
src/c++/perf_analyzer/main.cc
+47
-0
src/c++/perf_analyzer/metrics.h
src/c++/perf_analyzer/metrics.h
+44
-0
src/c++/perf_analyzer/metrics_manager.cc
src/c++/perf_analyzer/metrics_manager.cc
+174
-0
src/c++/perf_analyzer/metrics_manager.h
src/c++/perf_analyzer/metrics_manager.h
+94
-0
src/c++/perf_analyzer/mock_concurrency_worker.h
src/c++/perf_analyzer/mock_concurrency_worker.h
+69
-0
No files found.
Too many changes to show.
To preserve performance only
184 of 184+
files are displayed.
Plain diff
Email patch
src/c++/perf_analyzer/infer_data_manager.cc
0 → 100644
View file @
c68e1835
// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "infer_data_manager.h"
#include <algorithm>
namespace
triton
{
namespace
perfanalyzer
{
cb
::
Error
InferDataManager
::
Init
()
{
RETURN_IF_ERROR
(
CreateAndPopulateInputs
());
return
cb
::
Error
::
Success
;
}
cb
::
Error
InferDataManager
::
CreateAndPopulateInputs
()
{
// All combinations of thread + input + stream + step
//
for
(
size_t
thread_id
=
0
;
thread_id
<
max_threads_
;
thread_id
++
)
{
for
(
const
auto
&
input
:
*
(
parser_
->
Inputs
()))
{
const
std
::
string
&
name
=
input
.
first
;
const
ModelTensor
&
tensor
=
input
.
second
;
for
(
int
stream_id
=
0
;
stream_id
<
(
int
)
data_loader_
->
GetDataStreamsCount
();
stream_id
++
)
{
for
(
int
step_id
=
0
;
step_id
<
(
int
)
data_loader_
->
GetTotalSteps
(
stream_id
);
step_id
+=
1
)
{
RETURN_IF_ERROR
(
CreateAndPopulateInput
(
thread_id
,
name
,
tensor
,
stream_id
,
step_id
));
}
}
}
}
return
cb
::
Error
::
Success
;
}
cb
::
Error
InferDataManager
::
CreateAndPopulateInput
(
const
size_t
thread_id
,
const
std
::
string
&
name
,
const
ModelTensor
&
tensor
,
int
stream_id
,
int
step_id
)
{
std
::
vector
<
TensorData
>
input_datas
;
size_t
count
=
0
;
RETURN_IF_ERROR
(
GetInputData
(
name
,
tensor
,
stream_id
,
step_id
,
input_datas
));
if
(
tensor
.
is_shape_tensor_
)
{
RETURN_IF_ERROR
(
ValidateShapeTensor
(
tensor
,
stream_id
,
step_id
,
input_datas
));
}
std
::
vector
<
int64_t
>
shape
;
RETURN_IF_ERROR
(
data_loader_
->
GetInputShape
(
tensor
,
stream_id
,
step_id
,
&
shape
));
if
(
!
shape
.
empty
())
{
if
((
parser_
->
MaxBatchSize
()
!=
0
)
&&
(
!
tensor
.
is_shape_tensor_
))
{
shape
.
insert
(
shape
.
begin
(),
(
int64_t
)
batch_size_
);
}
}
cb
::
InferInput
*
input
;
RETURN_IF_ERROR
(
CreateInferInput
(
&
input
,
backend_kind_
,
name
,
shape
,
tensor
.
datatype_
));
// Number of missing pieces of data for optional inputs
int
missing_data_cnt
=
0
;
int
total_cnt
=
input_datas
.
size
();
for
(
size_t
i
=
0
;
i
<
total_cnt
;
i
++
)
{
if
(
!
input_datas
[
i
].
is_valid
)
{
missing_data_cnt
++
;
}
else
{
RETURN_IF_ERROR
(
input
->
AppendRaw
(
input_datas
[
i
].
data_ptr
,
input_datas
[
i
].
batch1_size
));
}
}
// If all optional inputs had data provided, this is a valid input. But if
// some inferences in the batch provided data for an optional input and
// some inferences did not, this is an invalid case and an error is
// thrown.
if
(
missing_data_cnt
==
0
)
{
inputs_
.
insert
({{
thread_id
,
name
,
stream_id
,
step_id
},
input
});
}
else
if
(
missing_data_cnt
>
0
&&
missing_data_cnt
<
total_cnt
)
{
return
cb
::
Error
(
"For batch sizes larger than 1, the same set of inputs must be "
"specified for each batch. You cannot use different set of "
"optional inputs for each individual batch."
);
}
return
cb
::
Error
::
Success
;
}
cb
::
InferInput
*
InferDataManager
::
GetInput
(
const
size_t
thread_id
,
const
std
::
string
&
name
,
int
stream_id
,
int
step_id
)
{
auto
input
=
inputs_
.
find
({
thread_id
,
name
,
stream_id
,
step_id
});
if
(
input
==
inputs_
.
end
())
{
return
nullptr
;
}
else
{
return
input
->
second
;
}
}
cb
::
Error
InferDataManager
::
InitInferDataInput
(
const
std
::
string
&
name
,
const
ModelTensor
&
model_tensor
,
InferData
&
infer_data
)
{
std
::
vector
<
int64_t
>
shape
;
RETURN_IF_ERROR
(
data_loader_
->
GetInputShape
(
model_tensor
,
0
,
0
,
&
shape
));
if
(
shape
.
empty
()
&&
(
backend_kind_
==
cb
::
BackendKind
::
TRITON
))
{
return
cb
::
Error
(
"unable to set shape for the input"
,
pa
::
GENERIC_ERROR
);
}
if
((
parser_
->
MaxBatchSize
()
!=
0
)
&&
(
!
model_tensor
.
is_shape_tensor_
))
{
shape
.
insert
(
shape
.
begin
(),
(
int64_t
)
batch_size_
);
}
cb
::
InferInput
*
infer_input
;
RETURN_IF_ERROR
(
CreateInferInput
(
&
infer_input
,
backend_kind_
,
name
,
shape
,
model_tensor
.
datatype_
));
infer_data
.
inputs_
.
push_back
(
infer_input
);
TensorData
input_data
;
RETURN_IF_ERROR
(
data_loader_
->
GetInputData
(
model_tensor
,
0
,
0
,
input_data
));
// Add optional input to request if data was found
if
(
input_data
.
is_valid
)
{
infer_data
.
valid_inputs_
.
push_back
(
infer_input
);
}
if
(
!
shape
.
empty
())
{
size_t
max_count
=
(
parser_
->
MaxBatchSize
()
==
0
)
?
1
:
batch_size_
;
for
(
size_t
i
=
0
;
i
<
max_count
;
++
i
)
{
RETURN_IF_ERROR
(
infer_input
->
AppendRaw
(
input_data
.
data_ptr
,
input_data
.
batch1_size
));
}
}
return
cb
::
Error
::
Success
;
}
cb
::
Error
InferDataManager
::
InitInferDataOutput
(
const
std
::
string
&
name
,
InferData
&
infer_data
)
{
cb
::
InferRequestedOutput
*
requested_output
;
RETURN_IF_ERROR
(
cb
::
InferRequestedOutput
::
Create
(
&
requested_output
,
backend_kind_
,
name
));
infer_data
.
outputs_
.
push_back
(
requested_output
);
return
cb
::
Error
::
Success
;
}
cb
::
Error
InferDataManager
::
UpdateInputs
(
const
size_t
thread_id
,
const
int
stream_index
,
const
int
step_index
,
InferData
&
infer_data
)
{
// Reset inputs for this inference request
infer_data
.
valid_inputs_
.
clear
();
for
(
const
auto
&
input
:
infer_data
.
inputs_
)
{
const
auto
&
name
=
input
->
Name
();
cb
::
InferInput
*
tmp_input
=
GetInput
(
thread_id
,
name
,
stream_index
,
step_index
);
if
(
tmp_input
!=
nullptr
)
{
infer_data
.
valid_inputs_
.
push_back
(
tmp_input
);
}
}
return
cb
::
Error
::
Success
;
}
}}
// namespace triton::perfanalyzer
src/c++/perf_analyzer/infer_data_manager.h
0 → 100644
View file @
c68e1835
// Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include "client_backend/client_backend.h"
#include "constants.h"
#include "data_loader.h"
#include "infer_data.h"
#include "infer_data_manager_base.h"
#include "model_parser.h"
#include "perf_utils.h"
namespace
triton
{
namespace
perfanalyzer
{
/// Manages infer data to prepare an inference request and the resulting
/// inference output from triton server
class
InferDataManager
:
public
InferDataManagerBase
{
public:
InferDataManager
(
const
size_t
max_threads
,
const
int32_t
batch_size
,
const
std
::
shared_ptr
<
ModelParser
>&
parser
,
const
std
::
shared_ptr
<
cb
::
ClientBackendFactory
>&
factory
,
const
std
::
shared_ptr
<
DataLoader
>&
data_loader
)
:
max_threads_
(
max_threads
),
InferDataManagerBase
(
batch_size
,
parser
,
factory
,
data_loader
)
{
}
/// Initialize this object. Must be called before any other functions
/// \return cb::Error object indicating success or failure.
cb
::
Error
Init
()
override
;
protected:
const
size_t
max_threads_
{
1
};
std
::
map
<
std
::
tuple
<
size_t
,
std
::
string
,
int
,
int
>
,
cb
::
InferInput
*>
inputs_
;
cb
::
Error
CreateAndPopulateInputs
();
cb
::
Error
CreateAndPopulateInput
(
const
size_t
thread_id
,
const
std
::
string
&
name
,
const
ModelTensor
&
model_tensor
,
int
stream_id
,
int
step_id
);
cb
::
InferInput
*
GetInput
(
const
size_t
thread_id
,
const
std
::
string
&
name
,
int
stream_id
,
int
step_id
);
cb
::
Error
InitInferDataInput
(
const
std
::
string
&
name
,
const
ModelTensor
&
model_tensor
,
InferData
&
infer_data
)
override
;
cb
::
Error
InitInferDataOutput
(
const
std
::
string
&
name
,
InferData
&
infer_data
)
override
;
/// Helper function to update the inputs
/// \param thread_id The ID of the calling thread
/// \param stream_index The data stream to use for next data
/// \param step_index The step index to use for next data
/// \param infer_data The target InferData object
/// \return cb::Error object indicating success or failure.
cb
::
Error
UpdateInputs
(
const
size_t
thread_id
,
const
int
stream_index
,
const
int
step_index
,
InferData
&
infer_data
);
#ifndef DOCTEST_CONFIG_DISABLE
public:
InferDataManager
()
=
default
;
#endif
};
}}
// namespace triton::perfanalyzer
src/c++/perf_analyzer/infer_data_manager_base.cc
0 → 100644
View file @
c68e1835
// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "infer_data_manager_base.h"
#include <algorithm>
namespace
triton
{
namespace
perfanalyzer
{
cb
::
Error
InferDataManagerBase
::
GetInputData
(
const
std
::
string
&
name
,
const
ModelTensor
&
tensor
,
int
stream_id
,
int
step_id
,
std
::
vector
<
TensorData
>&
input_datas
)
{
size_t
max_count
=
tensor
.
is_shape_tensor_
?
1
:
batch_size_
;
std
::
vector
<
int64_t
>
shape
;
std
::
vector
<
int64_t
>
prev_shape
;
for
(
size_t
count
=
0
;
count
<
max_count
;
count
++
)
{
int
local_step_id
=
(
step_id
+
count
)
%
data_loader_
->
GetTotalSteps
(
stream_id
);
TensorData
input_data
;
RETURN_IF_ERROR
(
data_loader_
->
GetInputShape
(
tensor
,
stream_id
,
local_step_id
,
&
shape
));
if
(
!
shape
.
empty
())
{
if
(
count
==
0
)
{
prev_shape
=
shape
;
}
else
{
if
(
!
std
::
equal
(
shape
.
begin
(),
shape
.
end
(),
prev_shape
.
begin
()))
{
return
cb
::
Error
(
"can not batch tensors with different shapes together "
"(input '"
+
name
+
"' expected shape "
+
ShapeVecToString
(
prev_shape
)
+
" and received "
+
ShapeVecToString
(
shape
),
pa
::
GENERIC_ERROR
);
}
}
}
RETURN_IF_ERROR
(
data_loader_
->
GetInputData
(
tensor
,
stream_id
,
local_step_id
,
input_data
));
input_datas
.
push_back
(
input_data
);
}
return
cb
::
Error
::
Success
;
}
cb
::
Error
InferDataManagerBase
::
ValidateShapeTensor
(
const
ModelTensor
&
tensor
,
int
stream_id
,
int
step_id
,
const
std
::
vector
<
TensorData
>&
input_datas
)
{
// Validate that steps 1 through N are exactly the same as step 0, since step
// 0 is the only one we send for shape tensors
for
(
size_t
count
=
1
;
count
<
batch_size_
;
count
++
)
{
int
local_step_id
=
(
step_id
+
count
)
%
data_loader_
->
GetTotalSteps
(
stream_id
);
TensorData
input_data
;
RETURN_IF_ERROR
(
data_loader_
->
GetInputData
(
tensor
,
stream_id
,
local_step_id
,
input_data
));
if
(
input_data
.
batch1_size
!=
input_datas
.
back
().
batch1_size
)
{
return
cb
::
Error
(
"The shape tensors should be identical in a batch (mismatch "
"in size)"
,
pa
::
GENERIC_ERROR
);
}
for
(
size_t
data_idx
=
0
;
data_idx
<
input_data
.
batch1_size
;
data_idx
++
)
{
if
(
*
(
input_data
.
data_ptr
+
data_idx
)
!=
*
(
input_datas
.
back
().
data_ptr
+
data_idx
))
{
return
cb
::
Error
(
"The shape tensors should be identical in a batch "
"(mismatch in content)"
,
pa
::
GENERIC_ERROR
);
}
}
}
return
cb
::
Error
::
Success
;
}
cb
::
Error
InferDataManagerBase
::
InitInferData
(
InferData
&
infer_data
)
{
// Initialize inputs
for
(
const
auto
&
input
:
*
(
parser_
->
Inputs
()))
{
RETURN_IF_ERROR
(
InitInferDataInput
(
input
.
first
,
input
.
second
,
infer_data
));
}
for
(
const
auto
&
output
:
*
(
parser_
->
Outputs
()))
{
RETURN_IF_ERROR
(
InitInferDataOutput
(
output
.
first
,
infer_data
));
}
return
cb
::
Error
::
Success
;
}
cb
::
Error
InferDataManagerBase
::
UpdateInferData
(
size_t
thread_id
,
int
stream_index
,
int
step_index
,
InferData
&
infer_data
)
{
RETURN_IF_ERROR
(
data_loader_
->
ValidateIndexes
(
stream_index
,
step_index
));
RETURN_IF_ERROR
(
UpdateInputs
(
thread_id
,
stream_index
,
step_index
,
infer_data
));
RETURN_IF_ERROR
(
UpdateValidationOutputs
(
stream_index
,
step_index
,
infer_data
));
return
cb
::
Error
::
Success
;
}
cb
::
Error
InferDataManagerBase
::
UpdateValidationOutputs
(
int
stream_index
,
int
step_index
,
InferData
&
infer_data
)
{
RETURN_IF_ERROR
(
data_loader_
->
ValidateIndexes
(
stream_index
,
step_index
));
infer_data
.
expected_outputs_
.
clear
();
for
(
const
auto
&
output
:
infer_data
.
outputs_
)
{
const
auto
&
model_output
=
(
*
(
parser_
->
Outputs
()))[
output
->
Name
()];
TensorData
output_data
;
const
int
*
set_shape_values
=
nullptr
;
int
set_shape_value_cnt
=
0
;
std
::
vector
<
TensorData
>
outputs
;
for
(
size_t
i
=
0
;
i
<
batch_size_
;
++
i
)
{
RETURN_IF_ERROR
(
data_loader_
->
GetOutputData
(
output
->
Name
(),
stream_index
,
(
step_index
+
i
)
%
data_loader_
->
GetTotalSteps
(
0
),
output_data
));
if
(
!
output_data
.
is_valid
)
{
break
;
}
outputs
.
emplace_back
(
output_data
);
// Shape tensor only need the first batch element
if
(
model_output
.
is_shape_tensor_
)
{
break
;
}
}
if
(
!
outputs
.
empty
())
{
infer_data
.
expected_outputs_
.
emplace_back
(
std
::
move
(
outputs
));
}
}
return
cb
::
Error
::
Success
;
}
cb
::
Error
InferDataManagerBase
::
CreateInferInput
(
cb
::
InferInput
**
infer_input
,
const
cb
::
BackendKind
kind
,
const
std
::
string
&
name
,
const
std
::
vector
<
int64_t
>&
dims
,
const
std
::
string
&
datatype
)
{
return
cb
::
InferInput
::
Create
(
infer_input
,
kind
,
name
,
dims
,
datatype
);
}
}}
// namespace triton::perfanalyzer
src/c++/perf_analyzer/infer_data_manager_base.h
0 → 100644
View file @
c68e1835
// Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include "client_backend/client_backend.h"
#include "constants.h"
#include "data_loader.h"
#include "iinfer_data_manager.h"
#include "infer_data.h"
#include "model_parser.h"
#include "perf_utils.h"
#include "tensor_data.h"
namespace
triton
{
namespace
perfanalyzer
{
/// Base class for Infer Data managers
///
class
InferDataManagerBase
:
public
IInferDataManager
{
public:
InferDataManagerBase
(
const
int32_t
batch_size
,
const
std
::
shared_ptr
<
ModelParser
>&
parser
,
const
std
::
shared_ptr
<
cb
::
ClientBackendFactory
>&
factory
,
const
std
::
shared_ptr
<
DataLoader
>&
data_loader
)
:
batch_size_
(
batch_size
),
parser_
(
parser
),
factory_
(
factory
),
data_loader_
(
data_loader
),
backend_kind_
(
factory
->
Kind
())
{
}
/// Populate the target InferData object with input and output objects
/// according to the model's shape
/// \param infer_data The target InferData object.
/// \return cb::Error object indicating success or failure.
cb
::
Error
InitInferData
(
InferData
&
infer_data
)
override
;
/// Updates the input data to use for inference request
/// \param thread_id The ID of the calling thread
/// \param stream_index The data stream to use for next data
/// \param step_index The step index to use for next data
/// \param infer_data The target InferData object
/// \return cb::Error object indicating success or failure.
cb
::
Error
UpdateInferData
(
size_t
thread_id
,
int
stream_index
,
int
step_index
,
InferData
&
infer_data
)
override
;
protected:
size_t
batch_size_
;
std
::
shared_ptr
<
ModelParser
>
parser_
;
std
::
shared_ptr
<
cb
::
ClientBackendFactory
>
factory_
;
std
::
shared_ptr
<
DataLoader
>
data_loader_
;
std
::
unique_ptr
<
cb
::
ClientBackend
>
backend_
;
cb
::
BackendKind
backend_kind_
;
/// Gets the input data for the specified input for the specified batch size
///
/// \param name The name of the input to get data for
/// \param tensor The ModelTensor of the input to get data for
/// \param stream_id The ID of the stream to get data for
/// \param step_id The ID of the step within the stream
/// \param input_datas The returned vector of TensorDatas
/// \return cb::Error object indicating success or failure.
cb
::
Error
GetInputData
(
const
std
::
string
&
name
,
const
ModelTensor
&
tensor
,
int
stream_id
,
int
step_id
,
std
::
vector
<
TensorData
>&
input_datas
);
/// For the case of an input with is_shape_tensor true, validate that
/// it follows all rules, and throw an error if it does not
/// \param tensor The ModelTensor of the input to validate
/// \param stream_id The ID of the stream to validate
/// \param step_id The ID of the step within the stream
/// \param input_datas vector of TensorDatas to validate
/// \return cb::Error object indicating success or failure.
cb
::
Error
ValidateShapeTensor
(
const
ModelTensor
&
tensor
,
int
stream_id
,
int
step_id
,
const
std
::
vector
<
TensorData
>&
input_datas
);
/// Helper function to update the inputs
/// \param thread_id The ID of the calling thread
/// \param stream_index The data stream to use for next data
/// \param step_index The step index to use for next data
/// \param infer_data The target InferData object
/// \return cb::Error object indicating success or failure.
virtual
cb
::
Error
UpdateInputs
(
const
size_t
thread_id
,
const
int
stream_index
,
const
int
step_index
,
InferData
&
infer_data
)
=
0
;
/// Updates the expected output data to use for inference request. Empty
/// vector will be returned if there is no expected output associated to the
/// step.
/// \param stream_index The data stream to use for next data
/// \param step_index The step index to use for next data
/// \param infer_data The target InferData object
/// \return cb::Error object indicating success or failure.
cb
::
Error
UpdateValidationOutputs
(
int
stream_index
,
int
step_index
,
InferData
&
infer_data
);
/// Creates inference input object
/// \param infer_input Output parameter storing newly created inference input
/// \param kind Backend kind
/// \param name Name of inference input
/// \param dims Shape of inference input
/// \param datatype Data type of inference input
/// \return cb::Error object indicating success or failure.
virtual
cb
::
Error
CreateInferInput
(
cb
::
InferInput
**
infer_input
,
const
cb
::
BackendKind
kind
,
const
std
::
string
&
name
,
const
std
::
vector
<
int64_t
>&
dims
,
const
std
::
string
&
datatype
);
virtual
cb
::
Error
InitInferDataInput
(
const
std
::
string
&
name
,
const
ModelTensor
&
model_tensor
,
InferData
&
infer_data
)
=
0
;
virtual
cb
::
Error
InitInferDataOutput
(
const
std
::
string
&
name
,
InferData
&
infer_data
)
=
0
;
#ifndef DOCTEST_CONFIG_DISABLE
public:
InferDataManagerBase
()
=
default
;
#endif
};
}}
// namespace triton::perfanalyzer
src/c++/perf_analyzer/infer_data_manager_factory.h
0 → 100644
View file @
c68e1835
// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include "data_loader.h"
#include "iinfer_data_manager.h"
#include "infer_data_manager.h"
#include "infer_data_manager_shm.h"
#include "model_parser.h"
namespace
triton
{
namespace
perfanalyzer
{
class
InferDataManagerFactory
{
public:
static
std
::
shared_ptr
<
IInferDataManager
>
CreateInferDataManager
(
const
size_t
max_threads
,
const
int32_t
batch_size
,
const
SharedMemoryType
shared_memory_type
,
const
size_t
output_shm_size
,
const
std
::
shared_ptr
<
ModelParser
>&
parser
,
const
std
::
shared_ptr
<
cb
::
ClientBackendFactory
>&
factory
,
const
std
::
shared_ptr
<
DataLoader
>&
data_loader
)
{
if
(
shared_memory_type
==
SharedMemoryType
::
NO_SHARED_MEMORY
)
{
return
CreateInferDataManagerNoShm
(
max_threads
,
batch_size
,
parser
,
factory
,
data_loader
);
}
else
{
return
CreateInferDataManagerShm
(
batch_size
,
shared_memory_type
,
output_shm_size
,
parser
,
factory
,
data_loader
);
}
}
private:
static
std
::
shared_ptr
<
IInferDataManager
>
CreateInferDataManagerNoShm
(
const
size_t
max_threads
,
const
int32_t
batch_size
,
const
std
::
shared_ptr
<
ModelParser
>&
parser
,
const
std
::
shared_ptr
<
cb
::
ClientBackendFactory
>&
factory
,
const
std
::
shared_ptr
<
DataLoader
>&
data_loader
)
{
return
std
::
make_shared
<
InferDataManager
>
(
max_threads
,
batch_size
,
parser
,
factory
,
data_loader
);
}
static
std
::
shared_ptr
<
IInferDataManager
>
CreateInferDataManagerShm
(
const
int32_t
batch_size
,
const
SharedMemoryType
shared_memory_type
,
const
size_t
output_shm_size
,
const
std
::
shared_ptr
<
ModelParser
>&
parser
,
const
std
::
shared_ptr
<
cb
::
ClientBackendFactory
>&
factory
,
const
std
::
shared_ptr
<
DataLoader
>&
data_loader
)
{
return
std
::
make_shared
<
InferDataManagerShm
>
(
batch_size
,
shared_memory_type
,
output_shm_size
,
parser
,
factory
,
data_loader
);
}
};
}}
// namespace triton::perfanalyzer
src/c++/perf_analyzer/infer_data_manager_shm.cc
0 → 100644
View file @
c68e1835
// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "infer_data_manager_shm.h"
#include <algorithm>
namespace
triton
{
namespace
perfanalyzer
{
InferDataManagerShm
::~
InferDataManagerShm
()
{
cb
::
Error
err
;
if
(
backend_
.
get
()
!=
nullptr
)
{
err
=
backend_
->
UnregisterAllSharedMemory
();
if
(
!
err
.
IsOk
())
{
std
::
cerr
<<
"Unable to unregister all shared memory regions"
<<
std
::
endl
;
}
if
(
shared_memory_type_
==
SharedMemoryType
::
SYSTEM_SHARED_MEMORY
)
{
for
(
auto
&
region
:
shared_memory_regions_
)
{
if
(
factory_
->
Kind
()
!=
triton
::
perfanalyzer
::
clientbackend
::
BackendKind
::
TRITON_C_API
)
{
err
=
backend_
->
UnmapSharedMemory
(
shared_memory_regions_
[
region
.
first
].
data_
.
get
(),
shared_memory_regions_
[
region
.
first
].
byte_size_
);
if
(
!
err
.
IsOk
())
{
std
::
cerr
<<
"Unable to unmap shared memory with key ("
<<
region
.
first
<<
"): Starting: "
<<
static_cast
<
void
*>
(
shared_memory_regions_
[
region
.
first
].
data_
.
get
())
<<
", size: "
<<
shared_memory_regions_
[
region
.
first
].
byte_size_
<<
std
::
endl
;
}
err
=
backend_
->
UnlinkSharedMemoryRegion
(
region
.
first
);
if
(
!
err
.
IsOk
())
{
std
::
cerr
<<
"Unable to unlink shared memory with key: "
<<
region
.
first
<<
std
::
endl
;
}
}
}
}
}
}
cb
::
Error
InferDataManagerShm
::
Init
()
{
// TMA-1062 remove the factory from this class and use only the backend
RETURN_IF_ERROR
(
factory_
->
CreateClientBackend
(
&
backend_
));
// Calling this function for the clean start
backend_
->
UnregisterAllSharedMemory
();
RETURN_IF_ERROR
(
CreateOutputMemoryRegions
());
RETURN_IF_ERROR
(
CreateAndPopulateInputMemoryRegions
());
return
cb
::
Error
::
Success
;
}
cb
::
Error
InferDataManagerShm
::
CreateOutputMemoryRegions
()
{
// Allocate the shared memory for outputs
for
(
const
auto
&
output
:
*
(
parser_
->
Outputs
()))
{
const
std
::
string
&
name
=
output
.
first
;
const
ModelTensor
&
tensor
=
output
.
second
;
int64_t
batch1_bytesize
=
ByteSize
(
tensor
.
shape_
,
tensor
.
datatype_
);
if
(
batch1_bytesize
<
0
)
{
batch1_bytesize
=
output_shm_size_
;
}
uint8_t
*
output_shm_ptr
;
size_t
alloc_size
=
batch1_bytesize
*
batch_size_
;
std
::
string
region_name
(
TensorToRegionName
(
name
));
RETURN_IF_ERROR
(
CreateMemoryRegion
(
region_name
,
shared_memory_type_
,
alloc_size
,
reinterpret_cast
<
void
**>
(
&
output_shm_ptr
)));
}
return
cb
::
Error
::
Success
;
}
cb
::
Error
InferDataManagerShm
::
CreateAndPopulateInputMemoryRegions
()
{
// All combinations of input + stream + step
//
for
(
const
auto
&
input
:
*
(
parser_
->
Inputs
()))
{
const
std
::
string
&
name
=
input
.
first
;
const
ModelTensor
&
tensor
=
input
.
second
;
for
(
int
stream_id
=
0
;
stream_id
<
(
int
)
data_loader_
->
GetDataStreamsCount
();
stream_id
++
)
{
for
(
int
step_id
=
0
;
step_id
<
(
int
)
data_loader_
->
GetTotalSteps
(
stream_id
);
step_id
+=
1
)
{
RETURN_IF_ERROR
(
CreateAndPopulateInputMemoryRegion
(
name
,
tensor
,
stream_id
,
step_id
));
}
}
}
return
cb
::
Error
::
Success
;
}
cb
::
Error
InferDataManagerShm
::
CreateAndPopulateInputMemoryRegion
(
const
std
::
string
&
name
,
const
ModelTensor
&
tensor
,
int
stream_id
,
int
step_id
)
{
std
::
vector
<
TensorData
>
input_datas
;
size_t
count
=
0
;
RETURN_IF_ERROR
(
GetInputData
(
name
,
tensor
,
stream_id
,
step_id
,
input_datas
));
if
(
tensor
.
is_shape_tensor_
)
{
RETURN_IF_ERROR
(
ValidateShapeTensor
(
tensor
,
stream_id
,
step_id
,
input_datas
));
}
size_t
alloc_size
=
0
;
for
(
size_t
i
=
0
;
i
<
input_datas
.
size
();
i
++
)
{
if
(
!
input_datas
[
i
].
is_valid
)
{
return
cb
::
Error
(
"Shared memory support in Perf Analyzer does not support "
"optional inputs at this time"
);
}
alloc_size
+=
input_datas
[
i
].
batch1_size
;
}
// Generate the shared memory region name
std
::
string
region_name
(
TensorToRegionName
(
name
)
+
"_"
+
std
::
to_string
(
stream_id
)
+
"_"
+
std
::
to_string
(
step_id
));
uint8_t
*
input_shm_ptr
;
RETURN_IF_ERROR
(
CreateMemoryRegion
(
region_name
,
shared_memory_type_
,
alloc_size
,
reinterpret_cast
<
void
**>
(
&
input_shm_ptr
)));
RETURN_IF_ERROR
(
CopySharedMemory
(
input_shm_ptr
,
input_datas
,
tensor
.
is_shape_tensor_
,
region_name
));
return
cb
::
Error
::
Success
;
}
cb
::
Error
InferDataManagerShm
::
CreateMemoryRegion
(
const
std
::
string
&
shm_region_name
,
const
SharedMemoryType
&
memory_type
,
const
size_t
byte_size
,
void
**
ptr
)
{
if
(
memory_type
==
SharedMemoryType
::
SYSTEM_SHARED_MEMORY
)
{
if
(
factory_
->
Kind
()
==
triton
::
perfanalyzer
::
clientbackend
::
BackendKind
::
TRITON_C_API
)
{
*
ptr
=
new
uint8_t
[
byte_size
];
RETURN_IF_ERROR
(
backend_
->
RegisterSystemMemory
(
shm_region_name
,
*
ptr
,
byte_size
));
// Set free as the destructor.
shared_memory_regions_
.
emplace
(
std
::
piecewise_construct
,
std
::
forward_as_tuple
(
shm_region_name
),
std
::
forward_as_tuple
(
SharedMemoryData
(
byte_size
,
std
::
unique_ptr
<
uint8_t
,
std
::
function
<
void
(
uint8_t
*
)
>>
(
reinterpret_cast
<
uint8_t
*>
(
*
ptr
),
[](
uint8_t
*
memory
)
{
free
(
memory
);
}))));
}
else
{
std
::
string
shm_key
(
"/"
+
shm_region_name
);
int
shm_fd_op
;
RETURN_IF_ERROR
(
backend_
->
CreateSharedMemoryRegion
(
shm_key
,
byte_size
,
&
shm_fd_op
));
RETURN_IF_ERROR
(
backend_
->
MapSharedMemory
(
shm_fd_op
,
0
,
byte_size
,
ptr
));
RETURN_IF_ERROR
(
backend_
->
RegisterSystemSharedMemory
(
shm_region_name
,
shm_key
,
byte_size
));
// No-op destruction
shared_memory_regions_
.
emplace
(
std
::
piecewise_construct
,
std
::
forward_as_tuple
(
shm_region_name
),
std
::
forward_as_tuple
(
SharedMemoryData
(
byte_size
,
std
::
unique_ptr
<
uint8_t
,
std
::
function
<
void
(
uint8_t
*
)
>>
(
reinterpret_cast
<
uint8_t
*>
(
*
ptr
),
[](
uint8_t
*
memory
)
{}))));
}
}
else
if
(
memory_type
==
SharedMemoryType
::
CUDA_SHARED_MEMORY
)
{
#ifdef TRITON_ENABLE_GPU
cudaError_t
cuda_err
=
cudaMalloc
((
void
**
)
ptr
,
byte_size
);
if
(
cuda_err
!=
cudaSuccess
)
{
return
cb
::
Error
(
"unable to allocate memory of "
+
std
::
to_string
(
byte_size
)
+
" bytes on gpu for output: "
+
std
::
string
(
cudaGetErrorString
(
cuda_err
)),
pa
::
GENERIC_ERROR
);
}
if
(
factory_
->
Kind
()
==
triton
::
perfanalyzer
::
clientbackend
::
BackendKind
::
TRITON_C_API
)
{
RETURN_IF_ERROR
(
backend_
->
RegisterCudaMemory
(
shm_region_name
,
*
ptr
,
byte_size
));
// Set cudaFree as the destructor
shared_memory_regions_
.
emplace
(
std
::
piecewise_construct
,
std
::
forward_as_tuple
(
shm_region_name
),
std
::
forward_as_tuple
(
SharedMemoryData
(
byte_size
,
std
::
unique_ptr
<
uint8_t
,
std
::
function
<
void
(
uint8_t
*
)
>>
(
reinterpret_cast
<
uint8_t
*>
(
*
ptr
),
[
shm_region_name
,
byte_size
](
uint8_t
*
memory
)
{
cudaError_t
cuda_err
=
cudaFree
(
memory
);
if
(
cuda_err
!=
cudaSuccess
)
{
std
::
cerr
<<
"Unable to free cuda shared memory for "
<<
shm_region_name
<<
": Starting: "
<<
static_cast
<
void
*>
(
memory
)
<<
", size: "
<<
byte_size
<<
" bytes, Details: "
<<
cudaGetErrorString
(
cuda_err
)
<<
std
::
endl
;
}
}))));
}
else
{
cudaIpcMemHandle_t
cuda_handle
;
RETURN_IF_ERROR
(
CreateCUDAIPCHandle
(
&
cuda_handle
,
reinterpret_cast
<
void
*>
(
*
ptr
)));
RETURN_IF_ERROR
(
backend_
->
RegisterCudaSharedMemory
(
shm_region_name
,
cuda_handle
,
byte_size
));
// No operation required for deleting the memory
shared_memory_regions_
.
emplace
(
std
::
piecewise_construct
,
std
::
forward_as_tuple
(
shm_region_name
),
std
::
forward_as_tuple
(
SharedMemoryData
(
byte_size
,
std
::
unique_ptr
<
uint8_t
,
std
::
function
<
void
(
uint8_t
*
)
>>
(
reinterpret_cast
<
uint8_t
*>
(
*
ptr
),
[](
uint8_t
*
memory
)
{}))));
}
#endif // TRITON_ENABLE_GPU
}
else
{
return
cb
::
Error
(
"CreateMemoryRegion called with invalid memory region type."
,
pa
::
GENERIC_ERROR
);
}
return
cb
::
Error
::
Success
;
}
cb
::
Error
InferDataManagerShm
::
CopySharedMemory
(
uint8_t
*
input_shm_ptr
,
const
std
::
vector
<
TensorData
>&
tensor_datas
,
bool
is_shape_tensor
,
std
::
string
&
region_name
)
{
if
(
shared_memory_type_
==
SharedMemoryType
::
SYSTEM_SHARED_MEMORY
)
{
// Populate the region with data
size_t
count
=
0
;
size_t
offset
=
0
;
size_t
max_count
=
is_shape_tensor
?
1
:
batch_size_
;
while
(
count
<
max_count
)
{
memcpy
(
input_shm_ptr
+
offset
,
tensor_datas
[
count
].
data_ptr
,
tensor_datas
[
count
].
batch1_size
);
offset
+=
tensor_datas
[
count
].
batch1_size
;
count
++
;
}
}
else
{
#ifdef TRITON_ENABLE_GPU
// Populate the region with data
size_t
count
=
0
;
size_t
offset
=
0
;
size_t
max_count
=
is_shape_tensor
?
1
:
batch_size_
;
while
(
count
<
max_count
)
{
cudaError_t
cuda_err
=
cudaMemcpy
(
(
void
*
)(
input_shm_ptr
+
offset
),
(
void
*
)
tensor_datas
[
count
].
data_ptr
,
tensor_datas
[
count
].
batch1_size
,
cudaMemcpyHostToDevice
);
if
(
cuda_err
!=
cudaSuccess
)
{
return
cb
::
Error
(
"Failed to copy data to cuda shared memory for "
+
region_name
+
" : "
+
std
::
string
(
cudaGetErrorString
(
cuda_err
)),
pa
::
GENERIC_ERROR
);
}
offset
+=
tensor_datas
[
count
].
batch1_size
;
count
++
;
}
#endif // TRITON_ENABLE_GPU
}
return
cb
::
Error
::
Success
;
}
cb
::
Error
InferDataManagerShm
::
InitInferDataInput
(
const
std
::
string
&
name
,
const
ModelTensor
&
model_tensor
,
InferData
&
infer_data
)
{
std
::
vector
<
int64_t
>
shape
;
RETURN_IF_ERROR
(
data_loader_
->
GetInputShape
(
model_tensor
,
0
,
0
,
&
shape
));
if
(
!
shape
.
empty
())
{
if
((
parser_
->
MaxBatchSize
()
!=
0
)
&&
(
!
model_tensor
.
is_shape_tensor_
))
{
shape
.
insert
(
shape
.
begin
(),
(
int64_t
)
batch_size_
);
}
}
else
{
return
cb
::
Error
(
"unable to set shape for the input"
,
pa
::
GENERIC_ERROR
);
}
cb
::
InferInput
*
infer_input
;
RETURN_IF_ERROR
(
CreateInferInput
(
&
infer_input
,
backend_kind_
,
name
,
shape
,
model_tensor
.
datatype_
));
infer_data
.
inputs_
.
push_back
(
infer_input
);
// FIXME: TMA-765 - Shared memory mode does not support optional inputs,
// currently, and will be implemented in the associated story.
infer_data
.
valid_inputs_
.
push_back
(
infer_input
);
std
::
string
region_name
(
TensorToRegionName
(
name
)
+
"_"
+
std
::
to_string
(
0
)
+
"_"
+
std
::
to_string
(
0
));
RETURN_IF_ERROR
(
infer_input
->
SetSharedMemory
(
region_name
,
shared_memory_regions_
[
region_name
].
byte_size_
));
return
cb
::
Error
::
Success
;
}
cb
::
Error
InferDataManagerShm
::
InitInferDataOutput
(
const
std
::
string
&
name
,
InferData
&
infer_data
)
{
cb
::
InferRequestedOutput
*
requested_output
;
RETURN_IF_ERROR
(
cb
::
InferRequestedOutput
::
Create
(
&
requested_output
,
backend_kind_
,
name
));
infer_data
.
outputs_
.
push_back
(
requested_output
);
std
::
string
region_name
(
TensorToRegionName
(
name
));
RETURN_IF_ERROR
(
requested_output
->
SetSharedMemory
(
region_name
,
shared_memory_regions_
[
region_name
].
byte_size_
));
return
cb
::
Error
::
Success
;
}
cb
::
Error
InferDataManagerShm
::
UpdateInputs
(
const
size_t
thread_id
,
const
int
stream_index
,
const
int
step_index
,
InferData
&
infer_data
)
{
for
(
const
auto
&
input
:
infer_data
.
inputs_
)
{
RETURN_IF_ERROR
(
input
->
Reset
());
const
auto
&
model_input
=
(
*
(
parser_
->
Inputs
()))[
input
->
Name
()];
std
::
string
region_name
(
TensorToRegionName
(
input
->
Name
())
+
'_'
+
std
::
to_string
(
stream_index
)
+
"_"
+
std
::
to_string
(
step_index
));
std
::
vector
<
int64_t
>
shape
;
RETURN_IF_ERROR
(
data_loader_
->
GetInputShape
(
model_input
,
stream_index
,
step_index
,
&
shape
));
if
(
!
shape
.
empty
())
{
if
((
parser_
->
MaxBatchSize
()
!=
0
)
&&
(
!
model_input
.
is_shape_tensor_
))
{
shape
.
insert
(
shape
.
begin
(),
(
int64_t
)
batch_size_
);
}
input
->
SetShape
(
shape
);
}
RETURN_IF_ERROR
(
input
->
SetSharedMemory
(
region_name
,
shared_memory_regions_
[
region_name
].
byte_size_
));
}
return
cb
::
Error
::
Success
;
}
}}
// namespace triton::perfanalyzer
src/c++/perf_analyzer/infer_data_manager_shm.h
0 → 100644
View file @
c68e1835
// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include "client_backend/client_backend.h"
#include "constants.h"
#include "data_loader.h"
#include "infer_data.h"
#include "infer_data_manager_base.h"
#include "model_parser.h"
#include "perf_utils.h"
namespace
triton
{
namespace
perfanalyzer
{
namespace
{
#ifdef TRITON_ENABLE_GPU
#include <cuda_runtime_api.h>
#define RETURN_IF_CUDA_ERR(FUNC) \
{ \
const cudaError_t result = FUNC; \
if (result != cudaSuccess) { \
return cb::Error( \
"CUDA exception (line " + std::to_string(__LINE__) + \
"): " + cudaGetErrorName(result) + " (" + \
cudaGetErrorString(result) + ")", \
pa::GENERIC_ERROR); \
} \
}
cb
::
Error
CreateCUDAIPCHandle
(
cudaIpcMemHandle_t
*
cuda_handle
,
void
*
input_d_ptr
,
int
device_id
=
0
)
{
// Set the GPU device to the desired GPU
RETURN_IF_CUDA_ERR
(
cudaSetDevice
(
device_id
));
// Create IPC handle for data on the gpu
RETURN_IF_CUDA_ERR
(
cudaIpcGetMemHandle
(
cuda_handle
,
input_d_ptr
));
return
cb
::
Error
::
Success
;
}
#endif // TRITON_ENABLE_GPU
}
// namespace
/// Holds information about the shared memory locations
struct
SharedMemoryData
{
SharedMemoryData
(
size_t
byte_size
,
std
::
unique_ptr
<
uint8_t
,
std
::
function
<
void
(
uint8_t
*
)
>>
data
)
:
byte_size_
(
byte_size
),
data_
(
std
::
move
(
data
))
{
}
SharedMemoryData
()
{}
// Byte size
size_t
byte_size_
;
// Unique pointer holding the shared memory data
std
::
unique_ptr
<
uint8_t
,
std
::
function
<
void
(
uint8_t
*
)
>>
data_
;
};
/// Manages infer data to prepare an inference request and the resulting
/// inference output from triton server
class
InferDataManagerShm
:
public
InferDataManagerBase
{
public:
InferDataManagerShm
(
const
int32_t
batch_size
,
const
SharedMemoryType
shared_memory_type
,
const
size_t
output_shm_size
,
const
std
::
shared_ptr
<
ModelParser
>&
parser
,
const
std
::
shared_ptr
<
cb
::
ClientBackendFactory
>&
factory
,
const
std
::
shared_ptr
<
DataLoader
>&
data_loader
)
:
shared_memory_type_
(
shared_memory_type
),
output_shm_size_
(
output_shm_size
),
InferDataManagerBase
(
batch_size
,
parser
,
factory
,
data_loader
)
{
}
~
InferDataManagerShm
();
/// Initialize this object. Must be called before any other functions
/// \return cb::Error object indicating success or failure.
cb
::
Error
Init
()
override
;
protected:
cb
::
Error
CreateOutputMemoryRegions
();
cb
::
Error
CreateAndPopulateInputMemoryRegions
();
cb
::
Error
CreateAndPopulateInputMemoryRegion
(
const
std
::
string
&
name
,
const
ModelTensor
&
tensor
,
int
stream_id
,
int
step_id
);
/// Create a memory region.
/// \return cb::Error object indicating success or failure.
cb
::
Error
CreateMemoryRegion
(
const
std
::
string
&
shm_region_name
,
const
SharedMemoryType
&
memory_type
,
const
size_t
byte_size
,
void
**
ptr
);
/// \brief Helper function to handle copying shared memory to the correct
/// memory region
/// \param input_shm_ptr Pointer to the shared memory for a specific input
/// \param input_datas The TensorDatas to be copied
/// \param is_shape_tensor Is the input a shape tensor
/// \param region_name Name of the shared memory region
/// \return cb::Error object indicating success or failure
virtual
cb
::
Error
CopySharedMemory
(
uint8_t
*
input_shm_ptr
,
const
std
::
vector
<
TensorData
>&
input_datas
,
bool
is_shape_tensor
,
std
::
string
&
region_name
);
cb
::
Error
InitInferDataInput
(
const
std
::
string
&
name
,
const
ModelTensor
&
model_tensor
,
InferData
&
infer_data
)
override
;
cb
::
Error
InitInferDataOutput
(
const
std
::
string
&
name
,
InferData
&
infer_data
)
override
;
/// Helper function to update the inputs
/// \param thread_id The ID of the calling thread
/// \param stream_index The data stream to use for next data
/// \param step_index The step index to use for next data
/// \param infer_data The target InferData object
/// \return cb::Error object indicating success or failure.
virtual
cb
::
Error
UpdateInputs
(
size_t
thread_id
,
const
int
stream_index
,
const
int
step_index
,
InferData
&
infer_data
)
override
;
SharedMemoryType
shared_memory_type_
;
size_t
output_shm_size_
;
// Map from shared memory key to its starting address and size
std
::
unordered_map
<
std
::
string
,
SharedMemoryData
>
shared_memory_regions_
;
};
}}
// namespace triton::perfanalyzer
src/c++/perf_analyzer/inference_profiler.cc
0 → 100644
View file @
c68e1835
// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "inference_profiler.h"
#include <math.h>
#include <algorithm>
#include <iomanip>
#include <iostream>
#include <limits>
#include <queue>
#include <sstream>
#include <stdexcept>
#include "client_backend/client_backend.h"
#include "constants.h"
#include "doctest.h"
namespace
triton
{
namespace
perfanalyzer
{
cb
::
Error
ReportPrometheusMetrics
(
const
Metrics
&
metrics
)
{
const
size_t
max_num_gpus_in_stdout
{
16
};
if
(
metrics
.
gpu_utilization_per_gpu
.
size
()
>
max_num_gpus_in_stdout
||
metrics
.
gpu_power_usage_per_gpu
.
size
()
>
max_num_gpus_in_stdout
||
metrics
.
gpu_memory_used_bytes_per_gpu
.
size
()
>
max_num_gpus_in_stdout
||
metrics
.
gpu_memory_total_bytes_per_gpu
.
size
()
>
max_num_gpus_in_stdout
)
{
std
::
cout
<<
"Too many GPUs on system to print out individual Prometheus "
"metrics, use the CSV output feature to see metrics."
<<
std
::
endl
;
return
cb
::
Error
::
Success
;
}
std
::
cout
<<
" Avg GPU Utilization:"
<<
std
::
endl
;
for
(
const
auto
&
gpu_uuid_metric_pair
:
metrics
.
gpu_utilization_per_gpu
)
{
const
auto
gpu_uuid
{
gpu_uuid_metric_pair
.
first
};
const
auto
metric
{
gpu_uuid_metric_pair
.
second
};
std
::
cout
<<
" "
<<
gpu_uuid
<<
" : "
<<
(
metric
*
100.0
)
<<
"%"
<<
std
::
endl
;
}
std
::
cout
<<
" Avg GPU Power Usage:"
<<
std
::
endl
;
for
(
const
auto
&
gpu_uuid_metric_pair
:
metrics
.
gpu_power_usage_per_gpu
)
{
const
auto
gpu_uuid
{
gpu_uuid_metric_pair
.
first
};
const
auto
metric
{
gpu_uuid_metric_pair
.
second
};
std
::
cout
<<
" "
<<
gpu_uuid
<<
" : "
<<
metric
<<
" watts"
<<
std
::
endl
;
}
std
::
cout
<<
" Max GPU Memory Usage:"
<<
std
::
endl
;
for
(
const
auto
&
gpu_uuid_metric_pair
:
metrics
.
gpu_memory_used_bytes_per_gpu
)
{
const
auto
gpu_uuid
{
gpu_uuid_metric_pair
.
first
};
const
auto
metric
{
gpu_uuid_metric_pair
.
second
};
std
::
cout
<<
" "
<<
gpu_uuid
<<
" : "
<<
metric
<<
" bytes"
<<
std
::
endl
;
}
std
::
cout
<<
" Total GPU Memory:"
<<
std
::
endl
;
for
(
const
auto
&
gpu_uuid_metric_pair
:
metrics
.
gpu_memory_total_bytes_per_gpu
)
{
const
auto
gpu_uuid
{
gpu_uuid_metric_pair
.
first
};
const
auto
metric
{
gpu_uuid_metric_pair
.
second
};
std
::
cout
<<
" "
<<
gpu_uuid
<<
" : "
<<
metric
<<
" bytes"
<<
std
::
endl
;
}
return
cb
::
Error
::
Success
;
}
namespace
{
inline
uint64_t
AverageDurationInUs
(
const
uint64_t
total_time_in_ns
,
const
uint64_t
cnt
)
{
if
(
cnt
==
0
)
{
return
0
;
}
return
total_time_in_ns
/
(
cnt
*
1000
);
}
EnsembleDurations
GetTotalEnsembleDurations
(
const
ServerSideStats
&
stats
)
{
EnsembleDurations
result
;
for
(
const
auto
&
model_stats
:
stats
.
composing_models_stat
)
{
if
(
model_stats
.
second
.
composing_models_stat
.
empty
())
{
// Cache hit count covers cache hits, not related to compute times
const
uint64_t
cache_hit_cnt
=
model_stats
.
second
.
cache_hit_count
;
// cache_miss_cnt should either equal infer_cnt or be zero if
// cache is disabled or not supported for the model/scheduler type
const
uint64_t
cache_miss_cnt
=
model_stats
.
second
.
cache_miss_count
;
result
.
total_queue_time_avg_us
+=
AverageDurationInUs
(
model_stats
.
second
.
queue_time_ns
,
model_stats
.
second
.
queue_count
);
const
uint64_t
compute_time
=
model_stats
.
second
.
compute_input_time_ns
+
model_stats
.
second
.
compute_infer_time_ns
+
model_stats
.
second
.
compute_output_time_ns
;
if
(
model_stats
.
second
.
compute_input_count
!=
model_stats
.
second
.
compute_infer_count
||
model_stats
.
second
.
compute_infer_count
!=
model_stats
.
second
.
compute_output_count
)
{
throw
std
::
runtime_error
(
"Server side statistics compute counts must be the same."
);
}
const
uint64_t
compute_cnt
=
model_stats
.
second
.
compute_input_count
;
result
.
total_compute_time_avg_us
+=
AverageDurationInUs
(
compute_time
,
compute_cnt
);
result
.
total_cache_hit_time_avg_us
+=
AverageDurationInUs
(
model_stats
.
second
.
cache_hit_time_ns
,
cache_hit_cnt
);
result
.
total_cache_miss_time_avg_us
+=
AverageDurationInUs
(
model_stats
.
second
.
cache_miss_time_ns
,
cache_miss_cnt
);
// Track combined cache/compute total avg for reporting latency with cache
// enabled
result
.
total_combined_cache_compute_time_avg_us
+=
AverageDurationInUs
(
compute_time
+
model_stats
.
second
.
cache_hit_time_ns
+
model_stats
.
second
.
cache_miss_time_ns
,
compute_cnt
+
cache_hit_cnt
);
}
else
{
const
auto
this_ensemble_duration
=
GetTotalEnsembleDurations
(
model_stats
.
second
);
result
.
total_queue_time_avg_us
+=
this_ensemble_duration
.
total_queue_time_avg_us
;
result
.
total_compute_time_avg_us
+=
this_ensemble_duration
.
total_compute_time_avg_us
;
result
.
total_cache_hit_time_avg_us
+=
this_ensemble_duration
.
total_cache_hit_time_avg_us
;
result
.
total_cache_miss_time_avg_us
+=
this_ensemble_duration
.
total_cache_miss_time_avg_us
;
result
.
total_combined_cache_compute_time_avg_us
+=
this_ensemble_duration
.
total_combined_cache_compute_time_avg_us
;
}
}
return
result
;
}
size_t
GetOverheadDuration
(
size_t
total_time
,
size_t
queue_time
,
size_t
compute_time
)
{
return
(
total_time
>
queue_time
+
compute_time
)
?
(
total_time
-
queue_time
-
compute_time
)
:
0
;
}
cb
::
Error
ReportServerSideStats
(
const
ServerSideStats
&
stats
,
const
int
iteration
,
const
std
::
shared_ptr
<
ModelParser
>&
parser
)
{
const
std
::
string
ident
=
std
::
string
(
2
*
iteration
,
' '
);
// Infer/exec counts cover compute time done in inference backends,
// not related to cache hit times
const
uint64_t
exec_cnt
=
stats
.
execution_count
;
const
uint64_t
infer_cnt
=
stats
.
inference_count
;
// Cache hit count covers cache hits, not related to compute times
const
uint64_t
cache_hit_cnt
=
stats
.
cache_hit_count
;
const
uint64_t
cache_miss_cnt
=
stats
.
cache_miss_count
;
// Success count covers all successful requests, cumulative time, queue
// time, compute, and cache
const
uint64_t
cnt
=
stats
.
success_count
;
if
(
cnt
==
0
)
{
std
::
cout
<<
ident
<<
" Request count: "
<<
cnt
<<
std
::
endl
;
return
cb
::
Error
::
Success
;
}
const
uint64_t
cumm_avg_us
=
AverageDurationInUs
(
stats
.
cumm_time_ns
,
cnt
);
std
::
cout
<<
ident
<<
" Inference count: "
<<
infer_cnt
<<
std
::
endl
<<
ident
<<
" Execution count: "
<<
exec_cnt
<<
std
::
endl
;
if
(
parser
->
ResponseCacheEnabled
())
{
std
::
cout
<<
ident
<<
" Cache hit count: "
<<
cache_hit_cnt
<<
std
::
endl
;
std
::
cout
<<
ident
<<
" Cache miss count: "
<<
cache_miss_cnt
<<
std
::
endl
;
}
std
::
cout
<<
ident
<<
" Successful request count: "
<<
cnt
<<
std
::
endl
<<
ident
<<
" Avg request latency: "
<<
cumm_avg_us
<<
" usec"
;
// Non-ensemble model
if
(
stats
.
composing_models_stat
.
empty
())
{
const
uint64_t
queue_avg_us
=
AverageDurationInUs
(
stats
.
queue_time_ns
,
stats
.
queue_count
);
const
uint64_t
compute_input_avg_us
=
AverageDurationInUs
(
stats
.
compute_input_time_ns
,
stats
.
compute_input_count
);
const
uint64_t
compute_infer_avg_us
=
AverageDurationInUs
(
stats
.
compute_infer_time_ns
,
stats
.
compute_infer_count
);
const
uint64_t
compute_output_avg_us
=
AverageDurationInUs
(
stats
.
compute_output_time_ns
,
stats
.
compute_output_count
);
const
uint64_t
compute_time
=
stats
.
compute_input_time_ns
+
stats
.
compute_infer_time_ns
+
stats
.
compute_output_time_ns
;
if
(
stats
.
compute_input_count
!=
stats
.
compute_infer_count
||
stats
.
compute_infer_count
!=
stats
.
compute_output_count
)
{
throw
std
::
runtime_error
(
"Server side statistics compute counts must be the same."
);
}
const
uint64_t
compute_cnt
=
stats
.
compute_input_count
;
const
uint64_t
compute_avg_us
=
AverageDurationInUs
(
compute_time
,
compute_cnt
);
const
uint64_t
cache_hit_avg_us
=
AverageDurationInUs
(
stats
.
cache_hit_time_ns
,
cache_hit_cnt
);
const
uint64_t
cache_miss_avg_us
=
AverageDurationInUs
(
stats
.
cache_miss_time_ns
,
cache_miss_cnt
);
const
uint64_t
total_compute_time_ns
=
stats
.
compute_input_time_ns
+
stats
.
compute_infer_time_ns
+
stats
.
compute_output_time_ns
;
// Get the average of cache hits and misses across successful requests
const
uint64_t
combined_cache_compute_avg_us
=
AverageDurationInUs
(
stats
.
cache_hit_time_ns
+
stats
.
cache_miss_time_ns
+
total_compute_time_ns
,
compute_cnt
+
cache_hit_cnt
);
if
(
parser
->
ResponseCacheEnabled
())
{
const
uint64_t
overhead_avg_us
=
GetOverheadDuration
(
cumm_avg_us
,
queue_avg_us
,
combined_cache_compute_avg_us
);
std
::
cout
<<
" (overhead "
<<
overhead_avg_us
<<
" usec + "
<<
"queue "
<<
queue_avg_us
<<
" usec + "
<<
"cache hit/miss "
<<
combined_cache_compute_avg_us
<<
" usec)"
<<
std
::
endl
;
std
::
cout
<<
ident
<<
ident
<<
" Average Cache Hit Latency: "
<<
cache_hit_avg_us
<<
" usec"
<<
std
::
endl
;
std
::
cout
<<
ident
<<
ident
<<
" Average Cache Miss Latency: "
<<
cache_miss_avg_us
+
compute_avg_us
<<
" usec "
<<
"(cache lookup/insertion "
<<
cache_miss_avg_us
<<
" usec + "
<<
"compute input "
<<
compute_input_avg_us
<<
" usec + "
<<
"compute infer "
<<
compute_infer_avg_us
<<
" usec + "
<<
"compute output "
<<
compute_output_avg_us
<<
" usec)"
<<
std
::
endl
<<
std
::
endl
;
}
// Response Cache Disabled
else
{
std
::
cout
<<
" (overhead "
<<
GetOverheadDuration
(
cumm_avg_us
,
queue_avg_us
,
compute_avg_us
)
<<
" usec + "
<<
"queue "
<<
queue_avg_us
<<
" usec + "
<<
"compute input "
<<
compute_input_avg_us
<<
" usec + "
<<
"compute infer "
<<
compute_infer_avg_us
<<
" usec + "
<<
"compute output "
<<
compute_output_avg_us
<<
" usec)"
<<
std
::
endl
<<
std
::
endl
;
if
(
cache_hit_avg_us
>
0
||
cache_miss_avg_us
>
0
)
{
std
::
cerr
<<
"Response Cache is disabled for model ["
<<
parser
->
ModelName
()
<<
"] but cache hit/miss latency is non-zero."
<<
std
::
endl
;
}
}
}
// Ensemble Model
else
{
const
auto
ensemble_times
=
GetTotalEnsembleDurations
(
stats
);
// Response Cache Enabled
if
(
parser
->
ResponseCacheEnabled
())
{
const
uint64_t
overhead_avg_us
=
GetOverheadDuration
(
cumm_avg_us
,
ensemble_times
.
total_queue_time_avg_us
,
ensemble_times
.
total_combined_cache_compute_time_avg_us
);
std
::
cout
<<
" (overhead "
<<
overhead_avg_us
<<
" usec + "
<<
"queue "
<<
ensemble_times
.
total_queue_time_avg_us
<<
" usec + "
<<
"cache hit/miss "
<<
ensemble_times
.
total_combined_cache_compute_time_avg_us
<<
" usec)"
<<
std
::
endl
;
std
::
cout
<<
ident
<<
ident
<<
" Average Cache Hit Latency: "
<<
ensemble_times
.
total_cache_hit_time_avg_us
<<
" usec"
<<
std
::
endl
;
std
::
cout
<<
ident
<<
ident
<<
" Average Cache Miss Latency: "
<<
ensemble_times
.
total_cache_miss_time_avg_us
+
ensemble_times
.
total_compute_time_avg_us
<<
" usec "
<<
std
::
endl
<<
std
::
endl
;
}
// Response Cache Disabled
else
{
std
::
cout
<<
" (overhead "
<<
GetOverheadDuration
(
cumm_avg_us
,
ensemble_times
.
total_queue_time_avg_us
,
ensemble_times
.
total_compute_time_avg_us
)
<<
" usec + "
<<
"queue "
<<
ensemble_times
.
total_queue_time_avg_us
<<
" usec + "
<<
"compute "
<<
ensemble_times
.
total_compute_time_avg_us
<<
" usec)"
<<
std
::
endl
<<
std
::
endl
;
}
// List out composing models of ensemble model
std
::
cout
<<
ident
<<
"Composing models: "
<<
std
::
endl
;
for
(
const
auto
&
model_stats
:
stats
.
composing_models_stat
)
{
const
auto
&
model_identifier
=
model_stats
.
first
;
std
::
cout
<<
ident
<<
model_identifier
.
first
<<
", version: "
<<
model_identifier
.
second
<<
std
::
endl
;
ReportServerSideStats
(
model_stats
.
second
,
iteration
+
1
,
parser
);
}
}
return
cb
::
Error
::
Success
;
}
cb
::
Error
ReportClientSideStats
(
const
ClientSideStats
&
stats
,
const
int64_t
percentile
,
const
cb
::
ProtocolType
protocol
,
const
bool
verbose
,
const
bool
on_sequence_model
,
const
bool
include_lib_stats
,
const
double
overhead_pct
,
const
double
send_request_rate
,
const
bool
is_decoupled_model
)
{
const
uint64_t
avg_latency_us
=
stats
.
avg_latency_ns
/
1000
;
const
uint64_t
std_us
=
stats
.
std_us
;
const
uint64_t
avg_request_time_us
=
stats
.
avg_request_time_ns
/
1000
;
const
uint64_t
avg_send_time_us
=
stats
.
avg_send_time_ns
/
1000
;
const
uint64_t
avg_receive_time_us
=
stats
.
avg_receive_time_ns
/
1000
;
const
uint64_t
avg_response_wait_time_us
=
avg_request_time_us
-
avg_send_time_us
-
avg_receive_time_us
;
std
::
string
client_library_detail
=
" "
;
if
(
include_lib_stats
)
{
if
(
protocol
==
cb
::
ProtocolType
::
GRPC
)
{
client_library_detail
+=
"Avg gRPC time: "
+
std
::
to_string
(
avg_request_time_us
)
+
" usec ("
;
if
(
!
verbose
)
{
client_library_detail
+=
"(un)marshal request/response "
+
std
::
to_string
(
avg_send_time_us
+
avg_receive_time_us
)
+
" usec + response wait "
+
std
::
to_string
(
avg_response_wait_time_us
)
+
" usec)"
;
}
else
{
client_library_detail
+=
"marshal "
+
std
::
to_string
(
avg_send_time_us
)
+
" usec + response wait "
+
std
::
to_string
(
avg_response_wait_time_us
)
+
" usec + unmarshal "
+
std
::
to_string
(
avg_receive_time_us
)
+
" usec)"
;
}
}
else
if
(
protocol
==
cb
::
ProtocolType
::
HTTP
)
{
client_library_detail
+=
"Avg HTTP time: "
+
std
::
to_string
(
avg_request_time_us
)
+
" usec ("
;
if
(
!
verbose
)
{
client_library_detail
+=
"send/recv "
+
std
::
to_string
(
avg_send_time_us
+
avg_receive_time_us
)
+
" usec + response wait "
+
std
::
to_string
(
avg_response_wait_time_us
)
+
" usec)"
;
}
else
{
client_library_detail
+=
"send "
+
std
::
to_string
(
avg_send_time_us
)
+
" usec + response wait "
+
std
::
to_string
(
avg_response_wait_time_us
)
+
" usec + receive "
+
std
::
to_string
(
avg_receive_time_us
)
+
" usec)"
;
}
}
}
std
::
cout
<<
" Request count: "
<<
stats
.
request_count
<<
std
::
endl
;
double
delay_pct
=
((
double
)
stats
.
delayed_request_count
/
stats
.
request_count
)
*
100
;
if
(
delay_pct
>
DELAY_PCT_THRESHOLD
)
{
std
::
cout
<<
" "
<<
"Avg send request rate: "
<<
std
::
fixed
<<
std
::
setprecision
(
2
)
<<
send_request_rate
<<
" infer/sec"
<<
std
::
endl
;
std
::
cout
<<
" "
<<
"[WARNING] Perf Analyzer was not able to keep up with the "
"desired request rate. "
;
std
::
cout
<<
delay_pct
<<
"% of the requests were delayed. "
<<
std
::
endl
;
}
if
(
on_sequence_model
)
{
std
::
cout
<<
" Sequence count: "
<<
stats
.
sequence_count
<<
" ("
<<
stats
.
sequence_per_sec
<<
" seq/sec)"
<<
std
::
endl
;
}
std
::
cout
<<
" Throughput: "
<<
stats
.
infer_per_sec
<<
" infer/sec"
<<
std
::
endl
;
if
(
is_decoupled_model
)
{
std
::
cout
<<
" Response Throughput: "
<<
stats
.
responses_per_sec
<<
" infer/sec"
<<
std
::
endl
;
}
if
(
verbose
)
{
std
::
stringstream
client_overhead
{
""
};
client_overhead
<<
" "
<<
"Avg client overhead: "
<<
std
::
fixed
<<
std
::
setprecision
(
2
)
<<
overhead_pct
<<
"%"
;
std
::
cout
<<
client_overhead
.
str
()
<<
std
::
endl
;
}
if
(
percentile
==
-
1
)
{
std
::
cout
<<
" Avg latency: "
<<
avg_latency_us
<<
" usec"
<<
" (standard deviation "
<<
std_us
<<
" usec)"
<<
std
::
endl
;
}
for
(
const
auto
&
percentile
:
stats
.
percentile_latency_ns
)
{
std
::
cout
<<
" p"
<<
percentile
.
first
<<
" latency: "
<<
(
percentile
.
second
/
1000
)
<<
" usec"
<<
std
::
endl
;
}
std
::
cout
<<
client_library_detail
<<
std
::
endl
;
return
cb
::
Error
::
Success
;
}
cb
::
Error
Report
(
const
PerfStatus
&
summary
,
const
int64_t
percentile
,
const
cb
::
ProtocolType
protocol
,
const
bool
verbose
,
const
bool
include_lib_stats
,
const
bool
include_server_stats
,
const
std
::
shared_ptr
<
ModelParser
>&
parser
,
const
bool
should_collect_metrics
,
const
double
overhead_pct_threshold
)
{
std
::
cout
<<
" Client: "
<<
std
::
endl
;
ReportClientSideStats
(
summary
.
client_stats
,
percentile
,
protocol
,
verbose
,
summary
.
on_sequence_model
,
include_lib_stats
,
summary
.
overhead_pct
,
summary
.
send_request_rate
,
parser
->
IsDecoupled
());
if
(
include_server_stats
)
{
std
::
cout
<<
" Server: "
<<
std
::
endl
;
ReportServerSideStats
(
summary
.
server_stats
,
1
,
parser
);
}
if
(
should_collect_metrics
)
{
std
::
cout
<<
" Server Prometheus Metrics: "
<<
std
::
endl
;
ReportPrometheusMetrics
(
summary
.
metrics
.
front
());
}
if
(
summary
.
overhead_pct
>
overhead_pct_threshold
)
{
std
::
cout
<<
"[WARNING] Perf Analyzer is not able to keep up with the "
"desired load. The results may not be accurate."
<<
std
::
endl
;
}
return
cb
::
Error
::
Success
;
}
}
// namespace
cb
::
Error
InferenceProfiler
::
Create
(
const
bool
verbose
,
const
double
stability_threshold
,
const
uint64_t
measurement_window_ms
,
const
size_t
max_trials
,
const
int64_t
percentile
,
const
uint64_t
latency_threshold_ms_
,
const
cb
::
ProtocolType
protocol
,
std
::
shared_ptr
<
ModelParser
>&
parser
,
std
::
shared_ptr
<
cb
::
ClientBackend
>
profile_backend
,
std
::
unique_ptr
<
LoadManager
>
manager
,
std
::
unique_ptr
<
InferenceProfiler
>*
profiler
,
uint64_t
measurement_request_count
,
MeasurementMode
measurement_mode
,
std
::
shared_ptr
<
MPIDriver
>
mpi_driver
,
const
uint64_t
metrics_interval_ms
,
const
bool
should_collect_metrics
,
const
double
overhead_pct_threshold
,
const
std
::
shared_ptr
<
ProfileDataCollector
>
collector
,
const
bool
should_collect_profile_data
)
{
std
::
unique_ptr
<
InferenceProfiler
>
local_profiler
(
new
InferenceProfiler
(
verbose
,
stability_threshold
,
measurement_window_ms
,
max_trials
,
(
percentile
!=
-
1
),
percentile
,
latency_threshold_ms_
,
protocol
,
parser
,
profile_backend
,
std
::
move
(
manager
),
measurement_request_count
,
measurement_mode
,
mpi_driver
,
metrics_interval_ms
,
should_collect_metrics
,
overhead_pct_threshold
,
collector
,
should_collect_profile_data
));
*
profiler
=
std
::
move
(
local_profiler
);
return
cb
::
Error
::
Success
;
}
InferenceProfiler
::
InferenceProfiler
(
const
bool
verbose
,
const
double
stability_threshold
,
const
int32_t
measurement_window_ms
,
const
size_t
max_trials
,
const
bool
extra_percentile
,
const
size_t
percentile
,
const
uint64_t
latency_threshold_ms_
,
const
cb
::
ProtocolType
protocol
,
std
::
shared_ptr
<
ModelParser
>&
parser
,
std
::
shared_ptr
<
cb
::
ClientBackend
>
profile_backend
,
std
::
unique_ptr
<
LoadManager
>
manager
,
uint64_t
measurement_request_count
,
MeasurementMode
measurement_mode
,
std
::
shared_ptr
<
MPIDriver
>
mpi_driver
,
const
uint64_t
metrics_interval_ms
,
const
bool
should_collect_metrics
,
const
double
overhead_pct_threshold
,
const
std
::
shared_ptr
<
ProfileDataCollector
>
collector
,
const
bool
should_collect_profile_data
)
:
verbose_
(
verbose
),
measurement_window_ms_
(
measurement_window_ms
),
max_trials_
(
max_trials
),
extra_percentile_
(
extra_percentile
),
percentile_
(
percentile
),
latency_threshold_ms_
(
latency_threshold_ms_
),
protocol_
(
protocol
),
parser_
(
parser
),
profile_backend_
(
profile_backend
),
manager_
(
std
::
move
(
manager
)),
measurement_request_count_
(
measurement_request_count
),
measurement_mode_
(
measurement_mode
),
mpi_driver_
(
mpi_driver
),
should_collect_metrics_
(
should_collect_metrics
),
overhead_pct_threshold_
(
overhead_pct_threshold
),
collector_
(
collector
),
should_collect_profile_data_
(
should_collect_profile_data
)
{
load_parameters_
.
stability_threshold
=
stability_threshold
;
load_parameters_
.
stability_window
=
3
;
if
(
profile_backend_
->
Kind
()
==
cb
::
BackendKind
::
TRITON
||
profile_backend_
->
Kind
()
==
cb
::
BackendKind
::
TRITON_C_API
)
{
// Measure and report client library stats only when the model
// is not decoupled.
include_lib_stats_
=
(
!
parser_
->
IsDecoupled
());
// Measure and report server statistics only when the server
// supports the statistics extension.
std
::
set
<
std
::
string
>
extensions
;
profile_backend_
->
ServerExtensions
(
&
extensions
);
include_server_stats_
=
(
extensions
.
find
(
"statistics"
)
!=
extensions
.
end
());
}
else
{
include_lib_stats_
=
true
;
include_server_stats_
=
false
;
}
if
(
should_collect_metrics_
)
{
metrics_manager_
=
std
::
make_shared
<
MetricsManager
>
(
profile_backend
,
metrics_interval_ms
);
}
}
cb
::
Error
InferenceProfiler
::
Profile
(
const
size_t
concurrent_request_count
,
std
::
vector
<
PerfStatus
>&
perf_statuses
,
bool
&
meets_threshold
,
bool
&
is_stable
)
{
cb
::
Error
err
;
PerfStatus
perf_status
{};
perf_status
.
concurrency
=
concurrent_request_count
;
is_stable
=
false
;
meets_threshold
=
true
;
RETURN_IF_ERROR
(
dynamic_cast
<
ConcurrencyManager
*>
(
manager_
.
get
())
->
ChangeConcurrencyLevel
(
concurrent_request_count
));
err
=
ProfileHelper
(
perf_status
,
&
is_stable
);
if
(
err
.
IsOk
())
{
uint64_t
stabilizing_latency_ms
=
perf_status
.
stabilizing_latency_ns
/
NANOS_PER_MILLIS
;
if
((
stabilizing_latency_ms
>=
latency_threshold_ms_
)
&&
(
latency_threshold_ms_
!=
NO_LIMIT
))
{
std
::
cerr
<<
"Measured latency went over the set limit of "
<<
latency_threshold_ms_
<<
" msec. "
<<
std
::
endl
;
meets_threshold
=
false
;
}
else
if
(
!
is_stable
)
{
if
(
measurement_mode_
==
MeasurementMode
::
TIME_WINDOWS
)
{
std
::
cerr
<<
"Failed to obtain stable measurement within "
<<
max_trials_
<<
" measurement windows for concurrency "
<<
concurrent_request_count
<<
". Please try to "
<<
"increase the --measurement-interval."
<<
std
::
endl
;
}
else
if
(
measurement_mode_
==
MeasurementMode
::
COUNT_WINDOWS
)
{
std
::
cerr
<<
"Failed to obtain stable measurement within "
<<
max_trials_
<<
" measurement windows for concurrency "
<<
concurrent_request_count
<<
". Please try to "
<<
"increase the --measurement-request-count."
<<
std
::
endl
;
}
meets_threshold
=
false
;
}
else
{
perf_statuses
.
push_back
(
perf_status
);
err
=
Report
(
perf_status
,
percentile_
,
protocol_
,
verbose_
,
include_lib_stats_
,
include_server_stats_
,
parser_
,
should_collect_metrics_
,
overhead_pct_threshold_
);
if
(
!
err
.
IsOk
())
{
std
::
cerr
<<
err
;
meets_threshold
=
false
;
}
}
}
else
{
return
err
;
}
return
cb
::
Error
::
Success
;
}
cb
::
Error
InferenceProfiler
::
Profile
(
const
double
request_rate
,
std
::
vector
<
PerfStatus
>&
perf_statuses
,
bool
&
meets_threshold
,
bool
&
is_stable
)
{
cb
::
Error
err
;
PerfStatus
perf_status
{};
perf_status
.
request_rate
=
request_rate
;
is_stable
=
false
;
meets_threshold
=
true
;
RETURN_IF_ERROR
(
dynamic_cast
<
RequestRateManager
*>
(
manager_
.
get
())
->
ChangeRequestRate
(
request_rate
));
std
::
cout
<<
"Request Rate: "
<<
request_rate
<<
" inference requests per seconds"
<<
std
::
endl
;
err
=
ProfileHelper
(
perf_status
,
&
is_stable
);
if
(
err
.
IsOk
())
{
uint64_t
stabilizing_latency_ms
=
perf_status
.
stabilizing_latency_ns
/
NANOS_PER_MILLIS
;
if
((
stabilizing_latency_ms
>=
latency_threshold_ms_
)
&&
(
latency_threshold_ms_
!=
NO_LIMIT
))
{
std
::
cerr
<<
"Measured latency went over the set limit of "
<<
latency_threshold_ms_
<<
" msec. "
<<
std
::
endl
;
meets_threshold
=
false
;
}
else
if
(
!
is_stable
)
{
std
::
cerr
<<
"Failed to obtain stable measurement."
<<
std
::
endl
;
meets_threshold
=
false
;
}
else
{
perf_statuses
.
push_back
(
perf_status
);
err
=
Report
(
perf_status
,
percentile_
,
protocol_
,
verbose_
,
include_lib_stats_
,
include_server_stats_
,
parser_
,
should_collect_metrics_
,
overhead_pct_threshold_
);
if
(
!
err
.
IsOk
())
{
std
::
cerr
<<
err
;
meets_threshold
=
false
;
}
}
}
else
{
return
err
;
}
return
cb
::
Error
::
Success
;
}
cb
::
Error
InferenceProfiler
::
Profile
(
std
::
vector
<
PerfStatus
>&
perf_statuses
,
bool
&
meets_threshold
,
bool
&
is_stable
)
{
cb
::
Error
err
;
PerfStatus
perf_status
{};
RETURN_IF_ERROR
(
dynamic_cast
<
CustomLoadManager
*>
(
manager_
.
get
())
->
InitCustomIntervals
());
RETURN_IF_ERROR
(
dynamic_cast
<
CustomLoadManager
*>
(
manager_
.
get
())
->
GetCustomRequestRate
(
&
perf_status
.
request_rate
));
is_stable
=
false
;
meets_threshold
=
true
;
err
=
ProfileHelper
(
perf_status
,
&
is_stable
);
if
(
err
.
IsOk
())
{
uint64_t
stabilizing_latency_ms
=
perf_status
.
stabilizing_latency_ns
/
NANOS_PER_MILLIS
;
if
((
stabilizing_latency_ms
>=
latency_threshold_ms_
)
&&
(
latency_threshold_ms_
!=
NO_LIMIT
))
{
std
::
cerr
<<
"Measured latency went over the set limit of "
<<
latency_threshold_ms_
<<
" msec. "
<<
std
::
endl
;
meets_threshold
=
false
;
}
else
if
(
!
is_stable
)
{
std
::
cerr
<<
"Failed to obtain stable measurement."
<<
std
::
endl
;
meets_threshold
=
false
;
}
else
{
perf_statuses
.
push_back
(
perf_status
);
err
=
Report
(
perf_status
,
percentile_
,
protocol_
,
verbose_
,
include_lib_stats_
,
include_server_stats_
,
parser_
,
should_collect_metrics_
,
overhead_pct_threshold_
);
if
(
!
err
.
IsOk
())
{
std
::
cerr
<<
err
;
meets_threshold
=
false
;
}
}
}
else
{
return
err
;
}
return
cb
::
Error
::
Success
;
}
cb
::
Error
InferenceProfiler
::
ProfileHelper
(
PerfStatus
&
experiment_perf_status
,
bool
*
is_stable
)
{
// Start measurement
LoadStatus
load_status
;
size_t
completed_trials
=
0
;
std
::
queue
<
cb
::
Error
>
error
;
std
::
deque
<
PerfStatus
>
measurement_perf_statuses
;
all_request_records_
.
clear
();
previous_window_end_ns_
=
0
;
// Start with a fresh empty request records vector in the manager
//
std
::
vector
<
RequestRecord
>
empty_request_records
;
RETURN_IF_ERROR
(
manager_
->
SwapRequestRecords
(
empty_request_records
));
do
{
PerfStatus
measurement_perf_status
;
measurement_perf_status
.
concurrency
=
experiment_perf_status
.
concurrency
;
measurement_perf_status
.
request_rate
=
experiment_perf_status
.
request_rate
;
RETURN_IF_ERROR
(
manager_
->
CheckHealth
());
if
(
measurement_mode_
==
MeasurementMode
::
TIME_WINDOWS
)
{
error
.
push
(
Measure
(
measurement_perf_status
,
measurement_window_ms_
,
false
));
}
else
{
error
.
push
(
Measure
(
measurement_perf_status
,
measurement_request_count_
,
true
));
}
measurement_perf_statuses
.
push_back
(
measurement_perf_status
);
if
(
error
.
size
()
>
load_parameters_
.
stability_window
)
{
error
.
pop
();
measurement_perf_statuses
.
pop_front
();
}
if
(
error
.
back
().
IsOk
())
{
load_status
.
infer_per_sec
.
push_back
(
measurement_perf_status
.
client_stats
.
infer_per_sec
);
load_status
.
latencies
.
push_back
(
measurement_perf_status
.
stabilizing_latency_ns
);
}
else
{
load_status
.
infer_per_sec
.
push_back
(
0
);
load_status
.
latencies
.
push_back
(
std
::
numeric_limits
<
uint64_t
>::
max
());
}
load_status
.
avg_ips
+=
load_status
.
infer_per_sec
.
back
()
/
load_parameters_
.
stability_window
;
load_status
.
avg_latency
+=
load_status
.
latencies
.
back
()
/
load_parameters_
.
stability_window
;
if
(
verbose_
)
{
if
(
error
.
back
().
IsOk
())
{
std
::
cout
<<
" Pass ["
<<
(
completed_trials
+
1
)
<<
"] throughput: "
<<
load_status
.
infer_per_sec
.
back
()
<<
" infer/sec. "
;
if
(
extra_percentile_
)
{
std
::
cout
<<
"p"
<<
percentile_
<<
" latency: "
<<
(
measurement_perf_status
.
client_stats
.
percentile_latency_ns
.
find
(
percentile_
)
->
second
/
1000
)
<<
" usec"
<<
std
::
endl
;
}
else
{
std
::
cout
<<
"Avg latency: "
<<
(
measurement_perf_status
.
client_stats
.
avg_latency_ns
/
1000
)
<<
" usec (std "
<<
measurement_perf_status
.
client_stats
.
std_us
<<
" usec). "
<<
std
::
endl
;
}
}
else
{
std
::
cout
<<
" Pass ["
<<
(
completed_trials
+
1
)
<<
"] cb::Error: "
<<
error
.
back
().
Message
()
<<
std
::
endl
;
}
}
*
is_stable
=
DetermineStability
(
load_status
);
if
(
IsDoneProfiling
(
load_status
,
is_stable
))
{
break
;
}
completed_trials
++
;
}
while
((
!
early_exit
)
&&
(
completed_trials
<
max_trials_
));
if
(
should_collect_metrics_
)
{
metrics_manager_
->
StopQueryingMetrics
();
}
// return the appropriate error which might have occurred in the
// stability_window for its proper handling.
while
(
!
error
.
empty
())
{
if
(
!
error
.
front
().
IsOk
())
{
return
error
.
front
();
}
else
{
error
.
pop
();
}
}
// Only merge the results if the results have stabilized.
if
(
*
is_stable
)
{
RETURN_IF_ERROR
(
MergePerfStatusReports
(
measurement_perf_statuses
,
experiment_perf_status
));
}
if
(
early_exit
)
{
return
cb
::
Error
(
"Received exit signal."
,
pa
::
GENERIC_ERROR
);
}
return
cb
::
Error
::
Success
;
}
bool
InferenceProfiler
::
DetermineStability
(
LoadStatus
&
load_status
)
{
bool
stable
=
false
;
if
(
load_status
.
infer_per_sec
.
size
()
>=
load_parameters_
.
stability_window
)
{
stable
=
true
;
size_t
idx
=
load_status
.
infer_per_sec
.
size
()
-
load_parameters_
.
stability_window
;
for
(
size_t
i
=
idx
;
i
<
load_status
.
infer_per_sec
.
size
();
i
++
)
{
if
(
load_status
.
infer_per_sec
[
i
]
==
0
)
{
stable
=
false
;
}
}
stable
=
stable
&&
CheckWindowForStability
(
idx
,
load_status
);
}
return
stable
;
}
bool
InferenceProfiler
::
CheckWindowForStability
(
size_t
idx
,
LoadStatus
&
load_status
)
{
return
IsInferWindowStable
(
idx
,
load_status
)
&&
IsLatencyWindowStable
(
idx
,
load_status
);
}
bool
InferenceProfiler
::
IsInferWindowStable
(
size_t
idx
,
LoadStatus
&
load_status
)
{
auto
infer_start
=
std
::
begin
(
load_status
.
infer_per_sec
)
+
idx
;
auto
infer_per_sec_measurements
=
std
::
minmax_element
(
infer_start
,
infer_start
+
load_parameters_
.
stability_window
);
auto
max_infer_per_sec
=
*
infer_per_sec_measurements
.
second
;
auto
min_infer_per_sec
=
*
infer_per_sec_measurements
.
first
;
return
max_infer_per_sec
/
min_infer_per_sec
<=
1
+
load_parameters_
.
stability_threshold
;
}
bool
InferenceProfiler
::
IsLatencyWindowStable
(
size_t
idx
,
LoadStatus
&
load_status
)
{
auto
latency_start
=
std
::
begin
(
load_status
.
latencies
)
+
idx
;
auto
latencies_per_sec_measurements
=
std
::
minmax_element
(
latency_start
,
latency_start
+
load_parameters_
.
stability_window
);
double
max_latency
=
*
latencies_per_sec_measurements
.
second
;
double
min_latency
=
*
latencies_per_sec_measurements
.
first
;
return
max_latency
/
min_latency
<=
1
+
load_parameters_
.
stability_threshold
;
}
bool
InferenceProfiler
::
IsDoneProfiling
(
LoadStatus
&
load_status
,
bool
*
is_stable
)
{
bool
done
=
false
;
bool
within_threshold
=
true
;
if
(
load_status
.
infer_per_sec
.
size
()
>=
load_parameters_
.
stability_window
)
{
size_t
idx
=
load_status
.
infer_per_sec
.
size
()
-
load_parameters_
.
stability_window
;
for
(;
idx
<
load_status
.
infer_per_sec
.
size
();
idx
++
)
{
within_threshold
&=
CheckWithinThreshold
(
idx
,
load_status
);
}
}
if
(
mpi_driver_
->
IsMPIRun
())
{
if
(
AllMPIRanksAreStable
(
*
is_stable
))
{
done
=
true
;
}
}
else
if
(
*
is_stable
)
{
done
=
true
;
}
if
((
!
within_threshold
)
&&
(
latency_threshold_ms_
!=
NO_LIMIT
))
{
done
=
true
;
}
return
done
;
}
bool
InferenceProfiler
::
CheckWithinThreshold
(
size_t
idx
,
LoadStatus
&
load_status
)
{
return
load_status
.
latencies
[
idx
]
<
(
latency_threshold_ms_
*
NANOS_PER_MILLIS
);
}
cb
::
Error
InferenceProfiler
::
MergeServerSideStats
(
std
::
vector
<
ServerSideStats
>&
server_side_stats
,
ServerSideStats
&
server_side_summary
)
{
auto
&
server_side_stat
=
server_side_stats
[
0
];
// Make sure that the perf status reports profiling settings match with each
// other.
for
(
size_t
i
=
1
;
i
<
server_side_stats
.
size
();
i
++
)
{
if
(
server_side_stats
[
i
].
composing_models_stat
.
size
()
!=
server_side_stat
.
composing_models_stat
.
size
())
{
return
cb
::
Error
(
"Inconsistent ensemble setting detected between the trials."
,
pa
::
GENERIC_ERROR
);
}
}
// Initialize the server stats for the merged report.
server_side_summary
.
inference_count
=
0
;
server_side_summary
.
execution_count
=
0
;
server_side_summary
.
cache_hit_count
=
0
;
server_side_summary
.
cache_miss_count
=
0
;
server_side_summary
.
success_count
=
0
;
server_side_summary
.
queue_count
=
0
;
server_side_summary
.
compute_input_count
=
0
;
server_side_summary
.
compute_output_count
=
0
;
server_side_summary
.
compute_infer_count
=
0
;
server_side_summary
.
cumm_time_ns
=
0
;
server_side_summary
.
queue_time_ns
=
0
;
server_side_summary
.
compute_input_time_ns
=
0
;
server_side_summary
.
compute_infer_time_ns
=
0
;
server_side_summary
.
compute_output_time_ns
=
0
;
server_side_summary
.
cache_hit_time_ns
=
0
;
server_side_summary
.
cache_miss_time_ns
=
0
;
server_side_summary
.
composing_models_stat
.
clear
();
for
(
auto
&
composing_model_stat
:
server_side_stat
.
composing_models_stat
)
{
std
::
vector
<
ServerSideStats
>
composing_model_stats
;
for
(
auto
&
server_side_stat
:
server_side_stats
)
{
composing_model_stats
.
push_back
(
server_side_stat
.
composing_models_stat
[
composing_model_stat
.
first
]);
}
ServerSideStats
merged_composing_model_stats
;
RETURN_IF_ERROR
(
MergeServerSideStats
(
composing_model_stats
,
merged_composing_model_stats
));
server_side_summary
.
composing_models_stat
.
insert
(
{
composing_model_stat
.
first
,
merged_composing_model_stats
});
}
for
(
auto
&
server_side_stat
:
server_side_stats
)
{
// Aggregated Server Stats
server_side_summary
.
inference_count
+=
server_side_stat
.
inference_count
;
server_side_summary
.
execution_count
+=
server_side_stat
.
execution_count
;
server_side_summary
.
cache_hit_count
+=
server_side_stat
.
cache_hit_count
;
server_side_summary
.
cache_miss_count
+=
server_side_stat
.
cache_miss_count
;
server_side_summary
.
success_count
+=
server_side_stat
.
success_count
;
server_side_summary
.
queue_count
+=
server_side_stat
.
queue_count
;
server_side_summary
.
compute_input_count
+=
server_side_stat
.
compute_input_count
;
server_side_summary
.
compute_infer_count
+=
server_side_stat
.
compute_infer_count
;
server_side_summary
.
compute_output_count
+=
server_side_stat
.
compute_output_count
;
server_side_summary
.
cumm_time_ns
+=
server_side_stat
.
cumm_time_ns
;
server_side_summary
.
queue_time_ns
+=
server_side_stat
.
queue_time_ns
;
server_side_summary
.
compute_input_time_ns
+=
server_side_stat
.
compute_input_time_ns
;
server_side_summary
.
compute_infer_time_ns
+=
server_side_stat
.
compute_infer_time_ns
;
server_side_summary
.
compute_output_time_ns
+=
server_side_stat
.
compute_output_time_ns
;
server_side_summary
.
cache_hit_time_ns
+=
server_side_stat
.
cache_hit_time_ns
;
server_side_summary
.
cache_miss_time_ns
+=
server_side_stat
.
cache_miss_time_ns
;
}
return
cb
::
Error
::
Success
;
}
cb
::
Error
InferenceProfiler
::
MergePerfStatusReports
(
std
::
deque
<
PerfStatus
>&
perf_status_reports
,
PerfStatus
&
experiment_perf_status
)
{
auto
&
perf_status
=
perf_status_reports
[
0
];
// Make sure that the perf status reports profiling settings match with each
// other.
for
(
size_t
i
=
1
;
i
<
perf_status_reports
.
size
();
i
++
)
{
perf_status
.
concurrency
=
experiment_perf_status
.
concurrency
;
perf_status
.
request_rate
=
experiment_perf_status
.
request_rate
;
if
(
perf_status_reports
[
i
].
on_sequence_model
!=
perf_status
.
on_sequence_model
)
{
return
cb
::
Error
(
"Inconsistent sequence setting detected."
,
pa
::
GENERIC_ERROR
);
}
if
(
perf_status_reports
[
i
].
batch_size
!=
perf_status
.
batch_size
)
{
return
cb
::
Error
(
"Inconsistent batch size detected."
,
pa
::
GENERIC_ERROR
);
}
if
(
perf_status_reports
[
i
].
server_stats
.
composing_models_stat
.
size
()
!=
perf_status
.
server_stats
.
composing_models_stat
.
size
())
{
return
cb
::
Error
(
"Inconsistent ensemble setting detected between the trials."
,
pa
::
GENERIC_ERROR
);
}
}
experiment_perf_status
.
batch_size
=
perf_status
.
batch_size
;
experiment_perf_status
.
on_sequence_model
=
perf_status
.
on_sequence_model
;
// Initialize the client stats for the merged report.
experiment_perf_status
.
client_stats
.
request_count
=
0
;
experiment_perf_status
.
client_stats
.
sequence_count
=
0
;
experiment_perf_status
.
client_stats
.
delayed_request_count
=
0
;
experiment_perf_status
.
client_stats
.
duration_ns
=
0
;
experiment_perf_status
.
client_stats
.
avg_latency_ns
=
0
;
experiment_perf_status
.
client_stats
.
percentile_latency_ns
.
clear
();
experiment_perf_status
.
client_stats
.
latencies
.
clear
();
experiment_perf_status
.
client_stats
.
std_us
=
0
;
experiment_perf_status
.
client_stats
.
avg_request_time_ns
=
0
;
experiment_perf_status
.
client_stats
.
avg_send_time_ns
=
0
;
experiment_perf_status
.
client_stats
.
avg_receive_time_ns
=
0
;
experiment_perf_status
.
client_stats
.
infer_per_sec
=
0
;
experiment_perf_status
.
client_stats
.
sequence_per_sec
=
0
;
experiment_perf_status
.
client_stats
.
completed_count
=
0
;
experiment_perf_status
.
stabilizing_latency_ns
=
0
;
experiment_perf_status
.
overhead_pct
=
0
;
experiment_perf_status
.
send_request_rate
=
0.0
;
std
::
vector
<
ServerSideStats
>
server_side_stats
;
for
(
auto
&
perf_status
:
perf_status_reports
)
{
// Aggregated Client Stats
experiment_perf_status
.
client_stats
.
request_count
+=
perf_status
.
client_stats
.
request_count
;
experiment_perf_status
.
client_stats
.
sequence_count
+=
perf_status
.
client_stats
.
sequence_count
;
experiment_perf_status
.
client_stats
.
delayed_request_count
+=
perf_status
.
client_stats
.
delayed_request_count
;
experiment_perf_status
.
client_stats
.
response_count
+=
perf_status
.
client_stats
.
response_count
;
experiment_perf_status
.
client_stats
.
duration_ns
+=
perf_status
.
client_stats
.
duration_ns
;
server_side_stats
.
push_back
(
perf_status
.
server_stats
);
experiment_perf_status
.
client_stats
.
latencies
.
insert
(
experiment_perf_status
.
client_stats
.
latencies
.
end
(),
perf_status
.
client_stats
.
latencies
.
begin
(),
perf_status
.
client_stats
.
latencies
.
end
());
// Accumulate the overhead percentage and send rate here to remove extra
// traversals over the perf_status_reports
experiment_perf_status
.
overhead_pct
+=
perf_status
.
overhead_pct
;
experiment_perf_status
.
send_request_rate
+=
perf_status
.
send_request_rate
;
}
// Calculate the average overhead_pct for the experiment.
experiment_perf_status
.
overhead_pct
/=
perf_status_reports
.
size
();
experiment_perf_status
.
send_request_rate
/=
perf_status_reports
.
size
();
if
(
include_lib_stats_
)
{
for
(
auto
&
perf_status
:
perf_status_reports
)
{
experiment_perf_status
.
client_stats
.
completed_count
+=
perf_status
.
client_stats
.
completed_count
;
experiment_perf_status
.
client_stats
.
avg_request_time_ns
+=
perf_status
.
client_stats
.
avg_request_time_ns
*
perf_status
.
client_stats
.
completed_count
;
experiment_perf_status
.
client_stats
.
avg_send_time_ns
+=
perf_status
.
client_stats
.
avg_send_time_ns
*
perf_status
.
client_stats
.
completed_count
;
experiment_perf_status
.
client_stats
.
avg_receive_time_ns
+=
perf_status
.
client_stats
.
avg_receive_time_ns
*
perf_status
.
client_stats
.
completed_count
;
}
if
(
experiment_perf_status
.
client_stats
.
completed_count
!=
0
)
{
experiment_perf_status
.
client_stats
.
avg_request_time_ns
=
experiment_perf_status
.
client_stats
.
avg_request_time_ns
/
experiment_perf_status
.
client_stats
.
completed_count
;
experiment_perf_status
.
client_stats
.
avg_send_time_ns
=
experiment_perf_status
.
client_stats
.
avg_send_time_ns
/
experiment_perf_status
.
client_stats
.
completed_count
;
experiment_perf_status
.
client_stats
.
avg_receive_time_ns
=
experiment_perf_status
.
client_stats
.
avg_receive_time_ns
/
experiment_perf_status
.
client_stats
.
completed_count
;
}
}
RETURN_IF_ERROR
(
MergeServerSideStats
(
server_side_stats
,
experiment_perf_status
.
server_stats
));
std
::
sort
(
experiment_perf_status
.
client_stats
.
latencies
.
begin
(),
experiment_perf_status
.
client_stats
.
latencies
.
end
());
float
client_duration_sec
=
(
float
)
experiment_perf_status
.
client_stats
.
duration_ns
/
NANOS_PER_SECOND
;
experiment_perf_status
.
client_stats
.
sequence_per_sec
=
experiment_perf_status
.
client_stats
.
sequence_count
/
client_duration_sec
;
experiment_perf_status
.
client_stats
.
infer_per_sec
=
(
experiment_perf_status
.
client_stats
.
request_count
*
experiment_perf_status
.
batch_size
)
/
client_duration_sec
;
experiment_perf_status
.
client_stats
.
responses_per_sec
=
experiment_perf_status
.
client_stats
.
response_count
/
client_duration_sec
;
RETURN_IF_ERROR
(
SummarizeLatency
(
experiment_perf_status
.
client_stats
.
latencies
,
experiment_perf_status
));
if
(
should_collect_metrics_
)
{
// Put all Metric objects in a flat vector so they're easier to merge
std
::
vector
<
std
::
reference_wrapper
<
const
Metrics
>>
all_metrics
{};
std
::
for_each
(
perf_status_reports
.
begin
(),
perf_status_reports
.
end
(),
[
&
all_metrics
](
const
PerfStatus
&
p
)
{
std
::
for_each
(
p
.
metrics
.
begin
(),
p
.
metrics
.
end
(),
[
&
all_metrics
](
const
Metrics
&
m
)
{
all_metrics
.
push_back
(
m
);
});
});
Metrics
merged_metrics
{};
RETURN_IF_ERROR
(
MergeMetrics
(
all_metrics
,
merged_metrics
));
experiment_perf_status
.
metrics
.
push_back
(
std
::
move
(
merged_metrics
));
}
return
cb
::
Error
::
Success
;
}
cb
::
Error
InferenceProfiler
::
GetServerSideStatus
(
std
::
map
<
cb
::
ModelIdentifier
,
cb
::
ModelStatistics
>*
model_stats
)
{
if
((
parser_
->
SchedulerType
()
==
ModelParser
::
ENSEMBLE
)
||
(
parser_
->
SchedulerType
()
==
ModelParser
::
ENSEMBLE_SEQUENCE
))
{
RETURN_IF_ERROR
(
profile_backend_
->
ModelInferenceStatistics
(
model_stats
));
}
else
{
RETURN_IF_ERROR
(
profile_backend_
->
ModelInferenceStatistics
(
model_stats
,
parser_
->
ModelName
(),
parser_
->
ModelVersion
()));
}
return
cb
::
Error
::
Success
;
}
// Used for measurement
cb
::
Error
InferenceProfiler
::
Measure
(
PerfStatus
&
perf_status
,
uint64_t
measurement_window
,
bool
is_count_based
)
{
std
::
map
<
cb
::
ModelIdentifier
,
cb
::
ModelStatistics
>
start_status
;
std
::
map
<
cb
::
ModelIdentifier
,
cb
::
ModelStatistics
>
end_status
;
cb
::
InferStat
start_stat
;
cb
::
InferStat
end_stat
;
manager_
->
ResetIdleTime
();
// Set current window start time to end of previous window. For first
// measurement window, capture start time, server side stats, and client side
// stats.
uint64_t
window_start_ns
=
previous_window_end_ns_
;
start_stat
=
prev_client_side_stats_
;
start_status
=
prev_server_side_stats_
;
if
(
window_start_ns
==
0
)
{
window_start_ns
=
std
::
chrono
::
duration_cast
<
std
::
chrono
::
nanoseconds
>
(
std
::
chrono
::
system_clock
::
now
().
time_since_epoch
())
.
count
();
if
(
should_collect_metrics_
)
{
metrics_manager_
->
StartQueryingMetrics
();
}
if
(
include_server_stats_
)
{
RETURN_IF_ERROR
(
GetServerSideStatus
(
&
start_status
));
}
RETURN_IF_ERROR
(
manager_
->
GetAccumulatedClientStat
(
&
start_stat
));
}
if
(
should_collect_metrics_
)
{
try
{
metrics_manager_
->
CheckQueryingStatus
();
}
catch
(
const
std
::
exception
&
e
)
{
return
cb
::
Error
(
e
.
what
(),
pa
::
GENERIC_ERROR
);
}
}
if
(
!
is_count_based
)
{
// Wait for specified time interval in msec
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
((
uint64_t
)(
measurement_window_ms_
*
1.2
)));
}
else
{
do
{
// Check the health of the worker threads.
RETURN_IF_ERROR
(
manager_
->
CheckHealth
());
// Wait for 1s until enough samples have been collected.
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
((
uint64_t
)
1000
));
}
while
(
manager_
->
CountCollectedRequests
()
<
measurement_window
);
}
uint64_t
window_end_ns
=
std
::
chrono
::
duration_cast
<
std
::
chrono
::
nanoseconds
>
(
std
::
chrono
::
system_clock
::
now
().
time_since_epoch
())
.
count
();
previous_window_end_ns_
=
window_end_ns
;
if
(
should_collect_metrics_
)
{
metrics_manager_
->
GetLatestMetrics
(
perf_status
.
metrics
);
}
// Get server status and then print report on difference between
// before and after status.
if
(
include_server_stats_
)
{
RETURN_IF_ERROR
(
GetServerSideStatus
(
&
end_status
));
prev_server_side_stats_
=
end_status
;
}
RETURN_IF_ERROR
(
manager_
->
GetAccumulatedClientStat
(
&
end_stat
));
prev_client_side_stats_
=
end_stat
;
std
::
vector
<
RequestRecord
>
current_request_records
;
RETURN_IF_ERROR
(
manager_
->
SwapRequestRecords
(
current_request_records
));
all_request_records_
.
insert
(
all_request_records_
.
end
(),
current_request_records
.
begin
(),
current_request_records
.
end
());
RETURN_IF_ERROR
(
Summarize
(
start_status
,
end_status
,
start_stat
,
end_stat
,
perf_status
,
window_start_ns
,
window_end_ns
));
return
cb
::
Error
::
Success
;
}
cb
::
Error
InferenceProfiler
::
Summarize
(
const
std
::
map
<
cb
::
ModelIdentifier
,
cb
::
ModelStatistics
>&
start_status
,
const
std
::
map
<
cb
::
ModelIdentifier
,
cb
::
ModelStatistics
>&
end_status
,
const
cb
::
InferStat
&
start_stat
,
const
cb
::
InferStat
&
end_stat
,
PerfStatus
&
summary
,
uint64_t
window_start_ns
,
uint64_t
window_end_ns
)
{
size_t
valid_sequence_count
=
0
;
size_t
delayed_request_count
=
0
;
size_t
response_count
=
0
;
// Get measurement from requests that fall within the time interval
std
::
pair
<
uint64_t
,
uint64_t
>
valid_range
{
window_start_ns
,
window_end_ns
};
uint64_t
window_duration_ns
=
valid_range
.
second
-
valid_range
.
first
;
std
::
vector
<
uint64_t
>
latencies
;
std
::
vector
<
RequestRecord
>
valid_requests
{};
ValidLatencyMeasurement
(
valid_range
,
valid_sequence_count
,
delayed_request_count
,
&
latencies
,
response_count
,
valid_requests
);
if
(
should_collect_profile_data_
)
{
CollectData
(
summary
,
window_start_ns
,
window_end_ns
,
std
::
move
(
valid_requests
));
}
RETURN_IF_ERROR
(
SummarizeLatency
(
latencies
,
summary
));
RETURN_IF_ERROR
(
SummarizeClientStat
(
start_stat
,
end_stat
,
window_duration_ns
,
latencies
.
size
(),
valid_sequence_count
,
delayed_request_count
,
response_count
,
summary
));
summary
.
client_stats
.
latencies
=
std
::
move
(
latencies
);
SummarizeOverhead
(
window_duration_ns
,
manager_
->
GetIdleTime
(),
summary
);
double
window_duration_s
{
window_duration_ns
/
static_cast
<
double
>
(
NANOS_PER_SECOND
)};
SummarizeSendRequestRate
(
window_duration_s
,
manager_
->
GetAndResetNumSentRequests
(),
summary
);
if
(
include_server_stats_
)
{
RETURN_IF_ERROR
(
SummarizeServerStats
(
start_status
,
end_status
,
&
(
summary
.
server_stats
)));
}
return
cb
::
Error
::
Success
;
}
void
InferenceProfiler
::
ValidLatencyMeasurement
(
const
std
::
pair
<
uint64_t
,
uint64_t
>&
valid_range
,
size_t
&
valid_sequence_count
,
size_t
&
delayed_request_count
,
std
::
vector
<
uint64_t
>*
valid_latencies
,
size_t
&
response_count
,
std
::
vector
<
RequestRecord
>&
valid_requests
)
{
valid_latencies
->
clear
();
valid_sequence_count
=
0
;
response_count
=
0
;
std
::
vector
<
size_t
>
erase_indices
{};
for
(
size_t
i
=
0
;
i
<
all_request_records_
.
size
();
i
++
)
{
const
auto
&
request_record
=
all_request_records_
[
i
];
uint64_t
request_start_ns
=
CHRONO_TO_NANOS
(
request_record
.
start_time_
);
uint64_t
request_end_ns
;
if
(
request_record
.
has_null_last_response_
==
false
)
{
request_end_ns
=
CHRONO_TO_NANOS
(
request_record
.
response_times_
.
back
());
}
else
if
(
request_record
.
response_times_
.
size
()
>
1
)
{
size_t
last_response_idx
{
request_record
.
response_times_
.
size
()
-
2
};
request_end_ns
=
CHRONO_TO_NANOS
(
request_record
.
response_times_
[
last_response_idx
]);
}
else
{
erase_indices
.
push_back
(
i
);
continue
;
}
if
(
request_start_ns
<=
request_end_ns
)
{
// Only counting requests that end within the time interval
if
((
request_end_ns
>=
valid_range
.
first
)
&&
(
request_end_ns
<=
valid_range
.
second
))
{
valid_latencies
->
push_back
(
request_end_ns
-
request_start_ns
);
response_count
+=
request_record
.
response_times_
.
size
();
if
(
request_record
.
has_null_last_response_
)
{
response_count
--
;
}
erase_indices
.
push_back
(
i
);
if
(
request_record
.
sequence_end_
)
{
valid_sequence_count
++
;
}
if
(
request_record
.
delayed_
)
{
delayed_request_count
++
;
}
}
}
}
std
::
for_each
(
erase_indices
.
begin
(),
erase_indices
.
end
(),
[
this
,
&
valid_requests
](
size_t
i
)
{
valid_requests
.
push_back
(
std
::
move
(
this
->
all_request_records_
[
i
]));
});
// Iterate through erase indices backwards so that erases from
// `all_request_records_` happen from the back to the front to avoid using
// wrong indices after subsequent erases
std
::
for_each
(
erase_indices
.
rbegin
(),
erase_indices
.
rend
(),
[
this
](
size_t
i
)
{
this
->
all_request_records_
.
erase
(
this
->
all_request_records_
.
begin
()
+
i
);
});
// Always sort measured latencies as percentile will be reported as default
std
::
sort
(
valid_latencies
->
begin
(),
valid_latencies
->
end
());
}
void
InferenceProfiler
::
CollectData
(
PerfStatus
&
summary
,
uint64_t
window_start_ns
,
uint64_t
window_end_ns
,
std
::
vector
<
RequestRecord
>&&
request_records
)
{
InferenceLoadMode
id
{
summary
.
concurrency
,
summary
.
request_rate
};
collector_
->
AddWindow
(
id
,
window_start_ns
,
window_end_ns
);
collector_
->
AddData
(
id
,
std
::
move
(
request_records
));
}
cb
::
Error
InferenceProfiler
::
SummarizeLatency
(
const
std
::
vector
<
uint64_t
>&
latencies
,
PerfStatus
&
summary
)
{
if
(
latencies
.
size
()
==
0
)
{
return
cb
::
Error
(
"No valid requests recorded within time interval."
" Please use a larger time window."
,
pa
::
OPTION_ERROR
);
}
std
::
tie
(
summary
.
client_stats
.
avg_latency_ns
,
summary
.
client_stats
.
std_us
)
=
GetMeanAndStdDev
(
latencies
);
// retrieve other interesting percentile
summary
.
client_stats
.
percentile_latency_ns
.
clear
();
std
::
set
<
size_t
>
percentiles
{
50
,
90
,
95
,
99
};
if
(
extra_percentile_
)
{
percentiles
.
emplace
(
percentile_
);
}
for
(
const
auto
percentile
:
percentiles
)
{
size_t
index
=
(
percentile
/
100.0
)
*
(
latencies
.
size
()
-
1
)
+
0.5
;
summary
.
client_stats
.
percentile_latency_ns
.
emplace
(
percentile
,
latencies
[
index
]);
}
if
(
extra_percentile_
)
{
summary
.
stabilizing_latency_ns
=
summary
.
client_stats
.
percentile_latency_ns
.
find
(
percentile_
)
->
second
;
}
else
{
summary
.
stabilizing_latency_ns
=
summary
.
client_stats
.
avg_latency_ns
;
}
return
cb
::
Error
::
Success
;
}
std
::
tuple
<
uint64_t
,
uint64_t
>
InferenceProfiler
::
GetMeanAndStdDev
(
const
std
::
vector
<
uint64_t
>&
latencies
)
{
uint64_t
avg_latency_ns
{
0
};
uint64_t
std_dev_latency_us
{
0
};
// calculate mean of latencies
uint64_t
tol_latency_ns
{
std
::
accumulate
(
latencies
.
begin
(),
latencies
.
end
(),
0ULL
)};
avg_latency_ns
=
tol_latency_ns
/
latencies
.
size
();
// calculate sample standard deviation of latencies
uint64_t
sq_sum_latency_avg_diff_ns
{
0
};
std
::
for_each
(
latencies
.
begin
(),
latencies
.
end
(),
[
avg_latency_ns
,
&
sq_sum_latency_avg_diff_ns
](
uint64_t
l
)
{
sq_sum_latency_avg_diff_ns
+=
static_cast
<
int64_t
>
(
l
-
avg_latency_ns
)
*
static_cast
<
int64_t
>
(
l
-
avg_latency_ns
);
});
if
(
latencies
.
size
()
>
1
)
{
std_dev_latency_us
=
std
::
sqrt
(
sq_sum_latency_avg_diff_ns
/
(
latencies
.
size
()
-
1
))
/
1000
;
}
else
{
std_dev_latency_us
=
UINT64_MAX
;
std
::
cerr
<<
"WARNING: Pass contained only one request, so sample latency "
"standard deviation will be infinity (UINT64_MAX)."
<<
std
::
endl
;
}
return
std
::
make_tuple
(
avg_latency_ns
,
std_dev_latency_us
);
}
cb
::
Error
InferenceProfiler
::
SummarizeClientStat
(
const
cb
::
InferStat
&
start_stat
,
const
cb
::
InferStat
&
end_stat
,
const
uint64_t
duration_ns
,
const
size_t
valid_request_count
,
const
size_t
valid_sequence_count
,
const
size_t
delayed_request_count
,
const
size_t
response_count
,
PerfStatus
&
summary
)
{
summary
.
on_sequence_model
=
((
parser_
->
SchedulerType
()
==
ModelParser
::
SEQUENCE
)
||
(
parser_
->
SchedulerType
()
==
ModelParser
::
ENSEMBLE_SEQUENCE
));
summary
.
batch_size
=
std
::
max
(
manager_
->
BatchSize
(),
(
size_t
)
1
);
summary
.
client_stats
.
request_count
=
valid_request_count
;
summary
.
client_stats
.
sequence_count
=
valid_sequence_count
;
summary
.
client_stats
.
delayed_request_count
=
delayed_request_count
;
summary
.
client_stats
.
response_count
=
response_count
;
summary
.
client_stats
.
duration_ns
=
duration_ns
;
float
client_duration_sec
=
(
float
)
summary
.
client_stats
.
duration_ns
/
NANOS_PER_SECOND
;
summary
.
client_stats
.
sequence_per_sec
=
valid_sequence_count
/
client_duration_sec
;
summary
.
client_stats
.
infer_per_sec
=
(
valid_request_count
*
summary
.
batch_size
)
/
client_duration_sec
;
summary
.
client_stats
.
responses_per_sec
=
response_count
/
client_duration_sec
;
if
(
include_lib_stats_
)
{
size_t
completed_count
=
end_stat
.
completed_request_count
-
start_stat
.
completed_request_count
;
uint64_t
request_time_ns
=
end_stat
.
cumulative_total_request_time_ns
-
start_stat
.
cumulative_total_request_time_ns
;
summary
.
client_stats
.
completed_count
=
completed_count
;
uint64_t
send_time_ns
=
end_stat
.
cumulative_send_time_ns
-
start_stat
.
cumulative_send_time_ns
;
uint64_t
receive_time_ns
=
end_stat
.
cumulative_receive_time_ns
-
start_stat
.
cumulative_receive_time_ns
;
if
(
completed_count
!=
0
)
{
summary
.
client_stats
.
avg_request_time_ns
=
request_time_ns
/
completed_count
;
summary
.
client_stats
.
avg_send_time_ns
=
send_time_ns
/
completed_count
;
summary
.
client_stats
.
avg_receive_time_ns
=
receive_time_ns
/
completed_count
;
}
}
return
cb
::
Error
::
Success
;
}
void
InferenceProfiler
::
SummarizeSendRequestRate
(
const
double
window_duration_s
,
const
size_t
num_sent_requests
,
PerfStatus
&
summary
)
{
if
(
window_duration_s
<=
0.0
)
{
throw
std
::
runtime_error
(
"window_duration_s must be positive"
);
}
summary
.
send_request_rate
=
num_sent_requests
/
window_duration_s
;
}
cb
::
Error
InferenceProfiler
::
DetermineStatsModelVersion
(
const
cb
::
ModelIdentifier
&
model_identifier
,
const
std
::
map
<
cb
::
ModelIdentifier
,
cb
::
ModelStatistics
>&
start_stats
,
const
std
::
map
<
cb
::
ModelIdentifier
,
cb
::
ModelStatistics
>&
end_stats
,
int64_t
*
status_model_version
)
{
// If model_version is unspecified then look in the stats to find the
// version with stats that incremented during the measurement.
//
// If multiple versions had incremented stats, use the highest numbered one
// and print a warning
*
status_model_version
=
-
1
;
bool
multiple_found
=
false
;
bool
version_unspecified
=
model_identifier
.
second
.
empty
();
if
(
version_unspecified
)
{
for
(
const
auto
&
x
:
end_stats
)
{
const
auto
&
end_id
=
x
.
first
;
const
auto
&
end_stat
=
x
.
second
;
bool
is_correct_model_name
=
model_identifier
.
first
.
compare
(
end_id
.
first
)
==
0
;
if
(
is_correct_model_name
)
{
uint64_t
end_queue_count
=
end_stat
.
queue_count_
;
uint64_t
start_queue_count
=
0
;
const
auto
&
itr
=
start_stats
.
find
(
end_id
);
if
(
itr
!=
start_stats
.
end
())
{
start_queue_count
=
itr
->
second
.
queue_count_
;
}
if
(
end_queue_count
>
start_queue_count
)
{
int64_t
this_version
=
std
::
stoll
(
end_id
.
second
);
if
(
*
status_model_version
!=
-
1
)
{
multiple_found
=
true
;
}
*
status_model_version
=
std
::
max
(
*
status_model_version
,
this_version
);
}
}
}
}
else
{
const
auto
&
itr
=
end_stats
.
find
(
model_identifier
);
if
(
itr
!=
end_stats
.
end
())
{
*
status_model_version
=
std
::
stoll
(
model_identifier
.
second
);
}
}
if
(
*
status_model_version
==
-
1
)
{
return
cb
::
Error
(
"failed to find the requested model version"
,
pa
::
GENERIC_ERROR
);
}
if
(
multiple_found
)
{
std
::
cerr
<<
"WARNING: Multiple versions of model "
<<
model_identifier
.
first
<<
" are loaded in the triton server, and the version to use was "
"unspecified. The stats for that model may be inaccurate."
<<
std
::
endl
;
}
return
cb
::
Error
::
Success
;
}
cb
::
Error
InferenceProfiler
::
SummarizeServerStats
(
const
std
::
map
<
cb
::
ModelIdentifier
,
cb
::
ModelStatistics
>&
start_status
,
const
std
::
map
<
cb
::
ModelIdentifier
,
cb
::
ModelStatistics
>&
end_status
,
ServerSideStats
*
server_stats
)
{
RETURN_IF_ERROR
(
SummarizeServerStats
(
std
::
make_pair
(
parser_
->
ModelName
(),
parser_
->
ModelVersion
()),
start_status
,
end_status
,
server_stats
));
return
cb
::
Error
::
Success
;
}
cb
::
Error
InferenceProfiler
::
SummarizeServerStats
(
const
cb
::
ModelIdentifier
&
model_identifier
,
const
std
::
map
<
cb
::
ModelIdentifier
,
cb
::
ModelStatistics
>&
start_status
,
const
std
::
map
<
cb
::
ModelIdentifier
,
cb
::
ModelStatistics
>&
end_status
,
ServerSideStats
*
server_stats
)
{
RETURN_IF_ERROR
(
SummarizeServerStatsHelper
(
model_identifier
,
start_status
,
end_status
,
server_stats
));
// Summarize the composing models, if any.
for
(
auto
composing_model_identifier
:
(
*
parser_
->
GetComposingModelMap
())[
model_identifier
.
first
])
{
int64_t
model_version
;
RETURN_IF_ERROR
(
DetermineStatsModelVersion
(
composing_model_identifier
,
start_status
,
end_status
,
&
model_version
));
composing_model_identifier
.
second
=
std
::
to_string
(
model_version
);
auto
it
=
server_stats
->
composing_models_stat
.
emplace
(
composing_model_identifier
,
ServerSideStats
())
.
first
;
RETURN_IF_ERROR
(
SummarizeServerStats
(
composing_model_identifier
,
start_status
,
end_status
,
&
(
it
->
second
)));
}
return
cb
::
Error
::
Success
;
}
cb
::
Error
InferenceProfiler
::
SummarizeServerStatsHelper
(
const
cb
::
ModelIdentifier
&
model_identifier
,
const
std
::
map
<
cb
::
ModelIdentifier
,
cb
::
ModelStatistics
>&
start_status
,
const
std
::
map
<
cb
::
ModelIdentifier
,
cb
::
ModelStatistics
>&
end_status
,
ServerSideStats
*
server_stats
)
{
int64_t
model_version
;
RETURN_IF_ERROR
(
DetermineStatsModelVersion
(
model_identifier
,
start_status
,
end_status
,
&
model_version
));
const
std
::
pair
<
std
::
string
,
std
::
string
>
this_id
(
model_identifier
.
first
,
std
::
to_string
(
model_version
));
const
auto
&
end_itr
=
end_status
.
find
(
this_id
);
if
(
end_itr
==
end_status
.
end
())
{
return
cb
::
Error
(
"missing statistics for requested model"
,
pa
::
GENERIC_ERROR
);
}
else
{
uint64_t
start_infer_cnt
=
0
;
uint64_t
start_exec_cnt
=
0
;
uint64_t
start_cnt
=
0
;
uint64_t
start_queue_cnt
=
0
;
uint64_t
start_compute_input_cnt
=
0
;
uint64_t
start_compute_infer_cnt
=
0
;
uint64_t
start_compute_output_cnt
=
0
;
uint64_t
start_cumm_time_ns
=
0
;
uint64_t
start_queue_time_ns
=
0
;
uint64_t
start_compute_input_time_ns
=
0
;
uint64_t
start_compute_infer_time_ns
=
0
;
uint64_t
start_compute_output_time_ns
=
0
;
uint64_t
start_cache_hit_cnt
=
0
;
uint64_t
start_cache_hit_time_ns
=
0
;
uint64_t
start_cache_miss_cnt
=
0
;
uint64_t
start_cache_miss_time_ns
=
0
;
const
auto
&
start_itr
=
start_status
.
find
(
this_id
);
if
(
start_itr
!=
start_status
.
end
())
{
start_infer_cnt
=
start_itr
->
second
.
inference_count_
;
start_exec_cnt
=
start_itr
->
second
.
execution_count_
;
start_cnt
=
start_itr
->
second
.
success_count_
;
start_queue_cnt
=
start_itr
->
second
.
queue_count_
;
start_compute_input_cnt
=
start_itr
->
second
.
compute_input_count_
;
start_compute_infer_cnt
=
start_itr
->
second
.
compute_infer_count_
;
start_compute_output_cnt
=
start_itr
->
second
.
compute_output_count_
;
start_cumm_time_ns
=
start_itr
->
second
.
cumm_time_ns_
;
start_queue_time_ns
=
start_itr
->
second
.
queue_time_ns_
;
start_compute_input_time_ns
=
start_itr
->
second
.
compute_input_time_ns_
;
start_compute_infer_time_ns
=
start_itr
->
second
.
compute_infer_time_ns_
;
start_compute_output_time_ns
=
start_itr
->
second
.
compute_output_time_ns_
;
start_cache_hit_cnt
=
start_itr
->
second
.
cache_hit_count_
;
start_cache_hit_time_ns
=
start_itr
->
second
.
cache_hit_time_ns_
;
start_cache_miss_cnt
=
start_itr
->
second
.
cache_miss_count_
;
start_cache_miss_time_ns
=
start_itr
->
second
.
cache_miss_time_ns_
;
}
server_stats
->
inference_count
=
end_itr
->
second
.
inference_count_
-
start_infer_cnt
;
server_stats
->
execution_count
=
end_itr
->
second
.
execution_count_
-
start_exec_cnt
;
server_stats
->
success_count
=
end_itr
->
second
.
success_count_
-
start_cnt
;
server_stats
->
queue_count
=
end_itr
->
second
.
queue_count_
-
start_queue_cnt
;
server_stats
->
compute_input_count
=
end_itr
->
second
.
compute_input_count_
-
start_compute_input_cnt
;
server_stats
->
compute_infer_count
=
end_itr
->
second
.
compute_infer_count_
-
start_compute_infer_cnt
;
server_stats
->
compute_output_count
=
end_itr
->
second
.
compute_output_count_
-
start_compute_output_cnt
;
server_stats
->
cumm_time_ns
=
end_itr
->
second
.
cumm_time_ns_
-
start_cumm_time_ns
;
server_stats
->
queue_time_ns
=
end_itr
->
second
.
queue_time_ns_
-
start_queue_time_ns
;
server_stats
->
compute_input_time_ns
=
end_itr
->
second
.
compute_input_time_ns_
-
start_compute_input_time_ns
;
server_stats
->
compute_infer_time_ns
=
end_itr
->
second
.
compute_infer_time_ns_
-
start_compute_infer_time_ns
;
server_stats
->
compute_output_time_ns
=
end_itr
->
second
.
compute_output_time_ns_
-
start_compute_output_time_ns
;
server_stats
->
cache_hit_count
=
end_itr
->
second
.
cache_hit_count_
-
start_cache_hit_cnt
;
server_stats
->
cache_hit_time_ns
=
end_itr
->
second
.
cache_hit_time_ns_
-
start_cache_hit_time_ns
;
server_stats
->
cache_miss_count
=
end_itr
->
second
.
cache_miss_count_
-
start_cache_miss_cnt
;
server_stats
->
cache_miss_time_ns
=
end_itr
->
second
.
cache_miss_time_ns_
-
start_cache_miss_time_ns
;
}
return
cb
::
Error
::
Success
;
}
void
InferenceProfiler
::
SummarizeOverhead
(
const
uint64_t
window_duration_ns
,
const
uint64_t
idle_ns
,
PerfStatus
&
summary
)
{
// The window start/stop is not instantaneous. It is possible that the PA
// overhead is smaller than the delay in the window start/stop process. Treat
// it as 0% overhead (100% idle) in that case
//
if
(
idle_ns
>
window_duration_ns
)
{
summary
.
overhead_pct
=
0
;
}
else
{
uint64_t
overhead_ns
=
window_duration_ns
-
idle_ns
;
double
overhead_pct
=
double
(
overhead_ns
)
/
window_duration_ns
*
100
;
summary
.
overhead_pct
=
overhead_pct
;
}
}
bool
InferenceProfiler
::
AllMPIRanksAreStable
(
bool
current_rank_stability
)
{
int
world_size
{
mpi_driver_
->
MPICommSizeWorld
()};
std
::
vector
<
int
>
stabilities_per_rank
{};
stabilities_per_rank
.
resize
(
world_size
,
0
);
int
my_rank
{
mpi_driver_
->
MPICommRankWorld
()};
stabilities_per_rank
[
my_rank
]
=
static_cast
<
int
>
(
current_rank_stability
);
for
(
int
rank
{
0
};
rank
<
world_size
;
rank
++
)
{
mpi_driver_
->
MPIBcastIntWorld
(
stabilities_per_rank
.
data
()
+
rank
,
1
,
rank
);
}
bool
all_stable
{
true
};
for
(
int
rank
{
0
};
rank
<
world_size
;
rank
++
)
{
if
(
stabilities_per_rank
[
rank
]
==
0
)
{
all_stable
=
false
;
break
;
}
}
if
(
verbose_
&&
all_stable
)
{
std
::
cout
<<
"All models on all MPI ranks are stable"
<<
std
::
endl
;
}
return
all_stable
;
}
cb
::
Error
InferenceProfiler
::
MergeMetrics
(
const
std
::
vector
<
std
::
reference_wrapper
<
const
Metrics
>>&
all_metrics
,
Metrics
&
merged_metrics
)
{
// Maps from each metric collection mapping gpu uuid to gpu utilization
std
::
vector
<
std
::
reference_wrapper
<
const
std
::
map
<
std
::
string
,
double
>>>
gpu_utilization_per_gpu_maps
{};
// Maps from each metric collection mapping gpu uuid to gpu power usage
std
::
vector
<
std
::
reference_wrapper
<
const
std
::
map
<
std
::
string
,
double
>>>
gpu_power_usage_per_gpu_maps
{};
// Maps from each metric collection mapping gpu uuid to gpu memory used bytes
std
::
vector
<
std
::
reference_wrapper
<
const
std
::
map
<
std
::
string
,
uint64_t
>>>
gpu_memory_used_bytes_per_gpu_maps
{};
// Maps from each metric collection mapping gpu uuid to gpu memory total bytes
std
::
vector
<
std
::
reference_wrapper
<
const
std
::
map
<
std
::
string
,
uint64_t
>>>
gpu_memory_total_bytes_per_gpu_maps
{};
// Put all metric maps in vector so they're easier to aggregate
std
::
for_each
(
all_metrics
.
begin
(),
all_metrics
.
end
(),
[
&
gpu_utilization_per_gpu_maps
,
&
gpu_power_usage_per_gpu_maps
,
&
gpu_memory_used_bytes_per_gpu_maps
,
&
gpu_memory_total_bytes_per_gpu_maps
](
const
std
::
reference_wrapper
<
const
Metrics
>
m
)
{
gpu_utilization_per_gpu_maps
.
push_back
(
m
.
get
().
gpu_utilization_per_gpu
);
gpu_power_usage_per_gpu_maps
.
push_back
(
m
.
get
().
gpu_power_usage_per_gpu
);
gpu_memory_used_bytes_per_gpu_maps
.
push_back
(
m
.
get
().
gpu_memory_used_bytes_per_gpu
);
gpu_memory_total_bytes_per_gpu_maps
.
push_back
(
m
.
get
().
gpu_memory_total_bytes_per_gpu
);
});
GetMetricAveragePerGPU
<
double
>
(
gpu_utilization_per_gpu_maps
,
merged_metrics
.
gpu_utilization_per_gpu
);
GetMetricAveragePerGPU
<
double
>
(
gpu_power_usage_per_gpu_maps
,
merged_metrics
.
gpu_power_usage_per_gpu
);
GetMetricMaxPerGPU
<
uint64_t
>
(
gpu_memory_used_bytes_per_gpu_maps
,
merged_metrics
.
gpu_memory_used_bytes_per_gpu
);
GetMetricFirstPerGPU
<
uint64_t
>
(
gpu_memory_total_bytes_per_gpu_maps
,
merged_metrics
.
gpu_memory_total_bytes_per_gpu
);
return
cb
::
Error
::
Success
;
}
}}
// namespace triton::perfanalyzer
src/c++/perf_analyzer/inference_profiler.h
0 → 100644
View file @
c68e1835
// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <algorithm>
#include <cstdint>
#include <deque>
#include <functional>
#include <map>
#include <memory>
#include <string>
#include <thread>
#include <tuple>
#include <vector>
#include "concurrency_manager.h"
#include "constants.h"
#include "custom_load_manager.h"
#include "metrics.h"
#include "metrics_manager.h"
#include "model_parser.h"
#include "mpi_utils.h"
#include "profile_data_collector.h"
#include "request_rate_manager.h"
namespace
triton
{
namespace
perfanalyzer
{
#ifndef DOCTEST_CONFIG_DISABLE
class
NaggyMockInferenceProfiler
;
class
TestInferenceProfiler
;
#endif
/// Constant parameters that determine the whether stopping criteria has met
/// for the current phase of testing
struct
LoadParams
{
// The number of measurements to account for during calculation of load
// status
uint32_t
stability_window
;
// The +/- range to account for while assessing load status
double
stability_threshold
;
};
/// Data structure to keep track of real-time load status and determine whether
/// stopping criteria has met for the current phase of testing.
struct
LoadStatus
{
// Stores the observations of infer_per_sec and latencies in a vector
std
::
vector
<
double
>
infer_per_sec
;
std
::
vector
<
uint64_t
>
latencies
;
// Records the average inference per second within the stability window
double
avg_ips
=
0
;
// Stores the average latency within the stability window
uint64_t
avg_latency
=
0
;
};
// Holds the total of the timiming components of composing models of an
// ensemble.
struct
EnsembleDurations
{
EnsembleDurations
()
:
total_queue_time_avg_us
(
0
),
total_compute_time_avg_us
(
0
),
total_cache_hit_time_avg_us
(
0
),
total_cache_miss_time_avg_us
(
0
),
total_combined_cache_compute_time_avg_us
(
0
)
{
}
uint64_t
total_queue_time_avg_us
;
uint64_t
total_compute_time_avg_us
;
// Time spent on cache lookups/copies for cache hits
uint64_t
total_cache_hit_time_avg_us
;
// Time spent on cache lookups/copies/insertions for cache misses
uint64_t
total_cache_miss_time_avg_us
;
// Combined average of cache and compute times
uint64_t
total_combined_cache_compute_time_avg_us
;
};
/// Holds the server-side inference statisitcs of the target model and its
/// composing models
struct
ServerSideStats
{
uint64_t
inference_count
;
uint64_t
execution_count
;
uint64_t
cache_hit_count
;
uint64_t
cache_miss_count
;
uint64_t
success_count
;
uint64_t
queue_count
;
uint64_t
compute_input_count
;
uint64_t
compute_infer_count
;
uint64_t
compute_output_count
;
uint64_t
cumm_time_ns
;
uint64_t
queue_time_ns
;
uint64_t
compute_input_time_ns
;
uint64_t
compute_infer_time_ns
;
uint64_t
compute_output_time_ns
;
// Time spent on cache lookups/copies for cache hits
uint64_t
cache_hit_time_ns
;
// Time spent on cache lookups/copies/insertions for cache misses
uint64_t
cache_miss_time_ns
;
std
::
map
<
cb
::
ModelIdentifier
,
ServerSideStats
>
composing_models_stat
;
};
/// Holds the statistics recorded at the client side.
struct
ClientSideStats
{
// Request count and elapsed time measured by client
uint64_t
request_count
;
// Only record sequences that finish within the measurement window
uint64_t
sequence_count
;
// The number of requests that missed their schedule
uint64_t
delayed_request_count
;
// The number of responses
uint64_t
response_count
;
uint64_t
duration_ns
;
uint64_t
avg_latency_ns
;
// a ordered map of percentiles to be reported (<percentile, value> pair)
std
::
map
<
size_t
,
uint64_t
>
percentile_latency_ns
;
// List of all the valid latencies.
std
::
vector
<
uint64_t
>
latencies
;
// Using usec to avoid square of large number (large in nsec)
uint64_t
std_us
;
uint64_t
avg_request_time_ns
;
uint64_t
avg_send_time_ns
;
uint64_t
avg_receive_time_ns
;
// Per sec stat
double
infer_per_sec
;
double
responses_per_sec
;
double
sequence_per_sec
;
// Completed request count reported by the client library
uint64_t
completed_count
;
};
/// The entire statistics record.
struct
PerfStatus
{
uint32_t
concurrency
;
double
request_rate
;
size_t
batch_size
;
ServerSideStats
server_stats
;
ClientSideStats
client_stats
;
std
::
vector
<
Metrics
>
metrics
{};
double
overhead_pct
;
bool
on_sequence_model
;
// placeholder for the latency value that is used for conditional checking
uint64_t
stabilizing_latency_ns
;
// Metric for requests sent per second
double
send_request_rate
{
0.0
};
};
cb
::
Error
ReportPrometheusMetrics
(
const
Metrics
&
metrics
);
//==============================================================================
/// A InferenceProfiler is a helper class that measures and summarizes the
/// inference statistic under different concurrency level.
///
/// The profiler can adjust the number of concurrent requests by informing the
/// concurrency manager. And after the adjustment, the profiler will actively
/// collecting the statistic from both the concurrency manager and the inference
/// server directly until it is stable. Once stable, the profiler updates the
/// 'status_summary' based on the most recent measurement.
///
/// The measurement procedure:
/// 1. The profiler gets start status from the server and records the start
/// time.
/// 2. After given time interval, the profiler gets end status from the server
/// and records the end time.
/// 3. The profiler obtains the request records recorded by concurrency manager,
/// and uses the request records that are recorded between start time and end
/// time to measure client side status and update status_summary.
///
class
InferenceProfiler
{
public:
/// Create a profiler that collects and summarizes inference statistic.
/// \param verbose Whether to print verbose logging.
/// \param stability_threshold The range that the measurement is considered as
/// stable. i.e. within (1 +/- stability_threshold) * average value of the
/// last 3 measurements. The criteria are "infer per second" and "average
/// latency", or "infer per second" and "percentile latency" if valid
/// percentile is set (see 'percentile' below).
/// \param measurement_window_ms The duration of each measurement in msec.
/// \param max_trials The maximum number of attempts to obtain
/// stable measurement.
/// \param percentile The percentile in terms of latency to be reported.
/// if it is a valid percentile value, the percentile latency will reported
/// and used as stable criteria instead of average latency. If it is -1,
/// average latency will be reported and used as stable criteria.
/// \param latency_threshold_ms The threshold on the latency measurements in
/// microseconds.
/// \param parser The ModelParse object which holds all the details about the
/// model.
/// \param profile_backend The ClientBackend object used to communicate
/// with the server by profiler.
/// \param manager The LoadManager object that will produce load on the
/// server.
/// \param profiler Returns a new InferenceProfiler object.
/// \param measurement_request_count The number of requests to capture when
/// using "count_windows" mode.
/// \param measurement_mode The measurement mode to use for windows.
/// \param mpi_driver The driver class for MPI operations.
/// \param metrics_interval_ms The interval at which the server-side metrics
/// \param should_collect_metrics Whether server-side inference server metrics
/// should be collected.
/// \param overhead_pct_threshold User set threshold above which the PA
/// overhead is too significant to provide usable results.
/// \param collector Collector for the profile data from experiments
/// \param should_collect_profile_data Whether to collect profile data.
/// \return cb::Error object indicating success or failure.
static
cb
::
Error
Create
(
const
bool
verbose
,
const
double
stability_threshold
,
const
uint64_t
measurement_window_ms
,
const
size_t
max_trials
,
const
int64_t
percentile
,
const
uint64_t
latency_threshold_ms
,
const
cb
::
ProtocolType
protocol
,
std
::
shared_ptr
<
ModelParser
>&
parser
,
std
::
shared_ptr
<
cb
::
ClientBackend
>
profile_backend
,
std
::
unique_ptr
<
LoadManager
>
manager
,
std
::
unique_ptr
<
InferenceProfiler
>*
profiler
,
uint64_t
measurement_request_count
,
MeasurementMode
measurement_mode
,
std
::
shared_ptr
<
MPIDriver
>
mpi_driver
,
const
uint64_t
metrics_interval_ms
,
const
bool
should_collect_metrics
,
const
double
overhead_pct_threshold
,
const
std
::
shared_ptr
<
ProfileDataCollector
>
collector
,
const
bool
should_collect_profile_data
);
/// Performs the profiling on the given range with the given search algorithm.
/// For profiling using request rate invoke template with double, otherwise
/// invoke with size_t for concurrency search.
/// \param start The starting point of the search range.
/// \param end The ending point of the search range.
/// \param step The step size to move along the search range in linear search
/// or the precision in binary search.
/// \param search_mode The search algorithm to be applied.
/// \param summary Returns the trace of the measurement along the search
/// path.
/// \return cb::Error object indicating success or failure.
template
<
typename
T
>
cb
::
Error
Profile
(
const
T
start
,
const
T
end
,
const
T
step
,
const
SearchMode
search_mode
,
std
::
vector
<
PerfStatus
>&
perf_statuses
)
{
cb
::
Error
err
;
bool
meets_threshold
,
is_stable
;
if
(
search_mode
==
SearchMode
::
NONE
)
{
err
=
Profile
(
perf_statuses
,
meets_threshold
,
is_stable
);
if
(
!
err
.
IsOk
())
{
return
err
;
}
}
else
if
(
search_mode
==
SearchMode
::
LINEAR
)
{
T
current_value
=
start
;
do
{
err
=
Profile
(
current_value
,
perf_statuses
,
meets_threshold
,
is_stable
);
if
(
!
err
.
IsOk
())
{
return
err
;
}
current_value
+=
step
;
}
while
(((
current_value
<=
end
)
||
(
end
==
static_cast
<
T
>
(
NO_LIMIT
)))
&&
(
meets_threshold
));
// If there was only one concurrency we swept over and it did not meet the
// stability threshold, we should return an error.
if
(
current_value
==
(
start
+
step
)
&&
is_stable
==
false
)
{
return
cb
::
Error
(
"Failed to obtain stable measurement."
,
pa
::
STABILITY_ERROR
);
}
}
else
{
err
=
Profile
(
start
,
perf_statuses
,
meets_threshold
,
is_stable
);
if
(
!
err
.
IsOk
()
||
(
!
meets_threshold
))
{
return
err
;
}
err
=
Profile
(
end
,
perf_statuses
,
meets_threshold
,
is_stable
);
if
(
!
err
.
IsOk
()
||
(
meets_threshold
))
{
return
err
;
}
T
this_start
=
start
;
T
this_end
=
end
;
while
((
this_end
-
this_start
)
>
step
)
{
T
current_value
=
(
this_end
+
this_start
)
/
2
;
err
=
Profile
(
current_value
,
perf_statuses
,
meets_threshold
,
is_stable
);
if
(
!
err
.
IsOk
())
{
return
err
;
}
if
(
meets_threshold
)
{
this_start
=
current_value
;
}
else
{
this_end
=
current_value
;
}
}
}
return
cb
::
Error
::
Success
;
}
bool
IncludeServerStats
()
{
return
include_server_stats_
;
}
private:
InferenceProfiler
(
const
bool
verbose
,
const
double
stability_threshold
,
const
int32_t
measurement_window_ms
,
const
size_t
max_trials
,
const
bool
extra_percentile
,
const
size_t
percentile
,
const
uint64_t
latency_threshold_ms
,
const
cb
::
ProtocolType
protocol
,
std
::
shared_ptr
<
ModelParser
>&
parser
,
std
::
shared_ptr
<
cb
::
ClientBackend
>
profile_backend
,
std
::
unique_ptr
<
LoadManager
>
manager
,
uint64_t
measurement_request_count
,
MeasurementMode
measurement_mode
,
std
::
shared_ptr
<
MPIDriver
>
mpi_driver
,
const
uint64_t
metrics_interval_ms
,
const
bool
should_collect_metrics
,
const
double
overhead_pct_threshold
,
const
std
::
shared_ptr
<
ProfileDataCollector
>
collector
,
const
bool
should_collect_profile_data
);
/// Actively measure throughput in every 'measurement_window' msec until the
/// throughput is stable. Once the throughput is stable, it adds the
/// observations on summary trace and returns whether the setting met the
/// threshold. NOTE: the requests are being sent regardless of the
/// measurement, so the data returned by the server (see struct
/// PerforamnceStatusStruct) will include more requests than what the client
/// measures (we can't get the exact server status right before the first
/// request and right after the last request in the measurement window).
/// \param concurrent_request_count The concurrency level for the measurement.
/// \param perf_statuses Appends the measurements summary at the end of this
/// list. \param meets_threshold Returns whether the setting meets the
/// threshold.
/// \param is_stable Returns whether the measurement is stable.
/// \return cb::Error object indicating success or failure.
cb
::
Error
Profile
(
const
size_t
concurrent_request_count
,
std
::
vector
<
PerfStatus
>&
perf_statuses
,
bool
&
meets_threshold
,
bool
&
is_stable
);
/// Similar to above function, but instead of setting the concurrency, it
/// sets the specified request rate for measurements.
/// \param request_rate The request rate for inferences.
/// \param perf_statuses Appends the measurements summary at the end of this
/// list. \param meets_threshold Returns whether the setting meets the
/// threshold. \param is_stable Returns whether the measurement is stable.
/// \return cb::Error object indicating success or failure.
cb
::
Error
Profile
(
const
double
request_rate
,
std
::
vector
<
PerfStatus
>&
perf_statuses
,
bool
&
meets_threshold
,
bool
&
is_stable
);
/// Measures throughput and latencies for custom load without controlling
/// request rate nor concurrency. Requires load manager to be loaded with
/// a file specifying the time intervals.
/// \param perf_statuses Appends the measurements summary at the end of this
/// list. \param meets_threshold Returns whether the measurement met the
/// threshold. \param is_stable Returns whether the measurement is stable.
/// \return cb::Error object indicating success
/// or failure.
cb
::
Error
Profile
(
std
::
vector
<
PerfStatus
>&
perf_statuses
,
bool
&
meets_threshold
,
bool
&
is_stable
);
/// A helper function for profiling functions.
/// \param status_summary Returns the summary of the measurement.
/// \param is_stable Returns whether the measurement stabilized or not.
/// \return cb::Error object indicating success or failure.
cb
::
Error
ProfileHelper
(
PerfStatus
&
status_summary
,
bool
*
is_stable
);
/// A helper function to determine if profiling is stable
/// \param load_status Stores the observations of infer_per_sec and latencies
/// \return Returns if the threshold and latencies are stable.
bool
DetermineStability
(
LoadStatus
&
load_status
);
/// Check if latency at index idx is within the latency threshold
/// \param idx index in latency vector
/// \param load_status Stores the observations of infer_per_sec and latencies
/// \return Returns whether the latencies are below the max threshold
bool
CheckWithinThreshold
(
size_t
idx
,
LoadStatus
&
load_status
);
/// A helper function to determine if profiling is done
/// \param load_status Stores the observations of infer_per_sec and latencies
/// \param is_stable Returns whether the measurement stabilized or not.
/// \return Returns if we should break out of the infinite stability check
/// loop.
bool
IsDoneProfiling
(
LoadStatus
&
load_status
,
bool
*
is_stable
);
/// Check if observed inferences and latencies are within threshold
/// for a single window starting at idx
/// \param idx index in latency vector
/// \param load_status Stores the observations of infer_per_sec and latencies
/// \return Returns whether inference and latency are stable
bool
CheckWindowForStability
(
size_t
idx
,
LoadStatus
&
load_status
);
/// Check if observed inferences are within threshold
/// for a single window starting at idx
/// \param idx index in latency vector
/// \param load_status Stores the observations of infer_per_sec and latencies
/// \return Returns whether inference is stable
bool
IsInferWindowStable
(
size_t
idx
,
LoadStatus
&
load_status
);
/// Check if observed latencies are within threshold
/// for a single window starting at idx
/// \param idx index in latency vector
/// \param load_status Stores the observations of infer_per_sec and latencies
/// \return Returns whether latency is stable
bool
IsLatencyWindowStable
(
size_t
idx
,
LoadStatus
&
load_status
);
/// Helper function to perform measurement.
/// \param status_summary The summary of this measurement.
/// \param measurement_window Indicating the number of requests or the
/// duration in milliseconds to collect requests.
/// \param is_count_based determines whether measurement_window is indicating
/// time or count.
/// \return cb::Error object indicating success or failure.
cb
::
Error
Measure
(
PerfStatus
&
status_summary
,
uint64_t
measurement_window
,
bool
is_count_based
);
/// Gets the server side statistics
/// \param model_status Returns the status of the models provided by
/// the server. If the model being profiled is non-ensemble model,
/// only its status will be returned. Otherwise, the status of the composing
/// models will also be returned.
/// \return cb::Error object indicating success or failure.
cb
::
Error
GetServerSideStatus
(
std
::
map
<
cb
::
ModelIdentifier
,
cb
::
ModelStatistics
>*
model_status
);
/// Summarize the measurement with the provided statistics.
/// \param start_status The model status at the start of the measurement.
/// \param end_status The model status at the end of the measurement.
/// \param start_stat The accumulated context status at the start.
/// \param end_stat The accumulated context status at the end.
/// \param summary Returns the summary of the measurement.
/// \param window_start_ns The window start timestamp in nanoseconds.
/// \param window_end_ns The window end timestamp in nanoseconds.
/// \return cb::Error object indicating success or failure.
cb
::
Error
Summarize
(
const
std
::
map
<
cb
::
ModelIdentifier
,
cb
::
ModelStatistics
>&
start_status
,
const
std
::
map
<
cb
::
ModelIdentifier
,
cb
::
ModelStatistics
>&
end_status
,
const
cb
::
InferStat
&
start_stat
,
const
cb
::
InferStat
&
end_stat
,
PerfStatus
&
summary
,
uint64_t
window_start_ns
,
uint64_t
window_end_ns
);
/// \param valid_range The start and end timestamp of the measurement window.
/// \param valid_sequence_count Returns the number of completed sequences
/// during the measurement. A sequence is a set of correlated requests sent to
/// sequence model.
/// \param latencies Returns the vector of request latencies where the
/// requests are completed within the measurement window.
/// \param response_count Returns the number of responses
/// \param valid_requests Returns a vector of valid request records
virtual
void
ValidLatencyMeasurement
(
const
std
::
pair
<
uint64_t
,
uint64_t
>&
valid_range
,
size_t
&
valid_sequence_count
,
size_t
&
delayed_request_count
,
std
::
vector
<
uint64_t
>*
latencies
,
size_t
&
response_count
,
std
::
vector
<
RequestRecord
>&
valid_requests
);
/// Add the data from the request records to the Raw Data Collector
/// \param perf_status PerfStatus of the current measurement
/// \param window_start_ns The window start timestamp in nanoseconds.
/// \param window_end_ns The window end timestamp in nanoseconds.
/// \param request_records The request records to collect.
void
CollectData
(
PerfStatus
&
perf_status
,
uint64_t
window_start_ns
,
uint64_t
window_end_ns
,
std
::
vector
<
RequestRecord
>&&
request_records
);
/// \param latencies The vector of request latencies collected.
/// \param summary Returns the summary that the latency related fields are
/// set.
/// \return cb::Error object indicating success or failure.
virtual
cb
::
Error
SummarizeLatency
(
const
std
::
vector
<
uint64_t
>&
latencies
,
PerfStatus
&
summary
);
/// \param latencies The vector of request latencies collected.
/// \return std::tuple object containing:
/// * mean of latencies in nanoseconds
/// * sample standard deviation of latencies in microseconds
std
::
tuple
<
uint64_t
,
uint64_t
>
GetMeanAndStdDev
(
const
std
::
vector
<
uint64_t
>&
latencies
);
/// \param start_stat The accumulated client statistics at the start.
/// \param end_stat The accumulated client statistics at the end.
/// \param duration_ns The duration of the measurement in nsec.
/// \param valid_request_count The number of completed requests recorded.
/// \param valid_sequence_count The number of completed sequences recorded.
/// \param delayed_request_count The number of requests that missed their
/// schedule.
/// \param response_count The number of responses.
/// \param summary Returns the summary that the fields recorded by
/// client are set.
/// \return cb::Error object indicating success or failure.
virtual
cb
::
Error
SummarizeClientStat
(
const
cb
::
InferStat
&
start_stat
,
const
cb
::
InferStat
&
end_stat
,
const
uint64_t
duration_ns
,
const
size_t
valid_request_count
,
const
size_t
delayed_request_count
,
const
size_t
valid_sequence_count
,
const
size_t
response_count
,
PerfStatus
&
summary
);
/// Adds the send request rate metric to the summary object.
/// \param window_duration_s The duration of the window in seconds.
/// \param num_sent_requests The number of requests sent during the last
/// window.
/// \param summary The summary object to be updated with the send request rate
/// metric.
void
SummarizeSendRequestRate
(
const
double
window_duration_s
,
const
size_t
num_sent_requests
,
PerfStatus
&
summary
);
/// Given a model_identifier to gather stats for, and a map of ALL stats,
/// determine which version of the model should be gathered
/// \param model_identifier A pair of model_name and model_version to identify
/// a specific model
/// \param start_stats The stats for all models at the start of the
/// measurement
/// \param end_stats The stats for all models at the end of the measurement
/// \param model_version The determined model version
cb
::
Error
DetermineStatsModelVersion
(
const
cb
::
ModelIdentifier
&
model_identifier
,
const
std
::
map
<
cb
::
ModelIdentifier
,
cb
::
ModelStatistics
>&
start_stats
,
const
std
::
map
<
cb
::
ModelIdentifier
,
cb
::
ModelStatistics
>&
end_stats
,
int64_t
*
model_version
);
/// \param start_status The model status at the start of the measurement.
/// \param end_status The model status at the end of the measurement.
/// \param server_stats Returns the summary that the fields recorded by server
/// are set.
/// \return cb::Error object indicating success or failure.
cb
::
Error
SummarizeServerStats
(
const
std
::
map
<
cb
::
ModelIdentifier
,
cb
::
ModelStatistics
>&
start_status
,
const
std
::
map
<
cb
::
ModelIdentifier
,
cb
::
ModelStatistics
>&
end_status
,
ServerSideStats
*
server_stats
);
/// \param model_identifier A pair of model_name and model_version to identify
/// a specific model.
/// \param start_status The model status at the start of the measurement.
/// \param end_status The model status at the end of the measurement.
/// \param server_stats Returns the summary that the fields recorded by server
/// are set.
/// \return cb::Error object indicating success or failure.
cb
::
Error
SummarizeServerStats
(
const
cb
::
ModelIdentifier
&
model_identifier
,
const
std
::
map
<
cb
::
ModelIdentifier
,
cb
::
ModelStatistics
>&
start_status
,
const
std
::
map
<
cb
::
ModelIdentifier
,
cb
::
ModelStatistics
>&
end_status
,
ServerSideStats
*
server_stats
);
/// \param model_identifier A pair of model_name and model_version to identify
/// a specific model.
/// \param start_status The model status at the start of the measurement.
/// \param end_status The model status at the end of the measurement.
/// \param server_stats Returns the summary that the fields recorded by server
/// are set.
/// \return cb::Error object indicating success or failure.
cb
::
Error
SummarizeServerStatsHelper
(
const
cb
::
ModelIdentifier
&
model_identifier
,
const
std
::
map
<
cb
::
ModelIdentifier
,
cb
::
ModelStatistics
>&
start_status
,
const
std
::
map
<
cb
::
ModelIdentifier
,
cb
::
ModelStatistics
>&
end_status
,
ServerSideStats
*
server_stats
);
/// Calculate the overhead and put the results into the summary
///
/// \param window_duration_ns The duration of the window
/// \param idle_ns The average worker idle time during the window
/// \param summary The summary object to be updated with overhead stats
///
void
SummarizeOverhead
(
const
uint64_t
window_duration_ns
,
const
uint64_t
idle_ns
,
PerfStatus
&
summary
);
/// Returns true if all MPI ranks (models) are stable. Should only be run if
/// and only if IsMPIRun() returns true.
/// \param current_rank_stability The stability of the current rank.
/// \return True if all MPI ranks are stable.
bool
AllMPIRanksAreStable
(
bool
current_rank_stability
);
/// Merge individual perf status reports into a single perf status. This
/// function is used to merge the results from multiple Measure runs into a
/// single report.
/// \param perf_status List of perf status reports to be merged.
/// \param summary_status Final merged summary status.
/// \return cb::Error object indicating success or failure.
virtual
cb
::
Error
MergePerfStatusReports
(
std
::
deque
<
PerfStatus
>&
perf_status
,
PerfStatus
&
summary_status
);
/// Merge individual server side statistics into a single server side report.
/// \param server_side_stats List of server side statistics reports to be
/// merged.
/// \param server_side_summary Final merged summary status.
/// \return cb::Error object indicating success or failure.
virtual
cb
::
Error
MergeServerSideStats
(
std
::
vector
<
ServerSideStats
>&
server_side_stats
,
ServerSideStats
&
server_side_summary
);
/// \param all_metrics Individual metrics from all intervals from stable
/// passes.
/// \param merged_metrics Output merged metrics from all intervals from stable
/// passes.
/// \return cb::Error object indicating success or failure.
cb
::
Error
MergeMetrics
(
const
std
::
vector
<
std
::
reference_wrapper
<
const
Metrics
>>&
all_metrics
,
Metrics
&
merged_metrics
);
template
<
typename
T
>
void
GetMetricAveragePerGPU
(
const
std
::
vector
<
std
::
reference_wrapper
<
const
std
::
map
<
std
::
string
,
T
>>>&
input_metric_maps
,
std
::
map
<
std
::
string
,
T
>&
output_metric_map
)
{
std
::
map
<
std
::
string
,
size_t
>
metric_count_per_gpu
{};
for
(
const
auto
&
input_metric_map
:
input_metric_maps
)
{
for
(
const
auto
&
input_metric
:
input_metric_map
.
get
())
{
const
auto
&
gpu_uuid
{
input_metric
.
first
};
const
auto
&
metric
{
input_metric
.
second
};
if
(
output_metric_map
.
find
(
gpu_uuid
)
==
output_metric_map
.
end
())
{
output_metric_map
[
gpu_uuid
]
=
0
;
metric_count_per_gpu
[
gpu_uuid
]
=
0
;
}
output_metric_map
[
gpu_uuid
]
+=
metric
;
metric_count_per_gpu
[
gpu_uuid
]
++
;
}
}
for
(
auto
&
output_metric
:
output_metric_map
)
{
const
auto
&
gpu_uuid
{
output_metric
.
first
};
auto
&
metric
{
output_metric
.
second
};
const
auto
&
metric_count
{
metric_count_per_gpu
[
gpu_uuid
]};
if
(
metric_count
>
0
)
{
metric
/=
metric_count
;
}
}
}
template
<
typename
T
>
void
GetMetricMaxPerGPU
(
const
std
::
vector
<
std
::
reference_wrapper
<
const
std
::
map
<
std
::
string
,
T
>>>&
input_metric_maps
,
std
::
map
<
std
::
string
,
T
>&
output_metric_map
)
{
for
(
const
auto
&
input_metric_map
:
input_metric_maps
)
{
for
(
const
auto
&
input_metric
:
input_metric_map
.
get
())
{
const
auto
&
gpu_uuid
{
input_metric
.
first
};
const
auto
&
metric
{
input_metric
.
second
};
if
(
output_metric_map
.
find
(
gpu_uuid
)
==
output_metric_map
.
end
())
{
output_metric_map
[
gpu_uuid
]
=
0
;
}
output_metric_map
[
gpu_uuid
]
=
std
::
max
(
output_metric_map
[
gpu_uuid
],
metric
);
}
}
}
template
<
typename
T
>
void
GetMetricFirstPerGPU
(
const
std
::
vector
<
std
::
reference_wrapper
<
const
std
::
map
<
std
::
string
,
T
>>>&
input_metric_maps
,
std
::
map
<
std
::
string
,
T
>&
output_metric_map
)
{
for
(
const
auto
&
input_metric_map
:
input_metric_maps
)
{
for
(
const
auto
&
input_metric
:
input_metric_map
.
get
())
{
const
auto
&
gpu_uuid
{
input_metric
.
first
};
const
auto
&
metric
{
input_metric
.
second
};
if
(
output_metric_map
.
find
(
gpu_uuid
)
==
output_metric_map
.
end
())
{
output_metric_map
[
gpu_uuid
]
=
metric
;
}
}
}
}
bool
verbose_
;
uint64_t
measurement_window_ms_
;
uint64_t
measurement_request_count_
;
MeasurementMode
measurement_mode_
;
size_t
max_trials_
;
bool
extra_percentile_
;
size_t
percentile_
;
uint64_t
latency_threshold_ms_
;
cb
::
ProtocolType
protocol_
;
std
::
string
model_name_
;
int64_t
model_version_
;
std
::
shared_ptr
<
ModelParser
>
parser_
;
std
::
shared_ptr
<
cb
::
ClientBackend
>
profile_backend_
;
std
::
unique_ptr
<
LoadManager
>
manager_
;
std
::
shared_ptr
<
ProfileDataCollector
>
collector_
;
LoadParams
load_parameters_
;
bool
include_lib_stats_
;
bool
include_server_stats_
;
std
::
shared_ptr
<
MPIDriver
>
mpi_driver_
;
/// The request records of the requests completed during all measurements
std
::
vector
<
RequestRecord
>
all_request_records_
;
/// The end time of the previous measurement window
uint64_t
previous_window_end_ns_
;
/// Server side statistics from the previous measurement window
std
::
map
<
cb
::
ModelIdentifier
,
cb
::
ModelStatistics
>
prev_server_side_stats_
;
/// Client side statistics from the previous measurement window
cb
::
InferStat
prev_client_side_stats_
;
/// Metrics manager that collects server-side metrics periodically
std
::
shared_ptr
<
MetricsManager
>
metrics_manager_
{
nullptr
};
/// Whether server-side inference server metrics should be collected.
bool
should_collect_metrics_
{
false
};
/// User set threshold above which the PA overhead is too significant to
/// provide usable results.
const
double
overhead_pct_threshold_
{
0.0
};
// Whether to collect profile data.
bool
should_collect_profile_data_
{
false
};
#ifndef DOCTEST_CONFIG_DISABLE
friend
NaggyMockInferenceProfiler
;
friend
TestInferenceProfiler
;
public:
InferenceProfiler
()
=
default
;
#endif
};
}}
// namespace triton::perfanalyzer
src/c++/perf_analyzer/ischeduler.h
0 → 100644
View file @
c68e1835
// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include "rate_schedule.h"
namespace
triton
{
namespace
perfanalyzer
{
/// Interface for worker threads that use a schedule
///
class
IScheduler
{
public:
/// Provides the schedule that should be followed
///
virtual
void
SetSchedule
(
RateSchedulePtr_t
schedule
)
=
0
;
};
}}
// namespace triton::perfanalyzer
src/c++/perf_analyzer/iworker.h
0 → 100644
View file @
c68e1835
// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
namespace
triton
{
namespace
perfanalyzer
{
/// Interface for worker threads that generate inference requests
///
class
IWorker
{
public:
virtual
void
Infer
()
=
0
;
};
}}
// namespace triton::perfanalyzer
src/c++/perf_analyzer/load_manager.cc
0 → 100644
View file @
c68e1835
// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "load_manager.h"
#include <algorithm>
#include "client_backend/client_backend.h"
#include "infer_data_manager_factory.h"
namespace
triton
{
namespace
perfanalyzer
{
cb
::
Error
LoadManager
::
CheckHealth
()
{
// Check thread status to make sure that the load setting is
// consistent to the one being reported
// If some thread return early, main thread will return and
// the worker thread's error message will be reported
// when derived class destructor gets called.
for
(
auto
&
thread_stat
:
threads_stat_
)
{
if
(
!
thread_stat
->
status_
.
IsOk
())
{
return
cb
::
Error
(
"Failed to maintain requested inference load."
" Worker thread(s) failed to generate concurrent requests."
,
pa
::
GENERIC_ERROR
);
}
if
(
!
thread_stat
->
cb_status_
.
IsOk
())
{
return
cb
::
Error
(
"Failed to retrieve results from inference request."
,
pa
::
GENERIC_ERROR
);
}
}
return
cb
::
Error
::
Success
;
}
cb
::
Error
LoadManager
::
SwapRequestRecords
(
std
::
vector
<
RequestRecord
>&
new_request_records
)
{
std
::
vector
<
RequestRecord
>
total_request_records
;
// Gather request records with proper locking from all the worker threads
for
(
auto
&
thread_stat
:
threads_stat_
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
thread_stat
->
mu_
);
total_request_records
.
insert
(
total_request_records
.
end
(),
thread_stat
->
request_records_
.
begin
(),
thread_stat
->
request_records_
.
end
());
thread_stat
->
request_records_
.
clear
();
}
// Swap the results
total_request_records
.
swap
(
new_request_records
);
return
cb
::
Error
::
Success
;
}
uint64_t
LoadManager
::
CountCollectedRequests
()
{
uint64_t
num_of_requests
=
0
;
for
(
auto
&
thread_stat
:
threads_stat_
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
thread_stat
->
mu_
);
num_of_requests
+=
thread_stat
->
request_records_
.
size
();
}
return
num_of_requests
;
}
cb
::
Error
LoadManager
::
GetAccumulatedClientStat
(
cb
::
InferStat
*
contexts_stat
)
{
contexts_stat
->
completed_request_count
=
0
;
contexts_stat
->
cumulative_receive_time_ns
=
0
;
contexts_stat
->
cumulative_send_time_ns
=
0
;
contexts_stat
->
cumulative_total_request_time_ns
=
0
;
for
(
auto
&
thread_stat
:
threads_stat_
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
thread_stat
->
mu_
);
for
(
auto
&
context_stat
:
thread_stat
->
contexts_stat_
)
{
contexts_stat
->
completed_request_count
+=
context_stat
.
completed_request_count
;
contexts_stat
->
cumulative_total_request_time_ns
+=
context_stat
.
cumulative_total_request_time_ns
;
contexts_stat
->
cumulative_send_time_ns
+=
context_stat
.
cumulative_send_time_ns
;
contexts_stat
->
cumulative_receive_time_ns
+=
context_stat
.
cumulative_receive_time_ns
;
}
}
return
cb
::
Error
::
Success
;
}
uint64_t
LoadManager
::
GetIdleTime
()
{
uint64_t
total
{
0
};
size_t
num_active_threads
=
0
;
for
(
auto
&
thread_stat
:
threads_stat_
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
thread_stat
->
mu_
);
uint64_t
idle_time
=
thread_stat
->
idle_timer
.
GetIdleTime
();
if
(
idle_time
)
{
total
+=
idle_time
;
num_active_threads
++
;
}
}
// TODO REFACTOR TMA-1043 InferDataManager should have an API to get
// num_active_threads. This method of determining active threads isn't fully
// accurate
if
(
num_active_threads
)
{
total
/=
num_active_threads
;
}
return
total
;
}
void
LoadManager
::
ResetIdleTime
()
{
for
(
auto
&
thread_stat
:
threads_stat_
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
thread_stat
->
mu_
);
thread_stat
->
idle_timer
.
Reset
();
}
}
const
size_t
LoadManager
::
GetAndResetNumSentRequests
()
{
size_t
num_sent_requests
{
0
};
for
(
auto
&
thread_stat
:
threads_stat_
)
{
num_sent_requests
+=
thread_stat
->
num_sent_requests_
;
thread_stat
->
num_sent_requests_
=
0
;
}
return
num_sent_requests
;
}
LoadManager
::
LoadManager
(
const
bool
async
,
const
bool
streaming
,
const
int32_t
batch_size
,
const
size_t
max_threads
,
const
SharedMemoryType
shared_memory_type
,
const
size_t
output_shm_size
,
const
std
::
shared_ptr
<
ModelParser
>&
parser
,
const
std
::
shared_ptr
<
cb
::
ClientBackendFactory
>&
factory
)
:
async_
(
async
),
streaming_
(
streaming
),
batch_size_
(
batch_size
),
max_threads_
(
max_threads
),
parser_
(
parser
),
factory_
(
factory
),
using_json_data_
(
false
)
{
on_sequence_model_
=
((
parser_
->
SchedulerType
()
==
ModelParser
::
SEQUENCE
)
||
(
parser_
->
SchedulerType
()
==
ModelParser
::
ENSEMBLE_SEQUENCE
));
data_loader_
.
reset
(
new
DataLoader
(
batch_size_
));
infer_data_manager_
=
InferDataManagerFactory
::
CreateInferDataManager
(
max_threads
,
batch_size
,
shared_memory_type
,
output_shm_size
,
parser
,
factory
,
data_loader_
);
}
void
LoadManager
::
InitManager
(
const
size_t
string_length
,
const
std
::
string
&
string_data
,
const
bool
zero_input
,
std
::
vector
<
std
::
string
>&
user_data
,
const
uint64_t
start_sequence_id
,
const
uint64_t
sequence_id_range
,
const
size_t
sequence_length
,
const
bool
sequence_length_specified
,
const
double
sequence_length_variation
)
{
// Note, this is already caught by the CLI, but adding it here for extra
// protection
if
(
on_sequence_model_
&&
batch_size_
>
1
)
{
throw
PerfAnalyzerException
(
"error: sequence models do not support batching"
,
GENERIC_ERROR
);
}
auto
status
=
InitManagerInputs
(
string_length
,
string_data
,
zero_input
,
user_data
);
THROW_IF_ERROR
(
status
,
"Failed to init manager inputs"
);
THROW_IF_ERROR
(
infer_data_manager_
->
Init
(),
"Unable to init infer data manager"
);
sequence_manager_
=
MakeSequenceManager
(
start_sequence_id
,
sequence_id_range
,
sequence_length
,
sequence_length_specified
,
sequence_length_variation
,
using_json_data_
,
data_loader_
);
InitManagerFinalize
();
}
cb
::
Error
LoadManager
::
InitManagerInputs
(
const
size_t
string_length
,
const
std
::
string
&
string_data
,
const
bool
zero_input
,
std
::
vector
<
std
::
string
>&
user_data
)
{
RETURN_IF_ERROR
(
factory_
->
CreateClientBackend
(
&
backend_
));
// Read provided data
if
(
!
user_data
.
empty
())
{
if
(
IsDirectory
(
user_data
[
0
]))
{
RETURN_IF_ERROR
(
data_loader_
->
ReadDataFromDir
(
parser_
->
Inputs
(),
parser_
->
Outputs
(),
user_data
[
0
]));
}
else
{
using_json_data_
=
true
;
for
(
const
auto
&
json_file
:
user_data
)
{
RETURN_IF_ERROR
(
data_loader_
->
ReadDataFromJSON
(
parser_
->
Inputs
(),
parser_
->
Outputs
(),
json_file
));
}
std
::
cout
<<
" Successfully read data for "
<<
data_loader_
->
GetDataStreamsCount
()
<<
" stream/streams"
;
if
(
data_loader_
->
GetDataStreamsCount
()
==
1
)
{
std
::
cout
<<
" with "
<<
data_loader_
->
GetTotalSteps
(
0
)
<<
" step/steps"
;
}
std
::
cout
<<
"."
<<
std
::
endl
;
}
}
else
{
RETURN_IF_ERROR
(
data_loader_
->
GenerateData
(
parser_
->
Inputs
(),
zero_input
,
string_length
,
string_data
));
}
// Reserve the required vector space
threads_stat_
.
reserve
(
max_threads_
);
return
cb
::
Error
::
Success
;
}
void
LoadManager
::
StopWorkerThreads
()
{
early_exit
=
true
;
// wake up all threads
wake_signal_
.
notify_all
();
size_t
cnt
=
0
;
for
(
auto
&
thread
:
threads_
)
{
thread
.
join
();
if
(
!
threads_stat_
[
cnt
]
->
status_
.
IsOk
())
{
std
::
cerr
<<
"Thread ["
<<
cnt
<<
"] had error: "
<<
(
threads_stat_
[
cnt
]
->
status_
)
<<
std
::
endl
;
}
if
(
!
threads_stat_
[
cnt
]
->
cb_status_
.
IsOk
())
{
std
::
cerr
<<
"Thread ["
<<
cnt
<<
"] had error: "
<<
(
threads_stat_
[
cnt
]
->
cb_status_
)
<<
std
::
endl
;
}
cnt
++
;
}
threads_
.
clear
();
}
std
::
shared_ptr
<
SequenceManager
>
LoadManager
::
MakeSequenceManager
(
const
uint64_t
start_sequence_id
,
const
uint64_t
sequence_id_range
,
const
size_t
sequence_length
,
const
bool
sequence_length_specified
,
const
double
sequence_length_variation
,
const
bool
using_json_data
,
std
::
shared_ptr
<
DataLoader
>
data_loader
)
{
return
std
::
make_shared
<
SequenceManager
>
(
start_sequence_id
,
sequence_id_range
,
sequence_length
,
sequence_length_specified
,
sequence_length_variation
,
using_json_data
,
data_loader
);
}
}}
// namespace triton::perfanalyzer
src/c++/perf_analyzer/load_manager.h
0 → 100644
View file @
c68e1835
// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <atomic>
#include <condition_variable>
#include <memory>
#include <random>
#include <thread>
#include "client_backend/client_backend.h"
#include "data_loader.h"
#include "iinfer_data_manager.h"
#include "load_worker.h"
#include "perf_utils.h"
#include "sequence_manager.h"
namespace
triton
{
namespace
perfanalyzer
{
#ifndef DOCTEST_CONFIG_DISABLE
class
NaggyMockLoadManager
;
#endif
class
LoadManager
{
public:
virtual
~
LoadManager
()
=
default
;
/// Initialize the Manager class to set up shared memory and inputs
/// \param string_length The length of the random strings to be generated
/// for string inputs.
/// \param string_data The string to be used as string inputs for model.
/// \param zero_input Whether to use zero for model inputs.
/// \param user_data The vector containing path/paths to user-provided data
/// that can be a directory or path to a json data file.
/// \param start_sequence_id The starting sequence ID to be used for iterating
/// through valid sequence IDs.
/// \param sequence_id_range The maximum sequence ID to be used for iterating
/// through valid sequence IDs.
/// \param sequence_length The base length of new sequences.
/// \param sequence_length_specified Whether the user specified the sequence
/// length.
/// \param sequence_length_variation The percentage variation in length of
/// sequences using autogenerated data as input.
void
InitManager
(
const
size_t
string_length
,
const
std
::
string
&
string_data
,
const
bool
zero_input
,
std
::
vector
<
std
::
string
>&
user_data
,
const
uint64_t
start_sequence_id
,
const
uint64_t
sequence_id_range
,
const
size_t
sequence_length
,
const
bool
sequence_length_specified
,
const
double
sequence_length_variation
);
/// Check if the load manager is working as expected.
/// \return cb::Error object indicating success or failure.
cb
::
Error
CheckHealth
();
/// Swap the content of the request records vector recorded by the load
/// manager with a new request records vector
/// \param new_request_records The request records vector to be swapped.
/// \return cb::Error object indicating success or failure.
cb
::
Error
SwapRequestRecords
(
std
::
vector
<
RequestRecord
>&
new_request_records
);
/// Get the sum of all contexts' stat
/// \param contexts_stat Returned the accumulated stat from all contexts
/// in load manager
cb
::
Error
GetAccumulatedClientStat
(
cb
::
InferStat
*
contexts_stat
);
/// Returns the amount of valid time each worker thread has averaged in
/// nanoseconds
///
uint64_t
GetIdleTime
();
/// Resets the counter for tracking valid time
///
void
ResetIdleTime
();
/// Calculates and returns the total number of sent requests across all
/// threads. Resets individual number of sent requests per thread.
/// \return The total number of sent requests across all threads.
const
size_t
GetAndResetNumSentRequests
();
/// \return the batch size used for the inference requests
virtual
size_t
BatchSize
()
const
{
return
batch_size_
;
}
/// Count the number of requests collected until now.
uint64_t
CountCollectedRequests
();
protected:
LoadManager
(
const
bool
async
,
const
bool
streaming
,
const
int32_t
batch_size
,
const
size_t
max_threads
,
const
SharedMemoryType
shared_memory_type
,
const
size_t
output_shm_size
,
const
std
::
shared_ptr
<
ModelParser
>&
parser
,
const
std
::
shared_ptr
<
cb
::
ClientBackendFactory
>&
factory
);
/// Complete any subclass-specific manager initialization tasks.
virtual
void
InitManagerFinalize
()
{}
/// Helper function to retrieve the input data for the inferences
/// \param string_length The length of the random strings to be generated
/// for string inputs.
/// \param string_data The string to be used as string inputs for model.
/// \param zero_input Whether to use zero for model inputs.
/// \param user_data The vector containing path/paths to user-provided data
/// that can be a directory or path to a json data file.
/// \return cb::Error object indicating success or failure.
cb
::
Error
InitManagerInputs
(
const
size_t
string_length
,
const
std
::
string
&
string_data
,
const
bool
zero_input
,
std
::
vector
<
std
::
string
>&
user_data
);
/// Stops all the worker threads generating the request load.
void
StopWorkerThreads
();
protected:
bool
async_
;
bool
streaming_
;
size_t
batch_size_
;
size_t
max_threads_
;
bool
on_sequence_model_
;
std
::
shared_ptr
<
ModelParser
>
parser_
;
std
::
shared_ptr
<
cb
::
ClientBackendFactory
>
factory_
;
bool
using_json_data_
;
std
::
shared_ptr
<
DataLoader
>
data_loader_
;
std
::
unique_ptr
<
cb
::
ClientBackend
>
backend_
;
std
::
shared_ptr
<
IInferDataManager
>
infer_data_manager_
;
// Track the workers so they all go out of scope at the
// same time
std
::
vector
<
std
::
shared_ptr
<
IWorker
>>
workers_
;
// Worker threads that loads the server with inferences
std
::
vector
<
std
::
thread
>
threads_
;
// Contains the statistics on the current working threads
std
::
vector
<
std
::
shared_ptr
<
ThreadStat
>>
threads_stat_
;
// Use condition variable to pause/continue worker threads
std
::
condition_variable
wake_signal_
;
std
::
mutex
wake_mutex_
;
std
::
shared_ptr
<
SequenceManager
>
sequence_manager_
{
nullptr
};
virtual
std
::
shared_ptr
<
SequenceManager
>
MakeSequenceManager
(
const
uint64_t
start_sequence_id
,
const
uint64_t
sequence_id_range
,
const
size_t
sequence_length
,
const
bool
sequence_length_specified
,
const
double
sequence_length_variation
,
const
bool
using_json_data
,
std
::
shared_ptr
<
DataLoader
>
data_loader
);
#ifndef DOCTEST_CONFIG_DISABLE
friend
NaggyMockLoadManager
;
public:
LoadManager
()
=
default
;
#endif
};
}}
// namespace triton::perfanalyzer
src/c++/perf_analyzer/load_worker.cc
0 → 100644
View file @
c68e1835
// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "load_worker.h"
#include <algorithm>
#include <thread>
#include "client_backend/client_backend.h"
#include "perf_utils.h"
namespace
triton
{
namespace
perfanalyzer
{
bool
LoadWorker
::
ShouldExit
()
{
return
early_exit
||
!
thread_stat_
->
cb_status_
.
IsOk
()
||
!
thread_stat_
->
status_
.
IsOk
();
}
bool
LoadWorker
::
HandleExitConditions
()
{
if
(
ShouldExit
())
{
CompleteOngoingSequences
();
WaitForOngoingRequests
();
return
true
;
}
return
false
;
}
void
LoadWorker
::
CompleteOngoingSequences
()
{
if
(
on_sequence_model_
)
{
for
(
size_t
ctx_id
=
0
;
ctx_id
<
ctxs_
.
size
();
++
ctx_id
)
{
size_t
seq_stat_index
=
GetSeqStatIndex
(
ctx_id
);
ctxs_
[
ctx_id
]
->
CompleteOngoingSequence
(
seq_stat_index
);
}
}
}
void
LoadWorker
::
WaitForOngoingRequests
()
{
while
(
GetNumOngoingRequests
()
!=
0
)
{
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
50
));
}
}
uint
LoadWorker
::
GetNumOngoingRequests
()
{
uint
num
=
0
;
for
(
auto
ctx
:
ctxs_
)
{
num
+=
ctx
->
GetNumOngoingRequests
();
}
return
num
;
}
void
LoadWorker
::
CreateContext
()
{
auto
ctx
=
CreateInferContext
();
ctx
->
Init
();
CreateContextFinalize
(
ctx
);
ctxs_
.
push_back
(
ctx
);
}
uint32_t
LoadWorker
::
GetCtxId
()
{
std
::
lock_guard
<
std
::
mutex
>
lk
(
cb_mtx_
);
return
ctx_id_tracker_
->
Get
();
}
void
LoadWorker
::
RestoreFreeCtxId
(
uint32_t
ctx_id
)
{
if
(
!
async_
)
{
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
cb_mtx_
);
ctx_id_tracker_
->
Restore
(
ctx_id
);
}
}
}
void
LoadWorker
::
AsyncCallbackFinalize
(
uint32_t
ctx_id
)
{
// avoid competition over 'cb_mtx_'
{
std
::
lock_guard
<
std
::
mutex
>
lk
(
cb_mtx_
);
ctx_id_tracker_
->
Restore
(
ctx_id
);
notified_
=
true
;
}
cb_cv_
.
notify_all
();
}
}}
// namespace triton::perfanalyzer
src/c++/perf_analyzer/load_worker.h
0 → 100644
View file @
c68e1835
// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <condition_variable>
#include <memory>
#include <mutex>
#include <queue>
#include "ctx_id_tracker_factory.h"
#include "data_loader.h"
#include "infer_context.h"
#include "iworker.h"
#include "model_parser.h"
#include "sequence_manager.h"
namespace
triton
{
namespace
perfanalyzer
{
/// Abstract base class for worker threads
///
class
LoadWorker
:
public
IWorker
{
protected:
LoadWorker
(
uint32_t
id
,
std
::
shared_ptr
<
ThreadStat
>
thread_stat
,
const
std
::
shared_ptr
<
ModelParser
>
parser
,
std
::
shared_ptr
<
DataLoader
>
data_loader
,
const
std
::
shared_ptr
<
cb
::
ClientBackendFactory
>
factory
,
const
bool
on_sequence_model
,
const
bool
async
,
const
bool
streaming
,
const
int32_t
batch_size
,
const
bool
using_json_data
,
std
::
condition_variable
&
wake_signal
,
std
::
mutex
&
wake_mutex
,
bool
&
execute
,
const
std
::
shared_ptr
<
IInferDataManager
>&
infer_data_manager
,
std
::
shared_ptr
<
SequenceManager
>
sequence_manager
)
:
id_
(
id
),
thread_stat_
(
thread_stat
),
parser_
(
parser
),
data_loader_
(
data_loader
),
factory_
(
factory
),
on_sequence_model_
(
on_sequence_model
),
async_
(
async
),
streaming_
(
streaming
),
batch_size_
(
batch_size
),
using_json_data_
(
using_json_data
),
wake_signal_
(
wake_signal
),
wake_mutex_
(
wake_mutex
),
execute_
(
execute
),
infer_data_manager_
(
infer_data_manager
),
sequence_manager_
(
sequence_manager
)
{
}
virtual
~
LoadWorker
()
=
default
;
protected:
// Return the total number of async requests that have started and not
// finished
uint
GetNumOngoingRequests
();
void
SendInferRequest
(
uint32_t
ctx_id
,
bool
delayed
=
false
)
{
if
(
ShouldExit
())
{
return
;
}
if
(
on_sequence_model_
)
{
uint32_t
seq_stat_index
=
GetSeqStatIndex
(
ctx_id
);
ctxs_
[
ctx_id
]
->
SendSequenceInferRequest
(
seq_stat_index
,
delayed
);
}
else
{
ctxs_
[
ctx_id
]
->
SendInferRequest
(
delayed
);
}
}
virtual
std
::
shared_ptr
<
InferContext
>
CreateInferContext
()
{
return
std
::
make_shared
<
InferContext
>
(
id_
,
ctxs_
.
size
(),
async_
,
streaming_
,
on_sequence_model_
,
using_json_data_
,
batch_size_
,
thread_stat_
,
data_loader_
,
parser_
,
factory_
,
execute_
,
infer_data_manager_
,
sequence_manager_
);
}
// Create an inference context and add it to ctxs_
virtual
void
CreateContext
();
// Any code that needs to execute after the Context has been created
virtual
void
CreateContextFinalize
(
std
::
shared_ptr
<
InferContext
>
ctx
)
=
0
;
// Detect the cases where this thread needs to exit
bool
ShouldExit
();
// Detect and handle the case where this thread needs to exit
// Returns true if an exit condition was met
bool
HandleExitConditions
();
void
CompleteOngoingSequences
();
void
WaitForOngoingRequests
();
virtual
uint32_t
GetSeqStatIndex
(
uint32_t
ctx_id
)
=
0
;
uint32_t
GetCtxId
();
void
RestoreFreeCtxId
(
uint32_t
ctx_id
);
void
AsyncCallbackFinalize
(
uint32_t
ctx_id
);
uint32_t
id_
;
std
::
vector
<
std
::
shared_ptr
<
InferContext
>>
ctxs_
;
std
::
shared_ptr
<
ICtxIdTracker
>
ctx_id_tracker_
;
// Variables used to signal async request completion
bool
notified_
=
false
;
std
::
mutex
cb_mtx_
;
std
::
condition_variable
cb_cv_
;
// TODO REFACTOR TMA-1017 is there a better way to do threading than to pass
// the same cv/mutex into every thread by reference? Used to wake up this
// thread if it has been put to sleep
std
::
condition_variable
&
wake_signal_
;
std
::
mutex
&
wake_mutex_
;
// TODO REFACTOR TMA-1017 is there a better way to communicate this than a
// shared bool reference? Used to pause execution of this thread
bool
&
execute_
;
// Stats for this thread
std
::
shared_ptr
<
ThreadStat
>
thread_stat_
;
std
::
shared_ptr
<
DataLoader
>
data_loader_
;
const
std
::
shared_ptr
<
ModelParser
>
parser_
;
const
std
::
shared_ptr
<
cb
::
ClientBackendFactory
>
factory_
;
const
std
::
shared_ptr
<
IInferDataManager
>
infer_data_manager_
;
const
bool
on_sequence_model_
;
const
bool
async_
;
const
bool
streaming_
;
const
int32_t
batch_size_
;
const
bool
using_json_data_
;
std
::
shared_ptr
<
SequenceManager
>
sequence_manager_
{
nullptr
};
};
}}
// namespace triton::perfanalyzer
src/c++/perf_analyzer/main.cc
0 → 100644
View file @
c68e1835
// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "perf_analyzer.h"
#include "perf_analyzer_exception.h"
namespace
pa
=
triton
::
perfanalyzer
;
int
main
(
int
argc
,
char
*
argv
[])
{
try
{
triton
::
perfanalyzer
::
CLParser
clp
;
pa
::
PAParamsPtr
params
=
clp
.
Parse
(
argc
,
argv
);
PerfAnalyzer
analyzer
(
params
);
analyzer
.
Run
();
}
catch
(
pa
::
PerfAnalyzerException
&
e
)
{
return
e
.
GetError
();
}
return
0
;
}
src/c++/perf_analyzer/metrics.h
0 → 100644
View file @
c68e1835
// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <cstdint>
#include <map>
#include <string>
namespace
triton
{
namespace
perfanalyzer
{
/// Struct that holds server-side metrics for the inference server.
/// The keys for each map are GPU UUIDs and the values are described in the
/// variable names.
struct
Metrics
{
std
::
map
<
std
::
string
,
double
>
gpu_utilization_per_gpu
{};
std
::
map
<
std
::
string
,
double
>
gpu_power_usage_per_gpu
{};
std
::
map
<
std
::
string
,
uint64_t
>
gpu_memory_used_bytes_per_gpu
{};
std
::
map
<
std
::
string
,
uint64_t
>
gpu_memory_total_bytes_per_gpu
{};
};
}}
// namespace triton::perfanalyzer
src/c++/perf_analyzer/metrics_manager.cc
0 → 100644
View file @
c68e1835
// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "metrics_manager.h"
#include <iostream>
#include <stdexcept>
#include <utility>
#include "constants.h"
#include "perf_analyzer_exception.h"
namespace
triton
{
namespace
perfanalyzer
{
MetricsManager
::
MetricsManager
(
std
::
shared_ptr
<
clientbackend
::
ClientBackend
>
client_backend
,
uint64_t
metrics_interval_ms
)
:
client_backend_
(
client_backend
),
metrics_interval_ms_
(
metrics_interval_ms
)
{
}
MetricsManager
::~
MetricsManager
()
{
if
(
query_loop_future_
.
valid
())
{
StopQueryingMetrics
();
}
}
void
MetricsManager
::
StartQueryingMetrics
()
{
should_keep_querying_
=
true
;
query_loop_future_
=
std
::
async
(
&
MetricsManager
::
QueryMetricsEveryNMilliseconds
,
this
);
}
void
MetricsManager
::
QueryMetricsEveryNMilliseconds
()
{
while
(
should_keep_querying_
)
{
const
auto
&
start
{
std
::
chrono
::
system_clock
::
now
()};
Metrics
metrics
{};
clientbackend
::
Error
err
{
client_backend_
->
Metrics
(
metrics
)};
if
(
err
.
IsOk
()
==
false
)
{
throw
PerfAnalyzerException
(
err
.
Message
(),
err
.
Err
());
}
CheckForMissingMetrics
(
metrics
);
{
std
::
lock_guard
<
std
::
mutex
>
metrics_lock
{
metrics_mutex_
};
metrics_
.
push_back
(
std
::
move
(
metrics
));
}
const
auto
&
end
{
std
::
chrono
::
system_clock
::
now
()};
const
auto
&
duration
{
end
-
start
};
const
auto
&
remainder
{
std
::
chrono
::
milliseconds
(
metrics_interval_ms_
)
-
duration
};
CheckForMetricIntervalTooShort
(
remainder
,
duration
);
{
std
::
unique_lock
<
std
::
mutex
>
query_loop_lock
{
query_loop_mutex_
};
query_loop_cv_
.
wait_for
(
query_loop_lock
,
remainder
);
}
}
}
void
MetricsManager
::
CheckForMissingMetrics
(
const
Metrics
&
metrics
)
{
if
(
has_given_missing_metrics_warning_
)
{
return
;
}
if
(
metrics
.
gpu_utilization_per_gpu
.
empty
())
{
std
::
cerr
<<
"WARNING: Unable to parse 'nv_gpu_utilization' metric."
<<
std
::
endl
;
has_given_missing_metrics_warning_
=
true
;
}
if
(
metrics
.
gpu_power_usage_per_gpu
.
empty
())
{
std
::
cerr
<<
"WARNING: Unable to parse 'nv_gpu_power_usage' metric."
<<
std
::
endl
;
has_given_missing_metrics_warning_
=
true
;
}
if
(
metrics
.
gpu_memory_used_bytes_per_gpu
.
empty
())
{
std
::
cerr
<<
"WARNING: Unable to parse 'nv_gpu_memory_used_bytes' metric."
<<
std
::
endl
;
has_given_missing_metrics_warning_
=
true
;
}
if
(
metrics
.
gpu_memory_total_bytes_per_gpu
.
empty
())
{
std
::
cerr
<<
"WARNING: Unable to parse 'nv_gpu_memory_total_bytes' metric."
<<
std
::
endl
;
has_given_missing_metrics_warning_
=
true
;
}
}
void
MetricsManager
::
CheckForMetricIntervalTooShort
(
const
std
::
chrono
::
nanoseconds
&
remainder
,
const
std
::
chrono
::
nanoseconds
&
duration
)
{
if
(
has_given_metric_interval_warning_
)
{
return
;
}
if
(
remainder
<
std
::
chrono
::
nanoseconds
::
zero
())
{
std
::
cerr
<<
"WARNING: Triton metrics endpoint latency ("
<<
std
::
chrono
::
duration_cast
<
std
::
chrono
::
milliseconds
>
(
duration
)
.
count
()
<<
"ms) is larger than the querying interval ("
<<
metrics_interval_ms_
<<
"ms). Please try a larger querying interval "
"via `--triton-metrics-interval`."
<<
std
::
endl
;
has_given_metric_interval_warning_
=
true
;
}
}
void
MetricsManager
::
CheckQueryingStatus
()
{
if
(
query_loop_future_
.
valid
()
&&
query_loop_future_
.
wait_for
(
std
::
chrono
::
seconds
(
0
))
==
std
::
future_status
::
ready
)
{
query_loop_future_
.
get
();
}
}
void
MetricsManager
::
GetLatestMetrics
(
std
::
vector
<
Metrics
>&
metrics
)
{
if
(
metrics
.
empty
()
==
false
)
{
throw
PerfAnalyzerException
(
"MetricsManager::GetLatestMetrics() must be passed an empty vector."
,
GENERIC_ERROR
);
}
std
::
lock_guard
<
std
::
mutex
>
metrics_lock
{
metrics_mutex_
};
metrics_
.
swap
(
metrics
);
}
void
MetricsManager
::
StopQueryingMetrics
()
{
should_keep_querying_
=
false
;
query_loop_cv_
.
notify_one
();
if
(
query_loop_future_
.
valid
())
{
query_loop_future_
.
get
();
}
}
}}
// namespace triton::perfanalyzer
src/c++/perf_analyzer/metrics_manager.h
0 → 100644
View file @
c68e1835
// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <chrono>
#include <condition_variable>
#include <cstdint>
#include <future>
#include <memory>
#include <mutex>
#include <vector>
#include "client_backend/client_backend.h"
#include "metrics.h"
namespace
triton
{
namespace
perfanalyzer
{
#ifndef DOCTEST_CONFIG_DISABLE
class
TestMetricsManager
;
#endif
class
MetricsManager
{
public:
MetricsManager
(
std
::
shared_ptr
<
clientbackend
::
ClientBackend
>
client_backend
,
uint64_t
metrics_interval_ms
);
/// Ends the background thread, redundant in case StopQueryingMetrics() isn't
/// called
~
MetricsManager
();
/// Starts background thread that queries metrics on an interval
void
StartQueryingMetrics
();
/// Checks if background thread threw exception and propagates it if so
void
CheckQueryingStatus
();
/// Puts the latest-collected metrics from background thread into vector
/// output parameter to be used by main thread
void
GetLatestMetrics
(
std
::
vector
<
Metrics
>&
metrics_per_timestamp
);
/// Ends the background thread
void
StopQueryingMetrics
();
private:
void
QueryMetricsEveryNMilliseconds
();
void
CheckForMissingMetrics
(
const
Metrics
&
metrics
);
void
CheckForMetricIntervalTooShort
(
const
std
::
chrono
::
nanoseconds
&
remainder
,
const
std
::
chrono
::
nanoseconds
&
duration
);
std
::
shared_ptr
<
clientbackend
::
ClientBackend
>
client_backend_
{
nullptr
};
uint64_t
metrics_interval_ms_
{
0
};
std
::
mutex
metrics_mutex_
{};
std
::
vector
<
Metrics
>
metrics_
{};
bool
should_keep_querying_
{
false
};
std
::
future
<
void
>
query_loop_future_
{};
std
::
mutex
query_loop_mutex_
{};
std
::
condition_variable
query_loop_cv_
{};
bool
has_given_missing_metrics_warning_
{
false
};
bool
has_given_metric_interval_warning_
{
false
};
#ifndef DOCTEST_CONFIG_DISABLE
friend
TestMetricsManager
;
public:
MetricsManager
()
=
default
;
#endif
};
}}
// namespace triton::perfanalyzer
src/c++/perf_analyzer/mock_concurrency_worker.h
0 → 100644
View file @
c68e1835
// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include "concurrency_worker.h"
#include "gmock/gmock.h"
namespace
triton
{
namespace
perfanalyzer
{
class
NaggyMockConcurrencyWorker
:
public
ConcurrencyWorker
{
public:
NaggyMockConcurrencyWorker
(
uint32_t
id
,
std
::
shared_ptr
<
ThreadStat
>
thread_stat
,
std
::
shared_ptr
<
ThreadConfig
>
thread_config
,
const
std
::
shared_ptr
<
ModelParser
>
parser
,
std
::
shared_ptr
<
DataLoader
>
data_loader
,
const
std
::
shared_ptr
<
cb
::
ClientBackendFactory
>
factory
,
const
bool
on_sequence_model
,
const
bool
async
,
const
size_t
max_concurrency
,
const
bool
using_json_data
,
const
bool
streaming
,
const
int32_t
batch_size
,
std
::
condition_variable
&
wake_signal
,
std
::
mutex
&
wake_mutex
,
size_t
&
active_threads
,
bool
&
execute
,
const
std
::
shared_ptr
<
IInferDataManager
>&
infer_data_manager
,
std
::
shared_ptr
<
SequenceManager
>
sequence_manager
)
:
ConcurrencyWorker
(
id
,
thread_stat
,
thread_config
,
parser
,
data_loader
,
factory
,
on_sequence_model
,
async
,
max_concurrency
,
using_json_data
,
streaming
,
batch_size
,
wake_signal
,
wake_mutex
,
active_threads
,
execute
,
infer_data_manager
,
sequence_manager
)
{
ON_CALL
(
*
this
,
Infer
()).
WillByDefault
([
this
]()
->
void
{
ConcurrencyWorker
::
Infer
();
});
}
MOCK_METHOD
(
void
,
Infer
,
(),
(
override
));
void
EmptyInfer
()
{
thread_config_
->
is_paused_
=
true
;
}
};
// Non-naggy version of Mock (won't warn when using default gmock
// mocked function)
using
MockConcurrencyWorker
=
testing
::
NiceMock
<
NaggyMockConcurrencyWorker
>
;
}}
// namespace triton::perfanalyzer
Prev
1
…
3
4
5
6
7
8
9
10
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment