Commit 3c15726c authored by yangzhong's avatar yangzhong
Browse files

git init

parents
/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
/// \file
/// \brief Declare IssueQueryController and other helper classes for
/// query issuing.
#ifndef MLPERF_LOADGEN_ISSUE_QUERY_CONTROLLER_H_
#define MLPERF_LOADGEN_ISSUE_QUERY_CONTROLLER_H_
#include <stdint.h>
#include <atomic>
#include <chrono>
#include <condition_variable>
#include <future>
#include <mutex>
#include <queue>
#include <random>
#include <thread>
#include <vector>
#include "loadgen.h"
#include "logging.h"
#include "query_sample.h"
#include "system_under_test.h"
#include "test_settings_internal.h"
#include "utils.h"
namespace mlperf {
namespace loadgen {
struct SampleMetadata;
class QueryMetadata;
/// \brief Every query and sample within a call to StartTest gets a unique
/// sequence id for easy cross reference, and a random number which is used to
/// determine accuracy logging when it is enabled.
struct SequenceGen {
uint64_t NextQueryId() { return query_id++; }
uint64_t NextSampleId() { return sample_id++; }
uint64_t CurrentSampleId() { return sample_id; }
double NextAccLogRng() { return accuracy_log_dist(accuracy_log_rng); }
void InitAccLogRng(uint64_t accuracy_log_rng_seed) {
accuracy_log_rng = std::mt19937(accuracy_log_rng_seed);
}
private:
uint64_t query_id = 0;
uint64_t sample_id = 0;
std::mt19937 accuracy_log_rng;
std::uniform_real_distribution<double> accuracy_log_dist =
std::uniform_real_distribution<double>(0, 1);
};
/// \brief An interface for a particular scenario + mode to implement for
/// extended hanlding of sample completion.
struct ResponseDelegate {
virtual ~ResponseDelegate() = default;
virtual void SampleComplete(SampleMetadata*, QuerySampleResponse*,
PerfClock::time_point,
const ResponseCallback&) = 0;
virtual void TokenComplete(SampleMetadata*, QuerySampleResponse*,
PerfClock::time_point,
const ResponseCallback&) = 0;
virtual void QueryComplete() = 0;
std::atomic<size_t> queries_completed{0};
};
/// \brief Used by the loadgen to coordinate response data and completion.
struct SampleMetadata {
QueryMetadata* query_metadata;
uint64_t sequence_id;
QuerySampleIndex sample_index;
double accuracy_log_val;
};
/// \brief Maintains data and timing info for a query and all its samples.
class QueryMetadata {
public:
QueryMetadata(const std::vector<QuerySampleIndex>& query_sample_indices,
std::chrono::nanoseconds scheduled_delta,
ResponseDelegate* response_delegate, SequenceGen* sequence_gen);
QueryMetadata(QueryMetadata&& src);
void NotifyOneSampleCompleted(PerfClock::time_point timestamp);
void WaitForAllSamplesCompleted();
PerfClock::time_point WaitForAllSamplesCompletedWithTimestamp();
/// \brief Coalesce multiple queries into one query.
/// When server_coalesce_queries is set to true in Server scenario, we
/// sometimes coalesce multiple queries into one query. This is done by moving
/// the other query's sample into current query, while maintaining their
/// original scheduled_time.
void CoalesceQueries(QueryMetadata* queries, size_t first, size_t last,
size_t stride);
/// \brief Set a coalesced query back to its original state.
void Decoalesce();
public:
std::vector<QuerySample> query_to_send;
const std::chrono::nanoseconds scheduled_delta;
ResponseDelegate* const response_delegate;
const uint64_t sequence_id;
// Performance information.
size_t scheduled_intervals = 0; // Number of intervals between queries, as
// actually scheduled during the run.
// For the MultiStream scenario only.
PerfClock::time_point scheduled_time;
PerfClock::time_point issued_start_time;
PerfClock::time_point all_samples_done_time;
private:
std::atomic<size_t> wait_count_;
std::promise<void> all_samples_done_;
std::vector<SampleMetadata> samples_;
};
/// \brief A state object for communications between the controller and its
/// caller.
struct IssueQueryState {
// Information from caller to controller.
SystemUnderTest* sut;
std::vector<QueryMetadata>* queries;
ResponseDelegate* response_delegate;
const TestSettingsInternal* settings;
TestMode mode;
// Information from controller to caller.
std::chrono::system_clock::time_point start_for_power;
PerfClock::time_point start_time;
bool ran_out_of_generated_queries;
size_t queries_issued;
size_t expected_latencies;
// The lock to modify this state (in multi-thread case).
std::mutex mtx;
};
/// \brief Controls the query issuing part.
/// This controller handles both the cases if the user registers or does not
/// register IssueQueryThreads. It is implemented as a singleton, and is NOT
/// thread-safe (i.e. users should not call StartTest() on multiple threads).
/// It is thread-safe with regard to IssueQueryThreads.
class IssueQueryController {
public:
/// \brief Get the controller instance singleton.
static IssueQueryController& GetInstance();
/// \brief Don't allow copy. This is a singleton.
IssueQueryController(IssueQueryController const&) = delete;
void operator=(IssueQueryController const&) = delete;
/// \brief Register an IssueQueryThread.
/// It is blocking until the entire test ends.
void RegisterThread();
/// \brief Set number of IssueQueryThreads and wait for thread registration.
/// If for any reason the number of registered threads do not match the
/// specified number, it prints out an error.
void SetNumThreads(size_t n);
/// \brief Kick off the query issuing.
/// The query issuing will be done on the current thread if there is no
/// registered IssueQueryThreads or if it is not in Server scenario.
template <TestScenario scenario>
void StartIssueQueries(IssueQueryState* s);
/// \brief Notify the IssueQueryThreads to end.
void EndThreads();
private:
/// \brief Hide constructor. This is a singleton.
IssueQueryController() {}
/// \brief The internal helper which actually issues queries.
/// This should be called by the thread(s) which issues queries.
template <TestScenario scenario, bool multi_thread>
void IssueQueriesInternal(size_t query_stride, size_t thread_idx);
/// \brief The issue query state.
IssueQueryState* state;
/// \brief Locks for communications across IssueQueryThreads and the main
/// thread.
std::mutex mtx;
std::condition_variable cond_var;
/// \brief Thread ids of the registered IssueQueryThreads.
std::vector<std::thread::id> thread_ids;
size_t num_threads{0};
/// \brief Whether the threads should be actively issuing queries.
bool issuing{false};
/// \brief Flags for each IssueQueryThread to mark that it is done.
std::vector<bool> thread_complete;
/// \brief Whether the threads can end now.
bool end_test{false};
};
} // namespace loadgen
} // namespace mlperf
#endif // MLPERF_LOADGEN_ISSUE_QUERY_CONTROLLER_H_
/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "loadgen.h"
#include <stdint.h>
#include <algorithm>
#include <atomic>
#include <cassert>
#include <chrono>
#include <cstring>
#include <ctime>
#include <fstream>
#include <future>
#include <iomanip>
#include <iostream>
#include <queue>
#include <random>
#include <sstream>
#include <string>
#include <thread>
#include <vector>
#include "early_stopping.h"
#include "issue_query_controller.h"
#include "logging.h"
#include "query_sample.h"
#include "query_sample_library.h"
#include "results.h"
#include "system_under_test.h"
#include "test_settings.h"
#include "test_settings_internal.h"
#include "utils.h"
#include "version.h"
namespace mlperf {
/// \brief Loadgen implementation details.
namespace loadgen {
/// \brief A random set of samples in the QSL that should fit in RAM when
/// loaded together.
struct LoadableSampleSet {
std::vector<QuerySampleIndex> set;
const size_t sample_distribution_end; // Excludes padding in MultiStream.
};
/// \brief Generates nanoseconds from a start time to multiple end times.
/// TODO: This isn't very useful anymore. Remove it.
struct DurationGeneratorNs {
const PerfClock::time_point start;
int64_t delta(PerfClock::time_point end) const {
return std::chrono::duration_cast<std::chrono::nanoseconds>(end - start)
.count();
}
};
/// \brief ResponseDelegate implementation templated by scenario and mode.
template <TestScenario scenario, TestMode mode>
struct ResponseDelegateDetailed : public ResponseDelegate {
double accuracy_log_offset = 0.0f;
double accuracy_log_prob = 0.0f;
void SampleComplete(SampleMetadata* sample, QuerySampleResponse* response,
PerfClock::time_point complete_begin_time,
const ResponseCallback& response_cb) override {
// Using a raw pointer here should help us hit the std::function
// small buffer optimization code path when we aren't copying data.
// For some reason, using std::unique_ptr<std::vector> wasn't moving
// into the lambda; even with C++14.
std::vector<uint8_t>* sample_data_copy = nullptr;
double accuracy_log_val =
sample->accuracy_log_val + accuracy_log_offset < 1.0
? sample->accuracy_log_val + accuracy_log_offset
: sample->accuracy_log_val + accuracy_log_offset - 1.0;
if (mode == TestMode::AccuracyOnly ||
accuracy_log_val <= accuracy_log_prob) {
// if a response_cb callback is provided, data only needs to reside on the
// host *after* calling it note that the callback is blocking and will
// likely involve a memcpy from accelerator to host
if (response_cb) {
response_cb(response);
}
// TODO: Verify accuracy with the data copied here.
uint8_t* src_begin = reinterpret_cast<uint8_t*>(response->data);
uint8_t* src_end = src_begin + response->size;
sample_data_copy = new std::vector<uint8_t>(src_begin, src_end);
}
int64_t n_tokens = response->n_tokens;
Log([sample, complete_begin_time, sample_data_copy,
n_tokens](AsyncLog& log) {
QueryMetadata* query = sample->query_metadata;
DurationGeneratorNs sched{query->scheduled_time};
if (scenario == TestScenario::Server) {
// Trace the server scenario as a stacked graph via counter events.
DurationGeneratorNs issued{query->issued_start_time};
log.TraceCounterEvent("Latency", query->scheduled_time, "issue_delay",
sched.delta(query->issued_start_time),
"issue_to_done",
issued.delta(complete_begin_time));
}
// While visualizing overlapping samples in offline mode is not
// practical, sample completion is still recorded for auditing purposes.
log.TraceSample("Sample", sample->sequence_id, query->scheduled_time,
complete_begin_time, "sample_seq", sample->sequence_id,
"query_seq", query->sequence_id, "sample_idx",
sample->sample_index, "issue_start_ns",
sched.delta(query->issued_start_time), "complete_ns",
sched.delta(complete_begin_time));
if (sample_data_copy) {
log.LogAccuracy(sample->sequence_id, sample->sample_index,
LogBinaryAsHexString{sample_data_copy}, n_tokens);
delete sample_data_copy;
}
// Record the latency at the end, since it will unblock the issuing
// thread and potentially destroy the metadata being used above.
QuerySampleLatency latency = sched.delta(complete_begin_time);
log.RecordSampleCompletion(sample->sequence_id, complete_begin_time,
latency, n_tokens);
});
}
void TokenComplete(SampleMetadata* sample, QuerySampleResponse* response,
PerfClock::time_point complete_begin_time,
const ResponseCallback& response_cb) override {
// Using a raw pointer here should help us hit the std::function
// small buffer optimization code path when we aren't copying data.
// For some reason, using std::unique_ptr<std::vector> wasn't moving
// into the lambda; even with C++14.
std::vector<uint8_t>* token_data_copy = nullptr;
double accuracy_log_val =
sample->accuracy_log_val + accuracy_log_offset < 1.0
? sample->accuracy_log_val + accuracy_log_offset
: sample->accuracy_log_val + accuracy_log_offset - 1.0;
if (mode == TestMode::AccuracyOnly ||
accuracy_log_val <= accuracy_log_prob) {
uint8_t* src_begin = reinterpret_cast<uint8_t*>(response->data);
uint8_t* src_end = src_begin + response->size;
token_data_copy = new std::vector<uint8_t>(src_begin, src_end);
}
Log([sample, complete_begin_time, token_data_copy](AsyncLog& log) {
QueryMetadata* query = sample->query_metadata;
DurationGeneratorNs sched{query->scheduled_time};
if (scenario == TestScenario::Server) {
DurationGeneratorNs issued{query->issued_start_time};
log.TraceCounterEvent(
"Token_Latency", query->scheduled_time, "issue_delay",
sched.delta(query->issued_start_time), "issue_to_done",
issued.delta(complete_begin_time));
} else {
log.TraceSample("Token", sample->sequence_id, query->scheduled_time,
complete_begin_time, "sample_seq", sample->sequence_id,
"query_seq", query->sequence_id, "sample_idx",
sample->sample_index, "issue_start_ns",
sched.delta(query->issued_start_time), "complete_ns",
sched.delta(complete_begin_time));
}
if (token_data_copy) {
log.CacheToken(sample->sequence_id,
LogBinaryAsHexString{token_data_copy});
}
QuerySampleLatency latency = sched.delta(complete_begin_time);
log.RecordTokenCompletion(sample->sequence_id, complete_begin_time,
latency);
});
}
void QueryComplete() override {
// We only need to track outstanding queries in the server scenario to
// detect when the SUT has fallen too far behind.
if (scenario == TestScenario::Server) {
queries_completed.fetch_add(1, std::memory_order_relaxed);
}
}
};
/// \brief Selects the query timestamps for all scenarios except Server.
template <TestScenario scenario>
auto ScheduleDistribution(double qps) {
return [period = std::chrono::duration_cast<std::chrono::nanoseconds>(
std::chrono::duration<double>(1.0 / qps))](auto& /*gen*/) {
return period;
};
}
/// \brief Selects the query timestamps for the Server scenario.
template <>
auto ScheduleDistribution<TestScenario::Server>(double qps) {
// Poisson arrival process corresponds to exponentially distributed
// interarrival times.
return [dist = std::exponential_distribution<>(qps)](auto& gen) mutable {
return std::chrono::duration_cast<std::chrono::nanoseconds>(
std::chrono::duration<double>(dist(gen)));
};
}
/// \brief Selects samples for the accuracy mode.
template <TestMode mode>
auto SampleDistribution(size_t sample_count, size_t stride, std::mt19937* rng) {
std::vector<size_t> indices;
for (size_t i = 0; i < sample_count; i += stride) {
indices.push_back(i);
}
std::shuffle(indices.begin(), indices.end(), *rng);
return [indices = std::move(indices), i = size_t(0)](auto& /*gen*/) mutable {
return indices.at(i++);
};
}
/// \brief Selects samples for the performance mode.
template <>
auto SampleDistribution<TestMode::PerformanceOnly>(size_t sample_count,
size_t /*stride*/,
std::mt19937* /*rng*/) {
return [dist = std::uniform_int_distribution<>(0, sample_count - 1)](
auto& gen) mutable { return dist(gen); };
}
/// \brief Sample across the dataset, and ensure coverage of each of the
/// samples.
// Useful for non-uniform dataset (e.g. Llama2, GPTJ, 3d-unet)
auto SampleDistributionEqualIssue(size_t sample_count, size_t set_size,
std::mt19937* rng) {
std::vector<size_t> indices;
std::vector<size_t> shuffle_indices(set_size);
std::iota(shuffle_indices.begin(), shuffle_indices.end(), 0);
for (size_t j = 0; j < sample_count; j += set_size) {
std::shuffle(shuffle_indices.begin(), shuffle_indices.end(), *rng);
indices.insert(indices.end(), shuffle_indices.begin(),
shuffle_indices.end());
}
return [indices = std::move(indices), i = size_t(0)](auto& /*gen*/) mutable {
return indices.at((i++) % indices.size());
};
}
/// \brief Generates queries for the requested settings, templated by
/// scenario and mode.
/// \todo Make GenerateQueries faster.
/// QueryMetadata is expensive to move; either reserve queries in advance
/// so the queries vector doesn't need to grow. And/or parent samples to their
/// queries only after all queries have been generated.
/// \todo For the server scenario only, scale the query timeline at the end so
/// the QPS as scheduled is equal to the QPS as requested.
template <TestScenario scenario, TestMode mode>
std::vector<QueryMetadata> GenerateQueries(
const TestSettingsInternal& settings,
const LoadableSampleSet& loaded_sample_set, SequenceGen* sequence_gen,
ResponseDelegate* response_delegate) {
auto tracer =
MakeScopedTracer([](AsyncTrace& trace) { trace("GenerateQueries"); });
auto& loaded_samples = loaded_sample_set.set;
// Generate 2x more samples than we think we'll need given the expected
// QPS in case the SUT is faster than expected.
// We should exit before issuing all queries.
// Does not apply to the server scenario since the duration only
// depends on the ideal scheduled time, not the actual issue time.
const int duration_multiplier = scenario == TestScenario::Server ? 1 : 2;
std::chrono::microseconds gen_duration =
duration_multiplier * settings.target_duration;
size_t min_queries = settings.min_query_count;
size_t samples_per_query = settings.samples_per_query;
if (mode == TestMode::AccuracyOnly && scenario == TestScenario::Offline) {
samples_per_query = loaded_sample_set.sample_distribution_end;
}
// We should not exit early in accuracy mode.
if (mode == TestMode::AccuracyOnly || settings.performance_issue_unique) {
gen_duration = std::chrono::microseconds(0);
// Integer truncation here is intentional.
// For MultiStream, loaded samples is properly padded.
// For Offline, we create a 'remainder' query at the end of this function.
min_queries = loaded_samples.size() / samples_per_query;
}
std::vector<QueryMetadata> queries;
// Using the std::mt19937 pseudo-random number generator ensures a modicum of
// cross platform reproducibility for trace generation.
std::mt19937 sample_rng(settings.sample_index_rng_seed);
std::mt19937 schedule_rng(settings.schedule_rng_seed);
constexpr bool kIsMultiStream = scenario == TestScenario::MultiStream;
const size_t sample_stride = kIsMultiStream ? samples_per_query : 1;
auto sample_distribution = SampleDistribution<mode>(
loaded_sample_set.sample_distribution_end, sample_stride, &sample_rng);
// Use the unique sample distribution same as in AccuracyMode to
// to choose samples when either flag performance_issue_unique
// or performance_issue_same is set.
auto sample_distribution_unique = SampleDistribution<TestMode::AccuracyOnly>(
loaded_sample_set.sample_distribution_end, sample_stride, &sample_rng);
auto sample_distribution_equal_issue = SampleDistributionEqualIssue(
min_queries, loaded_samples.size(), &sample_rng);
auto schedule_distribution =
ScheduleDistribution<scenario>(settings.target_qps);
// When sample_concatenate_permutation is turned on, pad to a multiple of the
// complete dataset to ensure fairness.
auto enable_equal_issue = settings.sample_concatenate_permutation;
if (mode != TestMode::AccuracyOnly && enable_equal_issue) {
if (scenario == TestScenario::Offline &&
samples_per_query % loaded_samples.size() != 0) {
// In offline mode, we pad samples_per_query
size_t pad_size =
(loaded_samples.size() - samples_per_query % loaded_samples.size());
samples_per_query += pad_size;
} else if ((scenario != TestScenario::Offline) &&
(min_queries % loaded_samples.size() != 0)) {
// In Server, SingleStream, MultiStream mode, the min_queries should be
// padded
size_t pad_size =
(loaded_samples.size() - min_queries % loaded_samples.size());
min_queries += pad_size;
}
}
std::vector<QuerySampleIndex> samples(samples_per_query);
std::chrono::nanoseconds timestamp(0);
std::chrono::nanoseconds prev_timestamp(0);
// Choose a single sample to repeat when in performance_issue_same mode
QuerySampleIndex same_sample = settings.performance_issue_same_index;
while (prev_timestamp < gen_duration || queries.size() < min_queries) {
if (kIsMultiStream) {
QuerySampleIndex sample_i = settings.performance_issue_unique
? sample_distribution_unique(sample_rng)
: settings.performance_issue_same
? same_sample
: sample_distribution(sample_rng);
for (auto& s : samples) {
// Select contiguous samples in the MultiStream scenario.
// This will not overflow, since GenerateLoadableSets adds padding at
// the end of the loadable sets in the MultiStream scenario.
// The padding allows the starting samples to be the same for each
// query with respect to samples_per_query.
s = loaded_samples[sample_i++];
}
} else if (scenario == TestScenario::Offline) {
// For the Offline + Performance scenario, we also want to support
// contiguous samples. In this scenario the query can be much larger than
// what fits into memory. We simply repeat loaded_samples N times, plus a
// remainder to ensure we fill up samples. Note that this eliminates
// randomization.
size_t num_loaded_samples = loaded_samples.size();
size_t num_full_repeats = samples_per_query / num_loaded_samples;
uint64_t remainder = samples_per_query % (num_loaded_samples);
if (settings.performance_issue_same) {
std::fill(samples.begin(), samples.begin() + samples_per_query,
loaded_samples[same_sample]);
} else {
for (size_t i = 0; i < num_full_repeats; ++i) {
std::copy(loaded_samples.begin(), loaded_samples.end(),
samples.begin() + i * num_loaded_samples);
if (settings.sample_concatenate_permutation) {
std::shuffle(samples.begin() + i * num_loaded_samples,
samples.begin() + (i + 1) * num_loaded_samples,
sample_rng);
}
}
std::copy(loaded_samples.begin(), loaded_samples.begin() + remainder,
samples.begin() + num_full_repeats * num_loaded_samples);
if (settings.sample_concatenate_permutation) {
assert(remainder == 0);
}
}
} else {
for (auto& s : samples) {
s = loaded_samples[settings.performance_issue_unique
? sample_distribution_unique(sample_rng)
: settings.performance_issue_same ? same_sample
: enable_equal_issue
? sample_distribution_equal_issue(sample_rng)
: sample_distribution(sample_rng)];
}
}
queries.emplace_back(samples, timestamp, response_delegate, sequence_gen);
prev_timestamp = timestamp;
timestamp += schedule_distribution(schedule_rng);
// In equal_issue mode, the min_queries will be bumped up by a multiple of
// the dataset size if the test time has not met the threshold.
if (enable_equal_issue && (queries.size() >= min_queries) &&
(prev_timestamp < gen_duration) &&
(scenario != TestScenario::Offline)) {
min_queries += loaded_samples.size();
}
}
// See if we need to create a "remainder" query for offline+accuracy to
// ensure we issue all samples in loaded_samples. Offline doesn't pad
// loaded_samples like MultiStream does.
if (scenario == TestScenario::Offline && mode == TestMode::AccuracyOnly) {
size_t remaining_samples = loaded_samples.size() % samples_per_query;
if (remaining_samples != 0) {
samples.resize(remaining_samples);
for (auto& s : samples) {
s = loaded_samples[sample_distribution(sample_rng)];
}
queries.emplace_back(samples, timestamp, response_delegate, sequence_gen);
}
}
LogDetail([count = queries.size(), spq = samples_per_query,
duration = timestamp.count()](AsyncDetail& detail) {
#if USE_NEW_LOGGING_FORMAT
MLPERF_LOG(detail, "generated_query_count", count);
MLPERF_LOG(detail, "generated_samples_per_query", spq);
MLPERF_LOG(detail, "generated_query_duration", duration);
#else
detail("GeneratedQueries: ", "queries", count, "samples per query", spq,
"duration", duration);
#endif
});
return queries;
}
/// \brief Issues a series of pre-generated queries.
// TODO: Templates for scenario and mode are overused, given the loadgen
// no longer generates queries on the fly. Should we reduce the
// use of templates?
template <TestScenario scenario, TestMode mode>
PerformanceResult IssueQueries(SystemUnderTest* sut,
const TestSettingsInternal& settings,
const LoadableSampleSet& loaded_sample_set,
SequenceGen* sequence_gen) {
// Create reponse handler.
ResponseDelegateDetailed<scenario, mode> response_logger;
std::uniform_real_distribution<double> accuracy_log_offset_dist =
std::uniform_real_distribution<double>(0.0, 1.0);
std::mt19937 accuracy_log_offset_rng(settings.accuracy_log_rng_seed);
response_logger.accuracy_log_offset =
accuracy_log_offset_dist(accuracy_log_offset_rng);
response_logger.accuracy_log_prob = settings.accuracy_log_probability;
// Generate queries.
auto sequence_id_start = sequence_gen->CurrentSampleId();
std::vector<QueryMetadata> queries = GenerateQueries<scenario, mode>(
settings, loaded_sample_set, sequence_gen, &response_logger);
// Calculated expected number of queries
uint64_t expected_queries =
settings.target_qps * settings.min_duration.count() / 1000;
uint64_t minimum_queries =
settings.min_query_count * settings.samples_per_query;
if (scenario != TestScenario::Offline) {
expected_queries *= settings.samples_per_query;
} else {
minimum_queries = settings.min_sample_count;
}
expected_queries =
expected_queries < minimum_queries ? minimum_queries : expected_queries;
if (settings.accuracy_log_sampling_target > 0) {
response_logger.accuracy_log_prob =
(double)settings.accuracy_log_sampling_target / expected_queries;
}
auto sequence_id_end = sequence_gen->CurrentSampleId();
size_t max_latencies_to_record = sequence_id_end - sequence_id_start;
// Initialize logger for latency recording.
GlobalLogger().RestartLatencyRecording(sequence_id_start,
max_latencies_to_record);
// Create and initialize an IssueQueryState.
IssueQueryState state{
sut, &queries, &response_logger, &settings, mode, {}, {}, false, 0,
0, {}};
auto& controller = IssueQueryController::GetInstance();
// Set number of IssueQueryThreads and wait for the threads to register.
controller.SetNumThreads(settings.requested.server_num_issue_query_threads);
// Start issuing the queries.
controller.StartIssueQueries<scenario>(&state);
// Gather query issuing statistics.
const auto start_for_power = state.start_for_power;
const auto start = state.start_time;
const auto ran_out_of_generated_queries = state.ran_out_of_generated_queries;
const auto queries_issued = state.queries_issued;
const auto expected_latencies = state.expected_latencies;
// Let the SUT know it should not expect any more queries.
sut->FlushQueries();
if (mode == TestMode::PerformanceOnly && ran_out_of_generated_queries) {
LogDetail([](AsyncDetail& detail) {
#if USE_NEW_LOGGING_FORMAT
MLPERF_LOG_ERROR(
detail, "error_runtime",
"Ending early: Ran out of generated queries to issue before the "
"minimum query count and test duration were reached. "
"Please update the relevant expected latency or target qps in the "
"TestSettings so they are more accurate.");
#else
detail.Error(
"Ending early: Ran out of generated queries to issue before the "
"minimum query count and test duration were reached.");
detail(
"Please update the relevant expected latency or target qps in the "
"TestSettings so they are more accurate.");
#endif
});
}
// Wait for tail queries to complete and collect all the latencies.
// We have to keep the synchronization primitives alive until the SUT
// is done with them.
auto& final_query = queries[queries_issued - 1];
std::vector<QuerySampleLatency> sample_latencies(
GlobalLogger().GetLatenciesBlocking(expected_latencies));
std::vector<QuerySampleLatency> first_token_latencies(
GlobalLogger().GetTokenLatencies(expected_latencies));
std::vector<QuerySampleLatency> time_per_output_token_arr(
GlobalLogger().GetTimePerOutputToken(expected_latencies));
std::vector<int64_t> tokens_per_sample(
GlobalLogger().GetTokensPerSample(expected_latencies));
// Log contention counters after every test as a sanity check.
GlobalLogger().LogContentionAndAllocations();
// This properly accounts for the fact that the max completion time may not
// belong to the final query. It also excludes any time spent postprocessing
// in the loadgen itself after final completion, which may be significant
// in the offline scenario.
PerfClock::time_point max_completion_time =
GlobalLogger().GetMaxCompletionTime();
auto sut_active_duration = max_completion_time - start;
LogDetail([start_for_power, sut_active_duration](AsyncDetail& detail) {
auto end_for_power =
start_for_power +
std::chrono::duration_cast<std::chrono::system_clock::duration>(
sut_active_duration);
#if USE_NEW_LOGGING_FORMAT
MLPERF_LOG_INTERVAL_START(detail, "power_begin",
DateTimeStringForPower(start_for_power));
MLPERF_LOG_INTERVAL_END(detail, "power_end",
DateTimeStringForPower(end_for_power));
#else
detail("POWER_BEGIN: ", "mode", ToString(mode), "time",
DateTimeStringForPower(start_for_power));
detail("POWER_END: ", "mode", ToString(mode), "time",
DateTimeStringForPower(end_for_power));
#endif
});
double max_latency =
QuerySampleLatencyToSeconds(GlobalLogger().GetMaxLatencySoFar());
double final_query_scheduled_time =
DurationToSeconds(final_query.scheduled_delta);
double final_query_issued_time =
DurationToSeconds(final_query.issued_start_time - start);
double final_query_all_samples_done_time =
DurationToSeconds(final_query.all_samples_done_time - start);
std::vector<QuerySampleLatency> query_latencies;
if (scenario == TestScenario::MultiStream) {
query_latencies.resize(queries_issued);
for (size_t i = 0; i < queries_issued; i++) {
query_latencies[i] = DurationGeneratorNs{queries[i].scheduled_time}.delta(
queries[i].all_samples_done_time);
}
}
return PerformanceResult{
std::move(sample_latencies),
std::move(query_latencies),
queries_issued,
max_latency,
final_query_scheduled_time,
final_query_issued_time,
final_query_all_samples_done_time,
TokenPerformanceResults{first_token_latencies, time_per_output_token_arr,
tokens_per_sample}};
}
void LoadSamplesToRam(QuerySampleLibrary* qsl,
const std::vector<QuerySampleIndex>& samples) {
LogDetail([&samples](AsyncDetail& detail) {
#if USE_NEW_LOGGING_FORMAT
MLPERF_LOG(detail, "loaded_qsl_set", samples);
#else
std::string set("\"[");
for (auto i : samples) {
set += std::to_string(i) + ",";
}
set.resize(set.size() - 1);
set += "]\"";
detail("Loading QSL : ", "set", set);
#endif
});
qsl->LoadSamplesToRam(samples);
}
/// \brief Generates random sets of samples in the QSL that we can load into
/// RAM at the same time.
std::vector<LoadableSampleSet> GenerateLoadableSets(
QuerySampleLibrary* qsl, const TestSettingsInternal& settings) {
auto tracer = MakeScopedTracer(
[](AsyncTrace& trace) { trace("GenerateLoadableSets"); });
std::vector<LoadableSampleSet> result;
std::mt19937 qsl_rng(settings.qsl_rng_seed);
// Generate indices for all available samples in the QSL.
const size_t qsl_total_count = qsl->TotalSampleCount();
std::vector<QuerySampleIndex> samples(qsl_total_count);
for (size_t i = 0; i < qsl_total_count; i++) {
samples[i] = static_cast<QuerySampleIndex>(i);
}
// Randomize the order of the samples.
std::shuffle(samples.begin(), samples.end(), qsl_rng);
// Partition the samples into loadable sets.
const size_t set_size = settings.performance_sample_count;
const size_t set_padding = (settings.scenario == TestScenario::MultiStream)
? settings.samples_per_query - 1
: 0;
std::vector<QuerySampleIndex> loadable_set;
loadable_set.reserve(set_size + set_padding);
for (auto s : samples) {
loadable_set.push_back(s);
if (loadable_set.size() == set_size) {
result.push_back({std::move(loadable_set), set_size});
loadable_set.clear();
loadable_set.reserve(set_size + set_padding);
}
}
if (!loadable_set.empty()) {
// Copy the size since it will become invalid after the move.
size_t loadable_set_size = loadable_set.size();
result.push_back({std::move(loadable_set), loadable_set_size});
}
// Add padding for the multi stream scenario. Padding allows the
// starting sample to be the same for all SUTs, independent of the value
// of samples_per_query, while enabling samples in a query to be contiguous.
for (auto& loadable_set : result) {
auto& set = loadable_set.set;
for (size_t i = 0; i < set_padding; i++) {
// It's not clear in the spec if the STL deallocates the old container
// before assigning, which would invalidate the source before the
// assignment happens. Even though we should have reserved enough
// elements above, copy the source first anyway since we are just moving
// integers around.
QuerySampleIndex p = set[i];
set.push_back(p);
}
}
return result;
}
/// \brief Opens and owns handles to all of the log files.
struct LogOutputs {
LogOutputs(const LogOutputSettings& output_settings,
const std::string& test_date_time) {
std::string prefix = output_settings.outdir;
prefix += "/" + output_settings.prefix;
if (output_settings.prefix_with_datetime) {
prefix += test_date_time + "_";
}
const std::string& suffix = output_settings.suffix;
summary_out.open(prefix + "summary" + suffix + ".txt");
detail_out.open(prefix + "detail" + suffix + ".txt");
accuracy_out.open(prefix + "accuracy" + suffix + ".json");
trace_out.open(prefix + "trace" + suffix + ".json");
}
bool CheckOutputs() {
bool all_ofstreams_good = true;
if (!summary_out.good()) {
all_ofstreams_good = false;
std::cerr << "LoadGen: Failed to open summary file.";
}
if (!detail_out.good()) {
all_ofstreams_good = false;
std::cerr << "LoadGen: Failed to open detailed log file.";
}
if (!accuracy_out.good()) {
all_ofstreams_good = false;
std::cerr << "LoadGen: Failed to open accuracy log file.";
}
if (!trace_out.good()) {
all_ofstreams_good = false;
std::cerr << "LoadGen: Failed to open trace file.";
}
return all_ofstreams_good;
}
std::ofstream summary_out;
std::ofstream detail_out;
std::ofstream accuracy_out;
std::ofstream trace_out;
};
/// \brief Find boundaries of performance settings by widening bounds
/// exponentially.
/// \details To find an upper bound of performance, widen an
/// upper bound exponentially until finding a bound that can't satisfy
/// performance constraints. i.e. [1, 2) -> [2, 4) -> [4, 8) -> ...
template <TestScenario scenario>
std::pair<PerformanceSummary, PerformanceSummary> FindBoundaries(
SystemUnderTest* sut, QuerySampleLibrary* qsl, SequenceGen* sequence_gen,
PerformanceSummary l_perf_summary) {
// Get upper bound
TestSettingsInternal u_settings = l_perf_summary.settings;
find_peak_performance::WidenPerformanceField<scenario>(&u_settings);
LogDetail(
[l_field = find_peak_performance::ToStringPerformanceField<scenario>(
l_perf_summary.settings),
u_field = find_peak_performance::ToStringPerformanceField<scenario>(
u_settings)](AsyncDetail& detail) {
#if USE_NEW_LOGGING_FORMAT
MLPERF_LOG(detail, "generic_message",
"FindBoundaries: Checking fields [" + l_field + ", " +
u_field + ")");
#else
detail("FindBoundaries: Checking fields [" + l_field + ", " + u_field +
")");
#endif
});
std::vector<loadgen::LoadableSampleSet> loadable_sets(
loadgen::GenerateLoadableSets(qsl, u_settings));
const LoadableSampleSet& performance_set = loadable_sets.front();
LoadSamplesToRam(qsl, performance_set.set);
PerformanceResult u_pr(IssueQueries<scenario, TestMode::PerformanceOnly>(
sut, u_settings, performance_set, sequence_gen));
PerformanceSummary u_perf_summary{sut->Name(), u_settings, std::move(u_pr)};
qsl->UnloadSamplesFromRam(performance_set.set);
std::string tmp;
if (!u_perf_summary.PerfConstraintsMet(&tmp)) {
return std::make_pair(l_perf_summary, u_perf_summary);
} else {
return FindBoundaries<scenario>(sut, qsl, sequence_gen, u_perf_summary);
}
}
/// \brief Find peak performance by binary search.
/// \details The found lower & upper bounds by the function 'FindBoundaries' are
/// used as initial bounds of binary search
template <TestScenario scenario>
PerformanceSummary FindPeakPerformanceBinarySearch(
SystemUnderTest* sut, QuerySampleLibrary* qsl, SequenceGen* sequence_gen,
const LoadableSampleSet& performance_set, PerformanceSummary l_perf_summary,
PerformanceSummary u_perf_summary) {
if (find_peak_performance::IsFinished<scenario>(l_perf_summary.settings,
u_perf_summary.settings)) {
return l_perf_summary;
}
const TestSettingsInternal m_settings =
find_peak_performance::MidOfBoundaries<scenario>(l_perf_summary.settings,
u_perf_summary.settings);
LogDetail([l_field =
find_peak_performance::ToStringPerformanceField<scenario>(
l_perf_summary.settings),
u_field =
find_peak_performance::ToStringPerformanceField<scenario>(
u_perf_summary.settings),
m_field =
find_peak_performance::ToStringPerformanceField<scenario>(
m_settings)](AsyncDetail& detail) {
#if USE_NEW_LOGGING_FORMAT
MLPERF_LOG(
detail, "generic_message",
"FindPeakPerformanceBinarySearch: Testing the mid value of bounds [" +
l_field + ", " + u_field + "): " + m_field);
#else
detail(
"FindPeakPerformanceBinarySearch: Testing the mid value of bounds [" +
l_field + ", " + u_field + "): " + m_field);
#endif
});
PerformanceResult m_pr(IssueQueries<scenario, TestMode::PerformanceOnly>(
sut, m_settings, performance_set, sequence_gen));
PerformanceSummary m_perf_summary{sut->Name(), m_settings, std::move(m_pr)};
std::string tmp;
if (m_perf_summary.PerfConstraintsMet(&tmp)) {
return FindPeakPerformanceBinarySearch<scenario>(
sut, qsl, sequence_gen, performance_set, m_perf_summary,
u_perf_summary);
} else {
return FindPeakPerformanceBinarySearch<scenario>(
sut, qsl, sequence_gen, performance_set, l_perf_summary,
m_perf_summary);
}
}
/// \brief Runs the performance mode, templated by scenario.
template <TestScenario scenario>
void RunPerformanceMode(SystemUnderTest* sut, QuerySampleLibrary* qsl,
const TestSettingsInternal& settings,
SequenceGen* sequence_gen) {
LogDetail([](AsyncDetail& detail) {
#if USE_NEW_LOGGING_FORMAT
MLPERF_LOG(detail, "generic_message", "Starting performance mode");
#else
detail("Starting performance mode:");
#endif
});
// Use first loadable set as the performance set.
std::vector<loadgen::LoadableSampleSet> loadable_sets(
loadgen::GenerateLoadableSets(qsl, settings));
const LoadableSampleSet& performance_set = loadable_sets.front();
LoadSamplesToRam(qsl, performance_set.set);
// Start PerfClock/system_clock timers for measuring performance interval
// for comparison vs external timer.
auto pc_start_ts = PerfClock::now();
auto sc_start_ts = std::chrono::system_clock::now();
if (settings.print_timestamps) {
std::cout << "Loadgen :: Perf mode start. system_clock Timestamp = "
<< std::chrono::system_clock::to_time_t(sc_start_ts) << "\n"
<< std::flush;
}
PerformanceResult pr(IssueQueries<scenario, TestMode::PerformanceOnly>(
sut, settings, performance_set, sequence_gen));
// Measure PerfClock/system_clock timer durations for comparison vs
// external timer.
auto pc_stop_ts = PerfClock::now();
auto sc_stop_ts = std::chrono::system_clock::now();
auto pc_duration = std::chrono::duration_cast<std::chrono::milliseconds>(
pc_stop_ts - pc_start_ts)
.count();
auto sc_duration = std::chrono::duration_cast<std::chrono::milliseconds>(
sc_stop_ts - sc_start_ts)
.count();
float pc_sc_ratio = static_cast<float>(pc_duration) / sc_duration;
if (settings.print_timestamps) {
std::cout << "Loadgen :: Perf mode stop. systme_clock Timestamp = "
<< std::chrono::system_clock::to_time_t(sc_stop_ts) << "\n"
<< std::flush;
std::cout << "Loadgen :: PerfClock Perf duration = " << pc_duration
<< "ms\n"
<< std::flush;
std::cout << "Loadgen :: system_clock Perf duration = " << sc_duration
<< "ms\n"
<< std::flush;
std::cout << "Loadgen :: PerfClock/system_clock ratio = " << std::fixed
<< std::setprecision(4) << pc_sc_ratio << "\n"
<< std::flush;
}
if (pc_sc_ratio > 1.01 || pc_sc_ratio < 0.99) {
LogDetail([pc_sc_ratio](AsyncDetail& detail) {
#if USE_NEW_LOGGING_FORMAT
std::stringstream ss;
ss << "PerfClock and system_clock differ by more than 1%! "
<< " pc_sc_ratio: " << pc_sc_ratio;
MLPERF_LOG_ERROR(detail, "error_runtime", ss.str());
#else
detail.Error("PerfClock and system_clock differ by more than 1\%! ",
"pc_sc_ratio", pc_sc_ratio);
#endif
});
} else if (pc_sc_ratio > 1.001 || pc_sc_ratio < 0.999) {
LogDetail([pc_sc_ratio](AsyncDetail& detail) {
#if USE_NEW_LOGGING_FORMAT
std::stringstream ss;
ss << "PerfClock and system_clock differ by more than 0.1%! "
<< " pc_sc_ratio: " << pc_sc_ratio;
MLPERF_LOG_WARNING(detail, "warning_generic_message", ss.str());
#else
detail.Warning("PerfClock and system_clock differ by more than 0.1\%. ",
"pc_sc_ratio", pc_sc_ratio);
#endif
});
}
PerformanceSummary perf_summary{sut->Name(), settings, std::move(pr)};
LogSummary([perf_summary](AsyncSummary& summary) mutable {
perf_summary.LogSummary(summary);
});
// Create a copy to prevent thread hazard between LogSummary and LogDetail.
PerformanceSummary perf_summary_detail{perf_summary};
LogDetail([perf_summary_detail](AsyncDetail& detail) mutable {
perf_summary_detail.LogDetail(detail);
});
qsl->UnloadSamplesFromRam(performance_set.set);
}
/// \brief Runs the binary search mode, templated by scenario.
/// \details 1. Check whether lower bound from user satisfies the performance
/// constraints, 2. Find an upper bound using the function 'FindBoundaries'
/// based on the lower bound, 3. Find peak performance settings using the
/// function 'FindPeakPerformanceBinarySearch'. note: Since we can't find a
/// lower bound programmatically because of the monotonicity issue of Server
/// scenario, rely on user's settings. After resolving this issue, we can
/// make the function 'FindBoundaries' find a lower bound as well from some
/// random initial settings.
template <TestScenario scenario>
void FindPeakPerformanceMode(SystemUnderTest* sut, QuerySampleLibrary* qsl,
const TestSettingsInternal& base_settings,
SequenceGen* sequence_gen) {
LogDetail([](AsyncDetail& detail) {
#if USE_NEW_LOGGING_FORMAT
MLPERF_LOG(detail, "generic_message", "Starting FindPeakPerformance mode");
#else
detail("Starting FindPeakPerformance mode:");
#endif
});
if (scenario != TestScenario::Server) {
LogDetail([unsupported_scenario = ToString(scenario)](AsyncDetail& detail) {
#if USE_NEW_LOGGING_FORMAT
MLPERF_LOG_ERROR(detail, "error_invalid_config",
find_peak_performance::kNotSupportedMsg);
#else
detail.Error(find_peak_performance::kNotSupportedMsg);
#endif
});
return;
}
LogDetail(
[base_field = find_peak_performance::ToStringPerformanceField<scenario>(
base_settings)](AsyncDetail& detail) {
#if USE_NEW_LOGGING_FORMAT
MLPERF_LOG(
detail, "generic_message",
"FindPeakPerformance: Check validity of the base settings field: " +
base_field);
#else
detail(
"FindPeakPerformance: Check validity of the base settings field: " +
base_field);
#endif
});
// 1. Check whether the lower bound came from user satisfy performance
// constraints or not.
std::vector<loadgen::LoadableSampleSet> base_loadable_sets(
loadgen::GenerateLoadableSets(qsl, base_settings));
const LoadableSampleSet& base_performance_set = base_loadable_sets.front();
LoadSamplesToRam(qsl, base_performance_set.set);
PerformanceResult base_pr(IssueQueries<scenario, TestMode::PerformanceOnly>(
sut, base_settings, base_performance_set, sequence_gen));
PerformanceSummary base_perf_summary{sut->Name(), base_settings,
std::move(base_pr)};
// We can also use all_constraints_met to check performance constraints,
// but to reduce searching time, leave it up to whether the settings satisfy
// min duration & min queries or not to users.
std::string msg;
if (!base_perf_summary.PerfConstraintsMet(&msg)) {
LogDetail([msg](AsyncDetail& detail) {
#if USE_NEW_LOGGING_FORMAT
std::stringstream ss;
ss << "FindPeakPerformance: Initial lower bound does not satisfy "
<< "performance constraints, msg: " << msg;
MLPERF_LOG_ERROR(detail, "error_runtime", ss.str());
#else
detail.Error(
"FindPeakPerformance: Initial lower bound does not satisfy "
"performance constraints, msg: " +
msg);
#endif
});
PerformanceSummary perf_summary{sut->Name(), base_settings,
std::move(base_perf_summary.pr)};
LogSummary([perf_summary](AsyncSummary& summary) mutable {
perf_summary.LogSummary(summary);
});
// Create a copy to prevent thread hazard between LogSummary and LogDetail.
PerformanceSummary perf_summary_detail{perf_summary};
LogDetail([perf_summary_detail](AsyncDetail& detail) mutable {
perf_summary_detail.LogDetail(detail);
});
qsl->UnloadSamplesFromRam(base_performance_set.set);
return;
}
// Clear loaded samples.
qsl->UnloadSamplesFromRam(base_performance_set.set);
// 2. Find an upper bound based on the lower bound.
std::pair<PerformanceSummary, PerformanceSummary> boundaries =
FindBoundaries<scenario>(sut, qsl, sequence_gen, base_perf_summary);
PerformanceSummary l_perf_summary = boundaries.first;
PerformanceSummary u_perf_summary = boundaries.second;
LogDetail(
[l_field = find_peak_performance::ToStringPerformanceField<scenario>(
l_perf_summary.settings),
u_field = find_peak_performance::ToStringPerformanceField<scenario>(
u_perf_summary.settings)](AsyncDetail& detail) {
#if USE_NEW_LOGGING_FORMAT
MLPERF_LOG(detail, "generic_message",
"FindPeakPerformance: Found boundaries: [" + l_field + ", " +
u_field + ")");
#else
detail("FindPeakPerformance: Found boundaries: [" + l_field + ", " +
u_field + ")");
#endif
});
// Reuse performance_set, u_perf_summary has the largest 'samples_per_query'.
std::vector<loadgen::LoadableSampleSet> loadable_sets(
loadgen::GenerateLoadableSets(qsl, u_perf_summary.settings));
const LoadableSampleSet& performance_set = loadable_sets.front();
LoadSamplesToRam(qsl, performance_set.set);
// 3. Find peak performance settings using the found boundaries
PerformanceSummary perf_summary = FindPeakPerformanceBinarySearch<scenario>(
sut, qsl, sequence_gen, performance_set, l_perf_summary, u_perf_summary);
// Print-out the peak performance test setting.
LogDetail([field = find_peak_performance::ToStringPerformanceField<scenario>(
perf_summary.settings)](AsyncDetail& detail) {
#if USE_NEW_LOGGING_FORMAT
MLPERF_LOG(detail, "generic_message",
"FindPeakPerformance: Found peak performance field: " + field);
#else
detail("FindPeakPerformance: Found peak performance field: " + field);
#endif
});
LogSummary([perf_summary](AsyncSummary& summary) mutable {
perf_summary.LogSummary(summary);
});
// Create a copy to prevent thread hazard between LogSummary and LogDetail.
PerformanceSummary perf_summary_detail{perf_summary};
LogDetail([perf_summary_detail](AsyncDetail& detail) mutable {
perf_summary_detail.LogDetail(detail);
});
qsl->UnloadSamplesFromRam(performance_set.set);
}
/// \brief Runs the accuracy mode, templated by scenario.
template <TestScenario scenario>
void RunAccuracyMode(SystemUnderTest* sut, QuerySampleLibrary* qsl,
const TestSettingsInternal& settings,
SequenceGen* sequence_gen) {
LogDetail([](AsyncDetail& detail) {
#if USE_NEW_LOGGING_FORMAT
MLPERF_LOG(detail, "generic_message", "Starting accuracy mode");
#else
detail("Starting accuracy mode:");
#endif
});
std::vector<loadgen::LoadableSampleSet> loadable_sets(
loadgen::GenerateLoadableSets(qsl, settings));
for (auto& loadable_set : loadable_sets) {
{
auto tracer = MakeScopedTracer(
[count = loadable_set.set.size()](AsyncTrace& trace) {
trace("LoadSamples", "count", count);
});
LoadSamplesToRam(qsl, loadable_set.set);
}
PerformanceResult pr(IssueQueries<scenario, TestMode::AccuracyOnly>(
sut, settings, loadable_set, sequence_gen));
{
auto tracer = MakeScopedTracer(
[count = loadable_set.set.size()](AsyncTrace& trace) {
trace("UnloadSampes", "count", count);
});
qsl->UnloadSamplesFromRam(loadable_set.set);
}
}
}
/// \brief Routes runtime scenario requests to the corresponding instances
/// of its templated mode functions.
struct RunFunctions {
using Signature = void(SystemUnderTest* sut, QuerySampleLibrary* qsl,
const TestSettingsInternal& settings,
SequenceGen* sequence_gen);
template <TestScenario compile_time_scenario>
static RunFunctions GetCompileTime() {
return {(RunAccuracyMode<compile_time_scenario>),
(RunPerformanceMode<compile_time_scenario>),
(FindPeakPerformanceMode<compile_time_scenario>)};
}
static RunFunctions Get(TestScenario run_time_scenario) {
switch (run_time_scenario) {
case TestScenario::SingleStream:
return GetCompileTime<TestScenario::SingleStream>();
case TestScenario::MultiStream:
return GetCompileTime<TestScenario::MultiStream>();
case TestScenario::Server:
return GetCompileTime<TestScenario::Server>();
case TestScenario::Offline:
return GetCompileTime<TestScenario::Offline>();
}
// We should not reach this point.
assert(false);
return GetCompileTime<TestScenario::SingleStream>();
}
Signature& accuracy;
Signature& performance;
Signature& find_peak_performance;
};
} // namespace loadgen
void StartTest(SystemUnderTest* sut, QuerySampleLibrary* qsl,
const TestSettings& requested_settings,
const LogSettings& log_settings,
const std::string audit_config_filename) {
GlobalLogger().StartIOThread();
const std::string test_date_time = CurrentDateTimeISO8601();
loadgen::LogOutputs log_outputs(log_settings.log_output, test_date_time);
if (!log_outputs.CheckOutputs()) {
return;
}
GlobalLogger().StartLogging(&log_outputs.summary_out, &log_outputs.detail_out,
&log_outputs.accuracy_out,
log_settings.log_output.copy_detail_to_stdout,
log_settings.log_output.copy_summary_to_stdout);
GlobalLogger().SetUseTokens(requested_settings.use_token_latencies);
bool needs_first_token =
(requested_settings.scenario != TestScenario::Offline);
GlobalLogger().SetNeedsFirstToken(needs_first_token);
if (log_settings.enable_trace) {
GlobalLogger().StartNewTrace(&log_outputs.trace_out, PerfClock::now());
}
// measure sut->Name() response time
PerfClock::time_point pre_get_sut_name_ts = PerfClock::now();
const std::string& sut_name = sut->Name();
PerfClock::time_point post_get_sut_name_ts = PerfClock::now();
auto get_sut_name_duration_ns =
std::chrono::duration_cast<std::chrono::nanoseconds>(
post_get_sut_name_ts - pre_get_sut_name_ts)
.count();
LogLoadgenVersion();
LogDetail([sut, qsl, test_date_time, &sut_name,
&get_sut_name_duration_ns](AsyncDetail& detail) {
#if USE_NEW_LOGGING_FORMAT
MLPERF_LOG(detail, "test_datetime", test_date_time);
MLPERF_LOG(detail, "sut_name", sut_name);
MLPERF_LOG(detail, "get_sut_name_duration_ns", get_sut_name_duration_ns);
MLPERF_LOG(detail, "qsl_name", qsl->Name());
MLPERF_LOG(detail, "qsl_reported_total_count", qsl->TotalSampleCount());
MLPERF_LOG(detail, "qsl_reported_performance_count",
qsl->PerformanceSampleCount());
#else
detail("Date + time of test: ", test_date_time);
detail("System Under Test (SUT) name: ", sut_name);
detail("Get SUT name time [ns]: ", get_sut_name_duration_ns);
detail("Query Sample Library (QSL) name: ", qsl->Name());
detail("QSL total size: ", qsl->TotalSampleCount());
detail("QSL performance size*: ", qsl->PerformanceSampleCount());
detail("*TestSettings (performance_sample_count_override) can override");
detail("*Refer to Effective Settings for actual value");
#endif
});
TestSettings test_settings = requested_settings;
// Look for Audit Config file to override TestSettings during audit
if (FileExists(audit_config_filename)) {
LogDetail([](AsyncDetail& detail) {
#if USE_NEW_LOGGING_FORMAT
MLPERF_LOG_WARNING(detail, "warning_generic_message",
"Found Audit Config file (audit.config)."
" Overriding TestSettings from audit.config file.");
#else
detail(
"Found Audit Config file (audit.config)."
" Overriding TestSettings from audit.config file.");
#endif
});
std::string audit_scenario = loadgen::ToString(test_settings.scenario);
// Remove Spaces from the string
RemoveValue(&audit_scenario, ' ');
const std::string generic_model = "*";
test_settings.FromConfig(audit_config_filename, generic_model,
audit_scenario, 2);
}
if (test_settings.test05) {
// If the configuration indicates we are running test05,
// random seeds
LogDetail([](AsyncDetail& detail) {
#if USE_NEW_LOGGING_FORMAT
MLPERF_LOG_WARNING(detail, "warning_generic_message",
"Test05 flag detected"
" Overriding random seeds");
#else
detail(
"Test05 flag detected"
" Overriding random seeds");
#endif
});
test_settings.mode = TestMode::PerformanceOnly;
test_settings.qsl_rng_seed = requested_settings.test05_qsl_rng_seed;
test_settings.sample_index_rng_seed =
requested_settings.test05_sample_index_rng_seed;
test_settings.schedule_rng_seed =
requested_settings.test05_schedule_rng_seed;
}
loadgen::TestSettingsInternal sanitized_settings(
test_settings, qsl->PerformanceSampleCount());
sanitized_settings.LogAllSettings();
auto run_funcs = loadgen::RunFunctions::Get(sanitized_settings.scenario);
loadgen::SequenceGen sequence_gen;
switch (sanitized_settings.mode) {
case TestMode::SubmissionRun:
run_funcs.accuracy(sut, qsl, sanitized_settings, &sequence_gen);
run_funcs.performance(sut, qsl, sanitized_settings, &sequence_gen);
break;
case TestMode::AccuracyOnly:
run_funcs.accuracy(sut, qsl, sanitized_settings, &sequence_gen);
break;
case TestMode::PerformanceOnly:
run_funcs.performance(sut, qsl, sanitized_settings, &sequence_gen);
break;
case TestMode::FindPeakPerformance:
run_funcs.find_peak_performance(sut, qsl, sanitized_settings,
&sequence_gen);
break;
}
loadgen::IssueQueryController::GetInstance().EndThreads();
// Stop tracing after logging so all logs are captured in the trace.
GlobalLogger().StopLogging();
GlobalLogger().StopTracing();
GlobalLogger().StopIOThread();
}
void AbortTest() {
loadgen::IssueQueryController::GetInstance().EndThreads();
GlobalLogger().StopLogging();
GlobalLogger().StopTracing();
GlobalLogger().StopIOThread();
}
void QuerySamplesComplete(QuerySampleResponse* responses, size_t response_count,
const ResponseCallback& response_cb) {
PerfClock::time_point timestamp = PerfClock::now();
auto tracer = MakeScopedTracer(
[](AsyncTrace& trace) { trace("QuerySamplesComplete"); });
const QuerySampleResponse* end = responses + response_count;
// Notify first to unblock loadgen production ASAP.
for (QuerySampleResponse* response = responses; response < end; response++) {
loadgen::SampleMetadata* sample =
reinterpret_cast<loadgen::SampleMetadata*>(response->id);
loadgen::QueryMetadata* query = sample->query_metadata;
query->NotifyOneSampleCompleted(timestamp);
}
// Log samples.
for (QuerySampleResponse* response = responses; response < end; response++) {
loadgen::SampleMetadata* sample =
reinterpret_cast<loadgen::SampleMetadata*>(response->id);
loadgen::QueryMetadata* query = sample->query_metadata;
query->response_delegate->SampleComplete(sample, response, timestamp,
response_cb);
}
// PerfClock::time_point end_timestamp = PerfClock::now();
// mlperf::samples_overhead_acum += (end_timestamp - timestamp).count();
}
void FirstTokenComplete(QuerySampleResponse* responses, size_t response_count,
const ResponseCallback& response_cb) {
PerfClock::time_point timestamp = PerfClock::now();
auto tracer =
MakeScopedTracer([](AsyncTrace& trace) { trace("FirstTokenComplete"); });
const QuerySampleResponse* end = responses + response_count;
// Log samples.
for (QuerySampleResponse* response = responses; response < end; response++) {
loadgen::SampleMetadata* sample =
reinterpret_cast<loadgen::SampleMetadata*>(response->id);
loadgen::QueryMetadata* query = sample->query_metadata;
query->response_delegate->TokenComplete(sample, response, timestamp,
response_cb);
}
// PerfClock::time_point end_timestamp = PerfClock::now();
// mlperf::tokens_overhead_acum += (end_timestamp - timestamp).count();
}
} // namespace mlperf
/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
/// \file
/// \brief Provides the entry points for a SUT to start a test and respond
/// to issued queries.
#ifndef MLPERF_LOADGEN_LOADGEN_H_
#define MLPERF_LOADGEN_LOADGEN_H_
#include <cstddef>
#include <functional>
#include <numeric>
#include <string>
/// \brief Contains the loadgen API.
namespace mlperf {
struct QuerySampleResponse;
class QuerySampleLibrary;
class SystemUnderTest;
struct TestSettings;
struct LogSettings;
using ResponseCallback = std::function<void(QuerySampleResponse*)>;
/// \addtogroup LoadgenAPI Loadgen API
/// @{
///
/// \brief SUT calls this to notify loadgen of completed samples.
/// \details
/// * The samples may be from any combination of queries or partial queries as
/// issued by \link mlperf::SystemUnderTest::IssueQuery
///
/// SystemUnderTest::IssueQuery \endlink.
/// * The SUT is responsible for owning and allocating the reponse data. The
/// loadgen will copy the response data if needed (e.g. for accuracy mode).
/// + If no response callback is provided, the response data must remain valid
/// for the entire duration of this call.
/// + The response callback is untimed; it is called for each response in
/// responses after the loadgen records the completion time and before the
/// loadgen copies the response data. The response callback enables the
/// loadgen to simulate response data being stored in accelerator DRAM.
/// After the response callback is called, response data must reside on the
/// host so that the loadgen can copy it. Submitters must seek prior
/// approval to use this feature of loadgen (refer to
/// https://github.com/mlcommons/inference_policies/blob/master/inference_rules.adoc#5-load-generator).
/// * All calls to QuerySampleComplete are thread-safe and wait-free bounded.
/// + Any number of threads can call QuerySampleComplete simultaneously.
/// + Regardless of where any other thread stalls, the current thread will
/// finish QuerySampleComplete in a bounded number of cycles.
/// + Note: If a callback is provided, the SUT must ensure that the callback
/// is also thread-safe and wait-free bounded for the above to hold.
void QuerySamplesComplete(QuerySampleResponse* responses, size_t response_count,
const ResponseCallback& response_cb = {});
void FirstTokenComplete(QuerySampleResponse* responses, size_t response_count,
const ResponseCallback& response_cb = {});
///
/// \brief Starts the test against SUT with the specified settings.
/// \details This is the C++ entry point. See mlperf::c::StartTest for the
/// C entry point.
///
void StartTest(SystemUnderTest* sut, QuerySampleLibrary* qsl,
const TestSettings& requested_settings,
const LogSettings& log_settings,
const std::string audit_config_filename = "audit.config");
///
/// \brief Aborts the running test.
/// \details This function will stop issueing new samples to the SUT. StartTest
/// will return after the current inference finishes. Since StartTest is a
/// blocking function, this function can only be called in another thread.
void AbortTest();
///
/// \brief Register a thread for query issuing in Server scenario.
/// \details If a thread registers itself, the thread(s) is used to call SUT's
/// IssueQuery(). This function is blocking until the entire test is done. The
/// number of registered threads must match server_num_issue_query_threads in
/// TestSettings. This function only has effect in Server scenario.
/// This is the C++ entry point. See mlperf::c::RegisterIssueQueryThread for the
/// C entry point.
///
void RegisterIssueQueryThread();
// inline long long samples_overhead_acum;
// inline long long tokens_overhead_acum;
/// @}
} // namespace mlperf
#endif // MLPERF_LOADGEN_LOADGEN_H_
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!-- Created by diasvg.py -->
<svg width="29.100cm" height="12.100cm" viewBox="1.950 0.950 29.100 12.100"
xmlns="http://www.w3.org/2000/svg"
xmlns:xlink="http://www.w3.org/1999/xlink">
<!-- Layer: Background -->
<g id="Background">
<!-- Flowchart - Box -->
<rect x="2.000" y="2.000" width="8.000" height="2.000" fill="#FFFFFF" stroke="#000000" stroke-width="0.100" rx="0.000" />
<text x="6.000" y="3.195" fill="#000000" text-anchor="middle" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
Model + Dataset</text>
<!-- Flowchart - Box -->
<rect x="2.000" y="6.000" width="8.000" height="2.000" fill="#FFFFFF" stroke="#000000" stroke-width="0.100" rx="0.000" />
<text x="6.000" y="7.195" fill="#000000" text-anchor="middle" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
Pre Processor</text>
<!-- Flowchart - Box -->
<rect x="24.000" y="6.000" width="7.000" height="2.000" fill="#FFFFFF" stroke="#000000" stroke-width="0.100" rx="0.000" />
<text x="27.500" y="7.195" fill="#000000" text-anchor="middle" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
Post Processor</text>
<!-- Flowchart - Box -->
<rect x="13.000" y="1.000" width="8.000" height="8.000" fill="#FFFFFF" stroke="#000000" stroke-width="0.100" rx="0.000" />
<text x="17.000" y="5.195" fill="#000000" text-anchor="middle" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
Benchmark</text>
<!-- Flowchart - Box -->
<rect x="24.000" y="1.000" width="7.000" height="4.000" fill="#FFFFFF" stroke="#000000" stroke-width="0.100" rx="0.000" />
<text x="27.500" y="3.195" fill="#000000" text-anchor="middle" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
Backend</text>
<!-- Flowchart - Box -->
<rect x="13.000" y="11.000" width="8.000" height="2.000" fill="#FFFFFF" stroke="#000000" stroke-width="0.100" rx="0.000" />
<text x="17.000" y="12.195" fill="#000000" text-anchor="middle" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
LoadGen</text>
<!-- Standard - Line -->
<line x1="10.000" y1="3.000" x2="12.513" y2="3.000" stroke="#000000" stroke-width="0.100" />
<polygon fill="#000000" stroke="#000000" stroke-width="0.100" points="12.888,3.000 12.388,3.250 12.513,3.000 12.388,2.750 "/>
<!-- Standard - Line -->
<line x1="10.000" y1="7.000" x2="12.513" y2="7.000" stroke="#000000" stroke-width="0.100" />
<polygon fill="#000000" stroke="#000000" stroke-width="0.100" points="12.888,7.000 12.388,7.250 12.513,7.000 12.388,6.750 "/>
<!-- Standard - Line -->
<line x1="15.000" y1="9.000" x2="15.000" y2="10.513" stroke="#000000" stroke-width="0.100" />
<polygon fill="#000000" stroke="#000000" stroke-width="0.100" points="15.000,10.888 14.750,10.388 15.000,10.513 15.250,10.388 "/>
<!-- Standard - Line -->
<line x1="17.000" y1="11.000" x2="17.000" y2="9.487" stroke="#000000" stroke-width="0.100" />
<polygon fill="#000000" stroke="#000000" stroke-width="0.100" points="17.000,9.112 17.250,9.612 17.000,9.487 16.750,9.612 "/>
<!-- Standard - Line -->
<line x1="19.000" y1="9.000" x2="19.000" y2="10.513" stroke="#000000" stroke-width="0.100" />
<polygon fill="#000000" stroke="#000000" stroke-width="0.100" points="19.000,10.888 18.750,10.388 19.000,10.513 19.250,10.388 "/>
<!-- Standard - Line -->
<line x1="21.000" y1="3.000" x2="23.513" y2="3.000" stroke="#000000" stroke-width="0.100" />
<polygon fill="#000000" stroke="#000000" stroke-width="0.100" points="23.888,3.000 23.388,3.250 23.513,3.000 23.388,2.750 "/>
<!-- Standard - Line -->
<line x1="24.000" y1="7.000" x2="21.487" y2="7.000" stroke="#000000" stroke-width="0.100" />
<polygon fill="#000000" stroke="#000000" stroke-width="0.100" points="21.112,7.000 21.612,6.750 21.487,7.000 21.612,7.250 "/>
<!-- Standard - Text -->
<text x="10.850" y="2.650" fill="#000000" text-anchor="start" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
1</text>
<!-- Standard - Text -->
<!-- Standard - Text -->
<text x="14.100" y="9.950" fill="#000000" text-anchor="start" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
2</text>
<!-- Standard - Text -->
<text x="16.100" y="10.450" fill="#000000" text-anchor="start" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
3</text>
<!-- Standard - Text -->
<text x="18.100" y="9.950" fill="#000000" text-anchor="start" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
5</text>
<!-- Standard - Text -->
<text x="21.950" y="2.650" fill="#000000" text-anchor="start" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
4</text>
<!-- Standard - Text -->
<!-- Standard - Text -->
<text x="24.500" y="12.221" fill="#000000" text-anchor="start" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
LoadGen Logs</text>
<!-- Standard - Line -->
<line x1="21.000" y1="12.000" x2="23.513" y2="12.000" stroke="#000000" stroke-width="0.100" />
<polygon fill="#000000" stroke="#000000" stroke-width="0.100" points="23.888,12.000 23.388,12.250 23.513,12.000 23.388,11.750 "/>
<!-- Standard - Text -->
<text x="21.900" y="11.450" fill="#000000" text-anchor="start" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
6</text>
<!-- Standard - Text -->
<!-- Standard - Text -->
<!-- Standard - Line -->
<line x1="27.500" y1="5.000" x2="27.500" y2="5.513" stroke="#000000" stroke-width="0.100" />
<polygon fill="#000000" stroke="#000000" stroke-width="0.100" points="27.500,5.888 27.250,5.388 27.500,5.513 27.750,5.388 "/>
</g>
</svg>
\ No newline at end of file
/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
/// \file
/// \brief Implements a logging system with a central IO thread that handles
/// all stringification and IO.
/// \details Log-producing threads only submit lambdas to be executed on the
/// IO thread.
/// All producers and consumers use lock-free operations that guarantee
/// forward progress independent of a) other stalled threads and b) where
/// those threads are stalled.
/// Each thread uses a double-buffering scheme to queue its logs. One buffer
/// is always reserved for writes and the other is reserved for reads.
/// A producing thread sends requests to the IOThread to swap the buffers
/// and the IOThread does the actual read/write swap after it has finished
/// reading the buffer it was working on.
#include "logging.h"
#include <cassert>
#include <cmath>
#include <future>
#include <iomanip>
#include <iostream>
#include <limits>
#include <sstream>
#if defined(_WIN32) || defined(WIN32) || defined(_WIN64) || defined(WIN64)
#define WIN32_LEAN_AND_MEAN
#define NOMINMAX
#include <process.h>
#include <windows.h>
#define MLPERF_GET_PID() _getpid()
#else
#include <unistd.h>
#define MLPERF_GET_PID() getpid()
#endif
// Use system-level TID for tracing. This enables correlation with other
// performance tools that are not aware of C++ std::thread::id.
#if defined(__linux__)
#include <sys/syscall.h>
#define MLPERF_GET_TID() syscall(SYS_gettid)
#elif defined(_WIN32) || defined(WIN32) || defined(_WIN64) || defined(WIN64)
#define MLPERF_GET_TID() GetCurrentThreadId()
#elif defined(__APPLE__)
#define MLPERF_GET_TID() \
std::hash<std::thread::id>{}(std::this_thread::get_id())
#else
// TODO: std::this_thread::id is a class but MLPERF_GET_TID() assigned to
// uint64_t
#define MLPERF_GET_TID() std::this_thread::get_id()
#endif
#include "utils.h"
namespace mlperf {
namespace logging {
namespace {
uintptr_t SwapRequestSlotIsWritableValue(size_t id) {
// LSB of 1 indicates that this isn't a pointer.
// MSBs encode the id to detect collisions when a slot in
// |thread_swap_request_slots_| is reused for a different id and the request
// for the previous id is very slow.
return (id << 1) | 0x1;
}
bool SwapRequestSlotIsReadable(uintptr_t value) {
// Valid pointers will not have their lsb set.
return (value & 0x1) != 0x1;
}
constexpr size_t kMaxThreadsToLog = 1024;
constexpr std::chrono::milliseconds kLogPollPeriod(10);
/// \brief How many log entries to pre-allocate per thread to help avoid
/// runtime allocation.
constexpr size_t kTlsLogReservedEntryCount = 1024;
constexpr auto kInvalidLatency = std::numeric_limits<QuerySampleLatency>::min();
constexpr auto nTokenInvalid = std::numeric_limits<int64_t>::min();
} // namespace
const std::string& ArgValueTransform(const bool& value) {
static const std::string v_true("true");
static const std::string v_false("false");
return value ? v_true : v_false;
}
char Bin2Hex(uint8_t four_bits) {
char number = '0' + four_bits;
char letter = ('A' - 10) + four_bits;
return four_bits < 10 ? number : letter;
}
const std::string ArgValueTransform(const LogBinaryAsHexString& value) {
if (value.data == nullptr) {
return "\"\"";
}
std::string hex;
hex.reserve(value.data->size() + 2);
hex.push_back('"');
for (auto b : *value.data) {
hex.push_back(Bin2Hex(b >> 4));
hex.push_back(Bin2Hex(b & 0x0F));
}
hex.push_back('"');
return hex;
}
#if USE_NEW_LOGGING_FORMAT
const std::string ArgValueTransform(const std::string& value) {
return std::string("\"") + value + std::string("\"");
}
const std::string ArgValueTransform(const char* value) {
return std::string("\"") + std::string(value) + std::string("\"");
}
const std::string ArgValueTransform(const std::vector<size_t>& value) {
std::string s("[");
for (auto i : value) {
s += std::to_string(i) + ",";
}
s.resize(s.size() - 1);
s += "]";
return s;
}
const std::string ArgValueTransform(
const std::map<std::string, std::string>& value) {
std::string s("{");
for (const auto& i : value) {
s += "\"";
s += i.first;
s += "\":\"";
s += i.second;
s += "\",";
}
s.resize(s.size() - 1);
s += "}";
return s;
}
const std::string ArgValueTransform(const float value) {
if (value == std::numeric_limits<float>::infinity()) {
return "Infinity";
} else if (value == -std::numeric_limits<float>::infinity()) {
return "-Infinity";
} else if (std::isnan(value)) {
return "NaN";
}
return std::to_string(value);
}
const std::string ArgValueTransform(const double value) {
if (value == std::numeric_limits<double>::infinity()) {
return "Infinity";
} else if (value == -std::numeric_limits<double>::infinity()) {
return "-Infinity";
} else if (std::isnan(value)) {
return "NaN";
}
return std::to_string(value);
}
#endif
ChromeTracer::ChromeTracer(std::ostream* out, PerfClock::time_point origin)
: out_(out), origin_(origin) {
WriteTraceEventHeader();
}
ChromeTracer::~ChromeTracer() {
WriteTraceEventFooter();
out_->flush();
}
void ChromeTracer::WriteTraceEventHeader() {
// Times and durations are converted from nanoseconds to microseconds, use
// 3 decimal digits to preserve precision.
*out_ << std::fixed << std::setprecision(3) << "{\"traceEvents\":[\n";
}
void ChromeTracer::WriteTraceEventFooter() {
*out_ << "{\"name\":\"LastTrace\"}\n"
<< "],\n"
<< "\"displayTimeUnit\":\"ns\",\n"
<< "\"otherData\":{\n"
<< "\"ts\":" << Micros(origin_.time_since_epoch()).count() << ",\n"
<< "\"version\":\"MLPerf LoadGen v1.0\"\n"
<< "}\n"
<< "}\n";
}
void AsyncLog::SetCurrentPidTid(uint64_t pid, uint64_t tid) {
current_pid_ = pid;
current_tid_ = tid;
}
void AsyncLog::SetLogFiles(std::ostream* summary, std::ostream* detail,
std::ostream* accuracy, bool copy_detail_to_stdout,
bool copy_summary_to_stdout,
PerfClock::time_point log_origin) {
std::unique_lock<std::mutex> lock(log_mutex_);
if (summary_out_ != &std::cerr) {
std::string warning_summary;
if (log_warning_count_ == 0) {
warning_summary = "\nNo warnings encountered during test.\n";
} else if (log_warning_count_ == 1) {
warning_summary = "\n1 warning encountered. See detailed log.\n";
} else if (log_warning_count_ != 0) {
warning_summary = "\n" + std::to_string(log_warning_count_) +
" warnings encountered. See detailed log.\n";
}
std::string error_summary;
if (log_error_count_ == 0) {
error_summary = "\nNo errors encountered during test.\n";
} else if (log_error_count_ == 1) {
error_summary = "\n1 ERROR encountered. See detailed log.\n";
} else if (log_error_count_ != 0) {
error_summary = "\n" + std::to_string(log_error_count_) +
" ERRORS encountered. See detailed log.\n";
}
*summary_out_ << warning_summary << error_summary;
if (copy_summary_to_stdout_) {
std::cout << warning_summary << error_summary;
}
}
if (summary_out_) {
summary_out_->flush();
}
if (detail_out_) {
detail_out_->flush();
}
if (accuracy_out_ != &std::cerr) {
WriteAccuracyFooterLocked();
accuracy_out_->flush();
}
summary_out_ = summary;
detail_out_ = detail;
accuracy_out_ = accuracy;
if (accuracy_out_ != &std::cerr) {
WriteAccuracyHeaderLocked();
}
copy_detail_to_stdout_ = copy_detail_to_stdout;
copy_summary_to_stdout_ = copy_summary_to_stdout;
log_origin_ = log_origin;
log_error_count_ = 0;
log_warning_count_ = 0;
}
void AsyncLog::StartNewTrace(std::ostream* trace_out,
PerfClock::time_point origin) {
std::unique_lock<std::mutex> lock(trace_mutex_);
if (trace_out) {
tracer_ = std::make_unique<ChromeTracer>(trace_out, origin);
} else {
tracer_.reset();
}
}
void AsyncLog::StopTrace() {
std::unique_lock<std::mutex> lock(trace_mutex_);
tracer_.reset();
}
void AsyncLog::LogAccuracy(uint64_t seq_id, const QuerySampleIndex qsl_idx,
const LogBinaryAsHexString& response,
int64_t n_tokens = 0) {
std::unique_lock<std::mutex> lock(log_mutex_);
if (!accuracy_out_) {
return;
}
*accuracy_out_ << (accuracy_needs_comma_ ? ",\n{ " : "\n{ ");
if (!use_tokens_) {
LogArgs(accuracy_out_, "seq_id", seq_id, "qsl_idx", qsl_idx, "data",
response);
} else if (!needs_first_token_) {
LogArgs(accuracy_out_, "seq_id", seq_id, "qsl_idx", qsl_idx, "data",
response, "token_count", n_tokens);
} else {
const size_t i = seq_id - latencies_first_sample_sequence_id_;
LogArgs(accuracy_out_, "seq_id", seq_id, "qsl_idx", qsl_idx, "data",
response, "token_data", token_records_[i], "token_count", n_tokens);
}
*accuracy_out_ << " }";
accuracy_needs_comma_ = true;
}
void AsyncLog::CacheToken(uint64_t seq_id,
const LogBinaryAsHexString& response) {
std::unique_lock<std::mutex> lock(token_record_mutex_);
const size_t i = seq_id - latencies_first_sample_sequence_id_;
if (token_records_.size() <= i) {
token_records_.resize(i + 1);
}
token_records_[i] = response;
}
void AsyncLog::Flush() {
{
std::unique_lock<std::mutex> lock(log_mutex_);
if (summary_out_) {
summary_out_->flush();
}
if (detail_out_) {
detail_out_->flush();
}
if (accuracy_out_) {
accuracy_out_->flush();
}
}
{
std::unique_lock<std::mutex> lock(trace_mutex_);
if (tracer_) {
tracer_->Flush();
}
}
}
void AsyncLog::WriteAccuracyHeaderLocked() {
*accuracy_out_ << "[";
accuracy_needs_comma_ = false;
}
void AsyncLog::WriteAccuracyFooterLocked() { *accuracy_out_ << "\n]\n"; }
void AsyncLog::RestartLatencyRecording(uint64_t first_sample_sequence_id,
size_t latencies_to_reserve) {
std::unique_lock<std::mutex> lock(latencies_mutex_);
assert(latencies_.empty());
assert(latencies_recorded_ == latencies_expected_);
latencies_recorded_ = 0;
latencies_expected_ = 0;
max_latency_ = 0;
max_completion_timstamp_ = PerfClock::now();
latencies_first_sample_sequence_id_ = first_sample_sequence_id;
latencies_.reserve(latencies_to_reserve);
token_latencies_.reserve(latencies_to_reserve);
tokens_per_sample_.reserve(latencies_to_reserve);
time_per_output_token_.reserve(latencies_to_reserve);
}
void AsyncLog::RecordSampleCompletion(uint64_t sample_sequence_id,
PerfClock::time_point completion_time,
QuerySampleLatency latency,
int64_t n_tokens = 0) {
std::unique_lock<std::mutex> lock(latencies_mutex_);
max_latency_ = std::max(max_latency_, latency);
max_completion_timstamp_ =
std::max(max_completion_timstamp_, completion_time);
if (sample_sequence_id < latencies_first_sample_sequence_id_) {
// Call LogErrorSync here since this kind of error could result in a
// segfault in the near future.
#if USE_NEW_LOGGING_FORMAT
std::stringstream ss;
ss << "Received completion for an old sample."
<< " Min expected id: " << latencies_first_sample_sequence_id_
<< " Actual id: " << sample_sequence_id;
MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime", ss.str());
#else
GlobalLogger().LogErrorSync(
"Received completion for an old sample.", "Min expected id",
latencies_first_sample_sequence_id_, "Actual id", sample_sequence_id);
#endif
return;
}
const size_t i = sample_sequence_id - latencies_first_sample_sequence_id_;
if (latencies_.size() <= i) {
// TODO: Reserve in advance.
latencies_.resize(i + 1, kInvalidLatency);
} else if (latencies_[i] != kInvalidLatency) {
// Call LogErrorSync here since this kind of error could result in a
// segfault in the near future.
#if USE_NEW_LOGGING_FORMAT
MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime",
"Attempted to complete a sample twice.");
#else
GlobalLogger().LogErrorSync("Attempted to complete a sample twice.");
#endif
// Return without recording the latency again to avoid potentially
// ending the test before the SUT is actually done, which could result
// in a segfault.
// If the SUT recorded the wrong sample, the test will hang and see
// the error above.
return;
}
if (use_tokens_) {
if (needs_first_token_ && (token_latencies_.size() <= i)) {
MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime",
"Attempted to record a sample latency before it's "
"first token latency");
} else if (needs_first_token_ && (token_latencies_[i] == kInvalidLatency)) {
MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime",
"Attempted to record a sample latency before it's "
"first token latency");
}
if (tokens_per_sample_.size() <= i) {
// TODO: Reserve in advance.
tokens_per_sample_.resize(i + 1, nTokenInvalid);
} else if (tokens_per_sample_[i] != nTokenInvalid) {
// Call LogErrorSync here since this kind of error could result in a
// segfault in the near future.
#if USE_NEW_LOGGING_FORMAT
MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime",
"Attempted to complete a sample twice.");
#else
GlobalLogger().LogErrorSync("Attempted to complete a sample twice.");
#endif
// Return without recording the latency again to avoid potentially
// ending the test before the SUT is actually done, which could result
// in a segfault.
// If the SUT recorded the wrong sample, the test will hang and see
// the error above.
return;
}
if (n_tokens == 0) {
MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime",
"n_tokens argument missing or attempted to record "
"0 as number of tokens");
} else if (n_tokens < 0) {
MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime",
"Attempted to record a negative number of tokens");
n_tokens = 0;
} else if (n_tokens == 1) {
MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime",
"Number of tokens need to be greater than 1");
n_tokens = 0;
}
if (time_per_output_token_.size() <= i) {
time_per_output_token_.resize(i + 1, kInvalidLatency);
} else if (time_per_output_token_[i] != kInvalidLatency) {
// Call LogErrorSync here since this kind of error could result in a
// segfault in the near future.
#if USE_NEW_LOGGING_FORMAT
MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime",
"Attempted to complete a sample twice.");
#else
GlobalLogger().LogErrorSync("Attempted to complete a sample twice.");
#endif
// Return without recording the latency again to avoid potentially
// ending the test before the SUT is actually done, which could result
// in a segfault.
// If the SUT recorded the wrong sample, the test will hang and see
// the error above.
return;
}
tokens_per_sample_[i] = n_tokens;
time_per_output_token_[i] =
(latency - token_latencies_[i]) / (n_tokens - 1);
}
latencies_[i] = latency;
latencies_recorded_++;
if (AllLatenciesRecorded()) {
all_latencies_recorded_.notify_all();
}
}
void AsyncLog::RecordTokenCompletion(uint64_t sample_sequence_id,
PerfClock::time_point completion_time,
QuerySampleLatency latency) {
std::unique_lock<std::mutex> lock(token_latencies_mutex_);
// std::unique_lock<std::mutex> lock(latencies_mutex_);
// max_latency_ = std::max(max_latency_, latency);
// max_completion_timstamp_ =
// std::max(max_completion_timstamp_, completion_time);
if (sample_sequence_id < latencies_first_sample_sequence_id_) {
// Call LogErrorSync here since this kind of error could result in a
// segfault in the near future.
#if USE_NEW_LOGGING_FORMAT
std::stringstream ss;
ss << "Received completion for an old sample."
<< " Min expected id: " << latencies_first_sample_sequence_id_
<< " Actual id: " << sample_sequence_id;
MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime", ss.str());
#else
GlobalLogger().LogErrorSync(
"Received completion for an old sample.", "Min expected id",
latencies_first_sample_sequence_id_, "Actual id", sample_sequence_id);
#endif
return;
}
const size_t i = sample_sequence_id - latencies_first_sample_sequence_id_;
if (latencies_.size() > i) {
if (latencies_[i] != kInvalidLatency) {
#if USE_NEW_LOGGING_FORMAT
MLPERF_LOG_ERROR_SYNC(
GlobalLogger(), "error_runtime",
"Attempted to record token latency after sample was completed");
#else
GlobalLogger().LogErrorSync(
"Attempted to record token latency after sample was completed");
#endif
// Return without recording the latency again to avoid potentially
// ending the test before the SUT is actually done, which could result
// in a segfault.
// If the SUT recorded the wrong sample, the test will hang and see
// the error above.
return;
}
}
if (token_latencies_.size() <= i) {
// TODO: Reserve in advance.
token_latencies_.resize(i + 1, kInvalidLatency);
} else if (token_latencies_[i] != kInvalidLatency) {
// Call LogErrorSync here since this kind of error could result in a
// segfault in the near future.
#if USE_NEW_LOGGING_FORMAT
MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime",
"Attempted to complete a sample twice.");
#else
GlobalLogger().LogErrorSync("Attempted to complete a sample twice.");
#endif
// Return without recording the latency again to avoid potentially
// ending the test before the SUT is actually done, which could result
// in a segfault.
// If the SUT recorded the wrong sample, the test will hang and see
// the error above.
return;
}
token_latencies_[i] = latency;
}
std::vector<QuerySampleLatency> AsyncLog::GetLatenciesBlocking(
size_t expected_count) {
std::vector<QuerySampleLatency> latencies;
{
std::unique_lock<std::mutex> lock(latencies_mutex_);
latencies_expected_ = expected_count;
all_latencies_recorded_.wait(lock, [&] { return AllLatenciesRecorded(); });
latencies.swap(latencies_);
}
if (latencies.size() != expected_count) {
// Call LogErrorSync here since this kind of error could result in a
// segfault in the near future.
#if USE_NEW_LOGGING_FORMAT
std::stringstream ss;
ss << "Received SequenceId that was too large."
<< " expected_size: " << expected_count
<< " actual_size: " << latencies.size();
MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime", ss.str());
#else
GlobalLogger().LogErrorSync("Received SequenceId that was too large.",
"expected_size", expected_count, "actual_size",
latencies.size());
#endif
}
size_t invalid_latency_count = 0;
for (auto l : latencies) {
if (l == kInvalidLatency) {
invalid_latency_count++;
}
}
if (invalid_latency_count != 0) {
// Call LogErrorSync here since this kind of error could result in a
// segfault in the near future.
#if USE_NEW_LOGGING_FORMAT
std::stringstream ss;
ss << "Encountered incomplete samples at the end of a series of queries."
<< " count: " << invalid_latency_count;
MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime", ss.str());
#else
GlobalLogger().LogErrorSync(
"Encountered incomplete samples at the end of a series of queries.",
"count", invalid_latency_count);
#endif
}
return latencies;
}
std::vector<QuerySampleLatency> AsyncLog::GetTokenLatencies(
size_t expected_count) {
std::vector<QuerySampleLatency> token_latencies;
token_latencies.swap(token_latencies_);
return token_latencies;
}
std::vector<QuerySampleLatency> AsyncLog::GetTimePerOutputToken(
size_t expected_count) {
std::vector<QuerySampleLatency> tpot_latencies;
tpot_latencies.swap(time_per_output_token_);
return tpot_latencies;
}
std::vector<int64_t> AsyncLog::GetTokensPerSample(size_t expected_count) {
std::vector<int64_t> tokens_per_sample;
tokens_per_sample.swap(tokens_per_sample_);
return tokens_per_sample;
}
PerfClock::time_point AsyncLog::GetMaxCompletionTime() {
return max_completion_timstamp_;
}
QuerySampleLatency AsyncLog::GetMaxLatencySoFar() {
std::unique_lock<std::mutex> lock(latencies_mutex_);
return max_latency_;
}
void AsyncLog::SetUseTokens(bool use_tokens) { use_tokens_ = use_tokens; }
void AsyncLog::SetNeedsFirstToken(bool needs_first_token) {
needs_first_token_ = needs_first_token;
}
/// \brief Records a single thread using thread-local storage and submits
/// entries to the central Logger.
///
/// \details This setup allows for each log entry to be added:
/// * With forward-progress guarantees. (i.e.: no locking or blocking
/// operations even if other threads have stalled.)
/// * Without expensive syscalls or I/O operations, which are deferred to
/// the central Logger.
class TlsLogger {
public:
TlsLogger(std::function<void()> forced_detatch);
~TlsLogger();
void ForcedDetatchFromThread() { forced_detatch_(); }
void Log(AsyncLogEntry&& entry);
void SwapBuffers();
std::vector<AsyncLogEntry>* StartReadingEntries();
void FinishReadingEntries();
bool ReadBufferHasBeenConsumed();
size_t MaxEntryVectorSize() { return max_entry_size_; }
uint64_t Pid() const { return pid_; }
uint64_t Tid() const { return tid_; }
void RequestSwapBuffersSlotRetried() {
swap_buffers_slot_retry_count_.fetch_add(1, std::memory_order_relaxed);
}
size_t ReportLogCasFailCount() {
size_t c = log_cas_fail_count_.load(std::memory_order_relaxed);
log_cas_fail_count_.fetch_sub(c, std::memory_order_relaxed);
return c;
}
size_t ReportSwapBuffersSlotRetryCount() {
size_t c = swap_buffers_slot_retry_count_.load(std::memory_order_relaxed);
swap_buffers_slot_retry_count_.fetch_sub(c, std::memory_order_relaxed);
return c;
}
void TraceCounters();
private:
using EntryVector = std::vector<AsyncLogEntry>;
enum class EntryState { Unlocked, ReadLock, WriteLock };
// Accessed by producer only.
size_t i_read_ = 0;
// Accessed by producer and consumer atomically.
EntryVector entries_[2];
std::atomic<EntryState> entry_states_[2]{{EntryState::ReadLock},
{EntryState::Unlocked}};
std::atomic<size_t> i_write_{1};
std::atomic<size_t> log_cas_fail_count_{0};
std::atomic<size_t> swap_buffers_slot_retry_count_{0};
// Accessed by consumer only.
size_t unread_swaps_ = 0;
size_t i_write_prev_ = 0;
uint64_t pid_;
uint64_t tid_;
size_t max_entry_size_ = kTlsLogReservedEntryCount;
std::function<void()> forced_detatch_;
};
Logger::Logger(std::chrono::duration<double> poll_period,
size_t max_threads_to_log)
: poll_period_(poll_period),
max_threads_to_log_(max_threads_to_log),
thread_swap_request_slots_(max_threads_to_log * 2) {
const size_t kSlotCount = max_threads_to_log * 2;
for (size_t i = 0; i < kSlotCount; i++) {
std::atomic_init(&thread_swap_request_slots_[i],
SwapRequestSlotIsWritableValue(i));
}
}
Logger::~Logger() {
// TlsLoggers might outlive this Logger when loaded as a python module.
// Forcefully make all currently registered TlsLoggers orphans.
std::unique_lock<std::mutex> lock(tls_loggers_registerd_mutex_);
TlsLogger* tls_logger_prev = nullptr;
(void)tls_logger_prev; // Avoid unused error in release builds.
while (!tls_loggers_registerd_.empty()) {
TlsLogger* tls_logger = *tls_loggers_registerd_.begin();
// Otherwise, this is an infinite loop.
assert(tls_logger != tls_logger_prev);
tls_loggers_registerd_mutex_.unlock();
tls_logger->ForcedDetatchFromThread();
tls_loggers_registerd_mutex_.lock();
tls_logger_prev = tls_logger;
}
}
void Logger::RequestSwapBuffers(TlsLogger* tls_logger) {
auto tls_logger_as_uint = reinterpret_cast<uintptr_t>(tls_logger);
assert(SwapRequestSlotIsReadable(tls_logger_as_uint));
size_t id, slot;
uintptr_t slot_is_writeable_value;
// The compare_exchange below should almost always succeed.
// The compare_exchange may fail if a recycled slot is still actively used
// by another thread, so we retry with subsequent slots here if needed.
// Since the slot count is 2x the expected number of threads to log,
// the CAS should only fail at most 50% of the time when all logging threads
// happen to be descheduled between the fetch_add and CAS below, which is
// very unlikely.
id = swap_request_id_.fetch_add(1, std::memory_order_relaxed);
slot = id % thread_swap_request_slots_.size();
slot_is_writeable_value = SwapRequestSlotIsWritableValue(id);
while (!thread_swap_request_slots_[slot].compare_exchange_strong(
slot_is_writeable_value, tls_logger_as_uint, std::memory_order_release)) {
id = swap_request_id_.fetch_add(1, std::memory_order_relaxed);
slot = id % thread_swap_request_slots_.size();
slot_is_writeable_value = SwapRequestSlotIsWritableValue(id);
tls_logger->RequestSwapBuffersSlotRetried();
}
}
void Logger::RegisterTlsLogger(TlsLogger* tls_logger) {
std::unique_lock<std::mutex> lock(tls_loggers_registerd_mutex_);
if (tls_loggers_registerd_.size() >= max_threads_to_log_) {
#if USE_NEW_LOGGING_FORMAT
MLPERF_LOG_ERROR_SYNC((*this), "error_runtime",
"Warning: More TLS loggers registerd than can be "
"active simultaneously.");
#else
LogErrorSync(
"Warning: More TLS loggers registerd than can "
"be active simultaneously.\n");
#endif
}
tls_loggers_registerd_.insert(tls_logger);
}
// This moves ownership of the tls_logger data to Logger so the
// exiting thread can exit immediately, even if all the logs of the
// exiting thread haven't been processed.
void Logger::UnRegisterTlsLogger(std::unique_ptr<TlsLogger> tls_logger) {
OrphanContainer::iterator orphan;
{
std::unique_lock<std::mutex> lock(tls_logger_orphans_mutex_);
tls_logger_orphans_.emplace_front(std::move(tls_logger));
orphan = tls_logger_orphans_.begin();
}
// Only remove the TlsLogger from the registry after adding to orphans so
// CollectTlsLoggerStats doesn't have any gaps in coverage.
{
std::unique_lock<std::mutex> lock(tls_loggers_registerd_mutex_);
tls_loggers_registerd_.erase(orphan->get());
}
// This will flush the logs of |tls_logger| and mark it for destruction.
// Deferring destruction via orphans_to_destroy helps avoid use-after-frees
// when the IOThread calls FinishReadingEntries.
(*orphan)->Log([this, orphan](AsyncLog&) {
CollectTlsLoggerStats(orphan->get());
orphans_to_destroy_.push_back(orphan);
});
}
void Logger::CollectTlsLoggerStats(TlsLogger* tls_logger) {
tls_total_log_cas_fail_count_ += tls_logger->ReportLogCasFailCount();
tls_total_swap_buffers_slot_retry_count_ +=
tls_logger->ReportSwapBuffersSlotRetryCount();
size_t max_entry_vector_size = tls_logger->MaxEntryVectorSize();
if (max_entry_vector_size > kTlsLogReservedEntryCount) {
#if USE_NEW_LOGGING_FORMAT
std::stringstream msg;
msg << "Logging allocation detected:" << " tid: " << tls_logger->Tid()
<< " reserved_entries: " << kTlsLogReservedEntryCount
<< " max_entries: " << max_entry_vector_size;
MLPERF_LOG_WARNING((*this), "warning_generic_message", msg.str());
#else
async_logger_.FlagWarning();
async_logger_.LogDetail("Logging allocation detected: ", "tid",
tls_logger->Tid(), "reserved_entries",
kTlsLogReservedEntryCount, "max_entries",
max_entry_vector_size);
#endif
}
}
void Logger::StartIOThread() {
{
std::unique_lock<std::mutex> lock(io_thread_mutex_);
keep_io_thread_alive_ = true;
}
io_thread_ = std::thread(&Logger::IOThread, this);
}
void Logger::StopIOThread() {
{
std::unique_lock<std::mutex> lock(io_thread_mutex_);
keep_io_thread_alive_ = false;
io_thread_cv_.notify_all();
}
io_thread_.join();
}
void Logger::StartLogging(std::ostream* summary, std::ostream* detail,
std::ostream* accuracy, bool copy_detail_to_stdout,
bool copy_summary_to_stdout) {
async_logger_.SetLogFiles(summary, detail, accuracy, copy_detail_to_stdout,
copy_summary_to_stdout, PerfClock::now());
}
void Logger::StopLogging() {
if (std::this_thread::get_id() == io_thread_.get_id()) {
#if USE_NEW_LOGGING_FORMAT
MLPERF_LOG_ERROR_SYNC((*this), "error_runtime",
"StopLogging() not supported from IO thread.");
#else
LogErrorSync("StopLogging() not supported from IO thread.");
#endif
return;
}
// Flush logs from this thread.
std::promise<void> io_thread_flushed_this_thread;
Log([&](AsyncLog&) { io_thread_flushed_this_thread.set_value(); });
io_thread_flushed_this_thread.get_future().wait();
async_logger_.SetLogFiles(&std::cerr, &std::cerr, &std::cerr, false, false,
PerfClock::now());
}
void Logger::StartNewTrace(std::ostream* trace_out,
PerfClock::time_point origin) {
async_logger_.StartNewTrace(trace_out, origin);
}
void Logger::StopTracing() {
// Flush traces from this thread.
std::promise<void> io_thread_flushed_this_thread;
Log([&](AsyncLog&) { io_thread_flushed_this_thread.set_value(); });
io_thread_flushed_this_thread.get_future().wait();
async_logger_.StopTrace();
}
void Logger::LogContentionAndAllocations() {
LogDetail([&](AsyncDetail& detail) {
{
std::unique_lock<std::mutex> lock(tls_loggers_registerd_mutex_);
for (auto tls_logger : tls_loggers_registerd_) {
CollectTlsLoggerStats(tls_logger);
}
}
{
std::unique_lock<std::mutex> lock(tls_logger_orphans_mutex_);
for (auto& orphan : tls_logger_orphans_) {
CollectTlsLoggerStats(orphan.get());
}
}
#if USE_NEW_LOGGING_FORMAT
MLPERF_LOG(detail, "logger_swap_request_slots_retry_count",
swap_request_slots_retry_count_);
MLPERF_LOG(detail, "logger_swap_request_slots_retry_retry_count",
swap_request_slots_retry_retry_count_);
MLPERF_LOG(detail, "logger_swap_request_slots_retry_reencounter_count",
swap_request_slots_retry_reencounter_count_);
MLPERF_LOG(detail, "logger_start_reading_entries_retry_count",
start_reading_entries_retry_count_);
MLPERF_LOG(detail, "logger_tls_total_log_cas_fail_count",
tls_total_log_cas_fail_count_);
MLPERF_LOG(detail, "logger_tls_total_swap_buffers_slot_retry_count",
tls_total_swap_buffers_slot_retry_count_);
#else
detail("Log Contention Counters:");
detail(std::to_string(swap_request_slots_retry_count_) +
" : swap_request_slots_retry_count");
detail(std::to_string(swap_request_slots_retry_retry_count_) +
" : swap_request_slots_retry_retry_count");
detail(std::to_string(swap_request_slots_retry_reencounter_count_) +
" : swap_request_slots_retry_reencounter_count");
detail(std::to_string(start_reading_entries_retry_count_) +
" : start_reading_entries_retry_count");
detail(std::to_string(tls_total_log_cas_fail_count_) +
" : tls_total_log_cas_fail_count");
detail(std::to_string(tls_total_swap_buffers_slot_retry_count_) +
" : tls_total_swap_buffers_slot_retry_count");
#endif
swap_request_slots_retry_count_ = 0;
swap_request_slots_retry_retry_count_ = 0;
swap_request_slots_retry_reencounter_count_ = 0;
start_reading_entries_retry_count_ = 0;
tls_total_log_cas_fail_count_ = 0;
tls_total_swap_buffers_slot_retry_count_ = 0;
});
}
void Logger::RestartLatencyRecording(uint64_t first_sample_sequence_id,
size_t latencies_to_reserve) {
async_logger_.RestartLatencyRecording(first_sample_sequence_id,
latencies_to_reserve);
}
std::vector<QuerySampleLatency> Logger::GetLatenciesBlocking(
size_t expected_count) {
return async_logger_.GetLatenciesBlocking(expected_count);
}
std::vector<QuerySampleLatency> Logger::GetTokenLatencies(
size_t expected_count) {
return async_logger_.GetTokenLatencies(expected_count);
}
std::vector<QuerySampleLatency> Logger::GetTimePerOutputToken(
size_t expected_count) {
return async_logger_.GetTimePerOutputToken(expected_count);
}
std::vector<QuerySampleLatency> Logger::GetTokensPerSample(
size_t expected_count) {
return async_logger_.GetTokensPerSample(expected_count);
}
PerfClock::time_point Logger::GetMaxCompletionTime() {
return async_logger_.GetMaxCompletionTime();
}
QuerySampleLatency Logger::GetMaxLatencySoFar() {
return async_logger_.GetMaxLatencySoFar();
}
void Logger::SetUseTokens(bool use_tokens) {
async_logger_.SetUseTokens(use_tokens);
}
void Logger::SetNeedsFirstToken(bool needs_first_token) {
async_logger_.SetNeedsFirstToken(needs_first_token);
}
TlsLogger* Logger::GetTlsLoggerThatRequestedSwap(size_t slot, size_t next_id) {
uintptr_t slot_value = thread_swap_request_slots_[slot].load();
if (SwapRequestSlotIsReadable(slot_value)) {
// TODO: Convert this block to a simple write once we are confidient
// that we don't need to check for success.
bool success = thread_swap_request_slots_[slot].compare_exchange_strong(
slot_value, SwapRequestSlotIsWritableValue(next_id));
if (!success) {
#if USE_NEW_LOGGING_FORMAT
MLPERF_LOG_WARNING((*this), "warning_generic_message", "CAS failed.");
#else
LogErrorSync("CAS failed.", "line", __LINE__);
#endif
assert(success);
}
return reinterpret_cast<TlsLogger*>(slot_value);
}
return nullptr;
}
void Logger::GatherRetrySwapRequests(std::vector<TlsLogger*>* threads_to_swap) {
if (swap_request_slots_to_retry_.empty()) {
return;
}
std::vector<SlotRetry> retry_slots;
retry_slots.swap(swap_request_slots_to_retry_);
for (auto& slot_retry : retry_slots) {
TlsLogger* tls_logger =
GetTlsLoggerThatRequestedSwap(slot_retry.slot, slot_retry.next_id);
if (tls_logger) {
threads_to_swap->push_back(tls_logger);
} else {
swap_request_slots_to_retry_.push_back(slot_retry);
swap_request_slots_retry_retry_count_++;
}
}
}
void Logger::GatherNewSwapRequests(std::vector<TlsLogger*>* threads_to_swap) {
auto swap_request_end = swap_request_id_.load(std::memory_order_acquire);
for (; swap_request_id_read_ < swap_request_end; swap_request_id_read_++) {
size_t slot = swap_request_id_read_ % thread_swap_request_slots_.size();
size_t next_id = swap_request_id_read_ + thread_swap_request_slots_.size();
TlsLogger* tls_logger = GetTlsLoggerThatRequestedSwap(slot, next_id);
if (tls_logger) {
threads_to_swap->push_back(tls_logger);
} else {
swap_request_slots_retry_count_++;
// A thread is in the middle of its call to RequestSwapBuffers.
// Retry later once it's done.
auto it = std::find_if(swap_request_slots_to_retry_.begin(),
swap_request_slots_to_retry_.end(),
[=](SlotRetry& s) { return s.slot == slot; });
if (it == swap_request_slots_to_retry_.end()) {
// This is the first time we are retrying the slot.
swap_request_slots_to_retry_.push_back({slot, next_id});
} else {
// Whoa. We've been retrying this slot since the last time it was
// encountered. Just update the next_id.
it->next_id = next_id;
swap_request_slots_retry_reencounter_count_++;
}
}
}
}
void Logger::IOThread() {
while (keep_io_thread_alive_) {
auto tracer1 =
MakeScopedTracer([](AsyncTrace& trace) { trace("IOThreadLoop"); });
{
auto tracer2 = MakeScopedTracer([](AsyncTrace& trace) { trace("Wait"); });
std::unique_lock<std::mutex> lock(io_thread_mutex_);
io_thread_cv_.wait_for(lock, poll_period_,
[&] { return !keep_io_thread_alive_; });
}
{
auto tracer3 =
MakeScopedTracer([](AsyncTrace& trace) { trace("Gather"); });
std::vector<TlsLogger*> threads_to_swap;
threads_to_swap.swap(threads_to_swap_deferred_);
GatherRetrySwapRequests(&threads_to_swap);
GatherNewSwapRequests(&threads_to_swap);
for (TlsLogger* thread : threads_to_swap) {
if (thread->ReadBufferHasBeenConsumed()) {
thread->SwapBuffers();
// After swapping a thread, it's ready to be read.
threads_to_read_.push_back(thread);
} else {
// Don't swap buffers again until we've finish reading the
// previous swap.
threads_to_swap_deferred_.push_back(thread);
}
}
}
{
auto tracer4 =
MakeScopedTracer([](AsyncTrace& trace) { trace("Process"); });
// Read from the threads we are confident have activity.
for (std::vector<TlsLogger*>::iterator thread = threads_to_read_.begin();
thread != threads_to_read_.end(); thread++) {
auto tracer5 =
MakeScopedTracer([tid = (*thread)->Tid()](AsyncTrace& trace) {
trace("Thread", "tid", tid);
});
std::vector<AsyncLogEntry>* entries = (*thread)->StartReadingEntries();
if (!entries) {
start_reading_entries_retry_count_++;
continue;
}
async_logger_.SetCurrentPidTid((*thread)->Pid(), (*thread)->Tid());
for (auto& entry : *entries) {
// Execute the entry to perform the serialization and I/O.
entry(async_logger_);
}
(*thread)->FinishReadingEntries();
// Mark for removal by the call to RemoveValue below.
*thread = nullptr;
}
// Only remove threads where reading succeeded so we retry the failed
// threads the next time around.
RemoveValue(&threads_to_read_, nullptr);
}
// Explicitly flush every time we wake up. The goal being minimization
// of large implicit flushes which could affect tail latency measurements,
// especially at percentiles closer to 100%.
/// \todo Determine if explicitly flushing logs every wake up is better
/// than relying on implicit flushing.
{
auto tracer6 =
MakeScopedTracer([](AsyncTrace& trace) { trace("FlushAll"); });
async_logger_.Flush();
}
if (!orphans_to_destroy_.empty()) {
auto tracer7 = MakeScopedTracer(
[](AsyncTrace& trace) { trace("Abandoning Orphans"); });
std::unique_lock<std::mutex> lock(tls_logger_orphans_mutex_);
for (auto orphan : orphans_to_destroy_) {
tls_logger_orphans_.erase(orphan);
}
orphans_to_destroy_.clear();
}
}
}
TlsLogger::TlsLogger(std::function<void()> forced_detatch)
: pid_(MLPERF_GET_PID()),
tid_(MLPERF_GET_TID()),
forced_detatch_(std::move(forced_detatch)) {
for (auto& entry : entries_) {
entry.reserve(kTlsLogReservedEntryCount);
}
}
TlsLogger::~TlsLogger() {}
// Log always makes forward progress since it can unconditionally obtain a
// "lock" on at least one of the buffers for writing.
// Notificiation is also lock free.
void TlsLogger::Log(AsyncLogEntry&& entry) {
size_t cas_fail_count = 0;
auto unlocked = EntryState::Unlocked;
size_t i_write = i_write_.load(std::memory_order_relaxed);
while (!entry_states_[i_write].compare_exchange_strong(
unlocked, EntryState::WriteLock, std::memory_order_acquire,
std::memory_order_relaxed)) {
unlocked = EntryState::Unlocked;
i_write ^= 1;
// We may need to try 3 times, since there could be a race with a
// previous SwapBuffers request and we use memory_order_relaxed when
// loading i_write_ above.
cas_fail_count++;
if (cas_fail_count >= 3) {
#if USE_NEW_LOGGING_FORMAT
MLPERF_LOG_WARNING(GlobalLogger(), "warning_generic_message",
"CAS failed.");
#else
GlobalLogger().LogErrorSync("CAS failed.", "times", cas_fail_count,
"line", __LINE__);
#endif
}
log_cas_fail_count_.fetch_add(1, std::memory_order_relaxed);
}
entries_[i_write].emplace_back(std::forward<AsyncLogEntry>(entry));
// TODO: Convert this block to a simple write once we are confidient
// that we don't need to check for success.
auto write_lock = EntryState::WriteLock;
bool success = entry_states_[i_write].compare_exchange_strong(
write_lock, EntryState::Unlocked, std::memory_order_release);
if (!success) {
#if USE_NEW_LOGGING_FORMAT
MLPERF_LOG_WARNING(GlobalLogger(), "warning_generic_message",
"CAS failed.");
#else
GlobalLogger().LogErrorSync("CAS failed.", "line", __LINE__);
#endif
assert(success);
}
bool write_buffer_swapped = i_write_prev_ != i_write;
if (write_buffer_swapped) {
GlobalLogger().RequestSwapBuffers(this);
i_write_prev_ = i_write;
}
}
void TlsLogger::SwapBuffers() {
// TODO: Convert this block to a simple write once we are confidient
// that we don't need to check for success.
auto read_lock = EntryState::ReadLock;
bool success = entry_states_[i_read_].compare_exchange_strong(
read_lock, EntryState::Unlocked, std::memory_order_release);
if (!success) {
#if USE_NEW_LOGGING_FORMAT
MLPERF_LOG_WARNING(GlobalLogger(), "warning_generic_message",
"CAS failed.");
#else
GlobalLogger().LogErrorSync("CAS failed.", "line", __LINE__);
#endif
assert(success);
}
i_write_.store(i_read_, std::memory_order_relaxed);
i_read_ ^= 1;
unread_swaps_++;
}
// Returns nullptr if read lock fails.
std::vector<AsyncLogEntry>* TlsLogger::StartReadingEntries() {
auto unlocked = EntryState::Unlocked;
if (entry_states_[i_read_].compare_exchange_strong(
unlocked, EntryState::ReadLock, std::memory_order_acquire,
std::memory_order_relaxed)) {
return &entries_[i_read_];
}
return nullptr;
}
void TlsLogger::FinishReadingEntries() {
// Detect first logging allocation and track max allocated size.
size_t new_size = entries_[i_read_].size();
if (new_size > max_entry_size_) {
if (max_entry_size_ == kTlsLogReservedEntryCount) {
Log([ts = PerfClock::now()](AsyncLog& log) {
log.TraceAsyncInstant("FirstAllocation", 0, ts);
});
}
max_entry_size_ = new_size;
}
entries_[i_read_].clear();
unread_swaps_--;
}
bool TlsLogger::ReadBufferHasBeenConsumed() { return unread_swaps_ == 0; }
void TlsLogger::TraceCounters() {
auto tracer = MakeScopedTracer(
[lcfc = log_cas_fail_count_.load(std::memory_order_relaxed),
sbsrc = swap_buffers_slot_retry_count_.load(std::memory_order_relaxed)](
AsyncTrace& trace) {
trace("TlsLogger:ContentionCounters", "log_cas_fail_count", lcfc,
"swap_buffers_slot_retry_count", sbsrc);
});
}
Logger& GlobalLogger() {
static Logger g_logger(kLogPollPeriod, kMaxThreadsToLog);
return g_logger;
}
/// \brief Moves ownership of the TlsLogger to Logger on thread exit
/// so no round-trip synchronization with the IO thread is required.
struct TlsLoggerWrapper {
TlsLoggerWrapper(std::function<void()> forced_detatch)
: tls_logger(std::make_unique<TlsLogger>(std::move(forced_detatch))) {
GlobalLogger().RegisterTlsLogger(tls_logger.get());
}
~TlsLoggerWrapper() {
tls_logger->TraceCounters();
GlobalLogger().UnRegisterTlsLogger(std::move(tls_logger));
}
std::unique_ptr<TlsLogger> tls_logger;
};
TlsLoggerWrapper* InitializeMyTlsLoggerWrapper() {
thread_local std::unique_ptr<TlsLoggerWrapper> tls_logger_wrapper;
// forced_detatch lets the global Logger forcefully detatch TlsLoggers
// from the thread in the Logger's destructor, which may run before
// thread-local variables are destroyed when the loadgen is used as a python
// module and dynamically unloaded.
// Note: We capture a pointer to the tls_logger_wrapper since variables of
// the thread-local storage class aren't actually captured. C++ spec says
// only variables of the automatic storage class are captured.
/// \todo There is a race where the same TlsLoggerWrapper might be
/// destroyed both naturally and via forced_detatch. Destruction of
/// the TlsLoggerWrapper should be locked.
auto forced_detatch = [tls_logger_wrapper = &tls_logger_wrapper]() {
tls_logger_wrapper->reset();
};
tls_logger_wrapper = std::make_unique<TlsLoggerWrapper>(forced_detatch);
return tls_logger_wrapper.get();
}
TlsLogger* InitializeMyTlsLogger() {
thread_local TlsLoggerWrapper* wrapper = InitializeMyTlsLoggerWrapper();
return wrapper->tls_logger.get();
}
void Log(AsyncLogEntry&& entry) {
thread_local TlsLogger* const tls_logger = InitializeMyTlsLogger();
tls_logger->Log(std::forward<AsyncLogEntry>(entry));
}
} // namespace logging
} // namespace mlperf
/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
/// \file
/// \brief Internal logging implementation details.
#ifndef MLPERF_LOADGEN_LOGGING_H_
#define MLPERF_LOADGEN_LOGGING_H_
#define USE_NEW_LOGGING_FORMAT 1
#define MLPERF_LOG(logger, key, value) \
logger.Log((key), (value), __FILE__, __LINE__)
#define MLPERF_LOG_ERROR(logger, key, value) \
logger.LogError((key), (value), __FILE__, __LINE__)
#define MLPERF_LOG_ERROR_SYNC(logger, key, value) \
logger.LogErrorSync((key), (value), __FILE__, __LINE__)
#define MLPERF_LOG_WARNING(logger, key, value) \
logger.LogWarning((key), (value), __FILE__, __LINE__)
#define MLPERF_LOG_INTERVAL_START(logger, key, value) \
logger.LogIntervalStart((key), (value), __FILE__, __LINE__)
#define MLPERF_LOG_INTERVAL_END(logger, key, value) \
logger.LogIntervalEnd((key), (value), __FILE__, __LINE__)
#include <algorithm>
#include <atomic>
#include <cassert>
#include <chrono>
#include <condition_variable>
#include <functional>
#include <future>
#include <iomanip>
#include <iostream>
#include <list>
#include <map>
#include <mutex>
#include <set>
#include <string>
#include <thread>
#include <unordered_set>
#include <vector>
#include "query_sample.h"
namespace mlperf {
/// \brief Wait-free logging utilities that defer stringification
/// and syscalls to a worker thread.
namespace logging {
class AsyncLog;
class Logger;
class TlsLogger;
struct TlsLoggerWrapper;
/// \todo Verify lambas are not allocating when bounded to a std::function.
using AsyncLogEntry = std::function<void(AsyncLog&)>;
using PerfClock = std::chrono::high_resolution_clock;
/// \brief Logs the raw bytes as a hexadecimal ascii string.
struct LogBinaryAsHexString {
std::vector<uint8_t>* data;
};
/// \brief By default, print out the value directly.
template <typename T>
const T& ArgValueTransform(const T& value) {
return value;
}
/// \brief Print out True/False.
const std::string& ArgValueTransform(const bool& value);
/// \brief Print out binary day as hex string.
const std::string ArgValueTransform(const LogBinaryAsHexString& value);
#if USE_NEW_LOGGING_FORMAT
/// \brief Print out a string in JSON format (with quotes).
const std::string ArgValueTransform(const std::string& value);
const std::string ArgValueTransform(const char* value);
/// \brief Prints a list of int in JSON format.
const std::string ArgValueTransform(const std::vector<size_t>& value);
/// \brief Prints a dict in JSON format.
const std::string ArgValueTransform(
const std::map<std::string, std::string>& value);
#endif
/// \brief Helper to print out values without quotes when value is a string.
template <typename T>
const T& ArgValueTransformWithoutQuote(const T& value) {
return ArgValueTransform<T>(value);
}
inline const std::string ArgValueTransformWithoutQuote(
const LogBinaryAsHexString& value) {
return ArgValueTransform(value);
}
/// \brief Helper to print out a string without the quotes.
inline const std::string ArgValueTransformWithoutQuote(
const std::string& value) {
return value;
}
/// \brief Outputs a trace that can be uploaded to chrome://tracing for
/// visualization.
/// \details Trace event format definition:
/// https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/edit?usp=sharing
class ChromeTracer {
public:
ChromeTracer(std::ostream* trace_out, PerfClock::time_point origin);
~ChromeTracer();
template <typename... Args>
void AddCompleteEvent(const std::string& name, uint64_t pid, uint64_t tid,
PerfClock::time_point start, PerfClock::time_point end,
const Args... args) {
*out_ << "{\"name\":\"" << name << "\"," << "\"ph\":\"X\","
<< "\"pid\":" << pid << "," << "\"tid\":" << tid << ","
<< "\"ts\":" << Micros(start - origin_).count() << ","
<< "\"dur\":" << Micros(end - start).count() << "," << "\"args\":{";
AddArgs(args...);
*out_ << "}},\n";
}
template <typename... Args>
void AddAsyncBeginEvent(const std::string& name, uint64_t pid, uint64_t id,
PerfClock::time_point time, const Args... args) {
*out_ << "{\"name\":\"" << name << "\"," << "\"cat\":\"default\","
<< "\"ph\":\"b\"," << "\"pid\":" << pid << "," << "\"id\":" << id
<< "," << "\"ts\":" << Micros(time - origin_).count() << ","
<< "\"args\":{";
AddArgs(args...);
*out_ << "}},\n";
}
template <typename... Args>
void AddAsyncInstantEvent(const std::string& name, uint64_t pid, uint64_t id,
PerfClock::time_point time, const Args... args) {
*out_ << "{\"name\":\"" << name << "\"," << "\"cat\":\"default\","
<< "\"ph\":\"n\"," << "\"pid\":" << pid << "," << "\"id\":" << id
<< "," << "\"ts\":" << Micros(time - origin_).count() << ","
<< "\"args\":{";
AddArgs(args...);
*out_ << "}},\n";
}
template <typename... Args>
void AddAsyncEndEvent(const std::string& name, uint64_t pid, uint64_t id,
PerfClock::time_point time) {
*out_ << "{\"name\":\"" << name << "\"," << "\"cat\":\"default\","
<< "\"ph\":\"e\", " << "\"pid\":" << pid << "," << "\"id\":" << id
<< "," << "\"ts\":" << Micros(time - origin_).count() << "},\n";
}
template <typename... Args>
void AddCounterEvent(const std::string& name, uint64_t pid,
PerfClock::time_point time, const Args... args) {
*out_ << "{\"name\":\"" << name << "\"," << "\"ph\": \"C\","
<< "\"pid\":" << pid << ","
<< "\"ts\":" << Micros(time - origin_).count() << ","
<< "\"args\":{ ";
AddArgs(args...);
*out_ << "}},\n";
}
void Flush() { out_->flush(); }
private:
using Micros = std::chrono::duration<double, std::micro>;
void WriteTraceEventHeader();
void WriteTraceEventFooter();
void AddArgs() {}
template <typename T>
void AddArgs(const std::string& arg_name, const T& arg_value) {
*out_ << "\"" << arg_name << "\":" << ArgValueTransform(arg_value);
}
template <typename T, typename... Args>
void AddArgs(const std::string& arg_name, const T& arg_value,
const Args... args) {
*out_ << "\"" << arg_name << "\":" << ArgValueTransform(arg_value) << ",";
AddArgs(args...);
}
std::ostream* out_;
PerfClock::time_point origin_;
};
/// \brief The proxy all logging lambdas ultimately use to write any log type.
/// \details Passed as an argument to the log lambda on the
/// recording thread to serialize the data captured by the lambda and
/// forward it to the output stream.
/// \todo Make summary_out_, detail_out_, accuracy_out_, and trace_out_
/// instances of a new LogOutput interface that the client may override.
class AsyncLog {
public:
void SetLogFiles(std::ostream* summary, std::ostream* detail,
std::ostream* accuracy, bool copy_detail_to_stdout,
bool copy_summary_to_stdout,
PerfClock::time_point log_origin);
void StartNewTrace(std::ostream* trace_out, PerfClock::time_point origin);
void StopTrace();
void Flush();
void SetCurrentPidTid(uint64_t pid, uint64_t tid);
void LogAccuracy(uint64_t seq_id, const QuerySampleIndex qsl_idx,
const LogBinaryAsHexString& response, int64_t n_tokens);
void CacheToken(uint64_t seq_id, const LogBinaryAsHexString& response);
template <typename... Args>
void LogSummary(const std::string& message, const Args... args);
void SetLogDetailTime(PerfClock::time_point time) { log_detail_time_ = time; }
void FlagError() {
std::unique_lock<std::mutex> lock(log_mutex_);
log_error_count_++;
error_flagged_ = true;
}
void FlagWarning() {
std::unique_lock<std::mutex> lock(log_mutex_);
log_warning_count_++;
warning_flagged_ = true;
}
#if USE_NEW_LOGGING_FORMAT
template <typename T>
void LogDetail(const std::string& key, const T& value,
const std::string file_name, const unsigned int line_no);
#else
template <typename... Args>
void LogDetail(const std::string& message, const Args... args);
#endif
template <typename... Args>
void Trace(const std::string& trace_name, PerfClock::time_point start,
PerfClock::time_point end, const Args... args) {
std::unique_lock<std::mutex> lock(trace_mutex_);
if (tracer_) {
tracer_->AddCompleteEvent(trace_name, current_pid_, current_tid_, start,
end, args...);
}
}
template <typename... Args>
void TraceAsyncInstant(const std::string& trace_name, uint64_t id,
PerfClock::time_point instant_time,
const Args... args) {
std::unique_lock<std::mutex> lock(trace_mutex_);
if (tracer_) {
tracer_->AddAsyncInstantEvent(trace_name, current_pid_, id, instant_time,
args...);
}
}
void SetScopedTraceTimes(PerfClock::time_point start,
PerfClock::time_point end) {
scoped_start_ = start;
scoped_end_ = end;
}
template <typename... Args>
void ScopedTrace(const std::string& trace_name, const Args... args) {
std::unique_lock<std::mutex> lock(trace_mutex_);
if (tracer_) {
tracer_->AddCompleteEvent(trace_name, current_pid_, current_tid_,
scoped_start_, scoped_end_, args...);
}
}
template <typename... Args>
void TraceSample(const std::string& trace_name, uint64_t id,
PerfClock::time_point start, PerfClock::time_point end,
const Args... args) {
std::unique_lock<std::mutex> lock(trace_mutex_);
if (tracer_) {
tracer_->AddAsyncBeginEvent(trace_name, current_pid_, id, start, args...);
tracer_->AddAsyncEndEvent(trace_name, current_pid_, id, end);
}
}
template <typename... Args>
void TraceCounterEvent(const std::string& trace_name,
PerfClock::time_point time, const Args... args) {
std::unique_lock<std::mutex> lock(trace_mutex_);
if (tracer_) {
tracer_->AddCounterEvent(trace_name, current_pid_, time, args...);
}
}
void RestartLatencyRecording(uint64_t first_sample_sequence_id,
size_t latencies_to_reserve);
void RecordSampleCompletion(uint64_t sample_sequence_id,
PerfClock::time_point completion_time,
QuerySampleLatency latency, int64_t n_tokens);
void RecordTokenCompletion(uint64_t sample_sequence_id,
PerfClock::time_point completion_time,
QuerySampleLatency latency);
std::vector<QuerySampleLatency> GetLatenciesBlocking(size_t expected_count);
std::vector<QuerySampleLatency> GetTokenLatencies(size_t expected_count);
std::vector<QuerySampleLatency> GetTimePerOutputToken(size_t expected_count);
std::vector<int64_t> GetTokensPerSample(size_t expected_count);
PerfClock::time_point GetMaxCompletionTime();
QuerySampleLatency GetMaxLatencySoFar();
void SetUseTokens(bool use_tokens);
void SetNeedsFirstToken(bool needs_first_token);
private:
void WriteAccuracyHeaderLocked();
void WriteAccuracyFooterLocked();
void LogArgs(std::ostream*) {}
template <typename T>
void LogArgs(std::ostream* out, const T& value_only) {
*out << ArgValueTransformWithoutQuote(value_only);
}
template <typename T>
void LogArgs(std::ostream* out, const std::string& arg_name,
const T& arg_value) {
*out << "\"" << arg_name
<< "\" : " << ArgValueTransformWithoutQuote(arg_value);
}
template <typename T, typename... Args>
void LogArgs(std::ostream* out, const std::string& arg_name,
const T& arg_value, const Args... args) {
*out << "\"" << arg_name
<< "\" : " << ArgValueTransformWithoutQuote(arg_value) << ", ";
LogArgs(out, args...);
}
std::mutex log_mutex_;
std::ostream* summary_out_ = &std::cerr;
std::ostream* detail_out_ = &std::cerr;
std::ostream* accuracy_out_ = &std::cerr;
// TODO: Instead of these bools, use a class that forwards to two streams.
bool copy_detail_to_stdout_ = false;
bool copy_summary_to_stdout_ = false;
bool accuracy_needs_comma_ = false;
PerfClock::time_point log_origin_;
size_t log_error_count_ = 0;
bool error_flagged_ = false;
size_t log_warning_count_ = 0;
bool warning_flagged_ = false;
bool use_tokens_ = false;
bool needs_first_token_ = false;
std::mutex trace_mutex_;
std::unique_ptr<ChromeTracer> tracer_;
uint64_t current_pid_;
uint64_t current_tid_;
PerfClock::time_point log_detail_time_;
PerfClock::time_point scoped_start_;
PerfClock::time_point scoped_end_;
std::mutex latencies_mutex_;
std::mutex token_latencies_mutex_;
std::mutex token_record_mutex_;
std::condition_variable all_latencies_recorded_;
uint64_t latencies_first_sample_sequence_id_ = 0;
std::vector<QuerySampleLatency> latencies_;
std::vector<QuerySampleLatency> token_latencies_;
std::vector<QuerySampleLatency> time_per_output_token_;
std::vector<LogBinaryAsHexString> token_records_;
std::vector<int64_t> tokens_per_sample_;
QuerySampleLatency max_latency_ = 0;
PerfClock::time_point max_completion_timstamp_;
size_t latencies_recorded_ = 0;
size_t latencies_expected_ = 0;
// Must be called with latencies_mutex_ held.
bool AllLatenciesRecorded() {
return latencies_recorded_ == latencies_expected_;
}
};
/// \brief The central logger that logs all threads belonging to a run.
class Logger {
public:
Logger(std::chrono::duration<double> poll_period, size_t max_threads_to_log);
~Logger();
void StartIOThread();
void StopIOThread();
void StartLogging(std::ostream* summary, std::ostream* detail,
std::ostream* accuracy, bool copy_detail_to_stdout,
bool copy_summary_to_stdout);
void StopLogging();
void StartNewTrace(std::ostream* trace_out, PerfClock::time_point origin);
void StopTracing();
void LogContentionAndAllocations();
void RestartLatencyRecording(uint64_t first_sample_sequence_id,
size_t latencies_to_reserve);
std::vector<QuerySampleLatency> GetLatenciesBlocking(size_t expected_count);
std::vector<QuerySampleLatency> GetTokenLatencies(size_t expected_count);
std::vector<QuerySampleLatency> GetTimePerOutputToken(size_t expected_count);
std::vector<int64_t> GetTokensPerSample(size_t expected_count);
PerfClock::time_point GetMaxCompletionTime();
QuerySampleLatency GetMaxLatencySoFar();
void SetUseTokens(bool use_tokens);
void SetNeedsFirstToken(bool needs_first_token);
private:
friend AsyncLog;
friend TlsLogger;
friend TlsLoggerWrapper;
void RegisterTlsLogger(TlsLogger* tls_logger);
void UnRegisterTlsLogger(std::unique_ptr<TlsLogger> tls_logger);
void RequestSwapBuffers(TlsLogger* tls_logger);
void CollectTlsLoggerStats(TlsLogger* tls_logger);
TlsLogger* GetTlsLoggerThatRequestedSwap(size_t slot, size_t next_id);
void GatherRetrySwapRequests(std::vector<TlsLogger*>* threads_to_swap);
void GatherNewSwapRequests(std::vector<TlsLogger*>* threads_to_swap);
/// \brief The main logging thread function that handles the serialization
/// and I/O to the stream or file.
///
/// \todo Provide client hook to set logging thead affinity and priority.
void IOThread();
// Slow synchronous error logging for internals that may prevent
// async logging from working.
#if USE_NEW_LOGGING_FORMAT
template <typename T>
void LogErrorSync(const std::string& key, const T& value,
const std::string file_name, const unsigned int line_no) {
/// \todo Acquire mutex once for FlagError + LogDetail to avoid
/// races. Better yet, switch to a non-stateful error API.
// This is better than nothing though.
async_logger_.FlagError();
async_logger_.LogDetail(key, value, file_name, line_no);
}
template <typename T>
void LogWarning(const std::string& key, const T& value,
const std::string file_name, const unsigned int line_no) {
async_logger_.FlagWarning();
async_logger_.LogDetail(key, value, file_name, line_no);
}
#else
template <typename... Args>
void LogErrorSync(const std::string& message, Args&&... args) {
/// \todo Acquire mutex once for FlagError + LogDetail to avoid
/// races. Better yet, switch to a non-stateful error API.
// This is better than nothing though.
async_logger_.FlagError();
async_logger_.LogDetail(message, std::forward<Args>(args)...);
}
#endif
// Accessed by IOThead only.
const std::chrono::duration<double> poll_period_;
AsyncLog async_logger_;
const size_t max_threads_to_log_;
std::thread io_thread_;
// Accessed by producers and IOThead during thread registration and
// destruction. Protected by io_thread_mutex_.
std::mutex io_thread_mutex_;
std::condition_variable io_thread_cv_;
bool keep_io_thread_alive_ = false;
std::mutex tls_loggers_registerd_mutex_;
std::unordered_set<TlsLogger*> tls_loggers_registerd_;
// Temporarily stores TlsLogger data for threads that have exited until
// all their log entries have been processed.
// Accessed by IOThread and producers as their threads exit.
std::mutex tls_logger_orphans_mutex_;
using OrphanContainer = std::list<std::unique_ptr<TlsLogger>>;
OrphanContainer tls_logger_orphans_;
// Accessed by producers and IOThead atomically.
std::atomic<size_t> swap_request_id_{0};
std::vector<std::atomic<uintptr_t>> thread_swap_request_slots_;
// Accessed by IOThead only.
size_t swap_request_id_read_{0};
struct SlotRetry {
size_t slot;
uintptr_t next_id;
};
std::vector<SlotRetry> swap_request_slots_to_retry_;
std::vector<TlsLogger*> threads_to_swap_deferred_;
std::vector<TlsLogger*> threads_to_read_;
std::vector<OrphanContainer::iterator> orphans_to_destroy_;
// Counts for retries related to the lock-free scheme.
// Abnormally high counts could be an indicator of contention.
// Access on IOThread only.
size_t swap_request_slots_retry_count_ = 0;
size_t swap_request_slots_retry_retry_count_ = 0;
size_t swap_request_slots_retry_reencounter_count_ = 0;
size_t start_reading_entries_retry_count_ = 0;
size_t tls_total_log_cas_fail_count_ = 0;
size_t tls_total_swap_buffers_slot_retry_count_ = 0;
};
Logger& GlobalLogger();
/// \brief The generic way to add a log entry.
/// \details Supports all types of logs, which is useful for complex
/// lambdas that may wish to log in multiple places or log something other
/// than a simple summary, detail, or trace entry.
void Log(AsyncLogEntry&& entry);
/// \brief The convenience proxy a LogSummary lambda uses to write to the
/// summary log.
class AsyncSummary {
public:
explicit AsyncSummary(AsyncLog& async_log) : async_log_(async_log) {}
AsyncLog& async_log() { return async_log_; }
template <typename... Args>
AsyncLog& operator()(Args&&... args) {
async_log_.LogSummary(std::forward<Args>(args)...);
return async_log_;
}
private:
AsyncLog& async_log_;
};
/// \brief A helper to simplify adding a summary log entry.
template <typename LambdaT>
void LogSummary(LambdaT&& lambda) {
Log([lambda = std::forward<LambdaT>(lambda)](AsyncLog& log) mutable {
AsyncSummary async_summary(log);
lambda(async_summary);
});
}
/// \brief The convenience proxy a LogDetail lambda uses to write to the detail
/// log.
class AsyncDetail {
public:
explicit AsyncDetail(AsyncLog& async_log) : async_log_(async_log) {}
AsyncLog& async_log() { return async_log_; }
#if USE_NEW_LOGGING_FORMAT
template <typename T>
AsyncLog& Log(const std::string& key, const T& value,
const std::string file_name, const unsigned int line_no) {
async_log_.LogDetail(key, value, file_name, line_no);
return async_log_;
}
template <typename T>
AsyncLog& LogError(const std::string& key, const T& value,
const std::string file_name, const unsigned int line_no) {
async_log_.FlagError();
async_log_.LogDetail(key, value, file_name, line_no);
return async_log_;
}
template <typename T>
AsyncLog& LogWarning(const std::string& key, const T& value,
const std::string file_name,
const unsigned int line_no) {
async_log_.FlagWarning();
async_log_.LogDetail(key, value, file_name, line_no);
return async_log_;
}
template <typename T>
AsyncLog& LogIntervalStart(const std::string& key, const T& value,
const std::string file_name,
const unsigned int line_no) {
async_log_.LogDetail(key, value, file_name, line_no);
return async_log_;
}
template <typename T>
AsyncLog& LogIntervalEnd(const std::string& key, const T& value,
const std::string file_name,
const unsigned int line_no) {
async_log_.LogDetail(key, value, file_name, line_no);
return async_log_;
}
#else
template <typename... Args>
AsyncLog& operator()(Args&&... args) {
async_log_.LogDetail(std::forward<Args>(args)...);
return async_log_;
}
template <typename... Args>
AsyncLog& Error(Args&&... args) {
async_log_.FlagError();
async_log_.LogDetail(std::forward<Args>(args)...);
return async_log_;
}
template <typename... Args>
AsyncLog& Warning(Args&&... args) {
async_log_.FlagWarning();
async_log_.LogDetail(std::forward<Args>(args)...);
return async_log_;
}
#endif
private:
AsyncLog& async_log_;
};
/// \brief A helper to simplify adding a detail log entry.
template <typename LambdaT>
void LogDetail(LambdaT&& lambda) {
Log([lambda = std::forward<LambdaT>(lambda),
timestamp = PerfClock::now()](AsyncLog& log) mutable {
log.SetLogDetailTime(timestamp);
AsyncDetail async_detail(log);
lambda(async_detail);
});
}
/// \brief The convenience proxy a ScopedTracer lambda uses to write to the
/// detail log.
class AsyncTrace {
public:
explicit AsyncTrace(AsyncLog& async_log) : async_log_(async_log) {}
AsyncLog& async_log() { return async_log_; }
template <typename... Args>
AsyncLog& operator()(Args&&... args) {
async_log_.ScopedTrace(std::forward<Args>(args)...);
return async_log_;
}
private:
AsyncLog& async_log_;
};
/// \brief ScopedTracer is an RAII object that traces the start and end
/// of its lifetime.
template <typename LambdaT>
class ScopedTracer {
public:
ScopedTracer(LambdaT&& lambda)
: start_(PerfClock::now()), lambda_(std::forward<LambdaT>(lambda)) {}
~ScopedTracer() {
Log([start = start_, lambda = std::move(lambda_),
end = PerfClock::now()](AsyncLog& log) {
log.SetScopedTraceTimes(start, end);
AsyncTrace async_trace(log);
lambda(async_trace);
});
}
private:
PerfClock::time_point start_;
LambdaT lambda_;
};
/// \brief Helper that creates a ScopeTracer with automatic type deduction.
/// \details Helps with automatic template type deduction, which has been
/// supported for functions for a long time.
/// C++17 will support deduction for classes, which will neutralize the utility
/// of a helper function like this.
/// \todo Determine which traces to keep for submission purposes.
template <typename LambdaT>
auto MakeScopedTracer(LambdaT&& lambda) -> ScopedTracer<LambdaT> {
return ScopedTracer<LambdaT>(std::forward<LambdaT>(lambda));
}
template <typename... Args>
void AsyncLog::LogSummary(const std::string& message, const Args... args) {
auto tracer = MakeScopedTracer([message](AsyncTrace& trace) {
std::string sanitized_message = message;
std::replace(sanitized_message.begin(), sanitized_message.end(), '"', '\'');
std::replace(sanitized_message.begin(), sanitized_message.end(), '\n', ';');
trace("LogSummary", "message", "\"" + sanitized_message + "\"");
});
std::unique_lock<std::mutex> lock(log_mutex_);
*summary_out_ << message;
LogArgs(summary_out_, args...);
*summary_out_ << "\n";
if (copy_summary_to_stdout_) {
std::cout << message;
LogArgs(&std::cout, args...);
std::cout << "\n";
}
}
#if USE_NEW_LOGGING_FORMAT
template <typename T>
void AsyncLog::LogDetail(const std::string& key, const T& value,
const std::string file_name,
const unsigned int line_no) {
auto tracer = MakeScopedTracer([key](AsyncTrace& trace) {
std::string sanitized_key = key;
std::replace(sanitized_key.begin(), sanitized_key.end(), '"', '\'');
std::replace(sanitized_key.begin(), sanitized_key.end(), '\n', ';');
trace("LogDetail", "key", "\"" + sanitized_key + "\"");
});
std::unique_lock<std::mutex> lock(log_mutex_);
std::vector<std::ostream*> detail_streams{detail_out_, &std::cout};
if (!copy_detail_to_stdout_) {
detail_streams.pop_back();
}
auto time_ns = (log_detail_time_ - log_origin_).count();
for (auto os : detail_streams) {
*os << ":::MLLOG {" << "\"key\": " << ArgValueTransform(key) << ", "
<< "\"value\": " << ArgValueTransform(value) << ", "
<< "\"time_ms\": " << ArgValueTransform(time_ns / 1000000ULL) << "."
<< std::setfill('0') << std::setw(6)
<< ArgValueTransform(time_ns % 1000000ULL) << ", "
<< "\"namespace\": \"mlperf::logging\", "
<< "\"event_type\": \"POINT_IN_TIME\", " << "\"metadata\": {"
<< "\"is_error\": " << ArgValueTransform(error_flagged_) << ", "
<< "\"is_warning\": " << ArgValueTransform(warning_flagged_) << ", "
<< "\"file\": \"" << file_name << "\", "
<< "\"line_no\": " << ArgValueTransform(line_no) << ", "
<< "\"pid\": " << ArgValueTransform(current_pid_) << ", "
<< "\"tid\": " << ArgValueTransform(current_tid_) << "}}\n";
if (error_flagged_) {
os->flush();
}
}
error_flagged_ = false;
warning_flagged_ = false;
}
#else
template <typename... Args>
void AsyncLog::LogDetail(const std::string& message, const Args... args) {
auto tracer = MakeScopedTracer([message](AsyncTrace& trace) {
std::string sanitized_message = message;
std::replace(sanitized_message.begin(), sanitized_message.end(), '"', '\'');
std::replace(sanitized_message.begin(), sanitized_message.end(), '\n', ';');
trace("LogDetail", "message", "\"" + sanitized_message + "\"");
});
std::unique_lock<std::mutex> lock(log_mutex_);
std::vector<std::ostream*> detail_streams{detail_out_, &std::cout};
if (!copy_detail_to_stdout_) {
detail_streams.pop_back();
}
for (auto os : detail_streams) {
*os << "\"pid\": " << current_pid_ << ", " << "\"tid\": " << current_tid_
<< ", " << "\"ts\": " << (log_detail_time_ - log_origin_).count()
<< "ns : ";
if (error_flagged_) {
*os << "ERROR : ";
} else if (warning_flagged_) {
*os << "WARNING : ";
}
*os << message;
LogArgs(os, args...);
*os << "\n";
if (error_flagged_) {
os->flush();
}
}
error_flagged_ = false;
warning_flagged_ = false;
}
#endif
} // namespace logging
// Export some things out of the logging namespace to simplify call sites.
const auto GlobalLogger = logging::GlobalLogger;
const auto Log = logging::Log;
using PerfClock = logging::PerfClock;
using LogBinaryAsHexString = logging::LogBinaryAsHexString;
using AsyncLog = logging::AsyncLog;
using AsyncSummary = logging::AsyncSummary;
template <typename LambdaT>
void LogSummary(LambdaT&& lambda) {
logging::LogSummary(std::forward<LambdaT>(lambda));
}
using AsyncDetail = logging::AsyncDetail;
template <typename LambdaT>
void LogDetail(LambdaT&& lambda) {
logging::LogDetail(std::forward<LambdaT>(lambda));
}
using AsyncTrace = logging::AsyncTrace;
template <typename LambdaT>
using ScopedTracer = logging::ScopedTracer<LambdaT>;
template <typename LambdaT>
auto MakeScopedTracer(LambdaT&& lambda) -> ScopedTracer<LambdaT> {
return ScopedTracer<LambdaT>(std::forward<LambdaT>(lambda));
}
} // namespace mlperf
#endif // MLPERF_LOADGEN_LOGGING_H_
Metadata-Version: 2.4
Name: mlcommons_loadgen
Version: 5.0.13
Summary: MLPerf Inference LoadGen python bindings
Home-page: https://mlcommons.org/
Description-Content-Type: text/markdown
Dynamic: description
Dynamic: description-content-type
Dynamic: home-page
Dynamic: summary
# Overview {#mainpage}
## Introduction
* The LoadGen is a *reusable* module that *efficiently* and *fairly* measures
the performance of inference systems.
* It generates traffic for scenarios as formulated by a diverse set of experts
in the [MLCommons working group](https://mlcommons.org/).
* The scenarios emulate the workloads seen in mobile devices,
autonomous vehicles, robotics, and cloud-based setups.
* Although the LoadGen is not model or dataset aware, its strength is in its
reusability with logic that is.
## Integration Example and Flow
The following is an diagram of how the LoadGen can be integrated into an
inference system, resembling how some of the MLPerf reference models are
implemented.
<div style="display:flex; flex-flow:row wrap; justify-content: space-evenly;">
<img src="https://raw.githubusercontent.com/mlcommons/inference/master/loadgen/loadgen_integration_diagram.svg" width="500px" style="padding: 20px">
<ol style="padding: 20px">
<li>Benchmark knows the model, dataset, and preprocessing.</li>
<li>Benchmark hands dataset sample IDs to LoadGen.</li>
<li>LoadGen starts generating queries of sample IDs.</li>
<li>Benchmark creates requests to backend.</li>
<li>Result is post processed and forwarded to LoadGen.</li>
<li>LoadGen outputs logs for analysis.<br>
</ol>
</div>
## Useful Links
* [FAQ](README_FAQ.md)
* [LoadGen Build Instructions](README_BUILD.md)
* [LoadGen API](loadgen.h)
* [Test Settings](test_settings.h) -
A good description of available scenarios, modes, and knobs.
* [MLPerf Inference Code](https://github.com/mlcommons/inference) -
Includes source for the LoadGen and reference models that use the LoadGen.
* [MLPerf Inference Rules](https://github.com/mlcommons/inference_policies) -
Any mismatch with this is a bug in the LoadGen.
## Scope of the LoadGen's Responsibilities
### In Scope
* **Provide a reusable** C++ library with python bindings.
* **Implement** the traffic patterns of the MLPerf Inference scenarios and
modes.
* **Record** all traffic generated and received for later analysis and
verification.
* **Summarize** the results and whether performance constraints were met.
* **Target high-performance** systems with efficient multi-thread friendly
logging utilities.
* **Generate trust** via a shared, well-tested, and community-hardened
code base.
### Out of Scope
The LoadGen is:
* **NOT** aware of the ML model it is running against.
* **NOT** aware of the data formats of the model's inputs and outputs.
* **NOT** aware of how to score the accuracy of a model's outputs.
* **NOT** aware of MLPerf rules regarding scenario-specific constraints.
Limitting the scope of the LoadGen in this way keeps it reusable across
different models and datasets without modification. Using composition and
dependency injection, the user can define their own model, datasets, and
metrics.
Additionally, not hardcoding MLPerf-specific test constraints, like test
duration and performance targets, allows users to use the LoadGen unmodified
for custom testing and continuous integration purposes.
## Submission Considerations
### Upstream all local modifications
* As a rule, no local modifications to the LoadGen's C++ library are allowed
for submission.
* Please upstream early and often to keep the playing field level.
### Choose your TestSettings carefully!
* Since the LoadGen is oblivious to the model, it can't enforce the MLPerf
requirements for submission. *e.g.:* target percentiles and latencies.
* For verification, the values in TestSettings are logged.
* To help make sure your settings are spec compliant, use
TestSettings::FromConfig in conjunction with the relevant config file provided
with the reference models.
## Responsibilities of a LoadGen User
### Implement the Interfaces
* Implement the SystemUnderTest and QuerySampleLibrary interfaces and pass
them to the StartTest function.
* Call QuerySampleComplete for every sample received by
SystemUnderTest::IssueQuery.
### Assess Accuracy
* Process the *mlperf_log_accuracy.json* output by the LoadGen to determine
the accuracy of your system.
* For the official models, Python scripts will be provided by the MLPerf model
owners for you to do this automatically.
For templates of how to do the above in detail, refer to code for the demos,
tests, and reference models.
## LoadGen over the Network
For reference, on a high level a submission looks like this:
<div align="center" style="display:flex; flex-flow:row wrap; justify-content: space-evenly;">
<img src="https://raw.githubusercontent.com/mlcommons/inference/master/loadgen/diagram_submission.png" width="300px" style="padding: 20px">
</div>
The LoadGen implementation is common to all submissions, while the QSL (“Query Sample Library”) and SUT (“System Under Test”) are implemented by submitters. QSL is responsible for loading the data and includes untimed preprocessing.
A submission over the network introduces a new component “QDL” (query dispatch library) that is added to the system as presented in the following diagram:
<div align="center" style="display:flex; flex-flow:row wrap; justify-content: space-evenly;">
<img src="https://raw.githubusercontent.com/mlcommons/inference/master/loadgen/diagram_network_submission.png" width="300px" style="padding: 20px">
</div>
QDL is a proxy for a load-balancer, that dispatches queries to SUT over a physical network, receives the responses and passes them back to LoadGen. It is implemented by the submitter. The interface of the QDL is the same as the API to SUT.
In scenarios using QDL, data may be compressed in QSL at the choice of the submitter in order to reduce network transmission time. Decompression is part of the timed processing in SUT. A set of approved standard compression schemes will be specified for each benchmark; additional compression schemes must be approved in advance by the Working Group.
All communication between LoadGen/QSL and SUT is via QDL, and all communication between QDL and SUT must pass over a physical network.
QDL implements the protocol to transmit queries over the network and receive responses. It also implements decompression of any response returned by the SUT, where compression of responses is allowed. Performing any part of the timed preprocessing or inference in QDL is specifically disallowed. Currently no batching is allowed in QDL, although this may be revisited in future.
The MLperf over the Network will run in Server mode and Offline mode. All LoadGen modes are expected to work as is with insignificant changes. These include running the test in performance mode, accuracy mode, find peak performance mode and compliance mode. The same applies for power measurements.
### QDL details
The Query Dispatch Library is implemented by the submitter and interfaces with LoadGen using the same SUT API. All MLPerf Inference SUTs implement the `mlperf::SystemUnderTest` class which is defined in system_under_test.h. The QDL implements `mlperf::QueryDispatchLibrary` class which inherits the `mlperf::SystemUnderTest` class and has the same API and support all existing `mlperf::SystemUnderTest` methods. It has a separate header file query_dispatch_library.h. Using sut with `mlperf::SystemUnderTest` class in LoadGen StartTest is natively upcasting `mlperf::QueryDispatchLibrary` class.
#### QDL Query issue and response over the network
The QDL gets the queries from the LoadGen through
```CPP
void IssueQuery(const std::vector<QuerySample>& samples)
```
The QDL dispatches the queries to the SUT over the physical media. The exact method and implementation for it are submitter specific and would not be specified at MLCommons. Submitter implementation includes all methods required to serialize the query, load balance, drive it to the Operating system and network interface card and send to the SUT.
The QDL receives the query responses over the network from the SUT. The exact method and implementation for it are submitter specific and would not be specified at MLCommons. The submitter implementation includes all methods required to receive the network data from the Network Interface card, go through the Operating system, deserialize the query response, and provide it back to the LoadGen through query completion by:
```CPP
struct QuerySampleResponse {
ResponseId id;
uintptr_t data;
size_t size;
};
void QuerySamplesComplete(QuerySampleResponse* responses,
size_t response_count);
```
#### QDL Additional Methods
In addition to that the QDL needs to implement the following methods that are provided by the SUT interface to the LoadGen:
```CPP
const std::string& Name();
```
The `Name` function returns a known string for over the Network SUTs to identify it as over the network benchmark.
```CPP
void FlushQueries();
```
It is not specified here how the QDL would query and configure the SUT to execute the above methods. The QDL responds to the LoadGen after receiving its own response from the SUT.
### Example
Refer to [LON demo](demos/lon) for a reference example illustrating usage of Loadgen over the network.
MANIFEST.in
README.md
VERSION.txt
__init__.py
early_stopping.cc
early_stopping.h
issue_query_controller.cc
issue_query_controller.h
loadgen.cc
loadgen.h
logging.cc
logging.h
mlperf.conf
mlperf_conf.h
pyproject.toml
query_dispatch_library.h
query_sample.h
query_sample_library.h
results.cc
results.h
setup.py
system_under_test.h
test_settings.h
test_settings_internal.cc
test_settings_internal.h
utils.cc
utils.h
version.cc
version.h
version_generator.py
./__init__.py
./version_generator.py
bindings/c_api.cc
bindings/c_api.h
bindings/python_api.cc
generated/version_generated.cc
mlcommons_loadgen.egg-info/PKG-INFO
mlcommons_loadgen.egg-info/SOURCES.txt
mlcommons_loadgen.egg-info/dependency_links.txt
mlcommons_loadgen.egg-info/top_level.txt
\ No newline at end of file
# The format of this config file is 'key = value'.
# The key has the format 'model.scenario.key'. Value is mostly int64_t.
# Model maybe '*' as wildcard. In that case the value applies to all models.
# All times are in milli seconds
# Set performance_sample_count for each model.
# User can optionally set this to higher values in user.conf.
resnet50.*.performance_sample_count_override = 1024
ssd-mobilenet.*.performance_sample_count_override = 256
retinanet.*.performance_sample_count_override = 64
bert.*.performance_sample_count_override = 10833
dlrm.*.performance_sample_count_override = 204800
dlrm-v2.*.performance_sample_count_override = 204800
rnnt.*.performance_sample_count_override = 2513
gptj.*.performance_sample_count_override = 13368
mixtral-8x7b.*.performance_sample_count_override = 15000
llama2-70b.*.performance_sample_count_override = 24576
llama2-70b-interactive.*.performance_sample_count_override = 24576
llama3_1-405b.*.performance_sample_count_override = 8313
stable-diffusion-xl.*.performance_sample_count_override = 5000
rgat.*.performance_sample_count_override = 788379
# set to 0 to let entire sample set to be performance sample
3d-unet.*.performance_sample_count_override = 0
# Set seeds.
*.*.qsl_rng_seed = 6023615788873153749
*.*.sample_index_rng_seed = 15036839855038426416
*.*.schedule_rng_seed = 9933818062894767841
# Set seeds for TEST_05.
*.*.test05_qsl_rng_seed = 7975553102935885558
*.*.test05_sample_index_rng_seed = 11403566307062068064
*.*.test05_schedule_rng_seed = 15816800565822761601
*.SingleStream.target_latency_percentile = 90
*.SingleStream.min_duration = 600000
*.MultiStream.target_latency_percentile = 99
*.MultiStream.samples_per_query = 8
*.MultiStream.min_duration = 600000
*.MultiStream.min_query_count = 662
retinanet.MultiStream.target_latency = 528
# 3D-UNet uses equal issue mode because it has non-uniform inputs
3d-unet.*.sample_concatenate_permutation = 1
# R-GAT uses equal issue mode because it may have non-uniform inputs
rgat.*.sample_concatenate_permutation = 1
# LLM benchmarks have non-uniform inputs and outputs, and use equal issue mode for all latency scenario
gptj.*.sample_concatenate_permutation = 1
llama2-70b.*.sample_concatenate_permutation = 1
llama2-70b-interactive.*.sample_concatenate_permutation = 1
mixtral-8x7b.*.sample_concatenate_permutation = 1
llama3_1-405b.*.sample_concatenate_permutation = 1
*.Server.target_latency = 10
*.Server.target_latency_percentile = 99
*.Server.target_duration = 0
*.Server.min_duration = 600000
resnet50.Server.target_latency = 15
retinanet.Server.target_latency = 100
bert.Server.target_latency = 130
dlrm.Server.target_latency = 60
dlrm-v2.Server.target_latency = 60
rnnt.Server.target_latency = 1000
gptj.Server.target_latency = 20000
stable-diffusion-xl.Server.target_latency = 20000
# Benchmarks that measure token latencies
llama2-70b.*.use_token_latencies = 1
llama2-70b-interactive.*.use_token_latencies = 1
mixtral-8x7b.*.use_token_latencies = 1
llama3_1-405b.*.use_token_latencies = 1
# gptj benchmark infers token latencies
gptj.*.infer_token_latencies = 1
gptj.*.token_latency_scaling_factor = 69
# Only ttft and tpot are tracked for the llama2-70b, mixtral-8x7B & llama3_1-405b benchmark therefore target_latency = 0
llama2-70b.Server.target_latency = 0
llama2-70b.Server.ttft_latency = 2000
llama2-70b.Server.tpot_latency = 200
# Target Latencies for low latency setting
llama2-70b-interactive.Server.target_latency = 0
llama2-70b-interactive.Server.ttft_latency = 450
llama2-70b-interactive.Server.tpot_latency = 40
mixtral-8x7b.Server.target_latency = 0
mixtral-8x7b.Server.ttft_latency = 2000
mixtral-8x7b.Server.tpot_latency = 200
llama3_1-405b.Server.target_latency = 0
llama3_1-405b.Server.ttft_latency = 6000
llama3_1-405b.Server.tpot_latency = 175
*.Offline.target_latency_percentile = 90
*.Offline.min_duration = 600000
# In Offline scenario, we always have one query. But LoadGen maps this to
# min_sample_count internally in Offline scenario. If the dataset size is larger
# than 24576 we limit the min_query_count to 24576 and otherwise we use
# the dataset size as the limit
resnet50.Offline.min_query_count = 24576
retinanet.Offline.min_query_count = 24576
dlrm-v2.Offline.min_query_count = 24576
bert.Offline.min_query_count = 10833
gptj.Offline.min_query_count = 13368
rnnt.Offline.min_query_count = 2513
3d-unet.Offline.min_query_count = 43
stable-diffusion-xl.Offline.min_query_count = 5000
llama2-70b.Offline.min_query_count = 24576
llama3_1-405b.Offline.min_query_count = 8313
mixtral-8x7b.Offline.min_query_count = 15000
rgat.Offline.min_query_count = 788379
# These fields should be defined and overridden by user.conf.
*.SingleStream.target_latency = 10
*.MultiStream.target_latency = 80
*.Server.target_qps = 1.0
*.Offline.target_qps = 1.0
const char* mlperf_conf =
"# The format of this config file is 'key = value'.\n"
"# The key has the format 'model.scenario.key'. Value is mostly int64_t.\n"
"# Model maybe '*' as wildcard. In that case the value applies to all models.\n"
"# All times are in milli seconds\n"
"\n"
"# Set performance_sample_count for each model.\n"
"# User can optionally set this to higher values in user.conf.\n"
"resnet50.*.performance_sample_count_override = 1024\n"
"ssd-mobilenet.*.performance_sample_count_override = 256\n"
"retinanet.*.performance_sample_count_override = 64\n"
"bert.*.performance_sample_count_override = 10833\n"
"dlrm.*.performance_sample_count_override = 204800\n"
"dlrm-v2.*.performance_sample_count_override = 204800\n"
"rnnt.*.performance_sample_count_override = 2513\n"
"gptj.*.performance_sample_count_override = 13368\n"
"mixtral-8x7b.*.performance_sample_count_override = 15000\n"
"llama2-70b.*.performance_sample_count_override = 24576\n"
"llama2-70b-interactive.*.performance_sample_count_override = 24576\n"
"llama3_1-405b.*.performance_sample_count_override = 8313\n"
"stable-diffusion-xl.*.performance_sample_count_override = 5000\n"
"rgat.*.performance_sample_count_override = 788379\n"
"# set to 0 to let entire sample set to be performance sample\n"
"3d-unet.*.performance_sample_count_override = 0\n"
"\n"
"# Set seeds.\n"
"*.*.qsl_rng_seed = 6023615788873153749\n"
"*.*.sample_index_rng_seed = 15036839855038426416\n"
"*.*.schedule_rng_seed = 9933818062894767841\n"
"# Set seeds for TEST_05.\n"
"*.*.test05_qsl_rng_seed = 7975553102935885558\n"
"*.*.test05_sample_index_rng_seed = 11403566307062068064\n"
"*.*.test05_schedule_rng_seed = 15816800565822761601\n"
"\n"
"\n"
"*.SingleStream.target_latency_percentile = 90\n"
"*.SingleStream.min_duration = 600000\n"
"\n"
"*.MultiStream.target_latency_percentile = 99\n"
"*.MultiStream.samples_per_query = 8\n"
"*.MultiStream.min_duration = 600000\n"
"*.MultiStream.min_query_count = 662\n"
"retinanet.MultiStream.target_latency = 528\n"
"\n"
"# 3D-UNet uses equal issue mode because it has non-uniform inputs\n"
"3d-unet.*.sample_concatenate_permutation = 1\n"
"\n"
"# R-GAT uses equal issue mode because it may have non-uniform inputs\n"
"rgat.*.sample_concatenate_permutation = 1\n"
"\n"
"# LLM benchmarks have non-uniform inputs and outputs, and use equal issue mode for all latency scenario\n"
"gptj.*.sample_concatenate_permutation = 1\n"
"llama2-70b.*.sample_concatenate_permutation = 1\n"
"llama2-70b-interactive.*.sample_concatenate_permutation = 1\n"
"mixtral-8x7b.*.sample_concatenate_permutation = 1\n"
"llama3_1-405b.*.sample_concatenate_permutation = 1\n"
"\n"
"*.Server.target_latency = 10\n"
"*.Server.target_latency_percentile = 99\n"
"*.Server.target_duration = 0\n"
"*.Server.min_duration = 600000\n"
"resnet50.Server.target_latency = 15\n"
"retinanet.Server.target_latency = 100\n"
"bert.Server.target_latency = 130\n"
"dlrm.Server.target_latency = 60\n"
"dlrm-v2.Server.target_latency = 60\n"
"rnnt.Server.target_latency = 1000\n"
"gptj.Server.target_latency = 20000\n"
"stable-diffusion-xl.Server.target_latency = 20000\n"
"# Benchmarks that measure token latencies\n"
"llama2-70b.*.use_token_latencies = 1\n"
"llama2-70b-interactive.*.use_token_latencies = 1\n"
"mixtral-8x7b.*.use_token_latencies = 1\n"
"llama3_1-405b.*.use_token_latencies = 1\n"
"# gptj benchmark infers token latencies\n"
"gptj.*.infer_token_latencies = 1\n"
"gptj.*.token_latency_scaling_factor = 69\n"
"# Only ttft and tpot are tracked for the llama2-70b, mixtral-8x7B & llama3_1-405b benchmark therefore target_latency = 0\n"
"llama2-70b.Server.target_latency = 0\n"
"llama2-70b.Server.ttft_latency = 2000\n"
"llama2-70b.Server.tpot_latency = 200\n"
"\n"
"# Target Latencies for low latency setting\n"
"llama2-70b-interactive.Server.target_latency = 0\n"
"llama2-70b-interactive.Server.ttft_latency = 450\n"
"llama2-70b-interactive.Server.tpot_latency = 40\n"
"\n"
"mixtral-8x7b.Server.target_latency = 0\n"
"mixtral-8x7b.Server.ttft_latency = 2000\n"
"mixtral-8x7b.Server.tpot_latency = 200\n"
"\n"
"llama3_1-405b.Server.target_latency = 0\n"
"llama3_1-405b.Server.ttft_latency = 6000\n"
"llama3_1-405b.Server.tpot_latency = 175\n"
"\n"
"*.Offline.target_latency_percentile = 90\n"
"*.Offline.min_duration = 600000\n"
"\n"
"# In Offline scenario, we always have one query. But LoadGen maps this to\n"
"# min_sample_count internally in Offline scenario. If the dataset size is larger\n"
"# than 24576 we limit the min_query_count to 24576 and otherwise we use\n"
"# the dataset size as the limit\n"
"\n"
"resnet50.Offline.min_query_count = 24576\n"
"retinanet.Offline.min_query_count = 24576\n"
"dlrm-v2.Offline.min_query_count = 24576\n"
"bert.Offline.min_query_count = 10833\n"
"gptj.Offline.min_query_count = 13368\n"
"rnnt.Offline.min_query_count = 2513\n"
"3d-unet.Offline.min_query_count = 43\n"
"stable-diffusion-xl.Offline.min_query_count = 5000\n"
"llama2-70b.Offline.min_query_count = 24576\n"
"llama3_1-405b.Offline.min_query_count = 8313\n"
"mixtral-8x7b.Offline.min_query_count = 15000\n"
"rgat.Offline.min_query_count = 788379\n"
"\n"
"# These fields should be defined and overridden by user.conf.\n"
"*.SingleStream.target_latency = 10\n"
"*.MultiStream.target_latency = 80\n"
"*.Server.target_qps = 1.0\n"
"*.Offline.target_qps = 1.0\n"
"";
[build-system]
requires = ["setuptools>=42", "wheel", "pybind11==2.11.1"]
build-backend = "setuptools.build_meta:__legacy__"
[tool.cibuildwheel]
environment = "CFLAGS='-std=c++14'"
build = "cp3{7,8,9,10,11,12,13}-*"
/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
/// \file
/// \brief Defines the QueryDispatchLibrary interface.
#ifndef MLPERF_LOADGEN_QUERY_DISPATCH_LIBRARY_H
#define MLPERF_LOADGEN_QUERY_DISPATCH_LIBRARY_H
#include <string>
#include "system_under_test.h"
namespace mlperf {
/// \addtogroup LoadgenAPI
/// @{
/// \brief The interface a client implements for the LoadGen over the network to
/// test. The API inherits the System_under_test.h API When working in LON mode
/// the QueryDispatchLibrary class is used and natively Upcasted to the
/// QueryDispatchLibrary class.
class QueryDispatchLibrary : public SystemUnderTest {
public:
virtual ~QueryDispatchLibrary() = default;
};
/// @}
} // namespace mlperf
#endif // MLPERF_LOADGEN_QUERY_DISPATCH_LIBRARY_H
/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
/// \file
/// \brief Defines the structs involved in issuing a query and responding to
/// a query.
/// \details These are broken out into their own files since they are exposed
/// as part of the C API and we want to avoid C clients including C++ code.
#ifndef MLPERF_LOADGEN_QUERY_SAMPLE_H_
#define MLPERF_LOADGEN_QUERY_SAMPLE_H_
#include <stddef.h>
#include <stdint.h>
#include <iostream>
namespace mlperf {
/// \addtogroup LoadgenAPI
/// @{
/// \brief Represents a unique identifier for a sample of an issued query.
/// \details As currently implemented, the id is a pointer to an internal
/// loadgen struct whose value will never be zero/null.
typedef uintptr_t ResponseId;
constexpr ResponseId kResponseIdReserved = 0;
/// \brief An index into the QuerySampleLibrary corresponding to a
/// single sample.
typedef size_t QuerySampleIndex;
/// \brief Represents the smallest unit of input inference can run on.
/// A query consists of one or more samples.
struct QuerySample {
ResponseId id;
QuerySampleIndex index;
};
/// \brief Represents a single response to QuerySample
struct QuerySampleResponse {
ResponseId id;
uintptr_t data;
size_t size; ///< Size in bytes.
int64_t n_tokens;
public:
QuerySampleResponse(ResponseId id, uintptr_t data, size_t size,
int64_t n_tokens)
: id(id),
data(data),
size(size),
n_tokens(n_tokens){
// std::cout << "Initialized with 4 arguments, n_tokens: " <<
// n_tokens <<"\n";
};
QuerySampleResponse(ResponseId id, uintptr_t data, size_t size)
: id(id),
data(data),
size(size),
n_tokens(0){
// std::cout << "Initialized with 3 arguments, n_tokens: " <<
// n_tokens <<"\n";
};
QuerySampleResponse()
: id(0),
data(0),
size(0),
n_tokens(0){
// std::cout << "Initialized with 0 arguments, n_tokens: " <<
// n_tokens <<"\n";
};
};
/// \brief A latency in nanoseconds, as recorded by the loadgen.
typedef int64_t QuerySampleLatency;
/// @}
} // namespace mlperf
#endif // MLPERF_LOADGEN_QUERY_SAMPLE_H_
/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
/// \file
/// \brief Defines the QuerySampleLibrary interface.
#ifndef MLPERF_LOADGEN_QUERY_SAMPLE_LIBRARY_H
#define MLPERF_LOADGEN_QUERY_SAMPLE_LIBRARY_H
#include <memory>
#include <string>
#include <vector>
#include "query_sample.h"
namespace mlperf {
/// \addtogroup LoadgenAPI
/// @{
/// \brief The interface a client implements to coordinate with the loadgen
/// which samples should be loaded.
class QuerySampleLibrary {
public:
virtual ~QuerySampleLibrary() {}
/// \brief A human readable name for the model.
virtual const std::string& Name() = 0;
/// \brief Total number of samples in library.
virtual size_t TotalSampleCount() = 0;
/// \brief The number of samples that are guaranteed to fit in RAM.
virtual size_t PerformanceSampleCount() = 0;
/// \brief Loads the requested query samples into memory.
/// \details Paired with calls to UnloadSamplesFromRam.
/// In the MultiStream scenarios:
/// * Samples will appear more than once.
/// * SystemUnderTest::IssueQuery will only be called with a set of samples
/// that are neighbors in the vector of samples here, which helps
/// SUTs that need the queries to be contiguous.
/// In all other scenarios:
/// * A previously loaded sample will not be loaded again.
virtual void LoadSamplesToRam(
const std::vector<QuerySampleIndex>& samples) = 0;
/// \brief Unloads the requested query samples from memory.
/// \details In the MultiStream scenarios:
/// * Samples may be unloaded the same number of times they were loaded;
/// however, if the implementation de-dups loaded samples rather than
/// loading samples into contiguous memory, it may unload a sample the
/// first time they see it unloaded without a refcounting scheme, ignoring
/// subsequent unloads. A refcounting scheme would also work, but is not
/// a requirement.
/// In all other scenarios:
/// * A previously unloaded sample will not be unloaded again.
virtual void UnloadSamplesFromRam(
const std::vector<QuerySampleIndex>& samples) = 0;
};
/// @}
} // namespace mlperf
#endif // MLPERF_LOADGEN_QUERY_SAMPLE_LIBRARY_H
/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "results.h"
#include "early_stopping.h"
#include "utils.h"
namespace mlperf {
namespace loadgen {
void PerformanceSummary::ProcessLatencies() {
if (pr.sample_latencies.empty()) {
return;
}
sample_count = pr.sample_latencies.size();
QuerySampleLatency accumulated_sample_latency = 0;
for (auto latency : pr.sample_latencies) {
accumulated_sample_latency += latency;
}
sample_latency_mean = accumulated_sample_latency / sample_count;
std::sort(pr.sample_latencies.begin(), pr.sample_latencies.end());
target_latency_percentile.sample_latency =
pr.sample_latencies[sample_count * target_latency_percentile.percentile];
sample_latency_min = pr.sample_latencies.front();
sample_latency_max = pr.sample_latencies.back();
for (auto& lp : latency_percentiles) {
assert(lp.percentile >= 0.0);
assert(lp.percentile < 1.0);
lp.sample_latency = pr.sample_latencies[sample_count * lp.percentile];
}
query_count = pr.queries_issued;
// Count the number of overlatency queries. Only for Server scenario. Since in
// this scenario the number of samples per query is 1, sample_latencies are
// used.
if (settings.scenario == TestScenario::Server) {
QuerySampleLatency max_latency = settings.target_latency.count() + 1;
overlatency_query_count =
pr.sample_latencies.end() -
std::lower_bound(pr.sample_latencies.begin(), pr.sample_latencies.end(),
max_latency);
}
if (settings.use_token_latencies) {
ProcessTokenLatencies();
}
// MultiStream only after this point.
if (settings.scenario != TestScenario::MultiStream) {
return;
}
// Calculate per-query stats.
size_t query_count = pr.queries_issued;
assert(pr.query_latencies.size() == query_count);
std::sort(pr.query_latencies.begin(), pr.query_latencies.end());
QuerySampleLatency accumulated_query_latency = 0;
for (auto latency : pr.query_latencies) {
accumulated_query_latency += latency;
}
query_latency_mean = accumulated_query_latency / query_count;
query_latency_min = pr.query_latencies.front();
query_latency_max = pr.query_latencies.back();
target_latency_percentile.query_latency =
pr.query_latencies[query_count * target_latency_percentile.percentile];
for (auto& lp : latency_percentiles) {
lp.query_latency = pr.query_latencies[query_count * lp.percentile];
}
}
void PerformanceSummary::ProcessTokenLatencies() {
constexpr auto nTokenInvalid = std::numeric_limits<int64_t>::min();
token_count = 0;
for (auto n_tokens : pr.token_results.tokens_per_sample) {
if (n_tokens != nTokenInvalid) token_count += n_tokens;
}
if (pr.token_results.first_token_latencies.empty()) {
return;
}
QuerySampleLatency accumulated_first_token_latency = 0;
for (auto latency : pr.token_results.first_token_latencies) {
accumulated_first_token_latency += latency;
}
first_token_latency_mean = accumulated_first_token_latency / sample_count;
QuerySampleLatency accumulated_tpot = 0;
for (auto latency : pr.token_results.time_per_output_token_arr) {
accumulated_tpot += latency;
}
time_per_output_token_mean = accumulated_tpot / sample_count;
std::sort(pr.token_results.first_token_latencies.begin(),
pr.token_results.first_token_latencies.end());
std::sort(pr.token_results.time_per_output_token_arr.begin(),
pr.token_results.time_per_output_token_arr.end());
token_target_latency_percentile.sample_latency =
pr.token_results
.first_token_latencies[sample_count *
token_target_latency_percentile.percentile];
first_token_latency_min = pr.token_results.first_token_latencies.front();
first_token_latency_max = pr.token_results.first_token_latencies.back();
for (auto& lp : token_latency_percentiles) {
assert(lp.percentile >= 0.0);
assert(lp.percentile < 1.0);
lp.sample_latency =
pr.token_results.first_token_latencies[sample_count * lp.percentile];
}
target_tpot_percentile.sample_latency =
pr.token_results
.time_per_output_token_arr[sample_count *
target_tpot_percentile.percentile];
time_per_output_token_min =
pr.token_results.time_per_output_token_arr.front();
time_per_output_token_max = pr.token_results.time_per_output_token_arr.back();
for (auto& lp : tpot_percentiles) {
assert(lp.percentile >= 0.0);
assert(lp.percentile < 1.0);
lp.sample_latency =
pr.token_results
.time_per_output_token_arr[sample_count * lp.percentile];
}
if (settings.scenario == TestScenario::Server) {
// TODO: Maybe another target latency needs to be added?
QuerySampleLatency max_latency = settings.target_latency.count() + 1;
overlatency_first_token_count =
pr.token_results.first_token_latencies.end() -
std::lower_bound(pr.token_results.first_token_latencies.begin(),
pr.token_results.first_token_latencies.end(),
max_latency);
}
}
bool PerformanceSummary::EarlyStopping(
std::string* recommendation, int64_t queries_issued,
std::vector<QuerySampleLatency>* sample_latencies,
std::vector<QuerySampleLatency>* query_latencies,
std::chrono::nanoseconds target_latency) {
recommendation->clear();
MinPassingQueriesFinder find_min_passing;
double confidence = 0.99;
double tolerance = 0.0;
ProcessLatencies();
switch (settings.scenario) {
case TestScenario::SingleStream: {
// TODO: Grab multistream percentile from settings, instead of hardcoding.
double multi_stream_percentile = 0.99;
int64_t t = 1;
int64_t h_min = find_min_passing(1, target_latency_percentile.percentile,
tolerance, confidence);
int64_t h = h_min;
if (queries_issued < h_min + 1) {
*recommendation =
" * Only processed " + std::to_string(queries_issued) +
" queries.\n * Need to process at least " +
std::to_string(h_min + 1) + " queries for early stopping.";
return false;
} else {
for (int64_t i = 2; i < queries_issued + 1; ++i) {
h = find_min_passing(i, target_latency_percentile.percentile,
tolerance, confidence);
if (queries_issued < h + i) {
t = i - 1;
break;
}
}
}
QuerySampleLatency percentile_estimate =
(*sample_latencies)[queries_issued - t];
*recommendation =
" * Processed at least " + std::to_string(h_min + 1) + " queries (" +
std::to_string(queries_issued) + ").\n" + " * Would discard " +
std::to_string(t - 1) + " highest latency queries.\n" +
" * Early stopping " +
DoubleToString(target_latency_percentile.percentile * 100, 0) +
"th percentile estimate: " + std::to_string(percentile_estimate);
early_stopping_latency_ss = percentile_estimate;
// Early stopping estimate for 99%ile (used for infering multi-stream from
// single-stream)
t = 1;
h_min =
find_min_passing(1, multi_stream_percentile, tolerance, confidence);
h = h_min;
if (queries_issued < h_min + 1) {
*recommendation +=
"\n * Not enough queries processed for " +
DoubleToString(multi_stream_percentile * 100, 0) +
"th percentile\n" +
" early stopping estimate (would need to process at\n least " +
std::to_string(h_min + 1) + " total queries).";
} else {
for (int64_t i = 2; i < queries_issued + 1; ++i) {
h = find_min_passing(i, multi_stream_percentile, tolerance,
confidence);
if (queries_issued < h + i) {
t = i - 1;
break;
}
}
percentile_estimate = (*sample_latencies)[queries_issued - t];
*recommendation +=
"\n * Early stopping " +
DoubleToString(multi_stream_percentile * 100, 0) +
"th percentile estimate: " + std::to_string(percentile_estimate);
early_stopping_latency_ms = percentile_estimate;
}
break;
}
case TestScenario::Server: {
int64_t t =
std::count_if((*sample_latencies).begin(), (*sample_latencies).end(),
[=](auto const& latency) {
return latency > target_latency.count();
});
int64_t h = find_min_passing(t, target_latency_percentile.percentile,
tolerance, confidence);
if (queries_issued >= h + t) {
*recommendation = " * Run successful.";
} else {
*recommendation = " * Run unsuccessful.\n * Processed " +
std::to_string(queries_issued) + " queries.\n" +
" * Would need to run at least " +
std::to_string(h + t - queries_issued) +
" more queries,\n with the run being successful if "
"every additional\n query were under latency.";
return false;
}
break;
}
case TestScenario::MultiStream: {
int64_t t = 1;
int64_t h_min = find_min_passing(1, target_latency_percentile.percentile,
tolerance, confidence);
int64_t h = h_min;
if (queries_issued < h_min + 1) {
*recommendation =
" * Only processed " + std::to_string(queries_issued) +
" queries.\n * Need to process at least " +
std::to_string(h_min + 1) + " queries for early stopping.";
return false;
} else {
for (int64_t i = 2; i < queries_issued + 1; ++i) {
h = find_min_passing(i, target_latency_percentile.percentile,
tolerance, confidence);
if (queries_issued < h + i) {
t = i - 1;
break;
}
}
}
QuerySampleLatency percentile_estimate =
(*query_latencies)[queries_issued - t];
*recommendation =
" * Processed at least " + std::to_string(h_min + 1) + " queries (" +
std::to_string(queries_issued) + ").\n" + " * Would discard " +
std::to_string(t - 1) + " highest latency queries.\n" +
" * Early stopping " +
DoubleToString(target_latency_percentile.percentile * 100, 0) +
"th percentile estimate: " + std::to_string(percentile_estimate);
early_stopping_latency_ms = percentile_estimate;
break;
}
case TestScenario::Offline:
break;
}
return true;
}
bool PerformanceSummary::MinDurationMet(std::string* recommendation) {
recommendation->clear();
const double min_duration = DurationToSeconds(settings.min_duration);
bool min_duration_met = false;
switch (settings.scenario) {
case TestScenario::Offline:
min_duration_met = pr.max_latency >= min_duration;
break;
case TestScenario::Server:
min_duration_met = pr.final_query_scheduled_time >= min_duration;
break;
case TestScenario::SingleStream:
case TestScenario::MultiStream:
min_duration_met = pr.final_query_issued_time >= min_duration;
break;
}
if (min_duration_met) {
return true;
}
switch (settings.scenario) {
case TestScenario::SingleStream:
case TestScenario::MultiStream:
*recommendation =
"Decrease the expected latency so the loadgen pre-generates more "
"queries.";
break;
case TestScenario::Server:
*recommendation =
"Increase the target QPS so the loadgen pre-generates more queries.";
break;
case TestScenario::Offline:
*recommendation =
"Increase expected QPS so the loadgen pre-generates a larger "
"(coalesced) query.";
break;
}
return false;
}
bool PerformanceSummary::MinQueriesMet() {
return pr.queries_issued >= settings.min_query_count;
}
bool PerformanceSummary::MinSamplesMet() {
return sample_count >= settings.min_sample_count;
}
bool PerformanceSummary::HasPerfConstraints() {
return settings.scenario == TestScenario::Server;
}
bool PerformanceSummary::PerfConstraintsMet(std::string* recommendation) {
recommendation->clear();
bool perf_constraints_met = true;
switch (settings.scenario) {
case TestScenario::SingleStream:
case TestScenario::MultiStream:
break;
case TestScenario::Server:
ProcessLatencies();
if (!settings.use_token_latencies) {
if (target_latency_percentile.sample_latency >
settings.target_latency.count()) {
*recommendation = "Reduce target QPS to improve latency.";
perf_constraints_met = false;
}
} else {
if (token_target_latency_percentile.sample_latency >
settings.server_ttft_latency) {
*recommendation =
"TTFT constrain not met: Reduce target QPS to improve latency.";
perf_constraints_met = false;
}
if (target_tpot_percentile.sample_latency >
settings.server_tpot_latency) {
if (recommendation->empty()) {
*recommendation =
"TPOT constrain not met: Reduce target QPS to improve latency.";
} else {
recommendation->append(
"\n * TPOT constrain not met: Reduce target QPS to improve "
"latency.");
}
perf_constraints_met = false;
}
}
break;
case TestScenario::Offline:
break;
}
return perf_constraints_met;
}
void PerformanceSummary::LogSummary(AsyncSummary& summary) {
ProcessLatencies();
summary(
"================================================\n"
"MLPerf Results Summary\n"
"================================================");
summary("SUT name : ", sut_name);
summary("Scenario : ", ToString(settings.scenario));
summary("Mode : ", ToString(settings.mode));
switch (settings.scenario) {
case TestScenario::SingleStream: {
summary(DoubleToString(target_latency_percentile.percentile * 100, 0) +
"th percentile latency (ns) : ",
target_latency_percentile.sample_latency);
break;
}
case TestScenario::MultiStream: {
summary(DoubleToString(target_latency_percentile.percentile * 100, 0) +
"th percentile latency (ns) : ",
target_latency_percentile.query_latency);
break;
}
case TestScenario::Server: {
// Subtract 1 from sample count since the start of the final sample
// represents the open end of the time range: i.e. [begin, end).
// This makes sense since:
// a) QPS doesn't apply if there's only one sample; it's pure latency.
// b) If you have precisely 1k QPS, there will be a sample exactly on
// the 1 second time point; but that would be the 1001th sample in
// the stream. Given the first 1001 queries, the QPS is
// 1000 queries / 1 second.
// TODO: make a more permanent solution
double qps_as_completed =
(sample_count - 1) / pr.final_query_all_samples_done_time;
summary("Completed samples per second : ",
DoubleToString(qps_as_completed));
break;
}
case TestScenario::Offline: {
double samples_per_second = sample_count / pr.max_latency;
summary("Samples per second: ", samples_per_second);
break;
}
}
if (settings.use_token_latencies) {
switch (settings.scenario) {
case TestScenario::SingleStream: {
summary(DoubleToString(token_target_latency_percentile.percentile * 100,
0) +
"th first token percentile latency (ns) : ",
token_target_latency_percentile.sample_latency);
break;
}
case TestScenario::MultiStream: {
summary(DoubleToString(token_target_latency_percentile.percentile * 100,
0) +
"th first token percentile latency (ns) : ",
token_target_latency_percentile.sample_latency);
break;
}
case TestScenario::Offline: {
double tokens_per_second = token_count / pr.max_latency;
summary("Tokens per second: ", tokens_per_second);
break;
}
case TestScenario::Server:
double tps_as_completed =
token_count / pr.final_query_all_samples_done_time;
summary("Completed tokens per second: ",
DoubleToString(tps_as_completed));
break;
}
}
if (settings.infer_token_latencies) {
switch (settings.scenario) {
case TestScenario::SingleStream: {
break;
}
case TestScenario::MultiStream: {
break;
}
case TestScenario::Offline: {
double tokens_per_second = settings.token_latency_scaling_factor *
sample_count / pr.max_latency;
summary("Tokens per second (inferred): ", tokens_per_second);
break;
}
case TestScenario::Server:
double tps_as_completed = settings.token_latency_scaling_factor *
(sample_count - 1) /
pr.final_query_all_samples_done_time;
summary("Completed tokens per second (inferred): ",
DoubleToString(tps_as_completed));
break;
}
}
std::string min_duration_recommendation;
std::string perf_constraints_recommendation;
std::string early_stopping_recommendation;
std::string early_stopping_ttft_recommendation;
std::string early_stopping_tpot_recommendation;
bool min_duration_met = MinDurationMet(&min_duration_recommendation);
bool min_queries_met = MinQueriesMet() && MinSamplesMet();
bool early_stopping_met = true;
if (!settings.use_token_latencies) {
early_stopping_met = EarlyStopping(
&early_stopping_recommendation, pr.queries_issued, &pr.sample_latencies,
&pr.query_latencies, settings.target_latency);
} else {
early_stopping_met =
EarlyStopping(&early_stopping_tpot_recommendation, pr.queries_issued,
&pr.token_results.time_per_output_token_arr,
&pr.query_latencies,
std::chrono::nanoseconds(settings.server_tpot_latency)) &&
EarlyStopping(&early_stopping_ttft_recommendation, pr.queries_issued,
&pr.token_results.first_token_latencies,
&pr.query_latencies,
std::chrono::nanoseconds(settings.server_ttft_latency));
}
bool perf_constraints_met =
PerfConstraintsMet(&perf_constraints_recommendation);
bool all_constraints_met = min_duration_met && min_queries_met &&
perf_constraints_met && early_stopping_met;
summary("Result is : ", all_constraints_met ? "VALID" : "INVALID");
if (HasPerfConstraints()) {
summary(" Performance constraints satisfied : ",
perf_constraints_met ? "Yes" : "NO");
}
summary(" Min duration satisfied : ", min_duration_met ? "Yes" : "NO");
summary(" Min queries satisfied : ", min_queries_met ? "Yes" : "NO");
summary(" Early stopping satisfied: ", early_stopping_met ? "Yes" : "NO");
if (!all_constraints_met) {
summary("Recommendations:");
if (!perf_constraints_met) {
summary(" * " + perf_constraints_recommendation);
}
if (!min_duration_met) {
summary(" * " + min_duration_recommendation);
}
if (!min_queries_met) {
summary(
" * The test exited early, before enough queries were issued.\n"
" See the detailed log for why this may have occurred.");
}
}
// Early stopping results
if (settings.scenario == TestScenario::SingleStream ||
settings.scenario == TestScenario::Server ||
settings.scenario == TestScenario::MultiStream) {
if (!settings.use_token_latencies) {
summary("Early Stopping Result:");
summary(early_stopping_recommendation);
} else {
summary("TTFT Early Stopping Result:");
summary(early_stopping_ttft_recommendation);
summary("TPOT Early Stopping Result:");
summary(early_stopping_tpot_recommendation);
}
}
summary(
"\n"
"================================================\n"
"Additional Stats\n"
"================================================");
if (settings.scenario == TestScenario::SingleStream) {
double qps_w_lg = (sample_count - 1) / pr.final_query_issued_time;
double qps_wo_lg = 1 / QuerySampleLatencyToSeconds(sample_latency_mean);
summary("QPS w/ loadgen overhead : " + DoubleToString(qps_w_lg));
summary("QPS w/o loadgen overhead : " + DoubleToString(qps_wo_lg));
summary("");
} else if (settings.scenario == TestScenario::Server) {
// Scheduled samples per second as an additional stat
double qps_as_scheduled =
(sample_count - 1) / pr.final_query_scheduled_time;
summary("Scheduled samples per second : ",
DoubleToString(qps_as_scheduled));
} else if (settings.scenario == TestScenario::MultiStream) {
summary("Per-query latency: ");
summary("Min latency (ns) : ", query_latency_min);
summary("Max latency (ns) : ", query_latency_max);
summary("Mean latency (ns) : ", query_latency_mean);
for (auto& lp : latency_percentiles) {
summary(
DoubleToString(lp.percentile * 100) + " percentile latency (ns) : ",
lp.query_latency);
}
}
if (settings.scenario != TestScenario::MultiStream) {
summary("Min latency (ns) : ", sample_latency_min);
summary("Max latency (ns) : ", sample_latency_max);
summary("Mean latency (ns) : ", sample_latency_mean);
for (auto& lp : latency_percentiles) {
summary(
DoubleToString(lp.percentile * 100) + " percentile latency (ns) : ",
lp.sample_latency);
}
}
if (settings.use_token_latencies) {
summary("");
if (settings.scenario == TestScenario::SingleStream) {
double tps_w_lg = token_count / pr.final_query_issued_time;
double tps_wo_lg =
((double)token_count) /
(QuerySampleLatencyToSeconds(sample_latency_mean) * sample_count);
summary("TPS w/ loadgen overhead : " + DoubleToString(tps_w_lg));
summary("TPS w/o loadgen overhead : " + DoubleToString(tps_wo_lg));
} else if (settings.scenario == TestScenario::Server) {
double tps_as_completed =
token_count / pr.final_query_all_samples_done_time;
summary("Completed tokens per second : ",
DoubleToString(tps_as_completed));
}
if (settings.scenario != TestScenario::Offline) {
summary("Min First Token latency (ns) : ",
first_token_latency_min);
summary("Max First Token latency (ns) : ",
first_token_latency_max);
summary("Mean First Token latency (ns) : ",
first_token_latency_mean);
for (auto& lp : token_latency_percentiles) {
summary(DoubleToString(lp.percentile * 100) +
" percentile first token latency (ns) : ",
lp.sample_latency);
}
summary("");
summary("Min Time to Output Token (ns) : ",
time_per_output_token_min);
summary("Max Time to Output Token (ns) : ",
time_per_output_token_max);
summary("Mean Time to Output Token (ns) : ",
time_per_output_token_mean);
for (auto& lp : tpot_percentiles) {
summary(DoubleToString(lp.percentile * 100) +
" percentile time to output token (ns) : ",
lp.sample_latency);
}
}
}
summary(
"\n"
"================================================\n"
"Test Parameters Used\n"
"================================================");
settings.LogSummary(summary);
}
void PerformanceSummary::LogDetail(AsyncDetail& detail) {
#if USE_NEW_LOGGING_FORMAT
ProcessLatencies();
// General validity checking
std::string min_duration_recommendation;
std::string perf_constraints_recommendation;
std::string early_stopping_recommendation;
std::string early_stopping_ttft_recommendation;
std::string early_stopping_tpot_recommendation;
bool min_duration_met = MinDurationMet(&min_duration_recommendation);
bool min_queries_met = MinQueriesMet() && MinSamplesMet();
bool perf_constraints_met =
PerfConstraintsMet(&perf_constraints_recommendation);
bool early_stopping_met = true;
if (!settings.use_token_latencies) {
early_stopping_met = EarlyStopping(
&early_stopping_recommendation, pr.queries_issued, &pr.sample_latencies,
&pr.query_latencies, settings.target_latency);
} else {
early_stopping_met =
EarlyStopping(&early_stopping_tpot_recommendation, pr.queries_issued,
&pr.token_results.time_per_output_token_arr,
&pr.query_latencies,
std::chrono::nanoseconds(settings.server_tpot_latency)) &&
EarlyStopping(&early_stopping_ttft_recommendation, pr.queries_issued,
&pr.token_results.first_token_latencies,
&pr.query_latencies,
std::chrono::nanoseconds(settings.server_ttft_latency));
}
bool all_constraints_met = min_duration_met && min_queries_met &&
perf_constraints_met && early_stopping_met;
MLPERF_LOG(detail, "result_validity",
all_constraints_met ? "VALID" : "INVALID");
if (HasPerfConstraints()) {
MLPERF_LOG(detail, "result_perf_constraints_met", perf_constraints_met);
}
MLPERF_LOG(detail, "result_min_duration_met", min_duration_met);
MLPERF_LOG(detail, "result_min_queries_met", min_queries_met);
MLPERF_LOG(detail, "early_stopping_met", early_stopping_met);
if (!all_constraints_met) {
std::string recommendation;
if (!perf_constraints_met) {
recommendation += perf_constraints_recommendation + " ";
}
if (!min_duration_met) {
recommendation += min_duration_recommendation + " ";
}
if (!min_queries_met) {
recommendation +=
"The test exited early, before enough queries were issued.";
}
std::replace(recommendation.begin(), recommendation.end(), '\n', ' ');
MLPERF_LOG(detail, "result_invalid_reason", recommendation);
}
std::replace(early_stopping_recommendation.begin(),
early_stopping_recommendation.end(), '\n', ' ');
if (!settings.use_token_latencies) {
MLPERF_LOG(detail, "early_stopping_result", early_stopping_recommendation);
} else {
std::replace(early_stopping_ttft_recommendation.begin(),
early_stopping_ttft_recommendation.end(), '\n', ' ');
std::replace(early_stopping_tpot_recommendation.begin(),
early_stopping_tpot_recommendation.end(), '\n', ' ');
MLPERF_LOG(detail, "early_stopping_ttft_result",
early_stopping_ttft_recommendation);
MLPERF_LOG(detail, "early_stopping_tpot_result",
early_stopping_tpot_recommendation);
}
// Report number of queries
MLPERF_LOG(detail, "result_query_count", query_count);
if (settings.scenario == TestScenario::Server) {
MLPERF_LOG(detail, "result_overlatency_query_count",
overlatency_query_count);
}
auto reportPerQueryLatencies = [&]() {
MLPERF_LOG(detail, "result_min_query_latency_ns", query_latency_min);
MLPERF_LOG(detail, "result_max_query_latency_ns", query_latency_max);
MLPERF_LOG(detail, "result_mean_query_latency_ns", query_latency_mean);
for (auto& lp : latency_percentiles) {
std::string percentile = DoubleToString(lp.percentile * 100);
MLPERF_LOG(detail,
"result_" + percentile + "_percentile_per_query_latency_ns",
lp.query_latency);
}
};
// Per-scenario performance results.
switch (settings.scenario) {
case TestScenario::SingleStream: {
double qps_w_lg = (sample_count - 1) / pr.final_query_issued_time;
double qps_wo_lg = 1 / QuerySampleLatencyToSeconds(sample_latency_mean);
MLPERF_LOG(detail, "result_qps_with_loadgen_overhead", qps_w_lg);
MLPERF_LOG(detail, "result_qps_without_loadgen_overhead", qps_wo_lg);
MLPERF_LOG(detail, "early_stopping_latency_ss",
early_stopping_latency_ss);
MLPERF_LOG(detail, "early_stopping_latency_ms",
early_stopping_latency_ms);
break;
}
case TestScenario::MultiStream: {
reportPerQueryLatencies();
MLPERF_LOG(detail, "early_stopping_latency_ms",
early_stopping_latency_ms);
break;
}
case TestScenario::Server: {
// Subtract 1 from sample count since the start of the final sample
// represents the open end of the time range: i.e. [begin, end).
// This makes sense since:
// a) QPS doesn't apply if there's only one sample; it's pure latency.
// b) If you have precisely 1k QPS, there will be a sample exactly on
// the 1 second time point; but that would be the 1001th sample in
// the stream. Given the first 1001 queries, the QPS is
// 1000 queries / 1 second.
double qps_as_scheduled =
(sample_count - 1) / pr.final_query_scheduled_time;
MLPERF_LOG(detail, "result_scheduled_samples_per_sec", qps_as_scheduled);
double qps_as_completed =
(sample_count - 1) / pr.final_query_all_samples_done_time;
MLPERF_LOG(detail, "result_completed_samples_per_sec", qps_as_completed);
break;
}
case TestScenario::Offline: {
double samples_per_second = sample_count / pr.max_latency;
MLPERF_LOG(detail, "result_samples_per_second", samples_per_second);
break;
}
}
// Detailed latencies
MLPERF_LOG(detail, "result_min_latency_ns", sample_latency_min);
MLPERF_LOG(detail, "result_max_latency_ns", sample_latency_max);
MLPERF_LOG(detail, "result_mean_latency_ns", sample_latency_mean);
for (auto& lp : latency_percentiles) {
MLPERF_LOG(detail,
"result_" + DoubleToString(lp.percentile * 100) +
"_percentile_latency_ns",
lp.sample_latency);
}
// Detailed first token latencies
if (settings.use_token_latencies) {
if (settings.scenario != TestScenario::Offline) {
MLPERF_LOG(detail, "result_first_token_min_latency_ns",
first_token_latency_min);
MLPERF_LOG(detail, "result_first_token_max_latency_ns",
first_token_latency_max);
MLPERF_LOG(detail, "result_first_token_mean_latency_ns",
first_token_latency_mean);
for (auto& lp : token_latency_percentiles) {
MLPERF_LOG(detail,
"result_first_token_" + DoubleToString(lp.percentile * 100) +
"_percentile_latency_ns",
lp.sample_latency);
}
double tps_w_lg = ((double)token_count) / pr.final_query_issued_time;
double tps_wo_lg =
((double)token_count) / (sample_latency_mean * sample_count);
MLPERF_LOG(detail, "result_token_throughput_with_loadgen_overhead",
tps_w_lg);
MLPERF_LOG(detail, "result_token_throughput", tps_wo_lg);
for (auto& lp : tpot_percentiles) {
MLPERF_LOG(detail,
"result_time_per_output_token_" +
DoubleToString(lp.percentile * 100) + "_percentile_ns",
lp.sample_latency);
}
MLPERF_LOG(detail, "result_time_to_output_token_min",
time_per_output_token_min);
MLPERF_LOG(detail, "result_time_to_output_token_max",
time_per_output_token_max);
MLPERF_LOG(detail, "result_time_to_output_token_mean",
time_per_output_token_mean);
double tps_as_completed =
token_count / pr.final_query_all_samples_done_time;
MLPERF_LOG(detail, "result_completed_tokens_per_second",
tps_as_completed);
} else {
double tokens_per_second = token_count / pr.max_latency;
MLPERF_LOG(detail, "result_tokens_per_second", tokens_per_second);
}
}
if (settings.infer_token_latencies) {
switch (settings.scenario) {
case TestScenario::Server: {
double completed_tokens_per_second =
(sample_count - 1) * settings.token_latency_scaling_factor /
pr.final_query_all_samples_done_time;
MLPERF_LOG(detail, "result_inferred_completed_tokens_per_second",
completed_tokens_per_second);
break;
}
case TestScenario::Offline: {
double tokens_per_second = sample_count *
settings.token_latency_scaling_factor /
pr.max_latency;
MLPERF_LOG(detail, "result_inferred_tokens_per_second",
tokens_per_second);
break;
}
case TestScenario::SingleStream: {
break;
}
case TestScenario::MultiStream: {
break;
}
}
#endif
}
}
} // namespace loadgen
} // namespace mlperf
/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
/// \file
/// \brief Defines PerformanceResult and PerformanceSummary.
#ifndef MLPERF_LOADGEN_RESULTS_H_
#define MLPERF_LOADGEN_RESULTS_H_
#include <string>
#include <vector>
#include "query_sample.h"
#include "test_settings_internal.h"
namespace mlperf {
namespace loadgen {
/// \brief Contains the performance results for benchmarks that have
/// token based metrics
struct TokenPerformanceResults {
std::vector<QuerySampleLatency> first_token_latencies;
std::vector<QuerySampleLatency> time_per_output_token_arr;
std::vector<int64_t> tokens_per_sample;
};
/// \brief Provides performance results that are independent of scenario
/// and other context.
struct PerformanceResult {
std::vector<QuerySampleLatency> sample_latencies;
std::vector<QuerySampleLatency> query_latencies;
size_t queries_issued;
double max_latency;
double final_query_scheduled_time; // seconds from start.
double final_query_issued_time; // seconds from start.
double final_query_all_samples_done_time; // seconds from start.
TokenPerformanceResults token_results;
};
/// \brief Wraps PerformanceResult with relevant context to change how
/// it's interpreted and reported.
struct PerformanceSummary {
std::string sut_name;
TestSettingsInternal settings;
PerformanceResult pr;
// Set by ProcessLatencies.
size_t sample_count = 0;
size_t query_count = 0;
size_t overlatency_query_count = 0;
QuerySampleLatency sample_latency_min = 0;
QuerySampleLatency sample_latency_max = 0;
QuerySampleLatency sample_latency_mean = 0;
QuerySampleLatency query_latency_min = 0;
QuerySampleLatency query_latency_max = 0;
QuerySampleLatency query_latency_mean = 0;
/// \brief The latency at a given percentile.
struct PercentileEntry {
const double percentile;
QuerySampleLatency sample_latency = 0;
QuerySampleLatency query_latency = 0; // MultiStream only.
};
// Latency target percentile
PercentileEntry target_latency_percentile{settings.target_latency_percentile};
PercentileEntry latency_percentiles[6] = {{.50}, {.90}, {.95},
{.97}, {.99}, {.999}};
// Early stopping percentile estimates for SingleStream and MultiStream
QuerySampleLatency early_stopping_latency_ss = 0;
QuerySampleLatency early_stopping_latency_ms = 0;
// Set by ProcessTokenLatencies
size_t token_count = 0;
size_t overlatency_first_token_count = 0;
QuerySampleLatency first_token_latency_min = 0;
QuerySampleLatency first_token_latency_max = 0;
QuerySampleLatency first_token_latency_mean = 0;
QuerySampleLatency time_per_output_token_min = 0;
QuerySampleLatency time_per_output_token_max = 0;
QuerySampleLatency time_per_output_token_mean = 0;
// Latency token target percentile
PercentileEntry token_target_latency_percentile{
settings.target_latency_percentile};
PercentileEntry token_latency_percentiles[6] = {{.50}, {.90}, {.95},
{.97}, {.99}, {.999}};
PercentileEntry target_tpot_percentile{settings.target_latency_percentile};
PercentileEntry tpot_percentiles[6] = {{.50}, {.90}, {.95},
{.97}, {.99}, {.999}};
#if defined(_WIN32) || defined(WIN32) || defined(_WIN64) || defined(WIN64)
// MSVC complains if there is no explicit constructor.
// (target_latency_percentile above depends on construction with settings)
PerformanceSummary(const std::string& sut_name_arg,
const TestSettingsInternal& settings_arg,
const PerformanceResult& pr_arg)
: sut_name(sut_name_arg), settings(settings_arg), pr(pr_arg){};
#endif
void ProcessLatencies();
void ProcessTokenLatencies();
bool MinDurationMet(std::string* recommendation);
bool EarlyStopping(std::string* recommendation, int64_t queries_issued,
std::vector<QuerySampleLatency>* sample_latencies,
std::vector<QuerySampleLatency>* query_latencies,
std::chrono::nanoseconds target_latency);
bool MinQueriesMet();
bool MinSamplesMet();
bool HasPerfConstraints();
bool PerfConstraintsMet(std::string* recommendation);
void LogSummary(AsyncSummary& summary);
void LogDetail(AsyncDetail& detail);
};
} // namespace loadgen
} // namespace mlperf
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment