Unverified Commit c0c65f76 authored by Aakarsh Gopi's avatar Aakarsh Gopi Committed by GitHub
Browse files

Updated network retry delay strategy to scale (#3306)



This allows for network retries, to scale well with the
number of machines, and still retains the existing functionality
for cases with smaller num_machines ( 500 )

Fixes #3301
Co-authored-by: default avatarAakarsh Gopi <aakarsh@vaticlabs.com>
parent 2e1b39ba
......@@ -47,7 +47,7 @@ typedef void* FastConfigHandle; /*!< \brief Handle of FastConfig. */
LIGHTGBM_C_EXPORT const char* LGBM_GetLastError();
/*!
* \brief Register a callback function for log redirecting.
* \brief Register a callback function for log redirecting.
* \param callback The callback function to register
* \return 0 when succeed, -1 when failure happens
*/
......
......@@ -8,9 +8,10 @@
#include <LightGBM/utils/common.h>
#include <LightGBM/utils/text_reader.h>
#include <string>
#include <algorithm>
#include <chrono>
#include <cstring>
#include <string>
#include <thread>
#include <unordered_map>
#include <unordered_set>
......@@ -186,7 +187,9 @@ void Linkers::Construct() {
listener_->SetTimeout(socket_timeout_);
listener_->Listen(incoming_cnt);
std::thread listen_thread(&Linkers::ListenThread, this, incoming_cnt);
const int connect_fail_retry_cnt = 20;
const int connect_fail_constant_factor = 20;
const int connect_fail_retries_scale_factor = static_cast<int>(num_machines_ / connect_fail_constant_factor);
const int connect_fail_retry_cnt = std::max(connect_fail_constant_factor, connect_fail_retries_scale_factor);
const int connect_fail_retry_first_delay_interval = 200; // 0.2 s
const float connect_fail_retry_delay_factor = 1.3f;
// start connect
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment