Commit 61292080 authored by ashok-ponnuswami-msft's avatar ashok-ponnuswami-msft Committed by Guolin Ke
Browse files

LightGBM would hang on MPI run if only some nodes have a fatal error. (#2600)

* LightGBM would hang on MPI run if only some nodes have a fatal error. The issue is that the destructor of Application calls MPI_Finalize(), which will cause the program to hand and prevent from exiting. So we move the network finalization out of the destructor and call MPI_Finalize() or MPI_Abort() based on whether there was an unhandled exception.

* Minor updates: Remove excess debug logging, whitespaces.

* Add comments for new functions.
parent 51ceef84
...@@ -3,26 +3,39 @@ ...@@ -3,26 +3,39 @@
* Licensed under the MIT License. See LICENSE file in the project root for license information. * Licensed under the MIT License. See LICENSE file in the project root for license information.
*/ */
#include <LightGBM/application.h> #include <LightGBM/application.h>
#include "network/linkers.h"
#include <iostream> #include <iostream>
int main(int argc, char** argv) { int main(int argc, char** argv) {
bool success = false;
try { try {
LightGBM::Application app(argc, argv); LightGBM::Application app(argc, argv);
app.Run(); app.Run();
#ifdef USE_MPI
LightGBM::Linkers::MpiFinalizeIfIsParallel();
#endif
success = true;
} }
catch (const std::exception& ex) { catch (const std::exception& ex) {
std::cerr << "Met Exceptions:" << std::endl; std::cerr << "Met Exceptions:" << std::endl;
std::cerr << ex.what() << std::endl; std::cerr << ex.what() << std::endl;
exit(-1);
} }
catch (const std::string& ex) { catch (const std::string& ex) {
std::cerr << "Met Exceptions:" << std::endl; std::cerr << "Met Exceptions:" << std::endl;
std::cerr << ex << std::endl; std::cerr << ex << std::endl;
exit(-1);
} }
catch (...) { catch (...) {
std::cerr << "Unknown Exceptions" << std::endl; std::cerr << "Unknown Exceptions" << std::endl;
}
if (!success) {
#ifdef USE_MPI
LightGBM::Linkers::MpiAbortIfIsParallel();
#endif
exit(-1); exit(-1);
} }
} }
...@@ -138,7 +138,24 @@ class Linkers { ...@@ -138,7 +138,24 @@ class Linkers {
#endif // USE_SOCKET #endif // USE_SOCKET
#ifdef USE_MPI
/*!
* \brief Check if MPI has been initialized
*/
static bool IsMpiInitialized();
/*!
* \brief Finalize the MPI session if it was initialized
*/
static void MpiFinalizeIfIsParallel();
/*!
* \brief Abort the MPI session if it was initialized (called in case there was a error that needs abrupt ending)
*/
static void MpiAbortIfIsParallel();
#endif
private: private:
/*! \brief Rank of local machine */ /*! \brief Rank of local machine */
int rank_; int rank_;
......
...@@ -27,11 +27,35 @@ Linkers::Linkers(Config) { ...@@ -27,11 +27,35 @@ Linkers::Linkers(Config) {
} }
Linkers::~Linkers() { Linkers::~Linkers() {
if (is_init_) { // Don't call MPI_Finalize() here: If the destructor was called because only this node had an exception, calling MPI_Finalize() will cause all nodes to hang.
// Instead we will handle finalize/abort for MPI in main().
}
bool Linkers::IsMpiInitialized() {
int is_mpi_init;
MPI_SAFE_CALL(MPI_Initialized(&is_mpi_init));
return is_mpi_init;
}
void Linkers::MpiFinalizeIfIsParallel() {
if (IsMpiInitialized()) {
Log::Debug("Finalizing MPI session.");
MPI_SAFE_CALL(MPI_Finalize()); MPI_SAFE_CALL(MPI_Finalize());
} }
} }
void Linkers::MpiAbortIfIsParallel() {
try {
if (IsMpiInitialized()) {
std::cerr << "Aborting MPI communication." << std::endl << std::flush;
MPI_SAFE_CALL(MPI_Abort(MPI_COMM_WORLD, -1));;
}
}
catch (...) {
std::cerr << "Exception was raised before aborting MPI. Aborting process..." << std::endl << std::flush;
abort();
}
}
} // namespace LightGBM } // namespace LightGBM
#endif // USE_MPI #endif // USE_MPI
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment