[c++] Fixed Predictor lifecycle and trees initialization in Contrib mode (#6778)

* 1) Fixed Predictor lifecycle 2) Fixed Boosting trees initialization #5482 * Added tests for LGBM_BoosterPredictForMat in Contrib mode * #6778 Reverted indentation to 4 spaces --------- Co-authored-by: James Lamb <jaylamb20@gmail.com> Co-authored-by: Nikita Titov <nekit94-08@mail.ru>

[c++] Fixed Predictor lifecycle and trees initialization in Contrib mode (#6778)
* 1) Fixed Predictor lifecycle 2) Fixed Boosting trees initialization #5482 * Added tests for LGBM_BoosterPredictForMat in Contrib mode * #6778 Reverted indentation to 4 spaces --------- Co-authored-by: James Lamb <jaylamb20@gmail.com> Co-authored-by: Nikita Titov <nekit94-08@mail.ru>
3654ecaa · AndreyOrb · GitHub · 226e7f7d · 3654ecaa · 3654ecaa
Unverified Commit 3654ecaa authored Jan 20, 2025 by AndreyOrb Committed by GitHub Jan 20, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 72 additions and 17 deletions

src/boosting/gbdt.h src/boosting/gbdt.h +12 -1

src/c_api.cpp src/c_api.cpp +5 -5

tests/cpp_tests/test_single_row.cpp tests/cpp_tests/test_single_row.cpp +55 -11

No files found.
--- a/src/boosting/gbdt.h
+++ b/src/boosting/gbdt.h
@@ -433,11 +433,18 @@ class GBDT : public GBDTBase {
      num_iteration_for_pred_ = num_iteration_for_pred_ - start_iteration;
    }
    start_iteration_for_pred_ = start_iteration;
-    if (is_pred_contrib) {
+    if (is_pred_contrib && !models_initialized_) {
+      std::lock_guard<std::mutex> lock(instance_mutex_);
+      if (models_initialized_)
+        return;
      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
      for (int i = 0; i < static_cast<int>(models_.size()); ++i) {
        models_[i]->RecomputeMaxDepth();
      }
+      models_initialized_ = true;
    }
  }
@@ -548,6 +555,10 @@ class GBDT : public GBDTBase {
  int max_feature_idx_;
  /*! \brief Parser config file content */
  std::string parser_config_str_ = "";
+  /*! \brief Are the models initialized (passed RecomputeMaxDepth phase) */
+  bool models_initialized_ = false;
+  /*! \brief Mutex for exclusive models initialization */
+  std::mutex instance_mutex_;
 #ifdef USE_CUDA
  /*! \brief First order derivative of training data */

--- a/src/c_api.cpp
+++ b/src/c_api.cpp
@@ -460,7 +460,7 @@ class Booster {
    *out_len = single_row_predictor->num_pred_in_one_row;
  }
-  Predictor CreatePredictor(int start_iteration, int num_iteration, int predict_type, int ncol, const Config& config) const {
+  std::shared_ptr<Predictor> CreatePredictor(int start_iteration, int num_iteration, int predict_type, int ncol, const Config& config) const {
    if (!config.predict_disable_shape_check && ncol != boosting_->MaxFeatureIdx() + 1) {
      Log::Fatal("The number of features in data (%d) is not the same as it was in training data (%d).\n" \
                 "You can set ``predict_disable_shape_check=true`` to discard this error, but please be aware what you are doing.", ncol, boosting_->MaxFeatureIdx() + 1);
@@ -478,7 +478,7 @@ class Booster {
      is_raw_score = false;
    }
-    return Predictor(boosting_.get(), start_iteration, num_iteration, is_raw_score, is_predict_leaf, predict_contrib,
+    return std::make_shared<Predictor>(boosting_.get(), start_iteration, num_iteration, is_raw_score, is_predict_leaf, predict_contrib,
                        config.pred_early_stop, config.pred_early_stop_freq, config.pred_early_stop_margin);
  }
@@ -496,7 +496,7 @@ class Booster {
      predict_contrib = true;
    }
    int64_t num_pred_in_one_row = boosting_->NumPredictOneRow(start_iteration, num_iteration, is_predict_leaf, predict_contrib);
-    auto pred_fun = predictor.GetPredictFunction();
+    auto pred_fun = predictor->GetPredictFunction();
    OMP_INIT_EX();
    #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
    for (int i = 0; i < nrow; ++i) {
@@ -517,7 +517,7 @@ class Booster {
                     int32_t** out_indices, void** out_data, int data_type,
                     bool* is_data_float32_ptr, int num_matrices) const {
    auto predictor = CreatePredictor(start_iteration, num_iteration, predict_type, ncol, config);
-    auto pred_sparse_fun = predictor.GetPredictSparseFunction();
+    auto pred_sparse_fun = predictor->GetPredictSparseFunction();
    std::vector<std::vector<std::unordered_map<int, double>>>& agg = *agg_ptr;
    OMP_INIT_EX();
    #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
@@ -652,7 +652,7 @@ class Booster {
    // Get the number of trees per iteration (for multiclass scenario we output multiple sparse matrices)
    int num_matrices = boosting_->NumModelPerIteration();
    auto predictor = CreatePredictor(start_iteration, num_iteration, predict_type, ncol, config);
-    auto pred_sparse_fun = predictor.GetPredictSparseFunction();
+    auto pred_sparse_fun = predictor->GetPredictSparseFunction();
    bool is_col_ptr_int32 = false;
    bool is_data_float32 = false;
    int num_output_cols = ncol + 1;

--- a/tests/cpp_tests/test_single_row.cpp
+++ b/tests/cpp_tests/test_single_row.cpp
@@ -12,7 +12,7 @@
 using LightGBM::TestUtils;
-TEST(SingleRow, JustWorks) {
+void test_predict_type(int predict_type, int num_predicts) {
    // Load some test data
    int result;
@@ -37,17 +37,19 @@ TEST(SingleRow, JustWorks) {
        booster_handle,
        &n_features);
    EXPECT_EQ(0, result) << "LGBM_BoosterGetNumFeature result code: " << result;
+    EXPECT_EQ(28, n_features) << "LGBM_BoosterGetNumFeature number of features: " << n_features;
    // Run a single row prediction and compare with regular Mat prediction:
    int64_t output_size;
    result = LGBM_BoosterCalcNumPredict(
        booster_handle,
        1,
-        C_API_PREDICT_NORMAL,  // predict_type
+        predict_type,          // predict_type
        0,                     // start_iteration
        -1,                    // num_iteration
        &output_size);
    EXPECT_EQ(0, result) << "LGBM_BoosterCalcNumPredict result code: " << result;
+    EXPECT_EQ(num_predicts, output_size) << "LGBM_BoosterCalcNumPredict output size: " << output_size;
    std::ifstream test_file("examples/binary_classification/binary.test");
    std::vector<double> test;
@@ -77,7 +79,7 @@ TEST(SingleRow, JustWorks) {
        test_set_size,         // nrow
        n_features,            // ncol
        1,                     // is_row_major
-        C_API_PREDICT_NORMAL,  // predict_type
+        predict_type,          // predict_type
        0,                     // start_iteration
        -1,                    // num_iteration
        "",
@@ -85,13 +87,47 @@ TEST(SingleRow, JustWorks) {
        &mat_output[0]);
    EXPECT_EQ(0, result) << "LGBM_BoosterPredictForMat result code: " << result;
-    // Now let's run with the single row fast prediction API:
+    // Test LGBM_BoosterPredictForMat in multi-threaded mode
    const int kNThreads = 10;
+    const int numIterations = 5;
+    std::vector<std::thread> predict_for_mat_threads(kNThreads);
+    for (int i = 0; i < kNThreads; i++) {
+        predict_for_mat_threads[i] = std::thread(
+            [
+                i, test_set_size, output_size, n_features,
+                    test = &test[0], booster_handle, predict_type, numIterations
+            ]() {
+                for (int j = 0; j < numIterations; j++) {
+                    int result;
+                    std::vector<double> mat_output(output_size * test_set_size, -1);
+                    int64_t written;
+                    result = LGBM_BoosterPredictForMat(
+                        booster_handle,
+                        &test[0],
+                        C_API_DTYPE_FLOAT64,
+                        test_set_size,         // nrow
+                        n_features,            // ncol
+                        1,                     // is_row_major
+                        predict_type,          // predict_type
+                        0,                     // start_iteration
+                        -1,                    // num_iteration
+                        "",
+                        &written,
+                        &mat_output[0]);
+                    EXPECT_EQ(0, result) << "LGBM_BoosterPredictForMat result code: " << result;
+                }
+            });
+    }
+    for (std::thread& t : predict_for_mat_threads) {
+        t.join();
+    }
+    // Now let's run with the single row fast prediction API:
    FastConfigHandle fast_configs[kNThreads];
    for (int i = 0; i < kNThreads; i++) {
        result = LGBM_BoosterPredictForMatSingleRowFastInit(
            booster_handle,
-            C_API_PREDICT_NORMAL,  // predict_type
+            predict_type,          // predict_type
            0,                     // start_iteration
            -1,                    // num_iteration
            C_API_DTYPE_FLOAT64,
@@ -102,14 +138,14 @@ TEST(SingleRow, JustWorks) {
    }
    std::vector<double> single_row_output(output_size * test_set_size, -1);
-    std::vector<std::thread> threads(kNThreads);
+    std::vector<std::thread> single_row_threads(kNThreads);
    int batch_size = (test_set_size + kNThreads - 1) / kNThreads;  // round up
    for (int i = 0; i < kNThreads; i++) {
-        threads[i] = std::thread(
+        single_row_threads[i] = std::thread(
            [
                i, batch_size, test_set_size, output_size, n_features,
-                test = &test[0], fast_configs = &fast_configs[0], single_row_output = &single_row_output[0]
+                    test = &test[0], fast_configs = &fast_configs[0], single_row_output = &single_row_output[0]
-            ](){
+            ]() {
                int result;
                int64_t written;
                for (int j = i * batch_size; j < std::min((i + 1) * batch_size, test_set_size); j++) {
@@ -122,8 +158,8 @@ TEST(SingleRow, JustWorks) {
                    EXPECT_EQ(written, output_size) << "LGBM_BoosterPredictForMatSingleRowFast unexpected written output size";
                }
            });
-    }
+      }
-    for (std::thread &t : threads) {
+    for (std::thread& t : single_row_threads) {
        t.join();
    }
@@ -141,3 +177,11 @@ TEST(SingleRow, JustWorks) {
    result = LGBM_DatasetFree(train_dataset);
    EXPECT_EQ(0, result) << "LGBM_DatasetFree result code: " << result;
 }
+TEST(SingleRow, Normal) {
+    test_predict_type(C_API_PREDICT_NORMAL, 1);
+}
+TEST(SingleRow, Contrib) {
+    test_predict_type(C_API_PREDICT_CONTRIB, 29);
+}