fix max_bin of categorical feature in parallel learning.

12ce2566 · Guolin Ke · ebc0de8b · 12ce2566 · 12ce2566 · 12ce2566
Commit 12ce2566 authored Mar 13, 2017 by Guolin Ke
4 changed files
--- a/include/LightGBM/boosting.h
+++ b/include/LightGBM/boosting.h
@@ -99,14 +99,14 @@ public:
  /*!
  * \brief Get prediction result at data_idx data
  * \param data_idx 0: training data, 1: 1st validation data
-  * \return out_len lenght of returned score
+  * \return out_len length of returned score
  */
  virtual int64_t GetNumPredictAt(int data_idx) const = 0;
  /*!
  * \brief Get prediction result at data_idx data
  * \param data_idx 0: training data, 1: 1st validation data
  * \param result used to store prediction result, should allocate memory before call this function
-  * \param out_len lenght of returned score
+  * \param out_len length of returned score
  */
  virtual void GetPredictAt(int data_idx, double* result, int64_t* out_len) = 0;

@@ -125,7 +125,7 @@ public:
  virtual std::vector<double> Predict(const double* feature_values) const = 0;
  
  /*!
-  * \brief Predtion for one record with leaf index
+  * \brief Prediction for one record with leaf index
  * \param feature_values Feature value on this record
  * \return Predicted leaf index for this record
  */

--- a/include/LightGBM/c_api.h
+++ b/include/LightGBM/c_api.h
@@ -540,7 +540,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForFile(BoosterHandle handle,
 *          C_API_PREDICT_RAW_SCORE: raw score
 *          C_API_PREDICT_LEAF_INDEX: leaf index
 * \param num_iteration number of iteration for prediction, <= 0 means no limit
-* \param out_len lenght of prediction
+* \param out_len length of prediction
 * \return 0 when succeed, -1 when failure happens
 */
 LIGHTGBM_C_EXPORT int LGBM_BoosterCalcNumPredict(BoosterHandle handle,

--- a/src/boosting/gbdt.h
+++ b/src/boosting/gbdt.h
@@ -119,7 +119,7 @@ public:
  * \brief Get prediction result at data_idx data
  * \param data_idx 0: training data, 1: 1st validation data
  * \param result used to store prediction result, should allocate memory before call this function
-  * \param out_len lenght of returned score
+  * \param out_len length of returned score
  */
  void GetPredictAt(int data_idx, double* out_result, int64_t* out_len) override;


--- a/src/io/dataset_loader.cpp
+++ b/src/io/dataset_loader.cpp
@@ -721,11 +721,11 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
        io_config_.max_bin, io_config_.min_data_in_bin, filter_cnt, bin_type);
    }
  } else {
-    // if have multi-machines, need find bin distributed
+    // if have multi-machines, need to find bin distributed
    // different machines will find bin for different features

    // start and len will store the process feature indices for different machines
-    // machine i will find bins for features in [ strat[i], start[i] + len[i] )
+    // machine i will find bins for features in [ start[i], start[i] + len[i] )
    std::vector<int> start(num_machines);
    std::vector<int> len(num_machines);
    int total_num_feature = static_cast<int>(sample_values.size());
@@ -738,8 +738,29 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
      start[i + 1] = start[i] + len[i];
    }
    len[num_machines - 1] = total_num_feature - start[num_machines - 1];
+  #pragma omp parallel for schedule(guided)
+    for (int i = 0; i < len[rank]; ++i) {
+      if (ignore_features_.count(start[rank] + i) > 0) {
+        continue;
+      }
+      BinType bin_type = BinType::NumericalBin;
+      if (categorical_features_.count(start[rank] + i)) {
+        bin_type = BinType::CategoricalBin;
+      }
+      bin_mappers[i].reset(new BinMapper());
+      bin_mappers[i]->FindBin(sample_values[start[rank] + i], sample_data.size(),
+                         io_config_.max_bin, io_config_.min_data_in_bin, filter_cnt, bin_type);
+    }
+    // get max_bin
+    int max_bin = 0;
+    for (int i = 0; i < len[rank]; ++i) {
+      if (ignore_features_.count(start[rank] + i) > 0) {
+        continue;
+      }
+      max_bin = std::max(max_bin, bin_mappers[i]->num_bin());
+    }
    // get size of bin mapper with max_bin_ size
-    int type_size = BinMapper::SizeForSpecificBin(io_config_.max_bin);
+    int type_size = BinMapper::SizeForSpecificBin(max_bin);
    // since sizes of different feature may not be same, we expand all bin mapper to type_size
    int buffer_size = type_size * total_num_feature;
    auto input_buffer = std::vector<char>(buffer_size);
@@ -751,14 +772,9 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
      if (ignore_features_.count(start[rank] + i) > 0) {
        continue;
      }
-      BinType bin_type = BinType::NumericalBin;
-      if (categorical_features_.count(start[rank] + i)) {
-        bin_type = BinType::CategoricalBin;
-      }
-      BinMapper bin_mapper;
-      bin_mapper.FindBin(sample_values[start[rank] + i], sample_data.size(), 
-        io_config_.max_bin, io_config_.min_data_in_bin, filter_cnt, bin_type);
-      bin_mapper.CopyTo(input_buffer.data() + i * type_size);
+      bin_mappers[i]->CopyTo(input_buffer.data() + i * type_size);
+      // free
+      bin_mappers[i].reset(nullptr);
    }
    // convert to binary size
    for (int i = 0; i < num_machines; ++i) {