fix bugs for parallel learning.

ebc0de8b · Guolin Ke · 368adeb3 · ebc0de8b · ebc0de8b · ebc0de8b
Commit ebc0de8b authored Mar 13, 2017 by Guolin Ke
14 changed files
--- a/include/LightGBM/c_api.h
+++ b/include/LightGBM/c_api.h
@@ -52,6 +52,87 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromFile(const char* filename,
  const DatasetHandle reference,
  DatasetHandle* out);

+
+/*!
+* \brief create a empty dataset by sampling csc data, if num_sample_row == num_total_row, will construct this dataset.
+* \param indptr pointer to row headers
+* \param indptr_type type of indptr, can be C_API_DTYPE_INT32 or C_API_DTYPE_INT64
+* \param indices findex
+* \param data fvalue
+* \param data_type type of data pointer, can be C_API_DTYPE_FLOAT32 or C_API_DTYPE_FLOAT64
+* \param nindptr number of rows in the matrix + 1
+* \param n_sample_elem number of nonzero elements in the matrix
+* \param num_col number of columns
+* \param num_total_row number of total rows
+* \param parameters additional parameters
+* \param out created dataset
+* \return 0 when succeed, -1 when failure happens
+*/
+LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromSampledCSR(const void* indptr,
+  int indptr_type,
+  const int32_t* indices,
+  const void* data,
+  int data_type,
+  int64_t nindptr,
+  int64_t n_sample_elem,
+  int64_t num_col,
+  int64_t num_total_row,
+  const char* parameters,
+  DatasetHandle* out);
+
+/*!
+* \brief create a empty dataset by reference Dataset
+* \param reference used to align bin mapper
+* \param num_total_row number of total rows
+* \param out created dataset
+* \return 0 when succeed, -1 when failure happens
+*/
+LIGHTGBM_C_EXPORT int LGBM_DatasetCreateByReference(const DatasetHandle reference,
+  int64_t num_total_row,
+  DatasetHandle* out);
+
+/*!
+* \brief push data to existing dataset, if nrow + start_row == num_total_row, will call dataset->FinishLoad
+* \param dataset handle of dataset
+* \param data pointer to the data space
+* \param data_type type of data pointer, can be C_API_DTYPE_FLOAT32 or C_API_DTYPE_FLOAT64
+* \param nrow number of rows
+* \param ncol number columns
+* \param start_row row start index
+* \return 0 when succeed, -1 when failure happens
+*/
+LIGHTGBM_C_EXPORT int LGBM_DatasetPushRows(DatasetHandle dataset,
+  const void* data,
+  int data_type,
+  int32_t nrow,
+  int32_t ncol,
+  int32_t start_row);
+
+/*!
+* \brief push data to existing dataset, if nrow + start_row == num_total_row, will call dataset->FinishLoad
+* \param dataset handle of dataset
+* \param indptr pointer to row headers
+* \param indptr_type type of indptr, can be C_API_DTYPE_INT32 or C_API_DTYPE_INT64
+* \param indices findex
+* \param data fvalue
+* \param data_type type of data pointer, can be C_API_DTYPE_FLOAT32 or C_API_DTYPE_FLOAT64
+* \param nindptr number of rows in the matrix + 1
+* \param nelem number of nonzero elements in the matrix
+* \param num_col number of columns
+* \param start_row row start index
+* \return 0 when succeed, -1 when failure happens
+*/
+LIGHTGBM_C_EXPORT int LGBM_DatasetPushRowsByCSR(DatasetHandle dataset,
+  const void* indptr,
+  int indptr_type,
+  const int32_t* indices,
+  const void* data,
+  int data_type,
+  int64_t nindptr,
+  int64_t nelem,
+  int64_t num_col,
+  int64_t start_row);
+
 /*!
 * \brief create a dataset from CSR format
 * \param indptr pointer to row headers

--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -343,6 +343,8 @@ struct ParameterAlias {
      { "test_data", "valid_data" },
      { "test", "valid_data" },
      { "is_sparse", "is_enable_sparse" },
+      { "enable_sparse", "is_enable_sparse" },
+      { "pre_partition", "is_pre_partition" },
      { "tranining_metric", "is_training_metric" },
      { "train_metric", "is_training_metric" },
      { "ndcg_at", "ndcg_eval_at" },

--- a/include/LightGBM/dataset.h
+++ b/include/LightGBM/dataset.h
@@ -312,6 +312,7 @@ public:
  }

  inline void PushOneRow(int tid, data_size_t row_idx, const std::vector<double>& feature_values) {
+    if (is_finish_load_) { return; }
    for (size_t i = 0; i < feature_values.size() && i < static_cast<size_t>(num_total_features_); ++i) {
      int feature_idx = used_feature_map_[i];
      if (feature_idx >= 0) {
@@ -323,6 +324,7 @@ public:
  }

  inline void PushOneRow(int tid, data_size_t row_idx, const std::vector<std::pair<int, double>>& feature_values) {
+    if (is_finish_load_) { return; }
    for (auto& inner_data : feature_values) {
      if (inner_data.first >= num_total_features_) { continue; }
      int feature_idx = used_feature_map_[inner_data.first];
@@ -520,6 +522,7 @@ private:
  std::vector<uint64_t> group_bin_boundaries_;
  std::vector<int> group_feature_start_;
  std::vector<int> group_feature_cnt_;
+  bool is_finish_load_;
 };

 }  // namespace LightGBM

--- a/src/c_api.cpp
+++ b/src/c_api.cpp
@@ -310,6 +310,110 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromFile(const char* filename,
  API_END();
 }

+LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromSampledCSR(const void* indptr,
+  int indptr_type,
+  const int32_t* indices,
+  const void* data,
+  int data_type,
+  int64_t nindptr,
+  int64_t n_sample_elem,
+  int64_t num_col,
+  int64_t num_total_row,
+  const char* parameters,
+  DatasetHandle* out) {
+  if (nindptr - 1 == num_total_row) {
+    return LGBM_DatasetCreateFromCSR(indptr, indptr_type, indices, data,
+      data_type, nindptr, n_sample_elem, num_col, parameters, nullptr, out);
+  } else {
+    API_BEGIN();
+    auto param = ConfigBase::Str2Map(parameters);
+    IOConfig io_config;
+    io_config.Set(param);
+    auto get_row_fun = RowFunctionFromCSR(indptr, indptr_type, indices, data, data_type, nindptr, n_sample_elem);
+    int32_t num_sample_row = static_cast<int32_t>(nindptr - 1);
+    std::vector<std::vector<double>> sample_values(num_col);
+    std::vector<std::vector<int>> sample_idx(num_col);
+    for (int i = 0; i < num_sample_row; ++i) {
+      auto row = get_row_fun(i);
+      for (std::pair<int, double>& inner_data : row) {
+        if (static_cast<size_t>(inner_data.first) >= sample_values.size()) {
+          sample_values.resize(inner_data.first + 1);
+          sample_idx.resize(inner_data.first + 1);
+        }
+        if (std::fabs(inner_data.second) > kEpsilon) {
+          sample_values[inner_data.first].emplace_back(inner_data.second);
+          sample_idx[inner_data.first].emplace_back(i);
+        }
+      }
+    }
+    CHECK(num_col >= static_cast<int>(sample_values.size()));
+    DatasetLoader loader(io_config, nullptr, 1, nullptr);
+    *out = loader.CostructFromSampleData(sample_values, sample_idx,
+      num_sample_row,
+      static_cast<data_size_t>(num_total_row));
+    API_END();
+  }
+}
+
+LIGHTGBM_C_EXPORT int LGBM_DatasetCreateByReference(const DatasetHandle reference,
+  int64_t num_total_row,
+  DatasetHandle* out) {
+  API_BEGIN();
+  std::unique_ptr<Dataset> ret;
+  ret.reset(new Dataset(static_cast<data_size_t>(num_total_row)));
+  ret->CreateValid(reinterpret_cast<const Dataset*>(reference));
+  *out = ret.release();
+  API_END();
+}
+
+LIGHTGBM_C_EXPORT int LGBM_DatasetPushRows(DatasetHandle dataset,
+  const void* data,
+  int data_type,
+  int32_t nrow,
+  int32_t ncol,
+  int32_t start_row) {
+  API_BEGIN();
+  auto p_dataset = reinterpret_cast<Dataset*>(dataset);
+  auto get_row_fun = RowFunctionFromDenseMatric(data, nrow, ncol, data_type, 1);
+#pragma omp parallel for schedule(static)
+  for (int i = 0; i < nrow; ++i) {
+    const int tid = omp_get_thread_num();
+    auto one_row = get_row_fun(i);
+    p_dataset->PushOneRow(tid, start_row + i, one_row);
+  }
+  if (start_row + nrow == p_dataset->num_data()) {
+    p_dataset->FinishLoad();
+  }
+  API_END();
+}
+
+LIGHTGBM_C_EXPORT int LGBM_DatasetPushRowsByCSR(DatasetHandle dataset,
+  const void* indptr,
+  int indptr_type,
+  const int32_t* indices,
+  const void* data,
+  int data_type,
+  int64_t nindptr,
+  int64_t nelem,
+  int64_t,
+  int64_t start_row) {
+  API_BEGIN();
+  auto p_dataset = reinterpret_cast<Dataset*>(dataset);
+  auto get_row_fun = RowFunctionFromCSR(indptr, indptr_type, indices, data, data_type, nindptr, nelem);
+  int32_t nrow = static_cast<int32_t>(nindptr - 1);
+#pragma omp parallel for schedule(static)
+  for (int i = 0; i < nrow; ++i) {
+    const int tid = omp_get_thread_num();
+    auto one_row = get_row_fun(i);
+    p_dataset->PushOneRow(tid,
+      static_cast<data_size_t>(start_row + i), one_row);
+  }
+  if (start_row + nrow == static_cast<int64_t>(p_dataset->num_data())) {
+    p_dataset->FinishLoad();
+  }
+  API_END();
+}
+
 LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromMat(const void* data,
  int data_type,
  int32_t nrow,
@@ -394,7 +498,6 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromCSR(const void* indptr,
          sample_idx.resize(inner_data.first + 1);
        }
        if (std::fabs(inner_data.second) > kEpsilon) {
-          // edit the feature value
          sample_values[inner_data.first].emplace_back(inner_data.second);
          sample_idx[inner_data.first].emplace_back(static_cast<int>(i));
        }

--- a/src/io/config.cpp
+++ b/src/io/config.cpp
@@ -303,7 +303,6 @@ void TreeConfig::Set(const std::unordered_map<std::string, std::string>& params)
  GetDouble(params, "histogram_pool_size", &histogram_pool_size);
  GetInt(params, "max_depth", &max_depth);
  GetInt(params, "top_k", &top_k);
-  CHECK(max_depth > 1 || max_depth < 0);
 }



--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -20,12 +20,14 @@ const char* Dataset::binary_file_token = "______LightGBM_Binary_File_Token______
 Dataset::Dataset() {
  data_filename_ = "noname";
  num_data_ = 0;
+  is_finish_load_ = false;
 }

 Dataset::Dataset(data_size_t num_data) {
  data_filename_ = "noname";
  num_data_ = num_data;
  metadata_.Init(num_data_, NO_SPECIFIC, NO_SPECIFIC);
+  is_finish_load_ = false;
 }

 Dataset::~Dataset() {
@@ -52,7 +54,7 @@ void Dataset::Construct(
  for (int i = 0; i < static_cast<int>(bin_mappers.size()); ++i) {
    if (bin_mappers[i] != nullptr && !bin_mappers[i]->is_trival()) {
      used_features.emplace_back(i);
-    } 
+    }
  }

  auto features_in_group = NoGroup(used_features);
@@ -110,10 +112,12 @@ void Dataset::Construct(
 }

 void Dataset::FinishLoad() {
+  if (is_finish_load_) { return; }
 #pragma omp parallel for schedule(guided)
  for (int i = 0; i < num_groups_; ++i) {
    feature_groups_[i]->bin_data_->FinishLoad();
  }
+  is_finish_load_ = true;
 }

 void Dataset::CopyFeatureMapperFrom(const Dataset* dataset) {
@@ -221,6 +225,7 @@ void Dataset::CopySubset(const Dataset* fullset, const data_size_t* used_indices
  if (need_meta_data) {
    metadata_.Init(fullset->metadata_, used_indices, num_used_indices);
  }
+  is_finish_load_ = true;
 }

 bool Dataset::SetFloatField(const char* field_name, const float* field_data, data_size_t num_element) {

--- a/src/io/dataset_loader.cpp
+++ b/src/io/dataset_loader.cpp
@@ -445,7 +445,7 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b
  }
  dataset->metadata_.PartitionLabel(*used_data_indices);
  // read feature data
-  for (int i = 0; i < dataset->num_features_; ++i) {
+  for (int i = 0; i < dataset->num_groups_; ++i) {
    // read feature size
    read_cnt = fread(buffer.data(), sizeof(size_t), 1, file);
    if (read_cnt != 1) {
@@ -471,6 +471,7 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b
  }
  dataset->feature_groups_.shrink_to_fit();
  fclose(file);
+  dataset->is_finish_load_ = true;
  return dataset.release();
 }


--- a/src/treelearner/data_parallel_tree_learner.cpp
+++ b/src/treelearner/data_parallel_tree_learner.cpp
@@ -47,17 +47,19 @@ void DataParallelTreeLearner::BeforeTrain() {
  // generate feature partition for current tree
  std::vector<std::vector<int>> feature_distribution(num_machines_, std::vector<int>());
  std::vector<int> num_bins_distributed(num_machines_, 0);
-  for (int i = 0; i < train_data_->num_features(); ++i) {
-    if (is_feature_used_[i]) {
+  for (int i = 0; i < train_data_->num_total_features(); ++i) {
+    int inner_feature_index = train_data_->InnerFeatureIndex(i);
+    if (inner_feature_index == -1) { continue; }
+    if (is_feature_used_[inner_feature_index]) {
      int cur_min_machine = static_cast<int>(ArrayArgs<int>::ArgMin(num_bins_distributed));
-      feature_distribution[cur_min_machine].push_back(i);
-      auto num_bin = train_data_->FeatureNumBin(i);
-      if (train_data_->FeatureBinMapper(i)->GetDefaultBin() == 0) {
+      feature_distribution[cur_min_machine].push_back(inner_feature_index);
+      auto num_bin = train_data_->FeatureNumBin(inner_feature_index);
+      if (train_data_->FeatureBinMapper(inner_feature_index)->GetDefaultBin() == 0) {
        num_bin -= 1;
      }
      num_bins_distributed[cur_min_machine] += num_bin;
    }
-    is_feature_aggregated_[i] = false;
+    is_feature_aggregated_[inner_feature_index] = false;
  }
  // get local used feature
  for (auto fid : feature_distribution[rank_]) {
@@ -167,7 +169,6 @@ void DataParallelTreeLearner::FindBestThresholds() {
    smaller_leaf_histogram_array_[feature_index].FromMemory(
      output_buffer_.data() + buffer_read_start_pos_[feature_index]);

-
    train_data_->FixHistogram(feature_index,
      smaller_leaf_splits_->sum_gradients(), smaller_leaf_splits_->sum_hessians(),
      GetGlobalDataCountInLeaf(smaller_leaf_splits_->LeafIndex()),
@@ -179,9 +180,9 @@ void DataParallelTreeLearner::FindBestThresholds() {
      smaller_leaf_splits_->sum_hessians(),
      GetGlobalDataCountInLeaf(smaller_leaf_splits_->LeafIndex()),
      &smaller_split);
-
    if (smaller_split.gain > smaller_best[tid].gain) {
      smaller_best[tid] = smaller_split;
+      smaller_best[tid].feature = train_data_->RealFeatureIndex(feature_index);
    }

    // only root leaf
@@ -199,6 +200,7 @@ void DataParallelTreeLearner::FindBestThresholds() {
      &larger_split);
    if (larger_split.gain > larger_best[tid].gain) {
      larger_best[tid] = larger_split;
+      larger_best[tid].feature = train_data_->RealFeatureIndex(feature_index);
    }
  }
  auto smaller_best_idx = ArrayArgs<SplitInfo>::ArgMax(smaller_best);

--- a/src/treelearner/feature_histogram.hpp
+++ b/src/treelearner/feature_histogram.hpp
@@ -13,7 +13,6 @@ namespace LightGBM

 class FeatureMetainfo {
 public:
-  int feature_idx;
  int num_bin;
  int bias = 0;
  /*! \brief pointer of tree config */
@@ -126,7 +125,6 @@ public:
    }
    if (is_splittable_) {
      // update split information
-      output->feature = meta_->feature_idx;
      output->threshold = best_threshold;
      output->left_output = CalculateSplittedLeafOutput(best_sum_left_gradient, best_sum_left_hessian);
      output->left_count = best_left_count;
@@ -139,7 +137,6 @@ public:
      output->right_sum_hessian = sum_hessian - best_sum_left_hessian - kEpsilon;
      output->gain = best_gain - gain_shift;
    } else {
-      output->feature = meta_->feature_idx;
      output->gain = kMinScore;
    }
  }
@@ -223,7 +220,6 @@ public:
    }
    if (is_splittable_) {
      // update split information
-      output->feature = meta_->feature_idx;
      output->threshold = best_threshold;
      output->left_output = CalculateSplittedLeafOutput(best_sum_left_gradient, best_sum_left_hessian);
      output->left_count = best_left_count;
@@ -236,7 +232,6 @@ public:
      output->right_sum_hessian = sum_hessian - best_sum_left_hessian - kEpsilon;
      output->gain = best_gain - gain_shift;
    } else {
-      output->feature = meta_->feature_idx;
      output->gain = kMinScore;
    }
  }
@@ -353,7 +348,6 @@ public:
      feature_metas_.resize(train_data->num_features());
 #pragma omp parallel for schedule(static)
      for (int i = 0; i < train_data->num_features(); ++i) {
-        feature_metas_[i].feature_idx = i;
        feature_metas_[i].num_bin = train_data->FeatureNumBin(i);
        if (train_data->FeatureBinMapper(i)->GetDefaultBin() == 0) {
          feature_metas_[i].bias = 1;

--- a/src/treelearner/feature_parallel_tree_learner.cpp
+++ b/src/treelearner/feature_parallel_tree_learner.cpp
@@ -28,12 +28,14 @@ void FeatureParallelTreeLearner::BeforeTrain() {
  // get feature partition
  std::vector<std::vector<int>> feature_distribution(num_machines_, std::vector<int>());
  std::vector<int> num_bins_distributed(num_machines_, 0);
-  for (int i = 0; i < train_data_->num_features(); ++i) {
-    if (is_feature_used_[i]) {
+  for (int i = 0; i < train_data_->num_total_features(); ++i) {
+    int inner_feature_index = train_data_->InnerFeatureIndex(i);
+    if (inner_feature_index == -1) { continue; }
+    if (is_feature_used_[inner_feature_index]) {
      int cur_min_machine = static_cast<int>(ArrayArgs<int>::ArgMin(num_bins_distributed));
-      feature_distribution[cur_min_machine].push_back(i);
-      num_bins_distributed[cur_min_machine] += train_data_->FeatureNumBin(i);
-      is_feature_used_[i] = false;
+      feature_distribution[cur_min_machine].push_back(inner_feature_index);
+      num_bins_distributed[cur_min_machine] += train_data_->FeatureNumBin(inner_feature_index);
+      is_feature_used_[inner_feature_index] = false;
    }
  }
  // get local used features

--- a/src/treelearner/leaf_splits.hpp
+++ b/src/treelearner/leaf_splits.hpp
@@ -2,7 +2,6 @@
 #define LIGHTGBM_TREELEARNER_LEAF_SPLITS_HPP_

 #include <LightGBM/meta.h>
-#include "split_info.hpp"
 #include "data_partition.hpp"

 #include <vector>
@@ -14,8 +13,8 @@ namespace LightGBM {
 */
 class LeafSplits {
 public:
-  LeafSplits(int num_feature, data_size_t num_data)
-    :num_data_in_leaf_(num_data), num_data_(num_data), num_features_(num_feature),
+  LeafSplits(data_size_t num_data)
+    :num_data_in_leaf_(num_data), num_data_(num_data),
    data_indices_(nullptr) {
  }
  void ResetNumData(data_size_t num_data) {
@@ -127,8 +126,6 @@ private:
  data_size_t num_data_in_leaf_;
  /*! \brief number of all training data */
  data_size_t num_data_;
-  /*! \brief number of features */
-  int num_features_;
  /*! \brief sum of gradients of current leaf */
  double sum_gradients_;
  /*! \brief sum of hessians of current leaf */

--- a/src/treelearner/serial_tree_learner.cpp
+++ b/src/treelearner/serial_tree_learner.cpp
@@ -17,7 +17,7 @@ std::chrono::duration<double, std::milli> ordered_bin_time;
 #endif // TIMETAG

 SerialTreeLearner::SerialTreeLearner(const TreeConfig* tree_config)
-  :tree_config_(tree_config){
+  :tree_config_(tree_config) {
  random_ = Random(tree_config_->feature_fraction_seed);
 #pragma omp parallel
 #pragma omp master
@@ -59,7 +59,7 @@ void SerialTreeLearner::Init(const Dataset* train_data) {
  histogram_pool_.DynamicChangeSize(train_data_, tree_config_, max_cache_size, tree_config_->num_leaves);
  // push split information for all leaves
  best_split_per_leaf_.resize(tree_config_->num_leaves);
-  
+
  // get ordered bin
  train_data_->CreateOrderedBins(&ordered_bins_);

@@ -71,8 +71,8 @@ void SerialTreeLearner::Init(const Dataset* train_data) {
    }
  }
  // initialize splits for leaf
-  smaller_leaf_splits_.reset(new LeafSplits(train_data_->num_features(), train_data_->num_data()));
-  larger_leaf_splits_.reset(new LeafSplits(train_data_->num_features(), train_data_->num_data()));
+  smaller_leaf_splits_.reset(new LeafSplits(train_data_->num_data()));
+  larger_leaf_splits_.reset(new LeafSplits(train_data_->num_data()));

  // initialize data partition
  data_partition_.reset(new DataPartition(num_data_, tree_config_->num_leaves));
@@ -84,10 +84,10 @@ void SerialTreeLearner::Init(const Dataset* train_data) {
  if (has_ordered_bin_) {
    is_data_in_leaf_.resize(num_data_);
    std::fill(is_data_in_leaf_.begin(), is_data_in_leaf_.end(), 0);
-    order_bin_indices_.clear();
+    ordered_bin_indices_.clear();
    for (int i = 0; i < static_cast<int>(ordered_bins_.size()); i++) {
      if (ordered_bins_[i] != nullptr) {
-        order_bin_indices_.push_back(i);
+        ordered_bin_indices_.push_back(i);
      }
    }
  }
@@ -126,14 +126,13 @@ void SerialTreeLearner::ResetTrainingData(const Dataset* train_data) {
  if (has_ordered_bin_) {
    is_data_in_leaf_.resize(num_data_);
    std::fill(is_data_in_leaf_.begin(), is_data_in_leaf_.end(), 0);
-    order_bin_indices_.clear();
+    ordered_bin_indices_.clear();
    for (int i = 0; i < static_cast<int>(ordered_bins_.size()); i++) {
      if (ordered_bins_[i] != nullptr) {
-        order_bin_indices_.push_back(i);
+        ordered_bin_indices_.push_back(i);
      }
    }
  }
-
 }

 void SerialTreeLearner::ResetConfig(const TreeConfig* tree_config) {
@@ -188,14 +187,14 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians
  // only root leaf can be splitted on first time
  int right_leaf = -1;
  for (int split = 0; split < tree_config_->num_leaves - 1; ++split) {
-#ifdef TIMETAG
+  #ifdef TIMETAG
    start_time = std::chrono::steady_clock::now();
-#endif
+  #endif
    // some initial works before finding best split
    if (BeforeFindBestSplit(left_leaf, right_leaf)) {
-#ifdef TIMETAG
+    #ifdef TIMETAG
      init_split_time += std::chrono::steady_clock::now() - start_time;
-#endif
+    #endif
      // find best threshold for every feature
      FindBestThresholds();
      // find best split from all features
@@ -210,14 +209,14 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians
      Log::Info("No further splits with positive gain, best gain: %f", best_leaf_SplitInfo.gain);
      break;
    }
-#ifdef TIMETAG
+  #ifdef TIMETAG
    start_time = std::chrono::steady_clock::now();
-#endif
+  #endif
    // split tree with best leaf
    Split(tree.get(), best_leaf, &left_leaf, &right_leaf);
-#ifdef TIMETAG
+  #ifdef TIMETAG
    split_time += std::chrono::steady_clock::now() - start_time;
-#endif
+  #endif
    cur_depth = std::max(cur_depth, tree->leaf_depth(left_leaf));
  }
  Log::Info("Trained a tree with leaves=%d and max_depth=%d", tree->num_leaves(), cur_depth);
@@ -228,19 +227,21 @@ void SerialTreeLearner::BeforeTrain() {

  // reset histogram pool
  histogram_pool_.ResetMap();
-  int used_feature_cnt = static_cast<int>(num_features_*tree_config_->feature_fraction);

-  if (used_feature_cnt < num_features_) {
+  if (tree_config_->feature_fraction < 1) {
+    int used_feature_cnt = static_cast<int>(train_data_->num_total_features()*tree_config_->feature_fraction);
    // initialize used features
    std::memset(is_feature_used_.data(), 0, sizeof(int8_t) * num_features_);
    // Get used feature at current tree
-    auto used_feature_indices = random_.Sample(num_features_, used_feature_cnt);
-    #pragma omp parallel for schedule(static)
+    auto used_feature_indices = random_.Sample(train_data_->num_total_features(), used_feature_cnt);
+  #pragma omp parallel for schedule(static)
    for (int i = 0; i < static_cast<int>(used_feature_indices.size()); ++i) {
-      is_feature_used_[used_feature_indices[i]] = 1;
+      int inner_feature_index = train_data_->InnerFeatureIndex(used_feature_indices[i]);
+      if (inner_feature_index < 0) {  continue; }
+      is_feature_used_[inner_feature_index] = 1;
    }
  } else {
-    #pragma omp parallel for schedule(static)
+  #pragma omp parallel for schedule(static)
    for (int i = 0; i < num_features_; ++i) {
      is_feature_used_[i] = 1;
    }
@@ -268,14 +269,14 @@ void SerialTreeLearner::BeforeTrain() {

  // if has ordered bin, need to initialize the ordered bin
  if (has_ordered_bin_) {
-#ifdef TIMETAG
+  #ifdef TIMETAG
    auto start_time = std::chrono::steady_clock::now();
-#endif
+  #endif
    if (data_partition_->leaf_count(0) == num_data_) {
      // use all data, pass nullptr
-      #pragma omp parallel for schedule(static)
-      for (int i = 0; i < static_cast<int>(order_bin_indices_.size()); ++i) {
-        ordered_bins_[order_bin_indices_[i]]->Init(nullptr, tree_config_->num_leaves);
+    #pragma omp parallel for schedule(static)
+      for (int i = 0; i < static_cast<int>(ordered_bin_indices_.size()); ++i) {
+        ordered_bins_[ordered_bin_indices_[i]]->Init(nullptr, tree_config_->num_leaves);
      }
    } else {
      // bagging, only use part of data
@@ -284,23 +285,23 @@ void SerialTreeLearner::BeforeTrain() {
      const data_size_t* indices = data_partition_->indices();
      data_size_t begin = data_partition_->leaf_begin(0);
      data_size_t end = begin + data_partition_->leaf_count(0);
-      #pragma omp parallel for schedule(static)
+    #pragma omp parallel for schedule(static)
      for (data_size_t i = begin; i < end; ++i) {
        is_data_in_leaf_[indices[i]] = 1;
      }
      // initialize ordered bin
-      #pragma omp parallel for schedule(static)
-      for (int i = 0; i < static_cast<int>(order_bin_indices_.size()); ++i) {
-        ordered_bins_[order_bin_indices_[i]]->Init(is_data_in_leaf_.data(), tree_config_->num_leaves);
+    #pragma omp parallel for schedule(static)
+      for (int i = 0; i < static_cast<int>(ordered_bin_indices_.size()); ++i) {
+        ordered_bins_[ordered_bin_indices_[i]]->Init(is_data_in_leaf_.data(), tree_config_->num_leaves);
      }
-#pragma omp parallel for schedule(static)
+    #pragma omp parallel for schedule(static)
      for (data_size_t i = begin; i < end; ++i) {
        is_data_in_leaf_[indices[i]] = 0;
      }
    }
-#ifdef TIMETAG
+  #ifdef TIMETAG
    ordered_bin_time += std::chrono::steady_clock::now() - start_time;
-#endif
+  #endif
  }
 }

@@ -320,7 +321,7 @@ bool SerialTreeLearner::BeforeFindBestSplit(int left_leaf, int right_leaf) {
  data_size_t num_data_in_right_child = GetGlobalDataCountInLeaf(right_leaf);
  // no enough data to continue
  if (num_data_in_right_child < static_cast<data_size_t>(tree_config_->min_data_in_leaf * 2)
-    && num_data_in_left_child < static_cast<data_size_t>(tree_config_->min_data_in_leaf * 2)) {
+      && num_data_in_left_child < static_cast<data_size_t>(tree_config_->min_data_in_leaf * 2)) {
    best_split_per_leaf_[left_leaf].gain = kMinScore;
    if (right_leaf >= 0) {
      best_split_per_leaf_[right_leaf].gain = kMinScore;
@@ -344,9 +345,9 @@ bool SerialTreeLearner::BeforeFindBestSplit(int left_leaf, int right_leaf) {
  }
  // split for the ordered bin
  if (has_ordered_bin_ && right_leaf >= 0) {
-#ifdef TIMETAG
+  #ifdef TIMETAG
    auto start_time = std::chrono::steady_clock::now();
-#endif
+  #endif
    // mark data that at left-leaf
    const data_size_t* indices = data_partition_->indices();
    const auto left_cnt = data_partition_->leaf_count(left_leaf);
@@ -359,22 +360,22 @@ bool SerialTreeLearner::BeforeFindBestSplit(int left_leaf, int right_leaf) {
      end = begin + right_cnt;
      mark = 0;
    }
-    #pragma omp parallel for schedule(static)
+  #pragma omp parallel for schedule(static)
    for (data_size_t i = begin; i < end; ++i) {
      is_data_in_leaf_[indices[i]] = 1;
    }
    // split the ordered bin
-    #pragma omp parallel for schedule(static)
-    for (int i = 0; i < static_cast<int>(order_bin_indices_.size()); ++i) {
-      ordered_bins_[order_bin_indices_[i]]->Split(left_leaf, right_leaf, is_data_in_leaf_.data(), mark);
+  #pragma omp parallel for schedule(static)
+    for (int i = 0; i < static_cast<int>(ordered_bin_indices_.size()); ++i) {
+      ordered_bins_[ordered_bin_indices_[i]]->Split(left_leaf, right_leaf, is_data_in_leaf_.data(), mark);
    }
-#pragma omp parallel for schedule(static)
+  #pragma omp parallel for schedule(static)
    for (data_size_t i = begin; i < end; ++i) {
      is_data_in_leaf_[indices[i]] = 0;
    }
-#ifdef TIMETAG
+  #ifdef TIMETAG
    ordered_bin_time += std::chrono::steady_clock::now() - start_time;
-#endif
+  #endif
  }
  return true;
 }
@@ -387,7 +388,7 @@ void SerialTreeLearner::FindBestThresholds() {
 #pragma omp parallel for schedule(static)
  for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
    if (!is_feature_used_[feature_index]) continue;
-    if (parent_leaf_histogram_array_ != nullptr 
+    if (parent_leaf_histogram_array_ != nullptr
        && !parent_leaf_histogram_array_[feature_index].is_splittable()) {
      smaller_leaf_histogram_array_[feature_index].set_is_splittable(false);
      continue;
@@ -401,21 +402,21 @@ void SerialTreeLearner::FindBestThresholds() {
  // construct smaller leaf
  HistogramBinEntry* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - 1;
  train_data_->ConstructHistograms(is_feature_used,
-    smaller_leaf_splits_->data_indices(), smaller_leaf_splits_->num_data_in_leaf(),
-    smaller_leaf_splits_->LeafIndex(),
-    ordered_bins_, gradients_, hessians_,
-    ordered_gradients_.data(), ordered_hessians_.data(),
-    ptr_smaller_leaf_hist_data);
+                                   smaller_leaf_splits_->data_indices(), smaller_leaf_splits_->num_data_in_leaf(),
+                                   smaller_leaf_splits_->LeafIndex(),
+                                   ordered_bins_, gradients_, hessians_,
+                                   ordered_gradients_.data(), ordered_hessians_.data(),
+                                   ptr_smaller_leaf_hist_data);

  if (larger_leaf_histogram_array_ != nullptr && !use_subtract) {
    // construct larger leaf
    HistogramBinEntry* ptr_larger_leaf_hist_data = larger_leaf_histogram_array_[0].RawData() - 1;
    train_data_->ConstructHistograms(is_feature_used,
-      larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf(),
-      larger_leaf_splits_->LeafIndex(),
-      ordered_bins_, gradients_, hessians_,
-      ordered_gradients_.data(), ordered_hessians_.data(),
-      ptr_larger_leaf_hist_data);
+                                     larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf(),
+                                     larger_leaf_splits_->LeafIndex(),
+                                     ordered_bins_, gradients_, hessians_,
+                                     ordered_gradients_.data(), ordered_hessians_.data(),
+                                     ptr_larger_leaf_hist_data);
  }
 #ifdef TIMETAG
  hist_time += std::chrono::steady_clock::now() - start_time;
@@ -426,15 +427,15 @@ void SerialTreeLearner::FindBestThresholds() {
  std::vector<SplitInfo> smaller_best(num_threads_);
  std::vector<SplitInfo> larger_best(num_threads_);
  // find splits
-  #pragma omp parallel for schedule(static)
+#pragma omp parallel for schedule(static)
  for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
    if (!is_feature_used[feature_index]) { continue; }
    const int tid = omp_get_thread_num();
    SplitInfo smaller_split;
-    train_data_->FixHistogram(feature_index, 
-      smaller_leaf_splits_->sum_gradients(), smaller_leaf_splits_->sum_hessians(),
-      smaller_leaf_splits_->num_data_in_leaf(),
-      smaller_leaf_histogram_array_[feature_index].RawData());
+    train_data_->FixHistogram(feature_index,
+                              smaller_leaf_splits_->sum_gradients(), smaller_leaf_splits_->sum_hessians(),
+                              smaller_leaf_splits_->num_data_in_leaf(),
+                              smaller_leaf_histogram_array_[feature_index].RawData());

    smaller_leaf_histogram_array_[feature_index].FindBestThreshold(
      smaller_leaf_splits_->sum_gradients(),
@@ -443,6 +444,7 @@ void SerialTreeLearner::FindBestThresholds() {
      &smaller_split);
    if (smaller_split.gain > smaller_best[tid].gain) {
      smaller_best[tid] = smaller_split;
+      smaller_best[tid].feature = train_data_->RealFeatureIndex(feature_index);
    }
    // only has root leaf
    if (larger_leaf_splits_ == nullptr || larger_leaf_splits_->LeafIndex() < 0) { continue; }
@@ -451,8 +453,8 @@ void SerialTreeLearner::FindBestThresholds() {
      larger_leaf_histogram_array_[feature_index].Subtract(smaller_leaf_histogram_array_[feature_index]);
    } else {
      train_data_->FixHistogram(feature_index, larger_leaf_splits_->sum_gradients(), larger_leaf_splits_->sum_hessians(),
-        larger_leaf_splits_->num_data_in_leaf(),
-        larger_leaf_histogram_array_[feature_index].RawData());
+                                larger_leaf_splits_->num_data_in_leaf(),
+                                larger_leaf_histogram_array_[feature_index].RawData());
    }
    SplitInfo larger_split;
    // find best threshold for larger child
@@ -463,6 +465,7 @@ void SerialTreeLearner::FindBestThresholds() {
      &larger_split);
    if (larger_split.gain > larger_best[tid].gain) {
      larger_best[tid] = larger_split;
+      larger_best[tid].feature = train_data_->RealFeatureIndex(feature_index);
    }
  }

@@ -487,21 +490,23 @@ void SerialTreeLearner::FindBestSplitsForLeaves() {

 void SerialTreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) {
  const SplitInfo& best_split_info = best_split_per_leaf_[best_Leaf];
+  const int inner_feature_index = train_data_->InnerFeatureIndex(best_split_info.feature);
  // left = parent
  *left_leaf = best_Leaf;
  // split tree, will return right leaf
-  *right_leaf = tree->Split(best_Leaf, best_split_info.feature,
-    train_data_->FeatureBinMapper(best_split_info.feature)->bin_type(),
-    best_split_info.threshold,
-    train_data_->RealFeatureIndex(best_split_info.feature),
-    train_data_->RealThreshold(best_split_info.feature, best_split_info.threshold),
-    static_cast<double>(best_split_info.left_output),
-    static_cast<double>(best_split_info.right_output),
-    static_cast<data_size_t>(best_split_info.left_count),
-    static_cast<data_size_t>(best_split_info.right_count),
-    static_cast<double>(best_split_info.gain));
+  *right_leaf = tree->Split(best_Leaf,
+                            inner_feature_index,
+                            train_data_->FeatureBinMapper(inner_feature_index)->bin_type(),
+                            best_split_info.threshold,
+                            best_split_info.feature,
+                            train_data_->RealThreshold(inner_feature_index, best_split_info.threshold),
+                            static_cast<double>(best_split_info.left_output),
+                            static_cast<double>(best_split_info.right_output),
+                            static_cast<data_size_t>(best_split_info.left_count),
+                            static_cast<data_size_t>(best_split_info.right_count),
+                            static_cast<double>(best_split_info.gain));
  // split data partition
-  data_partition_->Split(best_Leaf, train_data_, best_split_info.feature, 
+  data_partition_->Split(best_Leaf, train_data_, inner_feature_index,
                         best_split_info.threshold, *right_leaf);

  // init the leaves that used on next iteration
@@ -510,13 +515,12 @@ void SerialTreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* ri
                               best_split_info.left_sum_gradient,
                               best_split_info.left_sum_hessian);
    larger_leaf_splits_->Init(*right_leaf, data_partition_.get(),
-                               best_split_info.right_sum_gradient,
-                               best_split_info.right_sum_hessian);
+                              best_split_info.right_sum_gradient,
+                              best_split_info.right_sum_hessian);
  } else {
    smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(), best_split_info.right_sum_gradient, best_split_info.right_sum_hessian);
    larger_leaf_splits_->Init(*left_leaf, data_partition_.get(), best_split_info.left_sum_gradient, best_split_info.left_sum_hessian);
  }
 }

-
 }  // namespace LightGBM
--- a/src/treelearner/serial_tree_learner.h
+++ b/src/treelearner/serial_tree_learner.h
@@ -145,7 +145,7 @@ protected:
  /*! \brief config of tree learner*/
  const TreeConfig* tree_config_;
  int num_threads_;
-  std::vector<int> order_bin_indices_;
+  std::vector<int> ordered_bin_indices_;
 };

 inline data_size_t SerialTreeLearner::GetGlobalDataCountInLeaf(int leafIdx) const {

--- a/src/treelearner/voting_parallel_tree_learner.cpp
+++ b/src/treelearner/voting_parallel_tree_learner.cpp
@@ -46,8 +46,8 @@ void VotingParallelTreeLearner::Init(const Dataset* train_data) {
  larger_buffer_read_start_pos_.resize(num_features_);
  global_data_count_in_leaf_.resize(tree_config_->num_leaves);

-  smaller_leaf_splits_global_.reset(new LeafSplits(train_data_->num_features(), train_data_->num_data()));
-  larger_leaf_splits_global_.reset(new LeafSplits(train_data_->num_features(), train_data_->num_data()));
+  smaller_leaf_splits_global_.reset(new LeafSplits(train_data_->num_data()));
+  larger_leaf_splits_global_.reset(new LeafSplits(train_data_->num_data()));

  local_tree_config_ = *tree_config_;
  local_tree_config_.min_data_in_leaf /= num_machines_;
@@ -58,16 +58,12 @@ void VotingParallelTreeLearner::Init(const Dataset* train_data) {
  // initialize histograms for global
  smaller_leaf_histogram_array_global_.reset(new FeatureHistogram[num_features_]);
  larger_leaf_histogram_array_global_.reset(new FeatureHistogram[num_features_]);
-  int num_total_bin = 0;
-  for (int i = 0; i < num_features_; ++i) {
-    num_total_bin += train_data_->FeatureNumBin(i);
-  }
+  auto num_total_bin = train_data_->NumTotalBin();
  smaller_leaf_histogram_data_.resize(num_total_bin);
  larger_leaf_histogram_data_.resize(num_total_bin);
  feature_metas_.resize(train_data->num_features());
 #pragma omp parallel for schedule(static)
  for (int i = 0; i < train_data->num_features(); ++i) {
-    feature_metas_[i].feature_idx = i;
    feature_metas_[i].num_bin = train_data->FeatureNumBin(i);
    if (train_data->FeatureBinMapper(i)->GetDefaultBin() == 0) {
      feature_metas_[i].bias = 1;
@@ -207,17 +203,17 @@ void VotingParallelTreeLearner::CopyLocalHistogram(const std::vector<int>& small
    while (cur_used_features < cur_total_feature) {
      // copy smaller leaf histograms first
      if (smaller_idx < smaller_top_features.size()) {
-        int fid = smaller_top_features[smaller_idx];
+        int inner_feature_index = train_data_->InnerFeatureIndex(smaller_top_features[smaller_idx]);
        ++cur_used_features;
        // mark local aggregated feature
        if (i == rank_) {
-          smaller_is_feature_aggregated_[fid] = true;
-          smaller_buffer_read_start_pos_[fid] = static_cast<int>(cur_size);
+          smaller_is_feature_aggregated_[inner_feature_index] = true;
+          smaller_buffer_read_start_pos_[inner_feature_index] = static_cast<int>(cur_size);
        }
        // copy
-        std::memcpy(input_buffer_.data() + reduce_scatter_size_, smaller_leaf_histogram_array_[fid].RawData(), smaller_leaf_histogram_array_[fid].SizeOfHistgram());
-        cur_size += smaller_leaf_histogram_array_[fid].SizeOfHistgram();
-        reduce_scatter_size_ += smaller_leaf_histogram_array_[fid].SizeOfHistgram();
+        std::memcpy(input_buffer_.data() + reduce_scatter_size_, smaller_leaf_histogram_array_[inner_feature_index].RawData(), smaller_leaf_histogram_array_[inner_feature_index].SizeOfHistgram());
+        cur_size += smaller_leaf_histogram_array_[inner_feature_index].SizeOfHistgram();
+        reduce_scatter_size_ += smaller_leaf_histogram_array_[inner_feature_index].SizeOfHistgram();
        ++smaller_idx;
      }
      if (cur_used_features >= cur_total_feature) {
@@ -225,17 +221,17 @@ void VotingParallelTreeLearner::CopyLocalHistogram(const std::vector<int>& small
      }
      // then copy larger leaf histograms
      if (larger_idx < larger_top_features.size()) {
-        int fid = larger_top_features[larger_idx];
+        int inner_feature_index = train_data_->InnerFeatureIndex(larger_top_features[larger_idx]);
        ++cur_used_features;
        // mark local aggregated feature
        if (i == rank_) {
-          larger_is_feature_aggregated_[fid] = true;
-          larger_buffer_read_start_pos_[fid] = static_cast<int>(cur_size);
+          larger_is_feature_aggregated_[inner_feature_index] = true;
+          larger_buffer_read_start_pos_[inner_feature_index] = static_cast<int>(cur_size);
        }
        // copy
-        std::memcpy(input_buffer_.data() + reduce_scatter_size_, larger_leaf_histogram_array_[fid].RawData(), larger_leaf_histogram_array_[fid].SizeOfHistgram());
-        cur_size += larger_leaf_histogram_array_[fid].SizeOfHistgram();
-        reduce_scatter_size_ += larger_leaf_histogram_array_[fid].SizeOfHistgram();
+        std::memcpy(input_buffer_.data() + reduce_scatter_size_, larger_leaf_histogram_array_[inner_feature_index].RawData(), larger_leaf_histogram_array_[inner_feature_index].SizeOfHistgram());
+        cur_size += larger_leaf_histogram_array_[inner_feature_index].SizeOfHistgram();
+        reduce_scatter_size_ += larger_leaf_histogram_array_[inner_feature_index].SizeOfHistgram();
        ++larger_idx;
      }
    }
@@ -291,6 +287,7 @@ void VotingParallelTreeLearner::FindBestThresholds() {
 #pragma omp parallel for schedule(static)
  for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
    if (!is_feature_used[feature_index]) { continue; }
+    const int real_feature_index = train_data_->RealFeatureIndex(feature_index);
    train_data_->FixHistogram(feature_index,
      smaller_leaf_splits_->sum_gradients(), smaller_leaf_splits_->sum_hessians(),
      smaller_leaf_splits_->num_data_in_leaf(),
@@ -301,6 +298,7 @@ void VotingParallelTreeLearner::FindBestThresholds() {
      smaller_leaf_splits_->sum_hessians(),
      smaller_leaf_splits_->num_data_in_leaf(),
      &smaller_bestsplit_per_features[feature_index]);
+    smaller_bestsplit_per_features[feature_index].feature = real_feature_index;
    // only has root leaf
    if (larger_leaf_splits_ == nullptr || larger_leaf_splits_->LeafIndex() < 0) { continue; }

@@ -317,6 +315,7 @@ void VotingParallelTreeLearner::FindBestThresholds() {
      larger_leaf_splits_->sum_hessians(),
      larger_leaf_splits_->num_data_in_leaf(),
      &larger_bestsplit_per_features[feature_index]);
+    larger_bestsplit_per_features[feature_index].feature = real_feature_index;
  }

  std::vector<SplitInfo> smaller_top_k_splits, larger_top_k_splits;
@@ -382,6 +381,7 @@ void VotingParallelTreeLearner::FindBestThresholds() {
        &smaller_split);
      if (smaller_split.gain > smaller_best[tid].gain) {
        smaller_best[tid] = smaller_split;
+        smaller_best[tid].feature = train_data_->RealFeatureIndex(feature_index);
      }
    }

@@ -403,6 +403,7 @@ void VotingParallelTreeLearner::FindBestThresholds() {
        &larger_split);
      if (larger_split.gain > larger_best[tid].gain) {
        larger_best[tid] = larger_split;
+        larger_best[tid].feature = train_data_->RealFeatureIndex(feature_index);
      }
    }
  }