clean code for the split of bins and leaves.

6c4a9750 · Guolin Ke · 8fb26b06 · 6c4a9750 · 6c4a9750 · 6c4a9750
Commit 6c4a9750 authored Aug 20, 2017 by Guolin Ke
13 changed files
--- a/include/LightGBM/bin.h
+++ b/include/LightGBM/bin.h
@@ -12,462 +12,478 @@

 namespace LightGBM {

-  enum BinType {
-    NumericalBin,
-    CategoricalBin
-  };
-
-  enum MissingType {
-    None,
-    Zero,
-    NaN
-  };
-
-  /*! \brief Store data for one histogram bin */
-  struct HistogramBinEntry {
-  public:
-    /*! \brief Sum of gradients on this bin */
-    double sum_gradients = 0.0f;
-    /*! \brief Sum of hessians on this bin */
-    double sum_hessians = 0.0f;
-    /*! \brief Number of data on this bin */
-    data_size_t cnt = 0;
-    /*!
-    * \brief Sum up (reducers) functions for histogram bin
-    */
-    inline static void SumReducer(const char *src, char *dst, int len) {
-      const int type_size = sizeof(HistogramBinEntry);
-      int used_size = 0;
-      const HistogramBinEntry* p1;
-      HistogramBinEntry* p2;
-      while (used_size < len) {
-        // convert
-        p1 = reinterpret_cast<const HistogramBinEntry*>(src);
-        p2 = reinterpret_cast<HistogramBinEntry*>(dst);
-        // add
-        p2->cnt += p1->cnt;
-        p2->sum_gradients += p1->sum_gradients;
-        p2->sum_hessians += p1->sum_hessians;
-        src += type_size;
-        dst += type_size;
-        used_size += type_size;
-      }
+enum BinType {
+  NumericalBin,
+  CategoricalBin
+};
+
+enum MissingType {
+  None,
+  Zero,
+  NaN
+};
+
+/*! \brief Store data for one histogram bin */
+struct HistogramBinEntry {
+public:
+  /*! \brief Sum of gradients on this bin */
+  double sum_gradients = 0.0f;
+  /*! \brief Sum of hessians on this bin */
+  double sum_hessians = 0.0f;
+  /*! \brief Number of data on this bin */
+  data_size_t cnt = 0;
+  /*!
+  * \brief Sum up (reducers) functions for histogram bin
+  */
+  inline static void SumReducer(const char *src, char *dst, int len) {
+    const int type_size = sizeof(HistogramBinEntry);
+    int used_size = 0;
+    const HistogramBinEntry* p1;
+    HistogramBinEntry* p2;
+    while (used_size < len) {
+      // convert
+      p1 = reinterpret_cast<const HistogramBinEntry*>(src);
+      p2 = reinterpret_cast<HistogramBinEntry*>(dst);
+      // add
+      p2->cnt += p1->cnt;
+      p2->sum_gradients += p1->sum_gradients;
+      p2->sum_hessians += p1->sum_hessians;
+      src += type_size;
+      dst += type_size;
+      used_size += type_size;
    }
-  };
-
-  /*! \brief This class used to convert feature values into bin,
-  *          and store some meta information for bin*/
-  class BinMapper {
-  public:
-    BinMapper();
-    BinMapper(const BinMapper& other);
-    explicit BinMapper(const void* memory);
-    ~BinMapper();
-
-    bool CheckAlign(const BinMapper& other) const {
-      if (num_bin_ != other.num_bin_) {
-        return false;
-      }
-      if (missing_type_ != other.missing_type_) {
-        return false;
-      }
-      if (bin_type_ == BinType::NumericalBin) {
-        for (int i = 0; i < num_bin_; ++i) {
-          if (bin_upper_bound_[i] != other.bin_upper_bound_[i]) {
-            return false;
-          }
+  }
+};
+
+/*! \brief This class used to convert feature values into bin,
+*          and store some meta information for bin*/
+class BinMapper {
+public:
+  BinMapper();
+  BinMapper(const BinMapper& other);
+  explicit BinMapper(const void* memory);
+  ~BinMapper();
+
+  bool CheckAlign(const BinMapper& other) const {
+    if (num_bin_ != other.num_bin_) {
+      return false;
+    }
+    if (missing_type_ != other.missing_type_) {
+      return false;
+    }
+    if (bin_type_ == BinType::NumericalBin) {
+      for (int i = 0; i < num_bin_; ++i) {
+        if (bin_upper_bound_[i] != other.bin_upper_bound_[i]) {
+          return false;
        }
-      } else {
-        for (int i = 0; i < num_bin_; i++) {
-          if (bin_2_categorical_[i] != other.bin_2_categorical_[i]) {
-            return false;
-          }
+      }
+    } else {
+      for (int i = 0; i < num_bin_; i++) {
+        if (bin_2_categorical_[i] != other.bin_2_categorical_[i]) {
+          return false;
        }
      }
-      return true;
    }
+    return true;
+  }

-    /*! \brief Get number of bins */
-    inline int num_bin() const { return num_bin_; }
-    /*! \brief Missing Type */
-    inline MissingType missing_type() const { return missing_type_; }
-    /*! \brief True if bin is trival (contains only one bin) */
-    inline bool is_trival() const { return is_trival_; }
-    /*! \brief Sparsity of this bin ( num_zero_bins / num_data ) */
-    inline double sparse_rate() const { return sparse_rate_; }
-    /*!
-    * \brief Save binary data to file
-    * \param file File want to write
-    */
-    void SaveBinaryToFile(FILE* file) const;
-    /*!
-    * \brief Mapping bin into feature value
-    * \param bin
-    * \return Feature value of this bin
-    */
-    inline double BinToValue(uint32_t bin) const {
-      if (bin_type_ == BinType::NumericalBin) {
-        return bin_upper_bound_[bin];
-      } else {
-        return bin_2_categorical_[bin];
-      }
-    }
-    /*!
-    * \brief Get sizes in byte of this object
-    */
-    size_t SizesInByte() const;
-    /*!
-    * \brief Mapping feature value into bin
-    * \param value
-    * \return bin for this feature value
-    */
-    inline uint32_t ValueToBin(double value) const;
-
-    /*!
-    * \brief Get the default bin when value is 0
-    * \return default bin
-    */
-    inline uint32_t GetDefaultBin() const {
-      return default_bin_;
+  /*! \brief Get number of bins */
+  inline int num_bin() const { return num_bin_; }
+  /*! \brief Missing Type */
+  inline MissingType missing_type() const { return missing_type_; }
+  /*! \brief True if bin is trival (contains only one bin) */
+  inline bool is_trival() const { return is_trival_; }
+  /*! \brief Sparsity of this bin ( num_zero_bins / num_data ) */
+  inline double sparse_rate() const { return sparse_rate_; }
+  /*!
+  * \brief Save binary data to file
+  * \param file File want to write
+  */
+  void SaveBinaryToFile(FILE* file) const;
+  /*!
+  * \brief Mapping bin into feature value
+  * \param bin
+  * \return Feature value of this bin
+  */
+  inline double BinToValue(uint32_t bin) const {
+    if (bin_type_ == BinType::NumericalBin) {
+      return bin_upper_bound_[bin];
+    } else {
+      return bin_2_categorical_[bin];
    }
-    /*!
-    * \brief Construct feature value to bin mapper according feature values
-    * \param values (Sampled) values of this feature, Note: not include zero.
-    * \param num_values number of values.
-    * \param total_sample_cnt number of total sample count, equal with values.size() + num_zeros
-    * \param max_bin The maximal number of bin
-    * \param min_data_in_bin min number of data in one bin
-    * \param min_split_data
-    * \param bin_type Type of this bin
-    * \param use_missing True to enable missing value handle
-    * \param zero_as_missing True to use zero as missing value
-    */
-    void FindBin(double* values, int num_values, size_t total_sample_cnt, int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type,
-      bool use_missing, bool zero_as_missing);
-
-    /*!
-    * \brief Use specific number of bin to calculate the size of this class
-    * \param bin The number of bin
-    * \return Size
-    */
-    static int SizeForSpecificBin(int bin);
-
-    /*!
-    * \brief Seirilizing this object to buffer
-    * \param buffer The destination
-    */
-    void CopyTo(char* buffer) const;
-
-    /*!
-    * \brief Deserilizing this object from buffer
-    * \param buffer The source
-    */
-    void CopyFrom(const char* buffer);
-
-    /*!
-    * \brief Get bin types
-    */
-    inline BinType bin_type() const { return bin_type_; }
-
-    /*!
-    * \brief Get bin info
-    */
-    inline std::string bin_info() const {
-      if (bin_type_ == BinType::CategoricalBin) {
-        return Common::Join(bin_2_categorical_, ":");
-      } else {
-        std::stringstream str_buf;
-        str_buf << std::setprecision(std::numeric_limits<double>::digits10 + 2);
-        str_buf << '[' << min_val_ << ':' << max_val_ << ']';
-        return str_buf.str();
-      }
+  }
+  /*!
+  * \brief Get sizes in byte of this object
+  */
+  size_t SizesInByte() const;
+  /*!
+  * \brief Mapping feature value into bin 
+  * \param value
+  * \return bin for this feature value
+  */
+  inline uint32_t ValueToBin(double value) const;
+
+  /*!
+  * \brief Get the default bin when value is 0
+  * \return default bin
+  */
+  inline uint32_t GetDefaultBin() const {
+    return default_bin_;
+  }
+  /*!
+  * \brief Construct feature value to bin mapper according feature values
+  * \param values (Sampled) values of this feature, Note: not include zero. 
+  * \param num_values number of values.
+  * \param total_sample_cnt number of total sample count, equal with values.size() + num_zeros
+  * \param max_bin The maximal number of bin
+  * \param min_data_in_bin min number of data in one bin
+  * \param min_split_data
+  * \param bin_type Type of this bin
+  * \param use_missing True to enable missing value handle
+  * \param zero_as_missing True to use zero as missing value
+  */
+  void FindBin(double* values, int num_values, size_t total_sample_cnt, int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type, 
+               bool use_missing, bool zero_as_missing);
+
+  /*!
+  * \brief Use specific number of bin to calculate the size of this class
+  * \param bin The number of bin
+  * \return Size
+  */
+  static int SizeForSpecificBin(int bin);
+
+  /*!
+  * \brief Seirilizing this object to buffer
+  * \param buffer The destination
+  */
+  void CopyTo(char* buffer) const;
+
+  /*!
+  * \brief Deserilizing this object from buffer
+  * \param buffer The source
+  */
+  void CopyFrom(const char* buffer);
+
+  /*!
+  * \brief Get bin types
+  */
+  inline BinType bin_type() const { return bin_type_; }
+
+  /*!
+  * \brief Get bin info
+  */
+  inline std::string bin_info() const {
+    if (bin_type_ == BinType::CategoricalBin) {
+      return Common::Join(bin_2_categorical_, ":");
+    } else {
+      std::stringstream str_buf;
+      str_buf << std::setprecision(std::numeric_limits<double>::digits10 + 2);
+      str_buf << '[' << min_val_ << ':' << max_val_ << ']';
+      return str_buf.str();
    }
+  }

-  private:
-    /*! \brief Number of bins */
-    int num_bin_;
-    MissingType missing_type_;
-    /*! \brief Store upper bound for each bin */
-    std::vector<double> bin_upper_bound_;
-    /*! \brief True if this feature is trival */
-    bool is_trival_;
-    /*! \brief Sparse rate of this bins( num_bin0/num_data ) */
-    double sparse_rate_;
-    /*! \brief Type of this bin */
-    BinType bin_type_;
-    /*! \brief Mapper from categorical to bin */
-    std::unordered_map<int, unsigned int> categorical_2_bin_;
-    /*! \brief Mapper from bin to categorical */
-    std::vector<int> bin_2_categorical_;
-    /*! \brief minimal feature vaule */
-    double min_val_;
-    /*! \brief maximum feature value */
-    double max_val_;
-    /*! \brief bin value of feature value 0 */
-    uint32_t default_bin_;
-  };
+private:
+  /*! \brief Number of bins */
+  int num_bin_;
+  MissingType missing_type_;
+  /*! \brief Store upper bound for each bin */
+  std::vector<double> bin_upper_bound_;
+  /*! \brief True if this feature is trival */
+  bool is_trival_;
+  /*! \brief Sparse rate of this bins( num_bin0/num_data ) */
+  double sparse_rate_;
+  /*! \brief Type of this bin */
+  BinType bin_type_;
+  /*! \brief Mapper from categorical to bin */
+  std::unordered_map<int, unsigned int> categorical_2_bin_;
+  /*! \brief Mapper from bin to categorical */
+  std::vector<int> bin_2_categorical_;
+  /*! \brief minimal feature vaule */
+  double min_val_;
+  /*! \brief maximum feature value */
+  double max_val_;
+  /*! \brief bin value of feature value 0 */
+  uint32_t default_bin_;
+};
+
+/*!
+* \brief Interface for ordered bin data. efficient for construct histogram, especially for sparse bin
+*        There are 2 advantages by using ordered bin.
+*        1. group the data by leafs to improve the cache hit.
+*        2. only store the non-zero bin, which can speed up the histogram consturction for sparse features.
+*        However it brings additional cost: it need re-order the bins after every split, which will cost much for dense feature.
+*        So we only using ordered bin for sparse situations.
+*/
+class OrderedBin {
+public:
+  /*! \brief virtual destructor */
+  virtual ~OrderedBin() {}

  /*!
-  * \brief Interface for ordered bin data. efficient for construct histogram, especially for sparse bin
-  *        There are 2 advantages by using ordered bin.
-  *        1. group the data by leafs to improve the cache hit.
-  *        2. only store the non-zero bin, which can speed up the histogram consturction for sparse features.
-  *        However it brings additional cost: it need re-order the bins after every split, which will cost much for dense feature.
-  *        So we only using ordered bin for sparse situations.
+  * \brief Initialization logic.
+  * \param used_indices If used_indices.size() == 0 means using all data, otherwise, used_indices[i] == true means i-th data is used
+           (this logic was build for bagging logic)
+  * \param num_leaves Number of leaves on this iteration
  */
-  class OrderedBin {
-  public:
-    /*! \brief virtual destructor */
-    virtual ~OrderedBin() {}
-
-    /*!
-    * \brief Initialization logic.
-    * \param used_indices If used_indices.size() == 0 means using all data, otherwise, used_indices[i] == true means i-th data is used
-    (this logic was build for bagging logic)
-    * \param num_leaves Number of leaves on this iteration
-    */
-    virtual void Init(const char* used_indices, data_size_t num_leaves) = 0;
-
-    /*!
-    * \brief Construct histogram by using this bin
-    *        Note: Unlike Bin, OrderedBin doesn't use ordered gradients and ordered hessians.
-    *        Because it is hard to know the relative index in one leaf for sparse bin, since we skipped zero bins.
-    * \param leaf Using which leaf's data to construct
-    * \param gradients Gradients, Note:non-oredered by leaf
-    * \param hessians Hessians, Note:non-oredered by leaf
-    * \param out Output Result
-    */
-    virtual void ConstructHistogram(int leaf, const score_t* gradients,
-      const score_t* hessians, HistogramBinEntry* out) const = 0;
-
-    /*!
-    * \brief Construct histogram by using this bin
-    *        Note: Unlike Bin, OrderedBin doesn't use ordered gradients and ordered hessians.
-    *        Because it is hard to know the relative index in one leaf for sparse bin, since we skipped zero bins.
-    * \param leaf Using which leaf's data to construct
-    * \param gradients Gradients, Note:non-oredered by leaf
-    * \param out Output Result
-    */
-    virtual void ConstructHistogram(int leaf, const score_t* gradients, HistogramBinEntry* out) const = 0;
-
-    /*!
-    * \brief Split current bin, and perform re-order by leaf
-    * \param leaf Using which leaf's to split
-    * \param right_leaf The new leaf index after perform this split
-    * \param is_in_leaf is_in_leaf[i] == mark means the i-th data will be on left leaf after split
-    * \param mark is_in_leaf[i] == mark means the i-th data will be on left leaf after split
-    */
-    virtual void Split(int leaf, int right_leaf, const char* is_in_leaf, char mark) = 0;
-
-    virtual data_size_t NonZeroCount(int leaf) const = 0;
-  };
-
-  /*! \brief Iterator for one bin column */
-  class BinIterator {
-  public:
-    /*!
-    * \brief Get bin data on specific row index
-    * \param idx Index of this data
-    * \return Bin data
-    */
-    virtual uint32_t Get(data_size_t idx) = 0;
-    virtual uint32_t RawGet(data_size_t idx) = 0;
-    virtual void Reset(data_size_t idx) = 0;
-    virtual ~BinIterator() = default;
-  };
+  virtual void Init(const char* used_indices, data_size_t num_leaves) = 0;

  /*!
-  * \brief Interface for bin data. This class will store bin data for one feature.
-  *        unlike OrderedBin, this class will store data by original order.
-  *        Note that it may cause cache misses when construct histogram,
-  *        but it doesn't need to re-order operation, So it will be faster than OrderedBin for dense feature
+  * \brief Construct histogram by using this bin
+  *        Note: Unlike Bin, OrderedBin doesn't use ordered gradients and ordered hessians.
+  *        Because it is hard to know the relative index in one leaf for sparse bin, since we skipped zero bins.
+  * \param leaf Using which leaf's data to construct
+  * \param gradients Gradients, Note:non-oredered by leaf
+  * \param hessians Hessians, Note:non-oredered by leaf
+  * \param out Output Result
  */
-  class Bin {
-  public:
-    /*! \brief virtual destructor */
-    virtual ~Bin() {}
-    /*!
-    * \brief Push one record
-    * \pram tid Thread id
-    * \param idx Index of record
-    * \param value bin value of record
-    */
-    virtual void Push(int tid, data_size_t idx, uint32_t value) = 0;
-
-
-    virtual void CopySubset(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) = 0;
-    /*!
-    * \brief Get bin iterator of this bin for specific feature
-    * \param min_bin min_bin of current used feature
-    * \param max_bin max_bin of current used feature
-    * \param default_bin default bin if bin not in [min_bin, max_bin]
-    * \return Iterator of this bin
-    */
-    virtual BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin) const = 0;
-
-    /*!
-    * \brief Save binary data to file
-    * \param file File want to write
-    */
-    virtual void SaveBinaryToFile(FILE* file) const = 0;
-
-    /*!
-    * \brief Load from memory
-    * \param memory
-    * \param local_used_indices
-    */
-    virtual void LoadFromMemory(const void* memory,
-      const std::vector<data_size_t>& local_used_indices) = 0;
-
-    /*!
-    * \brief Get sizes in byte of this object
-    */
-    virtual size_t SizesInByte() const = 0;
-
-    /*! \brief Number of all data */
-    virtual data_size_t num_data() const = 0;
-
-    virtual void ReSize(data_size_t num_data) = 0;
-
-    /*!
-    * \brief Construct histogram of this feature,
-    *        Note: We use ordered_gradients and ordered_hessians to improve cache hit chance
-    *        The naive solution is using gradients[data_indices[i]] for data_indices[i] to get gradients,
-    which is not cache friendly, since the access of memory is not continuous.
-    *        ordered_gradients and ordered_hessians are preprocessed, and they are re-ordered by data_indices.
-    *        Ordered_gradients[i] is aligned with data_indices[i]'s gradients (same for ordered_hessians).
-    * \param data_indices Used data indices in current leaf
-    * \param num_data Number of used data
-    * \param ordered_gradients Pointer to gradients, the data_indices[i]-th data's gradient is ordered_gradients[i]
-    * \param ordered_hessians Pointer to hessians, the data_indices[i]-th data's hessian is ordered_hessians[i]
-    * \param out Output Result
-    */
-    virtual void ConstructHistogram(
-      const data_size_t* data_indices, data_size_t num_data,
-      const score_t* ordered_gradients, const score_t* ordered_hessians,
-      HistogramBinEntry* out) const = 0;
-
-    virtual void ConstructHistogram(data_size_t num_data,
-      const score_t* ordered_gradients, const score_t* ordered_hessians,
-      HistogramBinEntry* out) const = 0;
-
-    /*!
-    * \brief Construct histogram of this feature,
-    *        Note: We use ordered_gradients and ordered_hessians to improve cache hit chance
-    *        The naive solution is using gradients[data_indices[i]] for data_indices[i] to get gradients,
-    which is not cache friendly, since the access of memory is not continuous.
-    *        ordered_gradients and ordered_hessians are preprocessed, and they are re-ordered by data_indices.
-    *        Ordered_gradients[i] is aligned with data_indices[i]'s gradients (same for ordered_hessians).
-    * \param data_indices Used data indices in current leaf
-    * \param num_data Number of used data
-    * \param ordered_gradients Pointer to gradients, the data_indices[i]-th data's gradient is ordered_gradients[i]
-    * \param out Output Result
-    */
-    virtual void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data,
-      const score_t* ordered_gradients, HistogramBinEntry* out) const = 0;
-
-    virtual void ConstructHistogram(data_size_t num_data,
-      const score_t* ordered_gradients, HistogramBinEntry* out) const = 0;
-
-    /*!
-    * \brief Split data according to threshold, if bin <= threshold, will put into left(lte_indices), else put into right(gt_indices)
-    * \param min_bin min_bin of current used feature
-    * \param max_bin max_bin of current used feature
-    * \param default_bin defualt bin if bin not in [min_bin, max_bin]
-    * \param missing_type missing type
-    * \param default_left missing bin will go to left child
-    * \param threshold The split threshold.
-    * \param data_indices Used data indices. After called this function. The less than or equal data indices will store on this object.
-    * \param num_data Number of used data
-    * \param lte_indices After called this function. The less or equal data indices will store on this object.
-    * \param gt_indices After called this function. The greater data indices will store on this object.
-    * \param bin_type type of bin
-    * \return The number of less than or equal data.
-    */
-    virtual data_size_t Split(uint32_t min_bin, uint32_t max_bin,
-      uint32_t default_bin, MissingType missing_type, bool default_left, uint32_t threshold,
-      data_size_t* data_indices, data_size_t num_data,
-      data_size_t* lte_indices, data_size_t* gt_indices, BinType bin_type) const = 0;
-
-    /*!
-    * \brief Create the ordered bin for this bin
-    * \return Pointer to ordered bin
-    */
-    virtual OrderedBin* CreateOrderedBin() const = 0;
-
-    /*!
-    * \brief After pushed all feature data, call this could have better refactor for bin data
-    */
-    virtual void FinishLoad() = 0;
-
-    /*!
-    * \brief Create object for bin data of one feature, will call CreateDenseBin or CreateSparseBin according to "is_sparse"
-    * \param num_data Total number of data
-    * \param num_bin Number of bin
-    * \param sparse_rate Sparse rate of this bins( num_bin0/num_data )
-    * \param is_enable_sparse True if enable sparse feature
-    * \param sparse_threshold Threshold for treating a feature as a sparse feature
-    * \param is_sparse Will set to true if this bin is sparse
-    * \param default_bin Default bin for zeros value
-    * \return The bin data object
-    */
-    static Bin* CreateBin(data_size_t num_data, int num_bin,
-      double sparse_rate, bool is_enable_sparse, double sparse_threshold, bool* is_sparse);
-
-    /*!
-    * \brief Create object for bin data of one feature, used for dense feature
-    * \param num_data Total number of data
-    * \param num_bin Number of bin
-    * \return The bin data object
-    */
-    static Bin* CreateDenseBin(data_size_t num_data, int num_bin);
-
-    /*!
-    * \brief Create object for bin data of one feature, used for sparse feature
-    * \param num_data Total number of data
-    * \param num_bin Number of bin
-    * \return The bin data object
-    */
-    static Bin* CreateSparseBin(data_size_t num_data, int num_bin);
-  };
-
-  inline uint32_t BinMapper::ValueToBin(double value) const {
-    if (std::isnan(value)) {
-      if (missing_type_ == MissingType::NaN) {
-        return num_bin_ - 1;
+  virtual void ConstructHistogram(int leaf, const score_t* gradients,
+    const score_t* hessians, HistogramBinEntry* out) const = 0;
+
+  /*!
+  * \brief Construct histogram by using this bin
+  *        Note: Unlike Bin, OrderedBin doesn't use ordered gradients and ordered hessians.
+  *        Because it is hard to know the relative index in one leaf for sparse bin, since we skipped zero bins.
+  * \param leaf Using which leaf's data to construct
+  * \param gradients Gradients, Note:non-oredered by leaf
+  * \param out Output Result
+  */
+  virtual void ConstructHistogram(int leaf, const score_t* gradients, HistogramBinEntry* out) const = 0;
+
+  /*!
+  * \brief Split current bin, and perform re-order by leaf
+  * \param leaf Using which leaf's to split
+  * \param right_leaf The new leaf index after perform this split
+  * \param is_in_leaf is_in_leaf[i] == mark means the i-th data will be on left leaf after split
+  * \param mark is_in_leaf[i] == mark means the i-th data will be on left leaf after split
+  */
+  virtual void Split(int leaf, int right_leaf, const char* is_in_leaf, char mark) = 0;
+
+  virtual data_size_t NonZeroCount(int leaf) const = 0;
+};
+
+/*! \brief Iterator for one bin column */
+class BinIterator {
+public:
+  /*!
+  * \brief Get bin data on specific row index
+  * \param idx Index of this data
+  * \return Bin data
+  */
+  virtual uint32_t Get(data_size_t idx) = 0;
+  virtual uint32_t RawGet(data_size_t idx) = 0;
+  virtual void Reset(data_size_t idx) = 0;
+  virtual ~BinIterator() = default;
+};
+
+/*!
+* \brief Interface for bin data. This class will store bin data for one feature.
+*        unlike OrderedBin, this class will store data by original order.
+*        Note that it may cause cache misses when construct histogram,
+*        but it doesn't need to re-order operation, So it will be faster than OrderedBin for dense feature
+*/
+class Bin {
+public:
+  /*! \brief virtual destructor */
+  virtual ~Bin() {}
+  /*!
+  * \brief Push one record
+  * \pram tid Thread id
+  * \param idx Index of record
+  * \param value bin value of record
+  */
+  virtual void Push(int tid, data_size_t idx, uint32_t value) = 0;
+
+
+  virtual void CopySubset(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) = 0;
+  /*!
+  * \brief Get bin iterator of this bin for specific feature
+  * \param min_bin min_bin of current used feature
+  * \param max_bin max_bin of current used feature
+  * \param default_bin default bin if bin not in [min_bin, max_bin]
+  * \return Iterator of this bin
+  */
+  virtual BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin) const = 0;
+
+  /*!
+  * \brief Save binary data to file
+  * \param file File want to write
+  */
+  virtual void SaveBinaryToFile(FILE* file) const = 0;
+
+  /*!
+  * \brief Load from memory
+  * \param memory
+  * \param local_used_indices
+  */
+  virtual void LoadFromMemory(const void* memory,
+    const std::vector<data_size_t>& local_used_indices) = 0;
+
+  /*!
+  * \brief Get sizes in byte of this object
+  */
+  virtual size_t SizesInByte() const = 0;
+
+  /*! \brief Number of all data */
+  virtual data_size_t num_data() const = 0;
+
+  virtual void ReSize(data_size_t num_data) = 0;
+
+  /*!
+  * \brief Construct histogram of this feature,
+  *        Note: We use ordered_gradients and ordered_hessians to improve cache hit chance
+  *        The naive solution is using gradients[data_indices[i]] for data_indices[i] to get gradients,
+           which is not cache friendly, since the access of memory is not continuous.
+  *        ordered_gradients and ordered_hessians are preprocessed, and they are re-ordered by data_indices.
+  *        Ordered_gradients[i] is aligned with data_indices[i]'s gradients (same for ordered_hessians).
+  * \param data_indices Used data indices in current leaf
+  * \param num_data Number of used data
+  * \param ordered_gradients Pointer to gradients, the data_indices[i]-th data's gradient is ordered_gradients[i]
+  * \param ordered_hessians Pointer to hessians, the data_indices[i]-th data's hessian is ordered_hessians[i]
+  * \param out Output Result
+  */
+  virtual void ConstructHistogram(
+    const data_size_t* data_indices, data_size_t num_data,
+    const score_t* ordered_gradients, const score_t* ordered_hessians,
+    HistogramBinEntry* out) const = 0;
+
+  virtual void ConstructHistogram(data_size_t num_data,
+    const score_t* ordered_gradients, const score_t* ordered_hessians,
+    HistogramBinEntry* out) const = 0;
+
+  /*!
+  * \brief Construct histogram of this feature,
+  *        Note: We use ordered_gradients and ordered_hessians to improve cache hit chance
+  *        The naive solution is using gradients[data_indices[i]] for data_indices[i] to get gradients,
+  which is not cache friendly, since the access of memory is not continuous.
+  *        ordered_gradients and ordered_hessians are preprocessed, and they are re-ordered by data_indices.
+  *        Ordered_gradients[i] is aligned with data_indices[i]'s gradients (same for ordered_hessians).
+  * \param data_indices Used data indices in current leaf
+  * \param num_data Number of used data
+  * \param ordered_gradients Pointer to gradients, the data_indices[i]-th data's gradient is ordered_gradients[i]
+  * \param out Output Result
+  */
+  virtual void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data,
+                                  const score_t* ordered_gradients, HistogramBinEntry* out) const = 0;
+
+  virtual void ConstructHistogram(data_size_t num_data,
+                                  const score_t* ordered_gradients, HistogramBinEntry* out) const = 0;
+
+  /*!
+  * \brief Split data according to threshold, if bin <= threshold, will put into left(lte_indices), else put into right(gt_indices)
+  * \param min_bin min_bin of current used feature
+  * \param max_bin max_bin of current used feature
+  * \param default_bin defualt bin if bin not in [min_bin, max_bin]
+  * \param missing_type missing type
+  * \param default_left missing bin will go to left child
+  * \param threshold The split threshold.
+  * \param data_indices Used data indices. After called this function. The less than or equal data indices will store on this object.
+  * \param num_data Number of used data
+  * \param lte_indices After called this function. The less or equal data indices will store on this object.
+  * \param gt_indices After called this function. The greater data indices will store on this object.
+  * \return The number of less than or equal data.
+  */
+  virtual data_size_t Split(uint32_t min_bin, uint32_t max_bin, 
+    uint32_t default_bin, MissingType missing_type, bool default_left, uint32_t threshold,
+    data_size_t* data_indices, data_size_t num_data,
+    data_size_t* lte_indices, data_size_t* gt_indices) const = 0;
+
+  /*!
+  * \brief Split data according to threshold, if bin <= threshold, will put into left(lte_indices), else put into right(gt_indices)
+  * \param min_bin min_bin of current used feature
+  * \param max_bin max_bin of current used feature
+  * \param default_bin defualt bin if bin not in [min_bin, max_bin]
+  * \param threshold The split threshold.
+  * \param data_indices Used data indices. After called this function. The less than or equal data indices will store on this object.
+  * \param num_data Number of used data
+  * \param lte_indices After called this function. The less or equal data indices will store on this object.
+  * \param gt_indices After called this function. The greater data indices will store on this object.
+  * \return The number of less than or equal data.
+  */
+  virtual data_size_t SplitCategorical(uint32_t min_bin, uint32_t max_bin,
+                            uint32_t default_bin, uint32_t threshold,
+                            data_size_t* data_indices, data_size_t num_data,
+                            data_size_t* lte_indices, data_size_t* gt_indices) const = 0;
+
+  /*!
+  * \brief Create the ordered bin for this bin
+  * \return Pointer to ordered bin
+  */
+  virtual OrderedBin* CreateOrderedBin() const = 0;
+
+  /*!
+  * \brief After pushed all feature data, call this could have better refactor for bin data
+  */
+  virtual void FinishLoad() = 0;
+
+  /*!
+  * \brief Create object for bin data of one feature, will call CreateDenseBin or CreateSparseBin according to "is_sparse"
+  * \param num_data Total number of data
+  * \param num_bin Number of bin
+  * \param sparse_rate Sparse rate of this bins( num_bin0/num_data )
+  * \param is_enable_sparse True if enable sparse feature
+  * \param sparse_threshold Threshold for treating a feature as a sparse feature
+  * \param is_sparse Will set to true if this bin is sparse
+  * \param default_bin Default bin for zeros value
+  * \return The bin data object
+  */
+  static Bin* CreateBin(data_size_t num_data, int num_bin,
+    double sparse_rate, bool is_enable_sparse, double sparse_threshold, bool* is_sparse);
+
+  /*!
+  * \brief Create object for bin data of one feature, used for dense feature
+  * \param num_data Total number of data
+  * \param num_bin Number of bin
+  * \return The bin data object
+  */
+  static Bin* CreateDenseBin(data_size_t num_data, int num_bin);
+
+  /*!
+  * \brief Create object for bin data of one feature, used for sparse feature
+  * \param num_data Total number of data
+  * \param num_bin Number of bin
+  * \return The bin data object
+  */
+  static Bin* CreateSparseBin(data_size_t num_data, int num_bin);
+};
+
+inline uint32_t BinMapper::ValueToBin(double value) const {
+  if (std::isnan(value)) {
+    if (missing_type_ == MissingType::NaN) {
+      return num_bin_ - 1;
+    } else {
+      value = 0.0f;
+    }
+  }
+  if (bin_type_ == BinType::NumericalBin) {
+    // binary search to find bin
+    int l = 0;
+    int r = num_bin_ - 1;
+    if (missing_type_ == MissingType::NaN) {
+      r -= 1;
+    }
+    while (l < r) {
+      int m = (r + l - 1) / 2;
+      if (value <= bin_upper_bound_[m]) {
+        r = m;
      } else {
-        value = 0.0f;
+        l = m + 1;
      }
    }
-    if (bin_type_ == BinType::NumericalBin) {
-      // binary search to find bin
-      int l = 0;
-      int r = num_bin_ - 1;
-      if (missing_type_ == MissingType::NaN) {
-        r -= 1;
-      }
-      while (l < r) {
-        int m = (r + l - 1) / 2;
-        if (value <= bin_upper_bound_[m]) {
-          r = m;
-        } else {
-          l = m + 1;
-        }
-      }
-      return l;
+    return l;
+  } else {
+    int int_value = static_cast<int>(value);
+    // convert negative value to NaN bin
+    if (int_value < 0) {
+      return num_bin_ - 1;
+    }
+    if (categorical_2_bin_.count(int_value)) {
+      return categorical_2_bin_.at(int_value);
    } else {
-      int int_value = static_cast<int>(value);
-      // convert negative value to NaN bin
-      if (int_value < 0) {
-        return num_bin_ - 1;
-      }
-      if (categorical_2_bin_.count(int_value)) {
-        return categorical_2_bin_.at(int_value);
-      } else {
-        return num_bin_ - 1;
-      }
+      return num_bin_ - 1;
    }
  }
+}

 }  // namespace LightGBM


--- a/include/LightGBM/feature_group.h
+++ b/include/LightGBM/feature_group.h
@@ -168,9 +168,14 @@ public:
    uint32_t min_bin = bin_offsets_[sub_feature];
    uint32_t max_bin = bin_offsets_[sub_feature + 1] - 1;
    uint32_t default_bin = bin_mappers_[sub_feature]->GetDefaultBin();
-    auto missing_type = bin_mappers_[sub_feature]->missing_type();
-    return bin_data_->Split(min_bin, max_bin, default_bin, missing_type, default_left,
-      threshold, data_indices, num_data, lte_indices, gt_indices, bin_mappers_[sub_feature]->bin_type());
+    if (bin_mappers_[sub_feature]->bin_type() == BinType::NumericalBin) {
+      auto missing_type = bin_mappers_[sub_feature]->missing_type();
+      return bin_data_->Split(min_bin, max_bin, default_bin, missing_type, default_left,
+                              threshold, data_indices, num_data, lte_indices, gt_indices);
+    } else {
+      return bin_data_->SplitCategorical(min_bin, max_bin, default_bin, threshold, data_indices, num_data, lte_indices, gt_indices);
+    }
+
  }
  /*!
  * \brief From bin to feature value

--- a/include/LightGBM/tree.h
+++ b/include/LightGBM/tree.h
@@ -37,9 +37,8 @@ public:
  * \brief Performing a split on tree leaves.
  * \param leaf Index of leaf to be split
  * \param feature Index of feature; the converted index after removing useless features
-  * \param bin_type type of this feature, numerical or categorical
-  * \param threshold Threshold(bin) of split
  * \param real_feature Index of feature, the original index on data
+  * \param threshold_bin Threshold(bin) of split
  * \param threshold_double Threshold on feature value
  * \param left_value Model Left child output
  * \param right_value Model Right child output
@@ -50,10 +49,29 @@ public:
  * \param default_left default direction for missing value
  * \return The index of new leaf.
  */
-  int Split(int leaf, int feature, BinType bin_type, uint32_t threshold, int real_feature, 
-            double threshold_double, double left_value, double right_value, 
+  int Split(int leaf, int feature, int real_feature, uint32_t threshold_bin,
+            double threshold_double, double left_value, double right_value,
            data_size_t left_cnt, data_size_t right_cnt, double gain, MissingType missing_type, bool default_left);

+  /*!
+  * \brief Performing a split on tree leaves, with categorical feature
+  * \param leaf Index of leaf to be split
+  * \param feature Index of feature; the converted index after removing useless features
+  * \param real_feature Index of feature, the original index on data
+  * \param threshold_bin Threshold(bin) of split, use bitset to represent
+  * \param num_threshold_bin size of threshold_bin
+  * \param threshold 
+  * \param left_value Model Left child output
+  * \param right_value Model Right child output
+  * \param left_cnt Count of left child
+  * \param right_cnt Count of right child
+  * \param gain Split gain
+  * \return The index of new leaf.
+  */
+  int SplitCategorical(int leaf, int feature, int real_feature, uint32_t threshold_bin,
+                       double threshold, double left_value, double right_value,
+                       data_size_t left_cnt, data_size_t right_cnt, double gain, MissingType missing_type);
+
  /*! \brief Get the output of one leaf */
  inline double LeafOutput(int leaf) const { return leaf_value_[leaf]; }

@@ -89,6 +107,7 @@ public:
  * \return Prediction result
  */
  inline double Predict(const double* feature_values) const;
+
  inline int PredictLeafIndex(const double* feature_values) const;

  inline void PredictContrib(const double* feature_values, int num_features, double* output) const;
@@ -139,7 +158,7 @@ public:
  * \param rate The factor of shrinkage
  */
  inline void Shrinkage(double rate) {
-    #pragma omp parallel for schedule(static, 512) if (num_leaves_ >= 1024)
+    #pragma omp parallel for schedule(static, 1024) if (num_leaves_ >= 2048)
    for (int i = 0; i < num_leaves_; ++i) {
      leaf_value_[i] *= rate;
      if (leaf_value_[i] > kMaxTreeOutput) { leaf_value_[i] = kMaxTreeOutput; }
@@ -157,24 +176,6 @@ public:
  /*! \brief Serialize this object to if-else statement*/
  std::string ToIfElse(int index, bool is_predict_leaf_index);

-  template<typename T>
-  inline static bool CategoricalDecision(T fval, T threshold) {
-    if (static_cast<int>(fval) == static_cast<int>(threshold)) {
-      return true;
-    } else {
-      return false;
-    }
-  }
-
-  template<typename T>
-  inline static bool NumericalDecision(T fval, T threshold) {
-    if (fval <= threshold) {
-      return true;
-    } else {
-      return false;
-    }
-  }
-
  inline static bool IsZero(double fval) {
    if (fval > -kZeroAsMissingValueRange && fval <= kZeroAsMissingValueRange) {
      return true;
@@ -204,21 +205,44 @@ public:
    (*decision_type) |= (input << 2);
  }

-  inline static uint32_t ConvertMissingValue(uint32_t fval, uint32_t threshold, int8_t decision_type, uint32_t default_bin, uint32_t max_bin) {
-    uint8_t missing_type = GetMissingType(decision_type);
-    if ((missing_type == 1 && fval == default_bin)
-        || (missing_type == 2 && fval == max_bin)) {
-      if (GetDecisionType(decision_type, kDefaultLeftMask)) {
-        fval = threshold;
+private:
+
+  inline std::string NumericalDecisionIfElse(int node) {
+    std::stringstream str_buf;
+    uint8_t missing_type = GetMissingType(decision_type_[node]);
+    bool default_left = GetDecisionType(decision_type_[node], kDefaultLeftMask);
+    if (missing_type == 0 || (missing_type == 1 && default_left && kZeroAsMissingValueRange < threshold_[node])) {
+      str_buf << "if (fval <= " << threshold_[node] << ") {";
+    } else if (missing_type == 1) {
+      if (default_left) {
+        str_buf << "if (fval <= " << threshold_[node] << " || Tree::IsZero(fval)" << " || std::isnan(fval)) {";
+      } else {
+        str_buf << "if (fval <= " << threshold_[node] << " && !Tree::IsZero(fval)" << " && !std::isnan(fval)) {";
+      }
+    } else {
+      if (default_left) {
+        str_buf << "if (fval <= " << threshold_[node] << " || std::isnan(fval)) {";
      } else {
-        fval = threshold + 1;
+        str_buf << "if (fval <= " << threshold_[node] << " && !std::isnan(fval)) {";
      }
    }
-    return fval;
+    return str_buf.str();
  }

-  inline static double ConvertMissingValue(double fval, double threshold, int8_t decision_type) {
-    uint8_t missing_type = GetMissingType(decision_type);
+  inline std::string CategoricalDecisionIfElse(int node) const {
+    uint8_t missing_type = GetMissingType(decision_type_[node]);
+    std::stringstream str_buf;
+    if (missing_type == 2) {
+      str_buf << "if (std::isnan(fval)) { int_fval = -1; } else { int_fval = static_cast<int>(fval); }";
+    } else {
+      str_buf << "if (std::isnan(fval)) { int_fval = 0; } else { int_fval = static_cast<int>(fval); }";
+    }
+    str_buf << "if (int_fval >= 0 &&  int_fval == " << static_cast<int>(threshold_[node]) << ") {";
+    return str_buf.str();
+  }
+
+  inline int NumericalDecision(double fval, int node) const {
+    uint8_t missing_type = GetMissingType(decision_type_[node]);
    if (std::isnan(fval)) {
      if (missing_type != 2) {
        fval = 0.0f;
@@ -226,28 +250,79 @@ public:
    }
    if ((missing_type == 1 && IsZero(fval))
        || (missing_type == 2 && std::isnan(fval))) {
-      if (GetDecisionType(decision_type, kDefaultLeftMask)) {
-        fval = threshold;
+      if (GetDecisionType(decision_type_[node], kDefaultLeftMask)) {
+        return left_child_[node];
      } else {
-        fval = 10.0f * threshold;
+        return right_child_[node];
      }
    }
-    return fval;
+    if (fval <= threshold_[node]) {
+      return left_child_[node];
+    } else {
+      return right_child_[node];
+    }
  }

-  inline static const char* GetDecisionTypeName(int8_t type) {
-    if (type == 0) {
-      return "no_greater";
+  inline int NumericalDecisionInner(uint32_t fval, int node, uint32_t default_bin, uint32_t max_bin) const {
+    uint8_t missing_type = GetMissingType(decision_type_[node]);
+    if ((missing_type == 1 && fval == default_bin)
+        || (missing_type == 2 && fval == max_bin)) {
+      if (GetDecisionType(decision_type_[node], kDefaultLeftMask)) {
+        return left_child_[node];
+      } else {
+        return right_child_[node];
+      }
+    }
+    if (fval <= threshold_in_bin_[node]) {
+      return left_child_[node];
    } else {
-      return "is";
+      return right_child_[node];
    }
  }

-  static std::vector<bool(*)(uint32_t, uint32_t)> inner_decision_funs;
-  static std::vector<bool(*)(double, double)> decision_funs;
+  inline int CategoricalDecision(double fval, int node) const {
+    uint8_t missing_type = GetMissingType(decision_type_[node]);
+    int int_fval = static_cast<int>(fval);
+    if (int_fval < 0) {
+      return right_child_[node];;
+    } else if (std::isnan(fval)) {
+      // NaN is always in the right
+      if (missing_type == 2) {
+        return right_child_[node];
+      }
+      int_fval = 0;
+    }
+    if (int_fval == static_cast<int>(threshold_[node])) {
+      return left_child_[node];
+    }
+    return right_child_[node];
+  }

-private:
+  inline int CategoricalDecisionInner(uint32_t fval, int node) const {
+    if (fval == threshold_in_bin_[node]) {
+      return left_child_[node];
+    }
+    return right_child_[node];
+  }

+  inline int Decision(double fval, int node) const {
+    if (GetDecisionType(decision_type_[node], kCategoricalMask)) {
+      return CategoricalDecision(fval, node);
+    } else {
+      return NumericalDecision(fval, node);
+    }
+  }
+
+  inline int DecisionInner(uint32_t fval, int node, uint32_t default_bin, uint32_t max_bin) const {
+    if (GetDecisionType(decision_type_[node], kCategoricalMask)) {
+      return CategoricalDecisionInner(fval, node);
+    } else {
+      return NumericalDecisionInner(fval, node, default_bin, max_bin);
+    }
+  }
+
+  inline void Split(int leaf, int feature, int real_feature,
+                    double left_value, double right_value, data_size_t left_cnt, data_size_t right_cnt, double gain);
  /*!
  * \brief Find leaf index of which record belongs by features
  * \param feature_values Feature value of this record
@@ -288,6 +363,7 @@ private:
  std::vector<uint32_t> threshold_in_bin_;
  /*! \brief A non-leaf node's split threshold in feature value */
  std::vector<double> threshold_;
+  int num_cat_;
  /*! \brief Store the information for categorical feature handle and mising value handle. */
  std::vector<int8_t> decision_type_;
  /*! \brief A non-leaf node's split gain */
@@ -306,9 +382,44 @@ private:
  /*! \brief Depth for leaves */
  std::vector<int> leaf_depth_;
  double shrinkage_;
-  bool has_categorical_;
 };

+inline void Tree::Split(int leaf, int feature, int real_feature,
+                        double left_value, double right_value, data_size_t left_cnt, data_size_t right_cnt, double gain) {
+  int new_node_idx = num_leaves_ - 1;
+  // update parent info
+  int parent = leaf_parent_[leaf];
+  if (parent >= 0) {
+    // if cur node is left child
+    if (left_child_[parent] == ~leaf) {
+      left_child_[parent] = new_node_idx;
+    } else {
+      right_child_[parent] = new_node_idx;
+    }
+  }
+  // add new node
+  split_feature_inner_[new_node_idx] = feature;
+  split_feature_[new_node_idx] = real_feature;
+
+  split_gain_[new_node_idx] = Common::AvoidInf(gain);
+  // add two new leaves
+  left_child_[new_node_idx] = ~leaf;
+  right_child_[new_node_idx] = ~num_leaves_;
+  // update new leaves
+  leaf_parent_[leaf] = new_node_idx;
+  leaf_parent_[num_leaves_] = new_node_idx;
+  // save current leaf value to internal node before change
+  internal_value_[new_node_idx] = leaf_value_[leaf];
+  internal_count_[new_node_idx] = left_cnt + right_cnt;
+  leaf_value_[leaf] = std::isnan(left_value) ? 0.0f : left_value;
+  leaf_count_[leaf] = left_cnt;
+  leaf_value_[num_leaves_] = std::isnan(right_value) ? 0.0f : right_value;
+  leaf_count_[num_leaves_] = right_cnt;
+  // update leaf depth
+  leaf_depth_[num_leaves_] = leaf_depth_[leaf] + 1;
+  leaf_depth_[leaf]++;
+}
+
 inline double Tree::Predict(const double* feature_values) const {
  if (num_leaves_ > 1) {
    int leaf = GetLeaf(feature_values);
@@ -409,8 +520,7 @@ inline void Tree::TreeSHAP(const double *feature_values, double *phi,

  // internal node
  } else {
-    const int hot_index = 
-      decision_funs[GetDecisionType(decision_type_[node], kCategoricalMask)](feature_values[split_index], threshold_[node]);
+    const int hot_index = Decision(feature_values[split_index], node);
    const int cold_index = (hot_index == left_child_[node] ? right_child_[node] : left_child_[node]);
    const double w = data_count(node);
    const double hot_zero_fraction = data_count(hot_index)/w;
@@ -469,27 +579,13 @@ inline int Tree::MaxDepth() const {

 inline int Tree::GetLeaf(const double* feature_values) const {
  int node = 0;
-  if (has_categorical_) {
+  if (num_cat_ > 0) {
    while (node >= 0) {
-      double fval = ConvertMissingValue(feature_values[split_feature_[node]], threshold_[node], decision_type_[node]);
-      if (decision_funs[GetDecisionType(decision_type_[node], kCategoricalMask)](
-        fval,
-        threshold_[node])) {
-        node = left_child_[node];
-      } else {
-        node = right_child_[node];
-      }
+      node = Decision(feature_values[split_feature_[node]], node);
    }
  } else {
    while (node >= 0) {
-      double fval = ConvertMissingValue(feature_values[split_feature_[node]], threshold_[node], decision_type_[node]);
-      if (NumericalDecision<double>(
-        fval,
-        threshold_[node])) {
-        node = left_child_[node];
-      } else {
-        node = right_child_[node];
-      }
+      node = NumericalDecision(feature_values[split_feature_[node]], node);
    }
  }
  return ~node;

--- a/src/boosting/gbdt.cpp
+++ b/src/boosting/gbdt.cpp
@@ -473,7 +473,7 @@ bool GBDT::TrainOneIter(const score_t* gradient, const score_t* hessian, bool is
    auto label = train_data_->metadata().label();
    double init_score = ObtainAutomaticInitialScore(objective_function_, label, num_data_);
    std::unique_ptr<Tree> new_tree(new Tree(2));
-    new_tree->Split(0, 0, BinType::NumericalBin, 0, 0, 0, init_score, init_score, 0, 0, -1, MissingType::None, true);
+    new_tree->Split(0, 0, 0, 0, 0, init_score, init_score, 0, 0, -1, MissingType::None, true);
    train_score_updater_->AddScore(init_score, 0);
    for (auto& score_updater : valid_score_updater_) {
      score_updater->AddScore(init_score, 0);
@@ -553,7 +553,7 @@ bool GBDT::TrainOneIter(const score_t* gradient, const score_t* hessian, bool is
      // only add default score one-time
      if (!class_need_train_[cur_tree_id] && models_.size() < static_cast<size_t>(num_tree_per_iteration_)) {
        auto output = class_default_output_[cur_tree_id];
-        new_tree->Split(0, 0, BinType::NumericalBin, 0, 0, 0,
+        new_tree->Split(0, 0, 0, 0, 0,
                        output, output, 0, 0, -1, MissingType::None, true);
        train_score_updater_->AddScore(output, cur_tree_id);
        for (auto& score_updater : valid_score_updater_) {

--- a/src/boosting/rf.hpp
+++ b/src/boosting/rf.hpp
@@ -127,7 +127,7 @@ public:
        if (!class_need_train_[cur_tree_id] && models_.size() < static_cast<size_t>(num_tree_per_iteration_)) {
          double output = class_default_output_[cur_tree_id];
          objective_function_->ConvertOutput(&output, &output);
-          new_tree->Split(0, 0, BinType::NumericalBin, 0, 0, 0,
+          new_tree->Split(0, 0, 0, 0, 0,
                          output, output, 0, 0, -1, MissingType::None, true);
          train_score_updater_->AddScore(output, cur_tree_id);
          for (auto& score_updater : valid_score_updater_) {

--- a/src/io/dense_bin.hpp
+++ b/src/io/dense_bin.hpp
@@ -190,11 +190,11 @@ public:
  virtual data_size_t Split(
    uint32_t min_bin, uint32_t max_bin, uint32_t default_bin, MissingType missing_type, bool default_left,
    uint32_t threshold, data_size_t* data_indices, data_size_t num_data,
-    data_size_t* lte_indices, data_size_t* gt_indices, BinType bin_type) const override {
+    data_size_t* lte_indices, data_size_t* gt_indices) const override {
    if (num_data <= 0) { return 0; }
    VAL_T th = static_cast<VAL_T>(threshold + min_bin);
-    VAL_T minb = static_cast<VAL_T>(min_bin);
-    VAL_T maxb = static_cast<VAL_T>(max_bin);
+    const VAL_T minb = static_cast<VAL_T>(min_bin);
+    const VAL_T maxb = static_cast<VAL_T>(max_bin);
    VAL_T t_default_bin = static_cast<VAL_T>(min_bin + default_bin);
    if (default_bin == 0) {
      th -= 1;
@@ -204,59 +204,41 @@ public:
    data_size_t gt_count = 0;
    data_size_t* default_indices = gt_indices;
    data_size_t* default_count = &gt_count;
-    if (bin_type == BinType::NumericalBin) {
-      if (missing_type != MissingType::Zero && default_bin <= threshold) {
+    if (missing_type == MissingType::NaN) {
+      if (default_bin <= threshold) {
        default_indices = lte_indices;
        default_count = &lte_count;
      }
-      if (default_left && missing_type == MissingType::Zero) {
-        default_indices = lte_indices;
-        default_count = &lte_count;
-      } 
-      if (missing_type == MissingType::NaN) {
-        data_size_t* missing_default_indices = gt_indices;
-        data_size_t* missing_default_count = &gt_count;
-        if (default_left) {
-          missing_default_indices = lte_indices;
-          missing_default_count = &lte_count;
-        }
-        for (data_size_t i = 0; i < num_data; ++i) {
-          const data_size_t idx = data_indices[i];
-          VAL_T bin = data_[idx];
-          if (bin < minb || bin > maxb || t_default_bin == bin) {
-            default_indices[(*default_count)++] = idx;
-          } else if (bin == maxb) {
-            missing_default_indices[(*missing_default_count)++] = idx;
-          } else if (bin > th) {
-            gt_indices[gt_count++] = idx;
-          } else {
-            lte_indices[lte_count++] = idx;
-          }
-        }
-      } else {
-        for (data_size_t i = 0; i < num_data; ++i) {
-          const data_size_t idx = data_indices[i];
-          VAL_T bin = data_[idx];
-          if (bin < minb || bin > maxb || t_default_bin == bin) {
-            default_indices[(*default_count)++] = idx;
-          } else if (bin > th) {
-            gt_indices[gt_count++] = idx;
-          } else {
-            lte_indices[lte_count++] = idx;
-          }
+      data_size_t* missing_default_indices = gt_indices;
+      data_size_t* missing_default_count = &gt_count;
+      if (default_left) {
+        missing_default_indices = lte_indices;
+        missing_default_count = &lte_count;
+      }
+      for (data_size_t i = 0; i < num_data; ++i) {
+        const data_size_t idx = data_indices[i];
+        const VAL_T bin = data_[idx];
+        if (bin < minb || bin > maxb || t_default_bin == bin) {
+          default_indices[(*default_count)++] = idx;
+        } else if (bin == maxb) {
+          missing_default_indices[(*missing_default_count)++] = idx;
+        } else if (bin > th) {
+          gt_indices[gt_count++] = idx;
+        } else {
+          lte_indices[lte_count++] = idx;
        }
      }
    } else {
-      if (default_bin == threshold) {
+      if ((default_left && missing_type == MissingType::Zero) || (default_bin <= threshold && missing_type != MissingType::Zero)) {
        default_indices = lte_indices;
        default_count = &lte_count;
      }
      for (data_size_t i = 0; i < num_data; ++i) {
        const data_size_t idx = data_indices[i];
-        VAL_T bin = data_[idx];
+        const VAL_T bin = data_[idx];
        if (bin < minb || bin > maxb || t_default_bin == bin) {
          default_indices[(*default_count)++] = idx;
-        } else if (bin != th) {
+        } else if (bin > th) {
          gt_indices[gt_count++] = idx;
        } else {
          lte_indices[lte_count++] = idx;
@@ -266,6 +248,33 @@ public:
    return lte_count;
  }

+  virtual data_size_t SplitCategorical(
+    uint32_t min_bin, uint32_t max_bin, uint32_t default_bin,
+    uint32_t threshold, data_size_t* data_indices, data_size_t num_data,
+    data_size_t* lte_indices, data_size_t* gt_indices) const override {
+    if (num_data <= 0) { return 0; }
+    data_size_t lte_count = 0;
+    data_size_t gt_count = 0;
+    data_size_t* default_indices = gt_indices;
+    data_size_t* default_count = &gt_count;
+    if (threshold == default_bin) {
+      default_indices = lte_indices;
+      default_count = &lte_count;
+    }
+    for (data_size_t i = 0; i < num_data; ++i) {
+      const data_size_t idx = data_indices[i];
+      const uint32_t bin = data_[idx];
+      if (bin < min_bin || bin > max_bin) {
+        default_indices[(*default_count)++] = idx;
+      } else if (bin - min_bin == threshold) {
+        lte_indices[lte_count++] = idx;
+      } else {
+        gt_indices[gt_count++] = idx;
+      }
+    }
+    return lte_count;
+  }
+
  data_size_t num_data() const override { return num_data_; }

  /*! \brief not ordered bin for dense feature */

--- a/src/io/dense_nbits_bin.hpp
+++ b/src/io/dense_nbits_bin.hpp
@@ -229,11 +229,11 @@ public:
  virtual data_size_t Split(
    uint32_t min_bin, uint32_t max_bin, uint32_t default_bin, MissingType missing_type, bool default_left,
    uint32_t threshold, data_size_t* data_indices, data_size_t num_data,
-    data_size_t* lte_indices, data_size_t* gt_indices, BinType bin_type) const override {
+    data_size_t* lte_indices, data_size_t* gt_indices) const override {
    if (num_data <= 0) { return 0; }
    uint8_t th = static_cast<uint8_t>(threshold + min_bin);
-    uint8_t minb = static_cast<uint8_t>(min_bin);
-    uint8_t maxb = static_cast<uint8_t>(max_bin);
+    const uint8_t minb = static_cast<uint8_t>(min_bin);
+    const uint8_t maxb = static_cast<uint8_t>(max_bin);
    uint8_t t_default_bin = static_cast<uint8_t>(min_bin + default_bin);
    if (default_bin == 0) {
      th -= 1;
@@ -243,59 +243,41 @@ public:
    data_size_t gt_count = 0;
    data_size_t* default_indices = gt_indices;
    data_size_t* default_count = &gt_count;
-    if (bin_type == BinType::NumericalBin) {
-      if (missing_type != MissingType::Zero && default_bin <= threshold) {
+    if (missing_type == MissingType::NaN) {
+      if (default_bin <= threshold) {
        default_indices = lte_indices;
        default_count = &lte_count;
      }
-      if (default_left && missing_type == MissingType::Zero) {
-        default_indices = lte_indices;
-        default_count = &lte_count;
+      data_size_t* missing_default_indices = gt_indices;
+      data_size_t* missing_default_count = &gt_count;
+      if (default_left) {
+        missing_default_indices = lte_indices;
+        missing_default_count = &lte_count;
      }
-      if (missing_type == MissingType::NaN) {
-        data_size_t* missing_default_indices = gt_indices;
-        data_size_t* missing_default_count = &gt_count;
-        if (default_left) {
-          missing_default_indices = lte_indices;
-          missing_default_count = &lte_count;
-        }
-        for (data_size_t i = 0; i < num_data; ++i) {
-          const data_size_t idx = data_indices[i];
-          const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
-          if (bin < minb || bin > maxb || t_default_bin == bin) {
-            default_indices[(*default_count)++] = idx;
-          } else if (bin == maxb) {
-            missing_default_indices[(*missing_default_count)++] = idx;
-          } else if (bin > th) {
-            gt_indices[gt_count++] = idx;
-          } else {
-            lte_indices[lte_count++] = idx;
-          }
-        }
-      } else {
-        for (data_size_t i = 0; i < num_data; ++i) {
-          const data_size_t idx = data_indices[i];
-          const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
-          if (bin < minb || bin > maxb || t_default_bin == bin) {
-            default_indices[(*default_count)++] = idx;
-          } else if (bin > th) {
-            gt_indices[gt_count++] = idx;
-          } else {
-            lte_indices[lte_count++] = idx;
-          }
+      for (data_size_t i = 0; i < num_data; ++i) {
+        const data_size_t idx = data_indices[i];
+        const uint8_t bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
+        if (bin < minb || bin > maxb || t_default_bin == bin) {
+          default_indices[(*default_count)++] = idx;
+        } else if (bin == maxb) {
+          missing_default_indices[(*missing_default_count)++] = idx;
+        } else if (bin > th) {
+          gt_indices[gt_count++] = idx;
+        } else {
+          lte_indices[lte_count++] = idx;
        }
      }
    } else {
-      if (default_bin == threshold) {
+      if ((default_left && missing_type == MissingType::Zero) || (default_bin <= threshold && missing_type != MissingType::Zero)) {
        default_indices = lte_indices;
        default_count = &lte_count;
      }
      for (data_size_t i = 0; i < num_data; ++i) {
        const data_size_t idx = data_indices[i];
-        const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
+        const uint8_t bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
        if (bin < minb || bin > maxb || t_default_bin == bin) {
          default_indices[(*default_count)++] = idx;
-        } else if (bin != th) {
+        } else if (bin > th) {
          gt_indices[gt_count++] = idx;
        } else {
          lte_indices[lte_count++] = idx;
@@ -304,6 +286,34 @@ public:
    }
    return lte_count;
  }
+
+  virtual data_size_t SplitCategorical(
+    uint32_t min_bin, uint32_t max_bin, uint32_t default_bin,
+    uint32_t threshold, data_size_t* data_indices, data_size_t num_data,
+    data_size_t* lte_indices, data_size_t* gt_indices) const override {
+    if (num_data <= 0) { return 0; }
+    data_size_t lte_count = 0;
+    data_size_t gt_count = 0;
+    data_size_t* default_indices = gt_indices;
+    data_size_t* default_count = &gt_count;
+    if (default_bin == threshold) {
+      default_indices = lte_indices;
+      default_count = &lte_count;
+    }
+    for (data_size_t i = 0; i < num_data; ++i) {
+      const data_size_t idx = data_indices[i];
+      const uint32_t bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
+      if (bin < min_bin || bin > max_bin) {
+        default_indices[(*default_count)++] = idx;
+      } else if (bin - min_bin == threshold) {
+        lte_indices[lte_count++] = idx;
+      } else {
+        gt_indices[gt_count++] = idx;
+      }
+    }
+    return lte_count;
+  }
+
  data_size_t num_data() const override { return num_data_; }

  /*! \brief not ordered bin for dense feature */

--- a/src/io/sparse_bin.hpp
+++ b/src/io/sparse_bin.hpp
@@ -144,12 +144,12 @@ public:
  virtual data_size_t Split(
    uint32_t min_bin, uint32_t max_bin, uint32_t default_bin, MissingType missing_type, bool default_left,
    uint32_t threshold, data_size_t* data_indices, data_size_t num_data,
-    data_size_t* lte_indices, data_size_t* gt_indices, BinType bin_type) const override {
+    data_size_t* lte_indices, data_size_t* gt_indices) const override {
    // not need to split
    if (num_data <= 0) { return 0; }
    VAL_T th = static_cast<VAL_T>(threshold + min_bin);
-    VAL_T minb = static_cast<VAL_T>(min_bin);
-    VAL_T maxb = static_cast<VAL_T>(max_bin);
+    const VAL_T minb = static_cast<VAL_T>(min_bin);
+    const VAL_T maxb = static_cast<VAL_T>(max_bin);
    VAL_T t_default_bin = static_cast<VAL_T>(min_bin + default_bin);
    if (default_bin == 0) {
      th -= 1;
@@ -160,64 +160,74 @@ public:
    data_size_t gt_count = 0;
    data_size_t* default_indices = gt_indices;
    data_size_t* default_count = &gt_count;
-    if (bin_type == BinType::NumericalBin) {
-      if (missing_type != MissingType::Zero && default_bin <= threshold) {
+    if (missing_type == MissingType::NaN) {
+      if (default_bin <= threshold) {
        default_indices = lte_indices;
        default_count = &lte_count;
      }
-      if (default_left && missing_type == MissingType::Zero) {
-        default_indices = lte_indices;
-        default_count = &lte_count;
+      data_size_t* missing_default_indices = gt_indices;
+      data_size_t* missing_default_count = &gt_count;
+      if (default_left) {
+        missing_default_indices = lte_indices;
+        missing_default_count = &lte_count;
      }
-      if (missing_type == MissingType::NaN) {
-        data_size_t* missing_default_indices = gt_indices;
-        data_size_t* missing_default_count = &gt_count;
-        if (default_left) {
-          missing_default_indices = lte_indices;
-          missing_default_count = &lte_count;
-        }
-        for (data_size_t i = 0; i < num_data; ++i) {
-          const data_size_t idx = data_indices[i];
-          VAL_T bin = iterator.InnerRawGet(idx);
-          if (bin < minb || bin > maxb || t_default_bin == bin) {
-            default_indices[(*default_count)++] = idx;
-          } else if (bin == maxb) {
-            missing_default_indices[(*missing_default_count)++] = idx;
-          } else if (bin > th) {
-            gt_indices[gt_count++] = idx;
-          } else {
-            lte_indices[lte_count++] = idx;
-          }
-        }
-      } else {
-        for (data_size_t i = 0; i < num_data; ++i) {
-          const data_size_t idx = data_indices[i];
-          VAL_T bin = iterator.InnerRawGet(idx);
-          if (bin < minb || bin > maxb || t_default_bin == bin) {
-            default_indices[(*default_count)++] = idx;
-          } else if (bin > th) {
-            gt_indices[gt_count++] = idx;
-          } else {
-            lte_indices[lte_count++] = idx;
-          }
+      for (data_size_t i = 0; i < num_data; ++i) {
+        const data_size_t idx = data_indices[i];
+        const VAL_T bin = iterator.InnerRawGet(idx);
+        if (bin < minb || bin > maxb || t_default_bin == bin) {
+          default_indices[(*default_count)++] = idx;
+        } else if (bin == maxb) {
+          missing_default_indices[(*missing_default_count)++] = idx;
+        } else if (bin > th) {
+          gt_indices[gt_count++] = idx;
+        } else {
+          lte_indices[lte_count++] = idx;
        }
      }
    } else {
-      if (default_bin == threshold) {
+      if ((default_left && missing_type == MissingType::Zero) || (default_bin <= threshold && missing_type != MissingType::Zero)) {
        default_indices = lte_indices;
        default_count = &lte_count;
-      }
+      } 
      for (data_size_t i = 0; i < num_data; ++i) {
        const data_size_t idx = data_indices[i];
-        VAL_T bin = iterator.InnerRawGet(idx);
+        const VAL_T bin = iterator.InnerRawGet(idx);
        if (bin < minb || bin > maxb || t_default_bin == bin) {
          default_indices[(*default_count)++] = idx;
-        } else if (bin != th) {
+        } else if (bin > th) {
          gt_indices[gt_count++] = idx;
        } else {
          lte_indices[lte_count++] = idx;
        }
      }
+    } 
+    return lte_count;
+  }
+
+  virtual data_size_t SplitCategorical(
+    uint32_t min_bin, uint32_t max_bin, uint32_t default_bin,
+    uint32_t threshold, data_size_t* data_indices, data_size_t num_data,
+    data_size_t* lte_indices, data_size_t* gt_indices) const override {
+    if (num_data <= 0) { return 0; }
+    data_size_t lte_count = 0;
+    data_size_t gt_count = 0;
+    SparseBinIterator<VAL_T> iterator(this, data_indices[0]);
+    data_size_t* default_indices = gt_indices;
+    data_size_t* default_count = &gt_count;
+    if (default_bin == threshold) {
+      default_indices = lte_indices;
+      default_count = &lte_count;
+    }
+    for (data_size_t i = 0; i < num_data; ++i) {
+      const data_size_t idx = data_indices[i];
+      uint32_t bin = iterator.InnerRawGet(idx);
+      if (bin < min_bin || bin > max_bin) {
+        default_indices[(*default_count)++] = idx;
+      } else if (bin - min_bin == threshold) {
+        lte_indices[lte_count++] = idx;
+      } else {
+        gt_indices[gt_count++] = idx;
+      }
    }
    return lte_count;
  }

--- a/src/io/tree.cpp
+++ b/src/io/tree.cpp
@@ -15,11 +15,6 @@

 namespace LightGBM {

-std::vector<bool(*)(uint32_t, uint32_t)> Tree::inner_decision_funs =
-{ Tree::NumericalDecision<uint32_t>, Tree::CategoricalDecision<uint32_t> };
-std::vector<bool(*)(double, double)> Tree::decision_funs =
-{ Tree::NumericalDecision<double>, Tree::CategoricalDecision<double> };
-
 Tree::Tree(int max_leaves)
  :max_leaves_(max_leaves) {

@@ -43,38 +38,20 @@ Tree::Tree(int max_leaves)
  num_leaves_ = 1;
  leaf_parent_[0] = -1;
  shrinkage_ = 1.0f;
-  has_categorical_ = false;
+  num_cat_ = 0;
 }

 Tree::~Tree() {

 }

-int Tree::Split(int leaf, int feature, BinType bin_type, uint32_t threshold_bin, int real_feature, double threshold_double, 
-                double left_value, double right_value, data_size_t left_cnt, data_size_t right_cnt, double gain,
-                MissingType missing_type, bool default_left) {
+int Tree::Split(int leaf, int feature, int real_feature, uint32_t threshold_bin,
+                double threshold_double, double left_value, double right_value,
+                data_size_t left_cnt, data_size_t right_cnt, double gain, MissingType missing_type, bool default_left) {
+  Split(leaf, feature, real_feature, left_value, right_value, left_cnt, right_cnt, gain);
  int new_node_idx = num_leaves_ - 1;
-  // update parent info
-  int parent = leaf_parent_[leaf];
-  if (parent >= 0) {
-    // if cur node is left child
-    if (left_child_[parent] == ~leaf) {
-      left_child_[parent] = new_node_idx;
-    } else {
-      right_child_[parent] = new_node_idx;
-    }
-  }
-  // add new node
-  split_feature_inner_[new_node_idx] = feature;
-  split_feature_[new_node_idx] = real_feature;
-
  decision_type_[new_node_idx] = 0;
-  if (bin_type == BinType::NumericalBin) {
-    SetDecisionType(&decision_type_[new_node_idx], false, kCategoricalMask);
-  } else {
-    has_categorical_ = true;
-    SetDecisionType(&decision_type_[new_node_idx], true, kCategoricalMask);
-  }
+  SetDecisionType(&decision_type_[new_node_idx], false, kCategoricalMask);
  SetDecisionType(&decision_type_[new_node_idx], default_left, kDefaultLeftMask);
  if (missing_type == MissingType::None) {
    SetMissingType(&decision_type_[new_node_idx], 0);
@@ -83,31 +60,47 @@ int Tree::Split(int leaf, int feature, BinType bin_type, uint32_t threshold_bin,
  } else if (missing_type == MissingType::NaN) {
    SetMissingType(&decision_type_[new_node_idx], 2);
  }
-
  threshold_in_bin_[new_node_idx] = threshold_bin;
  threshold_[new_node_idx] = Common::AvoidInf(threshold_double);
-  split_gain_[new_node_idx] = Common::AvoidInf(gain);
-  // add two new leaves
-  left_child_[new_node_idx] = ~leaf;
-  right_child_[new_node_idx] = ~num_leaves_;
-  // update new leaves
-  leaf_parent_[leaf] = new_node_idx;
-  leaf_parent_[num_leaves_] = new_node_idx;
-  // save current leaf value to internal node before change
-  internal_value_[new_node_idx] = leaf_value_[leaf];
-  internal_count_[new_node_idx] = left_cnt + right_cnt;
-  leaf_value_[leaf] = std::isnan(left_value) ? 0.0f : left_value;
-  leaf_count_[leaf] = left_cnt;
-  leaf_value_[num_leaves_] = std::isnan(right_value) ? 0.0f : right_value;
-  leaf_count_[num_leaves_] = right_cnt;
-  // update leaf depth
-  leaf_depth_[num_leaves_] = leaf_depth_[leaf] + 1;
-  leaf_depth_[leaf]++;
+  ++num_leaves_;
+  return num_leaves_ - 1;
+}

+int Tree::SplitCategorical(int leaf, int feature, int real_feature, uint32_t threshold_bin,
+                           double threshold, double left_value, double right_value,
+                           data_size_t left_cnt, data_size_t right_cnt, double gain, MissingType missing_type) {
+  Split(leaf, feature, real_feature, left_value, right_value, left_cnt, right_cnt, gain);
+  int new_node_idx = num_leaves_ - 1;
+  decision_type_[new_node_idx] = 0;
+  SetDecisionType(&decision_type_[new_node_idx], true, kCategoricalMask);
+  if (missing_type == MissingType::None) {
+    SetMissingType(&decision_type_[new_node_idx], 0);
+  } else if (missing_type == MissingType::Zero) {
+    SetMissingType(&decision_type_[new_node_idx], 1);
+  } else if (missing_type == MissingType::NaN) {
+    SetMissingType(&decision_type_[new_node_idx], 2);
+  }
+  threshold_in_bin_[new_node_idx] = threshold_bin;
+  threshold_[new_node_idx] = threshold;
+  ++num_cat_;
  ++num_leaves_;
  return num_leaves_ - 1;
 }

+#define PredictionFun(niter, fidx_in_iter, start_pos, decision_fun, iter_idx, data_idx) \
+std::vector<std::unique_ptr<BinIterator>> iter((niter)); \
+for (int i = 0; i < (niter); ++i) { \
+  iter[i].reset(data->FeatureIterator((fidx_in_iter))); \
+  iter[i]->Reset((start_pos)); \
+}\
+for (data_size_t i = start; i < end; ++i) {\
+  int node = 0;\
+  while (node >= 0) {\
+    node = decision_fun(iter[(iter_idx)]->Get((data_idx)), node, default_bins[node], max_bins[node]);\
+  }\
+  score[(data_idx)] += static_cast<double>(leaf_value_[~node]);\
+}\
+
 void Tree::AddPredictionToScore(const Dataset* data, data_size_t num_data, double* score) const {
  if (num_leaves_ <= 1) { return; }
  std::vector<uint32_t> default_bins(num_leaves_ - 1);
@@ -118,98 +111,28 @@ void Tree::AddPredictionToScore(const Dataset* data, data_size_t num_data, doubl
    default_bins[i] = bin_mapper->GetDefaultBin();
    max_bins[i] = bin_mapper->num_bin() - 1;
  }
-  if (has_categorical_) {
+  if (num_cat_ > 0) {
    if (data->num_features() > num_leaves_ - 1) {
-      Threading::For<data_size_t>(0, num_data,
-        [this, &data, score, &default_bins, &max_bins](int, data_size_t start, data_size_t end) {
-        std::vector<std::unique_ptr<BinIterator>> iter(num_leaves_ - 1);
-        for (int i = 0; i < num_leaves_ - 1; ++i) {
-          const int fidx = split_feature_inner_[i];
-          iter[i].reset(data->FeatureIterator(fidx));
-          iter[i]->Reset(start);
-        }
-        for (data_size_t i = start; i < end; ++i) {
-          int node = 0;
-          while (node >= 0) {
-            uint32_t fval = ConvertMissingValue(iter[node]->Get(i), threshold_in_bin_[node], decision_type_[node], default_bins[node], max_bins[node]);
-            if (inner_decision_funs[GetDecisionType(decision_type_[node], kCategoricalMask)](
-              fval,
-              threshold_in_bin_[node])) {
-              node = left_child_[node];
-            } else {
-              node = right_child_[node];
-            }
-          }
-          score[i] += static_cast<double>(leaf_value_[~node]);
-        }
+      Threading::For<data_size_t>(0, num_data, [this, &data, score, &default_bins, &max_bins]
+      (int, data_size_t start, data_size_t end) {
+        PredictionFun(num_leaves_ - 1, split_feature_inner_[i], start, DecisionInner, node, i);
      });
    } else {
-      Threading::For<data_size_t>(0, num_data,
-        [this, &data, score, &default_bins, &max_bins](int, data_size_t start, data_size_t end) {
-        std::vector<std::unique_ptr<BinIterator>> iter(data->num_features());
-        for (int i = 0; i < data->num_features(); ++i) {
-          iter[i].reset(data->FeatureIterator(i));
-          iter[i]->Reset(start);
-        }
-        for (data_size_t i = start; i < end; ++i) {
-          int node = 0;
-          while (node >= 0) {
-            uint32_t fval = ConvertMissingValue(iter[split_feature_inner_[node]]->Get(i), threshold_in_bin_[node], decision_type_[node], default_bins[node], max_bins[node]);
-            if (inner_decision_funs[GetDecisionType(decision_type_[node], kCategoricalMask)](
-              fval,
-              threshold_in_bin_[node])) {
-              node = left_child_[node];
-            } else {
-              node = right_child_[node];
-            }
-          }
-          score[i] += static_cast<double>(leaf_value_[~node]);
-        }
+      Threading::For<data_size_t>(0, num_data, [this, &data, score, &default_bins, &max_bins]
+      (int, data_size_t start, data_size_t end) {
+        PredictionFun(data->num_features(), i, start, DecisionInner, split_feature_inner_[node], i);
      });
    }
  } else {
    if (data->num_features() > num_leaves_ - 1) {
-      Threading::For<data_size_t>(0, num_data,
-        [this, &data, score, &default_bins, &max_bins](int, data_size_t start, data_size_t end) {
-        std::vector<std::unique_ptr<BinIterator>> iter(num_leaves_ - 1);
-        for (int i = 0; i < num_leaves_ - 1; ++i) {
-          const int fidx = split_feature_inner_[i];
-          iter[i].reset(data->FeatureIterator(fidx));
-          iter[i]->Reset(start);
-        }
-        for (data_size_t i = start; i < end; ++i) {
-          int node = 0;
-          while (node >= 0) {
-            uint32_t fval = ConvertMissingValue(iter[node]->Get(i), threshold_in_bin_[node], decision_type_[node], default_bins[node], max_bins[node]);
-            if (fval <= threshold_in_bin_[node]) {
-              node = left_child_[node];
-            } else {
-              node = right_child_[node];
-            }
-          }
-          score[i] += static_cast<double>(leaf_value_[~node]);
-        }
+      Threading::For<data_size_t>(0, num_data, [this, &data, score, &default_bins, &max_bins]
+      (int, data_size_t start, data_size_t end) {
+        PredictionFun(num_leaves_ - 1, split_feature_inner_[i], start, NumericalDecisionInner, node, i);
      });
    } else {
-      Threading::For<data_size_t>(0, num_data,
-        [this, &data, score, &default_bins, &max_bins](int, data_size_t start, data_size_t end) {
-        std::vector<std::unique_ptr<BinIterator>> iter(data->num_features());
-        for (int i = 0; i < data->num_features(); ++i) {
-          iter[i].reset(data->FeatureIterator(i));
-          iter[i]->Reset(start);
-        }
-        for (data_size_t i = start; i < end; ++i) {
-          int node = 0;
-          while (node >= 0) {
-            uint32_t fval = ConvertMissingValue(iter[split_feature_inner_[node]]->Get(i), threshold_in_bin_[node], decision_type_[node], default_bins[node], max_bins[node]);
-            if (fval <= threshold_in_bin_[node]) {
-              node = left_child_[node];
-            } else {
-              node = right_child_[node];
-            }
-          }
-          score[i] += static_cast<double>(leaf_value_[~node]);
-        }
+      Threading::For<data_size_t>(0, num_data, [this, &data, score, &default_bins, &max_bins]
+      (int, data_size_t start, data_size_t end) {
+        PredictionFun(data->num_features(), i, start, NumericalDecisionInner, split_feature_inner_[node], i);
      });
    }
  }
@@ -227,110 +150,39 @@ void Tree::AddPredictionToScore(const Dataset* data,
    default_bins[i] = bin_mapper->GetDefaultBin();
    max_bins[i] = bin_mapper->num_bin() - 1;
  }
-  if (has_categorical_) {
+  if (num_cat_ > 0) {
    if (data->num_features() > num_leaves_ - 1) {
-      Threading::For<data_size_t>(0, num_data,
-        [this, data, used_data_indices, score, &default_bins, &max_bins](int, data_size_t start, data_size_t end) {
-        std::vector<std::unique_ptr<BinIterator>> iter(num_leaves_ - 1);
-        for (int i = 0; i < num_leaves_ - 1; ++i) {
-          const int fidx = split_feature_inner_[i];
-          iter[i].reset(data->FeatureIterator(fidx));
-          iter[i]->Reset(used_data_indices[start]);
-        }
-        for (data_size_t i = start; i < end; ++i) {
-          int node = 0;
-          const data_size_t idx = used_data_indices[i];
-          while (node >= 0) {
-            uint32_t fval = ConvertMissingValue(iter[node]->Get(idx), threshold_in_bin_[node], decision_type_[node], default_bins[node], max_bins[node]);
-            if (inner_decision_funs[GetDecisionType(decision_type_[node], kCategoricalMask)](
-              fval,
-              threshold_in_bin_[node])) {
-              node = left_child_[node];
-            } else {
-              node = right_child_[node];
-            }
-          }
-          score[idx] += static_cast<double>(leaf_value_[~node]);
-        }
+      Threading::For<data_size_t>(0, num_data, [this, &data, score, used_data_indices, &default_bins, &max_bins]
+      (int, data_size_t start, data_size_t end) {
+        PredictionFun(num_leaves_ - 1, split_feature_inner_[i], used_data_indices[start], DecisionInner, node, used_data_indices[i]);
      });
    } else {
-      Threading::For<data_size_t>(0, num_data,
-        [this, data, used_data_indices, score, &default_bins, &max_bins](int, data_size_t start, data_size_t end) {
-        std::vector<std::unique_ptr<BinIterator>> iter(data->num_features());
-        for (int i = 0; i < data->num_features(); ++i) {
-          iter[i].reset(data->FeatureIterator(i));
-          iter[i]->Reset(used_data_indices[start]);
-        }
-        for (data_size_t i = start; i < end; ++i) {
-          const data_size_t idx = used_data_indices[i];
-          int node = 0;
-          while (node >= 0) {
-            uint32_t fval = ConvertMissingValue(iter[split_feature_inner_[node]]->Get(idx), threshold_in_bin_[node], decision_type_[node], default_bins[node], max_bins[node]);
-            if (inner_decision_funs[GetDecisionType(decision_type_[node], kCategoricalMask)](
-              fval,
-              threshold_in_bin_[node])) {
-              node = left_child_[node];
-            } else {
-              node = right_child_[node];
-            }
-          }
-          score[idx] += static_cast<double>(leaf_value_[~node]);
-        }
+      Threading::For<data_size_t>(0, num_data, [this, &data, score, used_data_indices, &default_bins, &max_bins]
+      (int, data_size_t start, data_size_t end) {
+        PredictionFun(data->num_features(), i, used_data_indices[start], DecisionInner, split_feature_inner_[node], used_data_indices[i]);
      });
    }
  } else {
    if (data->num_features() > num_leaves_ - 1) {
-      Threading::For<data_size_t>(0, num_data,
-        [this, data, used_data_indices, score, &default_bins, &max_bins](int, data_size_t start, data_size_t end) {
-        std::vector<std::unique_ptr<BinIterator>> iter(num_leaves_ - 1);
-        for (int i = 0; i < num_leaves_ - 1; ++i) {
-          const int fidx = split_feature_inner_[i];
-          iter[i].reset(data->FeatureIterator(fidx));
-          iter[i]->Reset(used_data_indices[start]);
-        }
-        for (data_size_t i = start; i < end; ++i) {
-          int node = 0;
-          const data_size_t idx = used_data_indices[i];
-          while (node >= 0) {
-            uint32_t fval = ConvertMissingValue(iter[node]->Get(idx), threshold_in_bin_[node], decision_type_[node], default_bins[node], max_bins[node]);
-            if (fval <= threshold_in_bin_[node]) {
-              node = left_child_[node];
-            } else {
-              node = right_child_[node];
-            }
-          }
-          score[idx] += static_cast<double>(leaf_value_[~node]);
-        }
+      Threading::For<data_size_t>(0, num_data, [this, &data, score, used_data_indices, &default_bins, &max_bins]
+      (int, data_size_t start, data_size_t end) {
+        PredictionFun(num_leaves_ - 1, split_feature_inner_[i], used_data_indices[start], NumericalDecisionInner, node, used_data_indices[i]);
      });
    } else {
-      Threading::For<data_size_t>(0, num_data,
-        [this, data, used_data_indices, score, &default_bins, &max_bins](int, data_size_t start, data_size_t end) {
-        std::vector<std::unique_ptr<BinIterator>> iter(data->num_features());
-        for (int i = 0; i < data->num_features(); ++i) {
-          iter[i].reset(data->FeatureIterator(i));
-          iter[i]->Reset(used_data_indices[start]);
-        }
-        for (data_size_t i = start; i < end; ++i) {
-          const data_size_t idx = used_data_indices[i];
-          int node = 0;
-          while (node >= 0) {
-            uint32_t fval = ConvertMissingValue(iter[split_feature_inner_[node]]->Get(idx), threshold_in_bin_[node], decision_type_[node], default_bins[node], max_bins[node]);
-            if (fval <= threshold_in_bin_[node]) {
-              node = left_child_[node];
-            } else {
-              node = right_child_[node];
-            }
-          }
-          score[idx] += static_cast<double>(leaf_value_[~node]);
-        }
+      Threading::For<data_size_t>(0, num_data, [this, &data, score, used_data_indices, &default_bins, &max_bins]
+      (int, data_size_t start, data_size_t end) {
+        PredictionFun(data->num_features(), i, used_data_indices[start], NumericalDecisionInner, split_feature_inner_[node], used_data_indices[i]);
      });
    }
  }
 }

+#undef PredictionFun
+
 std::string Tree::ToString() {
  std::stringstream str_buf;
  str_buf << "num_leaves=" << num_leaves_ << std::endl;
+  str_buf << "num_cat=" << num_cat_ << std::endl;
  str_buf << "split_feature="
    << Common::ArrayToString<int>(split_feature_, num_leaves_ - 1, ' ') << std::endl;
  str_buf << "split_gain="
@@ -354,7 +206,6 @@ std::string Tree::ToString() {
  str_buf << "internal_count="
    << Common::ArrayToString<data_size_t>(internal_count_, num_leaves_ - 1, ' ') << std::endl;
  str_buf << "shrinkage=" << shrinkage_ << std::endl;
-  str_buf << "has_categorical=" << (has_categorical_ ? 1 : 0) << std::endl;
  str_buf << std::endl;
  return str_buf.str();
 }
@@ -363,8 +214,8 @@ std::string Tree::ToJSON() {
  std::stringstream str_buf;
  str_buf << std::setprecision(std::numeric_limits<double>::digits10 + 2);
  str_buf << "\"num_leaves\":" << num_leaves_ << "," << std::endl;
+  str_buf << "\"num_cat\":" << num_cat_ << "," << std::endl;
  str_buf << "\"shrinkage\":" << shrinkage_ << "," << std::endl;
-  str_buf << "\"has_categorical\":" << (has_categorical_ ? 1 : 0) << "," << std::endl;
  if (num_leaves_ == 1) {
    str_buf << "\"tree_structure\":" << NodeToJSON(-1) << std::endl;
  } else {
@@ -383,8 +234,26 @@ std::string Tree::NodeToJSON(int index) {
    str_buf << "\"split_index\":" << index << "," << std::endl;
    str_buf << "\"split_feature\":" << split_feature_[index] << "," << std::endl;
    str_buf << "\"split_gain\":" << split_gain_[index] << "," << std::endl;
-    str_buf << "\"threshold\":" << Common::AvoidInf(threshold_[index]) << "," << std::endl;
-    str_buf << "\"decision_type\":\"" << Tree::GetDecisionTypeName(decision_type_[index]) << "\"," << std::endl;
+    if (GetDecisionType(decision_type_[index], kCategoricalMask)) {
+      str_buf << "\"threshold\":" << static_cast<int>(threshold_[index]) << "," << std::endl;
+      str_buf << "\"decision_type\":\"==\"," << std::endl;
+    } else {
+      str_buf << "\"threshold\":" << Common::AvoidInf(threshold_[index]) << "," << std::endl;
+      str_buf << "\"decision_type\":\"<=\"," << std::endl;
+    }
+    if (GetDecisionType(decision_type_[index], kDefaultLeftMask)) {
+      str_buf << "\"default_left\":true," << std::endl;
+    } else {
+      str_buf << "\"default_left\":false," << std::endl;
+    }
+    uint8_t missing_type = GetMissingType(decision_type_[index]);
+    if (missing_type == 0) {
+      str_buf << "\"missing_type\":\"None\"," << std::endl;
+    } else if (missing_type == 1) {
+      str_buf << "\"missing_type\":\"Zero\"," << std::endl;
+    } else {
+      str_buf << "\"missing_type\":\"NaN\"," << std::endl;
+    }
    str_buf << "\"internal_value\":" << internal_value_[index] << "," << std::endl;
    str_buf << "\"internal_count\":" << internal_count_[index] << "," << std::endl;
    str_buf << "\"left_child\":" << NodeToJSON(left_child_[index]) << "," << std::endl;
@@ -414,6 +283,11 @@ std::string Tree::ToIfElse(int index, bool is_predict_leaf_index) {
  if (num_leaves_ == 1) {
    str_buf << "return 0";
  } else {
+    // use this for the missing value conversion
+    str_buf << "double fval = 0.0f; ";
+    if (num_cat_ > 0) {
+      str_buf << "int int_fval = 0; ";
+    }
    str_buf << NodeToIfElse(0, is_predict_leaf_index);
  }
  str_buf << " }" << std::endl;
@@ -425,13 +299,12 @@ std::string Tree::NodeToIfElse(int index, bool is_predict_leaf_index) {
  str_buf << std::setprecision(std::numeric_limits<double>::digits10 + 2);
  if (index >= 0) {
    // non-leaf
-    str_buf << "if (Tree::ConvertMissingValue(arr[" << split_feature_[index] << "], " << threshold_[index] << ", " << static_cast<int>(decision_type_[index]) << ") ";
+    str_buf << "fval = arr[" << split_feature_[index] << "];";
    if (GetDecisionType(decision_type_[index], kCategoricalMask) == 0) {
-      str_buf << "<";
+      str_buf << NumericalDecisionIfElse(index);
    } else {
-      str_buf << "=";
+      str_buf << CategoricalDecisionIfElse(index);
    }
-    str_buf << "= " << threshold_[index] << " ) { ";
    // left subtree
    str_buf << NodeToIfElse(left_child_[index], is_predict_leaf_index);
    str_buf << " } else { ";
@@ -471,6 +344,12 @@ Tree::Tree(const std::string& str) {

  Common::Atoi(key_vals["num_leaves"].c_str(), &num_leaves_);

+  if (key_vals.count("num_cat") <= 0) {
+    Log::Fatal("Tree model should contain num_cat field.");
+  }
+
+  Common::Atoi(key_vals["num_cat"].c_str(), &num_cat_);
+
  if (num_leaves_ <= 1) { return; }

  if (key_vals.count("left_child")) {
@@ -544,15 +423,6 @@ Tree::Tree(const std::string& str) {
  } else {
    shrinkage_ = 1.0f;
  }
-
-  if (key_vals.count("has_categorical")) {
-    int t = 0;
-    Common::Atoi(key_vals["has_categorical"].c_str(), &t);
-    has_categorical_ = t > 0;
-  } else {
-    has_categorical_ = false;
-  }
-
 }

 }  // namespace LightGBM
--- a/src/metric/dcg_calculator.cpp
+++ b/src/metric/dcg_calculator.cpp
@@ -84,9 +84,9 @@ void DCGCalculator::CalMaxDCG(const std::vector<data_size_t>& ks,
 double DCGCalculator::CalDCGAtK(data_size_t k, const float* label,
                                const double* score, data_size_t num_data) {
  // get sorted indices by score
-  std::vector<data_size_t> sorted_idx;
+  std::vector<data_size_t> sorted_idx(num_data);
  for (data_size_t i = 0; i < num_data; ++i) {
-    sorted_idx.emplace_back(i);
+    sorted_idx[i] = i;
  }
  std::sort(sorted_idx.begin(), sorted_idx.end(),
           [score](data_size_t a, data_size_t b) {return score[a] > score[b]; });
@@ -104,9 +104,9 @@ double DCGCalculator::CalDCGAtK(data_size_t k, const float* label,
 void DCGCalculator::CalDCG(const std::vector<data_size_t>& ks, const float* label,
                           const double * score, data_size_t num_data, std::vector<double>* out) {
  // get sorted indices by score
-  std::vector<data_size_t> sorted_idx;
+  std::vector<data_size_t> sorted_idx(num_data);
  for (data_size_t i = 0; i < num_data; ++i) {
-    sorted_idx.emplace_back(i);
+    sorted_idx[i] = i;
  }
  std::sort(sorted_idx.begin(), sorted_idx.end(),
            [score](data_size_t a, data_size_t b) {return score[a] > score[b]; });

--- a/src/treelearner/serial_tree_learner.cpp
+++ b/src/treelearner/serial_tree_learner.cpp
@@ -516,27 +516,40 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(const std::vector<int8_t>&
 }


-void SerialTreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) {
-  const SplitInfo& best_split_info = best_split_per_leaf_[best_Leaf];
+void SerialTreeLearner::Split(Tree* tree, int best_leaf, int* left_leaf, int* right_leaf) {
+  const SplitInfo& best_split_info = best_split_per_leaf_[best_leaf];
  const int inner_feature_index = train_data_->InnerFeatureIndex(best_split_info.feature);
  // left = parent
-  *left_leaf = best_Leaf;
-  // split tree, will return right leaf
-  *right_leaf = tree->Split(best_Leaf,
-                            inner_feature_index,
-                            train_data_->FeatureBinMapper(inner_feature_index)->bin_type(),
-                            best_split_info.threshold,
-                            best_split_info.feature,
-                            train_data_->RealThreshold(inner_feature_index, best_split_info.threshold),
-                            static_cast<double>(best_split_info.left_output),
-                            static_cast<double>(best_split_info.right_output),
-                            static_cast<data_size_t>(best_split_info.left_count),
-                            static_cast<data_size_t>(best_split_info.right_count),
-                            static_cast<double>(best_split_info.gain),
-                            train_data_->FeatureBinMapper(inner_feature_index)->missing_type(),
-                            best_split_info.default_left);
+  *left_leaf = best_leaf;
+  if (train_data_->FeatureBinMapper(inner_feature_index)->bin_type() == BinType::NumericalBin) {
+    // split tree, will return right leaf
+    *right_leaf = tree->Split(best_leaf,
+                              inner_feature_index,
+                              best_split_info.feature,
+                              best_split_info.threshold,
+                              train_data_->RealThreshold(inner_feature_index, best_split_info.threshold),
+                              static_cast<double>(best_split_info.left_output),
+                              static_cast<double>(best_split_info.right_output),
+                              static_cast<data_size_t>(best_split_info.left_count),
+                              static_cast<data_size_t>(best_split_info.right_count),
+                              static_cast<double>(best_split_info.gain),
+                              train_data_->FeatureBinMapper(inner_feature_index)->missing_type(),
+                              best_split_info.default_left);
+  } else {
+    *right_leaf = tree->SplitCategorical(best_leaf,
+                                         inner_feature_index,
+                                         best_split_info.feature,
+                                         best_split_info.threshold,
+                                         train_data_->RealThreshold(inner_feature_index, best_split_info.threshold),
+                                         static_cast<double>(best_split_info.left_output),
+                                         static_cast<double>(best_split_info.right_output),
+                                         static_cast<data_size_t>(best_split_info.left_count),
+                                         static_cast<data_size_t>(best_split_info.right_count),
+                                         static_cast<double>(best_split_info.gain),
+                                         train_data_->FeatureBinMapper(inner_feature_index)->missing_type());
+  }
  // split data partition
-  data_partition_->Split(best_Leaf, train_data_, inner_feature_index,
+  data_partition_->Split(best_leaf, train_data_, inner_feature_index,
                         best_split_info.threshold, best_split_info.default_left, *right_leaf);

  // init the leaves that used on next iteration

--- a/windows/LightGBM.vcxproj
+++ b/windows/LightGBM.vcxproj
@@ -218,6 +218,7 @@
    <ClInclude Include="..\src\boosting\gbdt.h" />
    <ClInclude Include="..\src\boosting\dart.hpp" />
    <ClInclude Include="..\src\boosting\goss.hpp" />
+    <ClInclude Include="..\src\boosting\rf.hpp" />
    <ClInclude Include="..\src\boosting\score_updater.hpp" />
    <ClInclude Include="..\src\io\dense_bin.hpp" />
    <ClInclude Include="..\src\io\dense_nbits_bin.hpp" />

--- a/windows/LightGBM.vcxproj.filters
+++ b/windows/LightGBM.vcxproj.filters
@@ -192,6 +192,9 @@
    <ClInclude Include="..\include\LightGBM\R_object_helper.h">
      <Filter>include\LightGBM</Filter>
    </ClInclude>
+    <ClInclude Include="..\src\boosting\rf.hpp">
+      <Filter>src\boosting</Filter>
+    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <ClCompile Include="..\src\application\application.cpp">