Merge branches 'master' and 'master' of https://github.com/Microsoft/LightGBM

2a8d38c5 · Qiwei Ye · 351b3d7e · ed958eb2 · 2a8d38c5 · 2a8d38c5
Commit 2a8d38c5 authored Nov 01, 2016 by Qiwei Ye
18 changed files
--- a/src/metric/regression_metric.hpp
+++ b/src/metric/regression_metric.hpp
@@ -15,25 +15,34 @@ namespace LightGBM {
 template<typename PointWiseLossCalculator>
 class RegressionMetric: public Metric {
 public:
-  explicit RegressionMetric(const MetricConfig& config) {
-    early_stopping_round_ = config.early_stopping_round;
-    output_freq_ = config.output_freq;
-    the_bigger_the_better = false;
+  explicit RegressionMetric(const MetricConfig&) {
+
  }

  virtual ~RegressionMetric() {

  }

+  const char* GetName() const override {
+    return name_.c_str();
+  }
+
+  bool is_bigger_better() const override {
+    return false;
+  }
+
  void Init(const char* test_name, const Metadata& metadata, data_size_t num_data) override {
-    name = test_name;
+    std::stringstream str_buf;
+    str_buf << test_name << "'s " << PointWiseLossCalculator::Name();
+    name_ = str_buf.str();
+
    num_data_ = num_data;
    // get label
    label_ = metadata.label();
    // get weights
    weights_ = metadata.weights();
    if (weights_ == nullptr) {
-      sum_weights_ = static_cast<double>(num_data_);
+      sum_weights_ = static_cast<float>(num_data_);
    } else {
      sum_weights_ = 0.0f;
      for (data_size_t i = 0; i < num_data_; ++i) {
@@ -42,29 +51,24 @@ public:
    }
  }

-  score_t PrintAndGetLoss(int iter, const score_t* score) const override {
-    if (early_stopping_round_ > 0 || (output_freq_ > 0 && iter % output_freq_ == 0)) {
-      score_t sum_loss = 0.0;
+  std::vector<float> Eval(const score_t* score) const override {
+    score_t sum_loss = 0.0f;
    if (weights_ == nullptr) {
-        #pragma omp parallel for schedule(static) reduction(+:sum_loss)
+#pragma omp parallel for schedule(static) reduction(+:sum_loss)
      for (data_size_t i = 0; i < num_data_; ++i) {
        // add loss
        sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], score[i]);
      }
    } else {
-        #pragma omp parallel for schedule(static) reduction(+:sum_loss)
+#pragma omp parallel for schedule(static) reduction(+:sum_loss)
      for (data_size_t i = 0; i < num_data_; ++i) {
        // add loss
        sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], score[i]) * weights_[i];
      }
    }
    score_t loss = PointWiseLossCalculator::AverageLoss(sum_loss, sum_weights_);
-      if (output_freq_ > 0 && iter % output_freq_ == 0){
-        Log::Info("Iteration:%d, %s's %s : %f", iter, name, PointWiseLossCalculator::Name(), loss);
-      }
-      return loss;
-    }
-    return 0.0f;
+    return std::vector<float>(1, static_cast<float>(loss));
+
  }

  inline static score_t AverageLoss(score_t sum_loss, score_t sum_weights) {
@@ -72,8 +76,6 @@ public:
  }

 private:
-  /*! \brief Output frequently */
-  int output_freq_;
  /*! \brief Number of data */
  data_size_t num_data_;
  /*! \brief Pointer of label */
@@ -81,9 +83,9 @@ private:
  /*! \brief Pointer of weighs */
  const float* weights_;
  /*! \brief Sum weights */
-  double sum_weights_;
+  float sum_weights_;
  /*! \brief Name of this test set */
-  const char* name;
+  std::string name_;
 };

 /*! \brief L2 loss for regression task */

--- a/src/network/linkers_socket.cpp
+++ b/src/network/linkers_socket.cpp
@@ -77,7 +77,7 @@ Linkers::~Linkers() {
 }

 void Linkers::ParseMachineList(const char * filename) {
-  TextReader<size_t> machine_list_reader(filename);
+  TextReader<size_t> machine_list_reader(filename, false);
  machine_list_reader.ReadAllLines();
  if (machine_list_reader.Lines().size() <= 0) {
    Log::Fatal("Machine list file:%s doesn't exist", filename);

--- a/src/network/network.cpp
+++ b/src/network/network.cpp
@@ -9,7 +9,7 @@

 namespace LightGBM {

-// static member defination
+// static member definition
 int Network::num_machines_;
 int Network::rank_;
 Linkers* Network::linkers_;
@@ -141,7 +141,7 @@ void Network::ReduceScatter(char* input, int input_size, int* block_start, int*
      // send local data to neighbor first
      linkers_->Send(recursive_halving_map_.neighbor, input, input_size);
    } else if (recursive_halving_map_.type == RecursiveHalvingNodeType::GroupLeader) {
-      // recieve neighbor data first
+      // receive neighbor data first
      int need_recv_cnt = input_size;
      linkers_->Recv(recursive_halving_map_.neighbor, output, need_recv_cnt);
      // reduce

--- a/src/network/socket_wrapper.hpp
+++ b/src/network/socket_wrapper.hpp
@@ -155,7 +155,7 @@ public:
        pAdapter = pAdapter->Next;
      }
    } else {
-      Log::Error("GetAdaptersinfo error: code %d ", dwRetVal);
+      Log::Fatal("GetAdaptersinfo error: code %d ", dwRetVal);
    }
    if (pAdapterInfo)
      FREE(pAdapterInfo);

--- a/src/objective/binary_objective.hpp
+++ b/src/objective/binary_objective.hpp
@@ -8,13 +8,13 @@

 namespace LightGBM {
 /*!
-* \brief Objective funtion for binary classification
+* \brief Objective function for binary classification
 */
 class BinaryLogloss: public ObjectiveFunction {
 public:
  explicit BinaryLogloss(const ObjectiveConfig& config) {
    is_unbalance_ = config.is_unbalance;
-    sigmoid_ = static_cast<score_t>(config.sigmoid);
+    sigmoid_ = static_cast<float>(config.sigmoid);
    if (sigmoid_ <= 0.0) {
      Log::Fatal("Sigmoid parameter %f :should greater than zero", sigmoid_);
    }
@@ -47,8 +47,8 @@ public:
    label_weights_[1] = 1.0f;
    // if using unbalance, change the labels weight
    if (is_unbalance_) {
-      label_weights_[1] = 1.0f / cnt_positive;
-      label_weights_[0] = 1.0f / cnt_negative;
+      label_weights_[1] = 1.0f;
+      label_weights_[0] = static_cast<float>(cnt_positive) / cnt_negative;
    }
  }

@@ -80,7 +80,7 @@ public:
    }
  }

-  double GetSigmoid() const override {
+  float GetSigmoid() const override {
    return sigmoid_;
  }

@@ -92,11 +92,11 @@ private:
  /*! \brief True if using unbalance training */
  bool is_unbalance_;
  /*! \brief Sigmoid parameter */
-  score_t sigmoid_;
+  float sigmoid_;
  /*! \brief Values for positive and negative labels */
  int label_val_[2];
  /*! \brief Weights for positive and negative labels */
-  score_t label_weights_[2];
+  float label_weights_[2];
  /*! \brief Weights for data */
  const float* weights_;
 };

--- a/src/objective/multiclass_objective.hpp
+++ b/src/objective/multiclass_objective.hpp
+#ifndef LIGHTGBM_OBJECTIVE_MULTICLASS_OBJECTIVE_HPP_
+#define LIGHTGBM_OBJECTIVE_MULTICLASS_OBJECTIVE_HPP_
+
+#include <LightGBM/objective_function.h>
+
+#include <cstring>
+#include <cmath>
+
+namespace LightGBM {
+/*!
+* \brief Objective function for multiclass classification
+*/
+class MulticlassLogloss: public ObjectiveFunction {
+public:
+  explicit MulticlassLogloss(const ObjectiveConfig& config)
+        :label_int_(nullptr) {
+    num_class_ = config.num_class;
+  }
+  
+  ~MulticlassLogloss() {
+    if (label_int_ != nullptr) { delete[] label_int_; }    
+  }
+  
+  void Init(const Metadata& metadata, data_size_t num_data) override {
+    num_data_ = num_data;
+    label_ = metadata.label();
+    weights_ = metadata.weights();
+    label_int_ = new int[num_data_];
+    for (int i = 0; i < num_data_; ++i){
+        label_int_[i] = static_cast<int>(label_[i]); 
+        if (label_int_[i] < 0 || label_int_[i] >= num_class_) {
+            Log::Fatal("Label must be in [0, %d), but find %d in label", num_class_, label_int_[i]);
+        }
+    }
+  }
+
+  void GetGradients(const score_t* score, score_t* gradients, score_t* hessians) const override {
+    if (weights_ == nullptr) {
+      #pragma omp parallel for schedule(static)
+      for (data_size_t i = 0; i < num_data_; ++i) {
+        std::vector<float> rec(num_class_);
+        for (int k = 0; k < num_class_; ++k){
+          rec[k] = static_cast<float>(score[k * num_data_ + i]);
+        }
+        Common::Softmax(&rec);  
+        for (int k = 0; k < num_class_; ++k) {
+          score_t p = static_cast<score_t>(rec[k]);
+          if (label_int_[i] == k) {
+            gradients[k * num_data_ + i] = p - 1.0f;
+          } else {
+            gradients[k * num_data_ + i] = p;
+          }
+          hessians[k * num_data_ + i] = 2.0f * p * (1.0f - p);
+        }  
+      }
+    } else {
+      #pragma omp parallel for schedule(static)
+      for (data_size_t i = 0; i < num_data_; ++i) {
+        std::vector<float> rec(num_class_);
+        for (int k = 0; k < num_class_; ++k){
+          rec[k] = static_cast<float>(score[k * num_data_ + i]);
+        }  
+        Common::Softmax(&rec);
+        for (int k = 0; k < num_class_; ++k) {
+          score_t p = static_cast<score_t>(rec[k]);
+          if (label_int_[i] == k) {
+            gradients[k * num_data_ + i] = (p - 1.0f) * weights_[i];
+          } else {
+            gradients[k * num_data_ + i] = p * weights_[i];
+          }
+          hessians[k * num_data_ + i] = 2.0f * p * (1.0f - p) * weights_[i];
+        }
+      }
+    }
+  }
+
+  float GetSigmoid() const override {
+    return -1.0f;
+  }
+
+private:
+  /*! \brief Number of data */
+  data_size_t num_data_;
+  /*! \brief Number of classes */
+  int num_class_;
+  /*! \brief Pointer of label */
+  const float* label_;
+  /*! \brief Corresponding integers of label_ */
+  int* label_int_;
+  /*! \brief Weights for data */
+  const float* weights_;
+};
+
+}  // namespace LightGBM
+#endif   // LightGBM_OBJECTIVE_MULTICLASS_OBJECTIVE_HPP_
--- a/src/objective/objective_function.cpp
+++ b/src/objective/objective_function.cpp
@@ -2,16 +2,19 @@
 #include "regression_objective.hpp"
 #include "binary_objective.hpp"
 #include "rank_objective.hpp"
+#include "multiclass_objective.hpp"

 namespace LightGBM {

 ObjectiveFunction* ObjectiveFunction::CreateObjectiveFunction(const std::string& type, const ObjectiveConfig& config) {
-  if (type == "regression") {
+  if (type == std::string("regression")) {
    return new RegressionL2loss(config);
-  } else if (type == "binary") {
+  } else if (type == std::string("binary")) {
    return new BinaryLogloss(config);
-  } else if (type == "lambdarank") {
+  } else if (type == std::string("lambdarank")) {
    return new LambdarankNDCG(config);
+  } else if (type == std::string("multiclass")) {
+    return new MulticlassLogloss(config);
  }
  return nullptr;
 }

--- a/src/objective/rank_objective.hpp
+++ b/src/objective/rank_objective.hpp
@@ -14,16 +14,16 @@

 namespace LightGBM {
 /*!
-* \brief Objective funtion for Lambdrank with NDCG
+* \brief Objective function for Lambdrank with NDCG
 */
 class LambdarankNDCG: public ObjectiveFunction {
 public:
  explicit LambdarankNDCG(const ObjectiveConfig& config) {
-    sigmoid_ = static_cast<score_t>(config.sigmoid);
+    sigmoid_ = static_cast<float>(config.sigmoid);
    // initialize DCG calculator
    DCGCalculator::Init(config.label_gain);
    // copy lable gain to local
-    std::vector<double> label_gain = config.label_gain;
+    std::vector<float> label_gain = config.label_gain;
    for (auto gain : label_gain) {
      label_gain_.push_back(static_cast<score_t>(gain));
    }
@@ -47,10 +47,10 @@ public:
    // get boundries
    query_boundaries_ = metadata.query_boundaries();
    if (query_boundaries_ == nullptr) {
-      Log::Fatal("For NDCG metric, should have query information");
+      Log::Fatal("For lambdarank tasks, should have query information");
    }
    num_queries_ = metadata.num_queries();
-    // cache inverse max DCG, avoid compution many times
+    // cache inverse max DCG, avoid computation many times
    inverse_max_dcgs_ = new score_t[num_queries_];
    for (data_size_t i = 0; i < num_queries_; ++i) {
      inverse_max_dcgs_[i] = static_cast<score_t>(
@@ -194,7 +194,7 @@ public:
    }
  }

-  double GetSigmoid() const override {
+  float GetSigmoid() const override {
    // though we use sigmoid transform on objective
    // for the prediction, we actually don't need to transform by sigmoid.
    // since we only need the ranking score.
@@ -207,7 +207,7 @@ private:
  /*! \brief Cache inverse max DCG, speed up calculation */
  score_t* inverse_max_dcgs_;
  /*! \brief Simgoid param */
-  score_t sigmoid_;
+  float sigmoid_;
  /*! \brief Optimized NDCG@ */
  int optimize_pos_at_;
  /*! \brief Number of queries */

--- a/src/objective/regression_objective.hpp
+++ b/src/objective/regression_objective.hpp
@@ -5,7 +5,7 @@

 namespace LightGBM {
 /*!
-* \brief Objective funtion for regression
+* \brief Objective function for regression
 */
 class RegressionL2loss: public ObjectiveFunction {
 public:
@@ -38,9 +38,9 @@ public:
    }
  }

-  double GetSigmoid() const override {
+  float GetSigmoid() const override {
    // not sigmoid transform, return -1
-    return -1.0;
+    return -1.0f;
  }

 private:

--- a/src/treelearner/data_parallel_tree_learner.cpp
+++ b/src/treelearner/data_parallel_tree_learner.cpp
@@ -103,19 +103,19 @@ void DataParallelTreeLearner::BeforeTrain() {
  }

  // sync global data sumup info
-  std::tuple<data_size_t, score_t, score_t> data(smaller_leaf_splits_->num_data_in_leaf(),
+  std::tuple<data_size_t, double, double> data(smaller_leaf_splits_->num_data_in_leaf(),
             smaller_leaf_splits_->sum_gradients(), smaller_leaf_splits_->sum_hessians());
  int size = sizeof(data);
  std::memcpy(input_buffer_, &data, size);
  // global sumup reduce
  Network::Allreduce(input_buffer_, size, size, output_buffer_, [](const char *src, char *dst, int len) {
    int used_size = 0;
-    int type_size = sizeof(std::tuple<data_size_t, score_t, score_t>);
-    const std::tuple<data_size_t, score_t, score_t> *p1;
-    std::tuple<data_size_t, score_t, score_t> *p2;
+    int type_size = sizeof(std::tuple<data_size_t, double, double>);
+    const std::tuple<data_size_t, double, double> *p1;
+    std::tuple<data_size_t, double, double> *p2;
    while (used_size < len) {
-      p1 = reinterpret_cast<const std::tuple<data_size_t, score_t, score_t> *>(src);
-      p2 = reinterpret_cast<std::tuple<data_size_t, score_t, score_t> *>(dst);
+      p1 = reinterpret_cast<const std::tuple<data_size_t, double, double> *>(src);
+      p2 = reinterpret_cast<std::tuple<data_size_t, double, double> *>(dst);
      std::get<0>(*p2) = std::get<0>(*p2) + std::get<0>(*p1);
      std::get<1>(*p2) = std::get<1>(*p2) + std::get<1>(*p1);
      std::get<2>(*p2) = std::get<2>(*p2) + std::get<2>(*p1);

--- a/src/treelearner/feature_histogram.hpp
+++ b/src/treelearner/feature_histogram.hpp
@@ -26,7 +26,7 @@ public:
  * \param min_num_data_one_leaf minimal number of data in one leaf
  */
  void Init(const Feature* feature, int feature_idx, data_size_t min_num_data_one_leaf,
-    score_t min_sum_hessian_one_leaf) {
+    double min_sum_hessian_one_leaf) {
    feature_idx_ = feature_idx;
    min_num_data_one_leaf_ = min_num_data_one_leaf;
    min_sum_hessian_one_leaf_ = min_sum_hessian_one_leaf;
@@ -40,13 +40,13 @@ public:
  * \brief Construct a histogram
  * \param num_data number of data in current leaf
  * \param sum_gradients sum of gradients of current leaf
-  * \param sum_hessians sum of hissians of current leaf
+  * \param sum_hessians sum of hessians of current leaf
  * \param ordered_gradients Orederd gradients
  * \param ordered_hessians  Ordered hessians
  * \param data_indices data indices of current leaf
  */
-  void Construct(data_size_t* data_indices, data_size_t num_data, score_t sum_gradients,
-                        score_t sum_hessians, const score_t* ordered_gradients, const score_t* ordered_hessians) {
+  void Construct(data_size_t* data_indices, data_size_t num_data, double sum_gradients,
+    double sum_hessians, const score_t* ordered_gradients, const score_t* ordered_hessians) {
    std::memset(data_, 0, sizeof(HistogramBinEntry)* num_bins_);
    num_data_ = num_data;
    sum_gradients_ = sum_gradients;
@@ -59,12 +59,12 @@ public:
  * \param leaf current leaf
  * \param num_data number of data in current leaf
  * \param sum_gradients sum of gradients of current leaf
-  * \param sum_hessians sum of hissians of current leaf
+  * \param sum_hessians sum of hessians of current leaf
  * \param gradients
  * \param hessian
  */
-  void Construct(const OrderedBin* ordered_bin, int leaf, data_size_t num_data, score_t sum_gradients,
-                        score_t sum_hessians, const score_t* gradients, const score_t* hessians) {
+  void Construct(const OrderedBin* ordered_bin, int leaf, data_size_t num_data, double sum_gradients,
+    double sum_hessians, const score_t* gradients, const score_t* hessians) {
    std::memset(data_, 0, sizeof(HistogramBinEntry)* num_bins_);
    num_data_ = num_data;
    sum_gradients_ = sum_gradients;
@@ -76,9 +76,9 @@ public:
  * \brief Set sumup information for current histogram
  * \param num_data number of data in current leaf
  * \param sum_gradients sum of gradients of current leaf
-  * \param sum_hessians sum of hissians of current leaf
+  * \param sum_hessians sum of hessians of current leaf
  */
-  void SetSumup(data_size_t num_data, score_t sum_gradients, score_t sum_hessians) {
+  void SetSumup(data_size_t num_data, double sum_gradients, double sum_hessians) {
    num_data_ = num_data;
    sum_gradients_ = sum_gradients;
    sum_hessians_ = sum_hessians + 2 * kEpsilon;
@@ -104,15 +104,15 @@ public:
  * \param output The best split result
  */
  void FindBestThreshold(SplitInfo* output) {
-    score_t best_sum_left_gradient = NAN;
-    score_t best_sum_left_hessian = NAN;
-    score_t best_gain = kMinScore;
+    double best_sum_left_gradient = NAN;
+    double best_sum_left_hessian = NAN;
+    double best_gain = kMinScore;
    data_size_t best_left_count = 0;
    unsigned int best_threshold = static_cast<unsigned int>(num_bins_);
-    score_t sum_right_gradient = 0.0f;
-    score_t sum_right_hessian = kEpsilon;
+    double sum_right_gradient = 0.0f;
+    double sum_right_hessian = kEpsilon;
    data_size_t right_count = 0;
-    score_t gain_shift = GetLeafSplitGain(sum_gradients_, sum_hessians_);
+    double gain_shift = GetLeafSplitGain(sum_gradients_, sum_hessians_);
    is_splittable_ = false;
    // from right to left, and we don't need data in bin0
    for (unsigned int t = num_bins_ - 1; t > 0; --t) {
@@ -125,14 +125,14 @@ public:
      // if data not enough
      if (left_count < min_num_data_one_leaf_) break;

-      score_t sum_left_hessian = sum_hessians_ - sum_right_hessian;
+      double sum_left_hessian = sum_hessians_ - sum_right_hessian;
      // if sum hessian too small
      if (sum_left_hessian < min_sum_hessian_one_leaf_) {
        break;
      }
-      score_t sum_left_gradient = sum_gradients_ - sum_right_gradient;
+      double sum_left_gradient = sum_gradients_ - sum_right_gradient;
      // current split gain
-      score_t current_gain = GetLeafSplitGain(sum_left_gradient, sum_left_hessian) + GetLeafSplitGain(sum_right_gradient, sum_right_hessian);
+      double current_gain = GetLeafSplitGain(sum_left_gradient, sum_left_hessian) + GetLeafSplitGain(sum_right_gradient, sum_right_hessian);
      // gain is worst than no perform split
      if (current_gain < gain_shift) {
        continue;
@@ -195,7 +195,7 @@ public:
  /*!
  * \brief Set min sum hessian in one leaf
  */
-  void SetMinSumHessianOneLeaf(score_t new_val) {
+  void SetMinSumHessianOneLeaf(double new_val) {
    min_sum_hessian_one_leaf_ = new_val;
  }

@@ -216,7 +216,7 @@ private:
  * \param sum_hessians
  * \return split gain
  */
-  score_t GetLeafSplitGain(score_t sum_gradients, score_t sum_hessians) const {
+  double GetLeafSplitGain(double sum_gradients, double sum_hessians) const {
    return (sum_gradients * sum_gradients) / (sum_hessians);
  }

@@ -226,7 +226,7 @@ private:
  * \param sum_hessians
  * \return leaf output
  */
-  score_t CalculateSplittedLeafOutput(score_t sum_gradients, score_t sum_hessians) const {
+  double CalculateSplittedLeafOutput(double sum_gradients, double sum_hessians) const {
    return -(sum_gradients) / (sum_hessians);
  }

@@ -234,7 +234,7 @@ private:
  /*! \brief minimal number of data in one leaf */
  data_size_t min_num_data_one_leaf_;
  /*! \brief minimal sum hessian of data in one leaf */
-  score_t min_sum_hessian_one_leaf_;
+  double min_sum_hessian_one_leaf_;
  /*! \brief the bin data of current feature */
  const Bin* bin_data_;
  /*! \brief number of bin of histogram */
@@ -244,9 +244,9 @@ private:
  /*! \brief number of all data */
  data_size_t num_data_;
  /*! \brief sum of gradient of current leaf */
-  score_t sum_gradients_;
+  double sum_gradients_;
  /*! \brief sum of hessians of current leaf */
-  score_t sum_hessians_;
+  double sum_hessians_;
  /*! \brief False if this histogram cannot split */
  bool is_splittable_ = true;
 };

--- a/src/treelearner/leaf_splits.hpp
+++ b/src/treelearner/leaf_splits.hpp
@@ -26,13 +26,14 @@ public:
  }

  /*!
+
  * \brief Init split on current leaf on partial data. 
  * \param leaf Index of current leaf
  * \param data_partition current data partition
  * \param sum_gradients
  * \param sum_hessians
  */
-  void Init(int leaf, const DataPartition* data_partition, score_t sum_gradients, score_t sum_hessians) {
+  void Init(int leaf, const DataPartition* data_partition, double sum_gradients, double sum_hessians) {
    leaf_index_ = leaf;
    num_data_in_leaf_ = data_partition->GetIndexOnLeaf(leaf, &data_indices_);
    sum_gradients_ = sum_gradients;
@@ -43,7 +44,7 @@ public:
  }

  /*!
-  * \brief Init splits on current leaf, it will travese all data to sum up the results
+  * \brief Init splits on current leaf, it will traverse all data to sum up the results
  * \param gradients
  * \param hessians
  */
@@ -51,8 +52,8 @@ public:
    num_data_in_leaf_ = num_data_;
    leaf_index_ = 0;
    data_indices_ = nullptr;
-    score_t tmp_sum_gradients = 0.0;
-    score_t tmp_sum_hessians = 0.0;
+    double tmp_sum_gradients = 0.0f;
+    double tmp_sum_hessians = 0.0f;
 #pragma omp parallel for schedule(static) reduction(+:tmp_sum_gradients, tmp_sum_hessians)
    for (data_size_t i = 0; i < num_data_in_leaf_; ++i) {
      tmp_sum_gradients += gradients[i];
@@ -75,8 +76,8 @@ public:
  void Init(int leaf, const DataPartition* data_partition, const score_t* gradients, const score_t *hessians) {
    leaf_index_ = leaf;
    num_data_in_leaf_ = data_partition->GetIndexOnLeaf(leaf, &data_indices_);
-    score_t tmp_sum_gradients = 0.0;
-    score_t tmp_sum_hessians = 0.0;
+    double tmp_sum_gradients = 0.0f;
+    double tmp_sum_hessians = 0.0f;
 #pragma omp parallel for schedule(static) reduction(+:tmp_sum_gradients, tmp_sum_hessians)
    for (data_size_t i = 0; i < num_data_in_leaf_; ++i) {
      data_size_t idx = data_indices_[i];
@@ -96,7 +97,7 @@ public:
  * \param sum_gradients
  * \param sum_hessians
  */
-  void Init(score_t sum_gradients, score_t sum_hessians) {
+  void Init(double sum_gradients, double sum_hessians) {
    leaf_index_ = 0;
    sum_gradients_ = sum_gradients;
    sum_hessians_ = sum_hessians;
@@ -125,10 +126,10 @@ public:
  data_size_t num_data_in_leaf() const { return num_data_in_leaf_; }

  /*! \brief Get sum of gradients of current leaf */
-  score_t sum_gradients() const { return sum_gradients_; }
+  double sum_gradients() const { return sum_gradients_; }
  
  /*! \brief Get sum of hessians of current leaf */
-  score_t sum_hessians() const { return sum_hessians_; }
+  double sum_hessians() const { return sum_hessians_; }

  /*! \brief Get indices of data of current leaf */
  data_size_t * data_indices() const { return data_indices_; }
@@ -146,9 +147,9 @@ private:
  /*! \brief number of features */
  int num_features_;
  /*! \brief sum of gradients of current leaf */
-  score_t sum_gradients_;
+  double sum_gradients_;
  /*! \brief sum of hessians of current leaf */
-  score_t sum_hessians_;
+  double sum_hessians_;
  /*! \brief indices of data of current leaf */
  data_size_t* data_indices_;
 };

--- a/src/treelearner/parallel_tree_learner.h
+++ b/src/treelearner/parallel_tree_learner.h
@@ -77,9 +77,9 @@ private:
  int* block_start_;
  /*! \brief Block size for reduce scatter */
  int* block_len_;
-  /*! \brief Write positions for feature histgrams */
+  /*! \brief Write positions for feature histograms */
  int* buffer_write_start_pos_;
-  /*! \brief Read positions for local feature histgrams */
+  /*! \brief Read positions for local feature histograms */
  int* buffer_read_start_pos_;
  /*! \brief Size for reduce scatter */
  int reduce_scatter_size_;

--- a/src/treelearner/serial_tree_learner.cpp
+++ b/src/treelearner/serial_tree_learner.cpp
@@ -15,10 +15,11 @@ SerialTreeLearner::SerialTreeLearner(const TreeConfig& tree_config)
  // initialize with nullptr
  num_leaves_ = tree_config.num_leaves;
  min_num_data_one_leaf_ = static_cast<data_size_t>(tree_config.min_data_in_leaf);
-  min_sum_hessian_one_leaf_ = static_cast<float>(tree_config.min_sum_hessian_in_leaf);
+  min_sum_hessian_one_leaf_ = static_cast<double>(tree_config.min_sum_hessian_in_leaf);
  feature_fraction_ = tree_config.feature_fraction;
  random_ = Random(tree_config.feature_fraction_seed);
  histogram_pool_size_ = tree_config.histogram_pool_size;
+  max_depth_ = tree_config.max_depth;
 }

 SerialTreeLearner::~SerialTreeLearner() {
@@ -62,16 +63,17 @@ void SerialTreeLearner::Init(const Dataset* train_data) {
  max_cache_size = Common::Min(max_cache_size, num_leaves_);
  histogram_pool_.ResetSize(max_cache_size, num_leaves_);

-  for (int i = 0; i < max_cache_size; ++i) {
+  auto histogram_create_function = [this]() {
    FeatureHistogram* tmp_histogram_array = new FeatureHistogram[train_data_->num_features()];
    for (int j = 0; j < train_data_->num_features(); ++j) {
      tmp_histogram_array[j].Init(train_data_->FeatureAt(j),
        j, min_num_data_one_leaf_,
        min_sum_hessian_one_leaf_);
    }
-    // set data at i-th position
-    histogram_pool_.Set(i, tmp_histogram_array);
-  }
+    return tmp_histogram_array;
+  };
+  histogram_pool_.Fill(histogram_create_function);
+
  // push split information for all leaves
  for (int i = 0; i < num_leaves_; ++i) {
    best_split_per_leaf_.push_back(SplitInfo());
@@ -106,7 +108,7 @@ void SerialTreeLearner::Init(const Dataset* train_data) {
  // initialize ordered gradients and hessians
  ordered_gradients_ = new score_t[num_data_];
  ordered_hessians_ = new score_t[num_data_];
-  // if has ordered bin, need allocata a buffer to fast split
+  // if has ordered bin, need allocate a buffer to fast split
  if (has_ordered_bin_) {
    is_data_in_leaf_ = new char[num_data_];
  }
@@ -120,6 +122,8 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians
  // some initial works before training
  BeforeTrain();
  Tree *tree = new Tree(num_leaves_);
+  // save pointer to last trained tree
+  last_trained_tree_ = tree;
  // root leaf
  int left_leaf = 0;
  // only root leaf can be splitted on first time
@@ -145,8 +149,6 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians
    // split tree with best leaf
    Split(tree, best_leaf, &left_leaf, &right_leaf);
  }
-  // save pointer to last trained tree
-  last_trained_tree_ = tree;
  return tree;
 }

@@ -234,6 +236,17 @@ void SerialTreeLearner::BeforeTrain() {
 }

 bool SerialTreeLearner::BeforeFindBestSplit(int left_leaf, int right_leaf) {
+  // check depth of current leaf
+  if (max_depth_ > 0) {
+    // only need to check left leaf, since right leaf is in same level of left leaf
+    if (last_trained_tree_->leaf_depth(left_leaf) >= max_depth_) {
+      best_split_per_leaf_[left_leaf].gain = kMinScore;
+      if (right_leaf >= 0) {
+        best_split_per_leaf_[right_leaf].gain = kMinScore;
+      }
+      return false;
+    }
+  }
  data_size_t num_data_in_left_child = GetGlobalDataCountInLeaf(left_leaf);
  data_size_t num_data_in_right_child = GetGlobalDataCountInLeaf(right_leaf);
  // no enough data to continue
@@ -257,14 +270,14 @@ bool SerialTreeLearner::BeforeFindBestSplit(int left_leaf, int right_leaf) {
  } else if (num_data_in_left_child < num_data_in_right_child) {
    smaller_leaf = left_leaf;
    larger_leaf = right_leaf;
-    // put parent(left) leaf's histograms into larger leaf's histgrams
+    // put parent(left) leaf's histograms into larger leaf's histograms
    if (histogram_pool_.Get(left_leaf, &larger_leaf_histogram_array_)) { parent_leaf_histogram_array_ = larger_leaf_histogram_array_; }
    histogram_pool_.Move(left_leaf, right_leaf);
    histogram_pool_.Get(left_leaf, &smaller_leaf_histogram_array_);
  } else {
    smaller_leaf = right_leaf;
    larger_leaf = left_leaf;
-    // put parent(left) leaf's histograms to larger leaf's histgrams
+    // put parent(left) leaf's histograms to larger leaf's histograms
    if (histogram_pool_.Get(left_leaf, &larger_leaf_histogram_array_)) { parent_leaf_histogram_array_ = larger_leaf_histogram_array_; }
    histogram_pool_.Get(right_leaf, &smaller_leaf_histogram_array_);
  }
@@ -402,7 +415,9 @@ void SerialTreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* ri
  *right_leaf = tree->Split(best_Leaf, best_split_info.feature, best_split_info.threshold,
    train_data_->FeatureAt(best_split_info.feature)->feature_index(),
    train_data_->FeatureAt(best_split_info.feature)->BinToValue(best_split_info.threshold),
-    best_split_info.left_output, best_split_info.right_output, best_split_info.gain);
+    static_cast<float>(best_split_info.left_output),
+    static_cast<float>(best_split_info.right_output),
+    static_cast<float>(best_split_info.gain));

  // split data partition
  data_partition_->Split(best_Leaf, train_data_->FeatureAt(best_split_info.feature)->bin_data(),

--- a/src/treelearner/serial_tree_learner.h
+++ b/src/treelearner/serial_tree_learner.h
@@ -41,11 +41,11 @@ public:
  void AddPredictionToScore(score_t *out_score) const override {
    #pragma omp parallel for schedule(guided)
    for (int i = 0; i < data_partition_->num_leaves(); ++i) {
-      double output = last_trained_tree_->LeafOutput(i);
+      float output = last_trained_tree_->LeafOutput(i);
      data_size_t* tmp_idx = nullptr;
      data_size_t cnt_leaf_data = data_partition_->GetIndexOnLeaf(i, &tmp_idx);
      for (data_size_t j = 0; j < cnt_leaf_data; ++j) {
-        out_score[tmp_idx[j]] += static_cast<score_t>(output);
+        out_score[tmp_idx[j]] += output;
      }
    }
  }
@@ -114,14 +114,14 @@ protected:
  /*! \brief mininal data on one leaf */
  data_size_t min_num_data_one_leaf_;
  /*! \brief mininal sum hessian on one leaf */
-  score_t min_sum_hessian_one_leaf_;
+  double min_sum_hessian_one_leaf_;
  /*! \brief sub-feature fraction rate */
-  double feature_fraction_;
+  float feature_fraction_;
  /*! \brief training data partition on leaves */
  DataPartition* data_partition_;
  /*! \brief used for generate used features */
  Random random_;
-  /*! \brief used for sub feature training, is_feature_used_[i] = falase means don't used feature i */
+  /*! \brief used for sub feature training, is_feature_used_[i] = false means don't used feature i */
  bool* is_feature_used_;
  /*! \brief pointer to histograms array of parent of current leaves */
  FeatureHistogram* parent_leaf_histogram_array_;
@@ -160,9 +160,11 @@ protected:
  /*! \brief  is_data_in_leaf_[i] != 0 means i-th data is marked */
  char* is_data_in_leaf_;
  /*! \brief  max cache size(unit:GB) for historical histogram. < 0 means not limit */
-  double histogram_pool_size_;
+  float histogram_pool_size_;
  /*! \brief used to cache historical histogram to speed up*/
  LRUPool<FeatureHistogram*> histogram_pool_;
+  /*! \brief  max depth of tree model */
+  int max_depth_;
 };



--- a/src/treelearner/split_info.hpp
+++ b/src/treelearner/split_info.hpp
@@ -21,23 +21,23 @@ public:
  /*! \brief Split threshold */
  unsigned int threshold;
  /*! \brief Left output after split */
-  score_t left_output;
+  double left_output;
  /*! \brief Right output after split */
-  score_t right_output;
+  double right_output;
  /*! \brief Split gain */
-  score_t gain;
+  double gain;
  /*! \brief Left number of data after split */
  data_size_t left_count;
  /*! \brief Right number of data after split */
  data_size_t right_count;
  /*! \brief Left sum gradient after split */
-  score_t left_sum_gradient;
+  double left_sum_gradient;
  /*! \brief Left sum hessian after split */
-  score_t left_sum_hessian;
+  double left_sum_hessian;
  /*! \brief Right sum gradient after split */
-  score_t right_sum_gradient;
+  double right_sum_gradient;
  /*! \brief Right sum hessian after split */
-  score_t right_sum_hessian;
+  double right_sum_hessian;

  SplitInfo() {
    // initilize with -1 and -inf gain
@@ -75,8 +75,8 @@ public:


 inline bool SplitInfo::operator > (const SplitInfo& si) const {
-  score_t local_gain = this->gain;
-  score_t other_gain = si.gain;
+  double local_gain = this->gain;
+  double other_gain = si.gain;
  // replace nan with -inf
  if (local_gain == NAN) {
    local_gain = kMinScore;

--- a/windows/LightGBM.vcxproj
+++ b/windows/LightGBM.vcxproj
@@ -185,11 +185,13 @@
    <ClInclude Include="..\src\metric\binary_metric.hpp" />
    <ClInclude Include="..\src\metric\rank_metric.hpp" />
    <ClInclude Include="..\src\metric\regression_metric.hpp" />
+    <ClInclude Include="..\src\metric\multiclass_metric.hpp" />
    <ClInclude Include="..\src\network\linkers.h" />
    <ClInclude Include="..\src\network\socket_wrapper.hpp" />
    <ClInclude Include="..\src\objective\binary_objective.hpp" />
    <ClInclude Include="..\src\objective\rank_objective.hpp" />
    <ClInclude Include="..\src\objective\regression_objective.hpp" />
+    <ClInclude Include="..\src\objective\multiclass_objective.hpp" />
    <ClInclude Include="..\src\treelearner\data_partition.hpp" />
    <ClInclude Include="..\src\treelearner\feature_histogram.hpp" />
    <ClInclude Include="..\src\treelearner\leaf_splits.hpp" />

--- a/windows/LightGBM.vcxproj.filters
+++ b/windows/LightGBM.vcxproj.filters
@@ -75,6 +75,9 @@
    <ClInclude Include="..\src\metric\regression_metric.hpp">
      <Filter>src\metric</Filter>
    </ClInclude>
+    <ClInclude Include="..\src\metric\multiclass_metric.hpp">
+      <Filter>src\metric</Filter>
+    </ClInclude>
    <ClInclude Include="..\src\network\socket_wrapper.hpp">
      <Filter>src\network</Filter>
    </ClInclude>
@@ -87,6 +90,9 @@
    <ClInclude Include="..\src\objective\regression_objective.hpp">
      <Filter>src\objective</Filter>
    </ClInclude>
+    <ClInclude Include="..\src\objective\multiclass_objective.hpp">
+      <Filter>src\objective</Filter>
+    </ClInclude>
    <ClInclude Include="..\src\treelearner\data_partition.hpp">
      <Filter>src\treelearner</Filter>
    </ClInclude>