Fix objective functions with zero hessian (#1199)

5392c9ea · Guolin Ke · GitHub · d90369a0 · 5392c9ea · 5392c9ea
Unverified Commit 5392c9ea authored Jan 16, 2018 by Guolin Ke Committed by GitHub Jan 16, 2018
20 changed files
--- a/docs/Features.rst
+++ b/docs/Features.rst
@@ -205,10 +205,24 @@ Support following metrics:

 -  NDCG

+-  MAP
+
 -  Multi class log loss

 -  Multi class error rate

+-  Fair
+
+-  Huber
+
+-  Poisson
+
+-  Quantile
+
+-  MAPE
+
+-  kullback Leibler
+
 For more details, please refer to `Parameters <./Parameters.rst#metric-parameters>`__.

 Other Features
@@ -269,7 +283,7 @@ References

 .. _LightGBM\: A Highly Efficient Gradient Boosting Decision Tree: https://papers.nips.cc/paper/6907-lightgbm-a-highly-efficient-gradient-boosting-decision-tree.pdf

-.. _On Grouping for Maximum Homogeneity: http://amstat.tandfonline.com/doi/abs/10.1080/01621459.1958.10501479
+.. _On Grouping for Maximum Homogeneity: http://www.csiss.org/SPACE/workshops/2004/SAC/files/fisher.pdf

 .. _Optimization of collective communication operations in MPICH: http://wwwi10.lrr.in.tum.de/~gerndt/home/Teaching/HPCSeminar/mpich_multi_coll.pdf


--- a/docs/Parameters.rst
+++ b/docs/Parameters.rst
@@ -54,7 +54,7 @@ Core Parameters
   - **Note**: Only can be used in CLI version.

 -  ``application``, default=\ ``regression``, type=enum,
-   options=\ ``regression``, ``regression_l1``, ``huber``, ``fair``, ``poisson``, ``quantile``, ``quantile_l2``,
+   options=\ ``regression``, ``regression_l1``, ``huber``, ``fair``, ``poisson``, ``quantile``, ``mape``,
   ``binary``, ``multiclass``, ``multiclassova``, ``xentropy``, ``xentlambda``, ``lambdarank``,
   alias=\ ``objective``, ``app``

@@ -72,7 +72,7 @@ Core Parameters

      -  ``quantile``, `Quantile regression`_

-      -  ``quantile_l2``, like the ``quantile``, but L2 loss is used instead
+      -  ``mape``, `MAPE loss`_

   -  ``binary``, binary `log loss`_ classification application

@@ -513,10 +513,6 @@ Objective Parameters

   -  parameter for `Fair loss`_. Will be used in ``regression`` task

-  ``gaussian_eta``, default=\ ``1.0``, type=double
-
-   -  parameter to control the width of Gaussian function. Will be used in ``regression_l1`` and ``huber`` losses
-
 -  ``poisson_max_delta_step``, default=\ ``0.7``, type=double

   -  parameter for `Poisson regression`_ to safeguard optimization
@@ -573,6 +569,8 @@ Metric Parameters
   -  ``l2_root``, root square loss, alias=\ ``root_mean_squared_error``, ``rmse``

   -  ``quantile``, `Quantile regression`_
+   
+   -  ``mape``, `MAPE loss`_

   -  ``huber``, `Huber loss`_

@@ -744,6 +742,8 @@ You can specific query/group id in data file now. Please refer to parameter ``gr

 .. _Quantile regression: https://en.wikipedia.org/wiki/Quantile_regression

+.. _MAPE loss: https://en.wikipedia.org/wiki/Mean_absolute_percentage_error
+
 .. _Fair loss: https://www.kaggle.com/c/allstate-claims-severity/discussion/24520

 .. _Poisson regression: https://en.wikipedia.org/wiki/Poisson_regression

--- a/docs/Quick-Start.rst
+++ b/docs/Quick-Start.rst
@@ -68,7 +68,7 @@ Some important parameters:
   -  ``convert_model``, for converting model file into if-else format, see more information in `Convert model parameters <./Parameters.rst#convert-model-parameters>`__

 -  ``application``, default=\ ``regression``, type=enum,
-   options=\ ``regression``, ``regression_l1``, ``huber``, ``fair``, ``poisson``, ``quantile``, ``quantile_l2``,
+   options=\ ``regression``, ``regression_l1``, ``huber``, ``fair``, ``poisson``, ``quantile``, ``mape``,
   ``binary``, ``multiclass``, ``multiclassova``, ``xentropy``, ``xentlambda``, ``lambdarank``,
   alias=\ ``objective``, ``app``

@@ -86,7 +86,7 @@ Some important parameters:

      -  ``quantile``, `Quantile regression`_

-      -  ``quantile_l2``, like the ``quantile``, but L2 loss is used instead
+      -  ``mape``, `MAPE loss`_

   -  ``binary``, binary `log loss`_ classification application

@@ -234,6 +234,8 @@ Examples

 .. _Quantile regression: https://en.wikipedia.org/wiki/Quantile_regression

+.. _MAPE loss: https://en.wikipedia.org/wiki/Mean_absolute_percentage_error
+
 .. _log loss: https://en.wikipedia.org/wiki/Cross_entropy

 .. _softmax: https://en.wikipedia.org/wiki/Softmax_function

--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -164,8 +164,6 @@ public:
  virtual ~ObjectiveConfig() {}
  double sigmoid = 1.0f;
  double fair_c = 1.0f;
-  // for Approximate Hessian With Gaussian
-  double gaussian_eta = 1.0f;
  double poisson_max_delta_step = 0.7f;
  // for lambdarank
  std::vector<double> label_gain;
@@ -473,7 +471,7 @@ struct ParameterAlias {
      "convert_model", "convert_model_language",
      "feature_fraction_seed", "enable_bundle", "data_filename", "valid_data_filenames",
      "snapshot_freq", "verbosity", "sparse_threshold", "enable_load_from_binary_file",
-      "max_conflict_rate", "poisson_max_delta_step", "gaussian_eta",
+      "max_conflict_rate", "poisson_max_delta_step",
      "histogram_pool_size", "is_provide_training_metric", "machine_list_filename", "machines",
      "zero_as_missing", "init_score_file", "valid_init_score_file", "is_predict_contrib",
      "max_cat_threshold",  "cat_smooth", "min_data_per_group", "cat_l2", "max_cat_to_onehot",

--- a/include/LightGBM/network.h
+++ b/include/LightGBM/network.h
@@ -210,6 +210,52 @@ public:
    return global;
  }

+  template<class T>
+  static T GlobalSyncUpByMean(T& local) {
+    T global = (T)0;
+    Allreduce(reinterpret_cast<char*>(&local),
+              sizeof(local), sizeof(local),
+              reinterpret_cast<char*>(&global),
+              [](const char* src, char* dst, int type_size, comm_size_t len) {
+      comm_size_t used_size = 0;
+      const T *p1;
+      T *p2;
+      while (used_size < len) {
+        p1 = reinterpret_cast<const T *>(src);
+        p2 = reinterpret_cast<T *>(dst);
+        *p2 += *p1;
+        src += type_size;
+        dst += type_size;
+        used_size += type_size;
+      }
+    });
+    return static_cast<T>(global / num_machines_);
+  }
+
+  template<class T>
+  static void GlobalSum(std::vector<T>& local) {
+    std::vector<T> global;
+    Allreduce(reinterpret_cast<char*>(local.data()),
+              static_cast<comm_size_t>(sizeof(T) * local.size()), sizeof(T),
+              reinterpret_cast<char*>(global.data()),
+              [](const char* src, char* dst, int type_size, comm_size_t len) {
+      comm_size_t used_size = 0;
+      const T *p1;
+      T *p2;
+      while (used_size < len) {
+        p1 = reinterpret_cast<const T *>(src);
+        p2 = reinterpret_cast<T *>(dst);
+        *p2 += *p1;
+        src += type_size;
+        dst += type_size;
+        used_size += type_size;
+      }
+    });
+    for (size_t i = 0; i < local.size(); ++i) {
+      local[i] = global[i];
+    }
+  }
+
 private:

  static void AllgatherBruck(char* input, const comm_size_t* block_start, const comm_size_t* block_len, char* output, comm_size_t all_size);

--- a/include/LightGBM/objective_function.h
+++ b/include/LightGBM/objective_function.h
@@ -35,9 +35,14 @@ public:

  virtual bool IsConstantHessian() const { return false; }

-  virtual bool BoostFromAverage() const { return false; }
+  virtual bool IsRenewTreeOutput() const { return false; }

-  virtual bool GetCustomAverage(double *) const { return false; }
+  virtual double RenewTreeOutput(double ori_output, const double*,
+                                 const data_size_t*,
+                                 const data_size_t*,
+                                 data_size_t) const { return ori_output; }
+
+  virtual double BoostFromScore() const { return 0.0f; }

  virtual bool SkipEmptyClass() const { return false; }


--- a/include/LightGBM/tree_learner.h
+++ b/include/LightGBM/tree_learner.h
@@ -12,6 +12,7 @@ namespace LightGBM {
 /*! \brief forward declaration */
 class Tree;
 class Dataset;
+class ObjectiveFunction;

 /*!
 * \brief Interface for tree learner
@@ -67,6 +68,9 @@ public:
  */
  virtual void AddPredictionToScore(const Tree* tree, double* out_score) const = 0;

+  virtual void RenewTreeOutput(Tree* tree, const ObjectiveFunction* obj, const double* prediction,
+                               data_size_t total_num_data, const data_size_t* bag_indices, data_size_t bag_cnt) const = 0;
+
  TreeLearner() = default;
  /*! \brief Disable copy */
  TreeLearner& operator=(const TreeLearner&) = delete;

--- a/include/LightGBM/utils/common.h
+++ b/include/LightGBM/utils/common.h
@@ -640,27 +640,6 @@ inline static void SortForPair(std::vector<T1>& keys, std::vector<T2>& values, s

 }

-/*
-* approximate hessians of absolute loss with Gaussian function
-* cf. https://en.wikipedia.org/wiki/Gaussian_function
-*
-* y is a prediction.
-* t means true target.
-* g means gradient.
-* eta is a parameter to control the width of Gaussian function.
-* w means weights.
-*/
-inline static double ApproximateHessianWithGaussian(const double y, const double t, const double g,
-                                                    const double eta, const double w=1.0f) {
-  const double diff = y - t;
-  const double pi = 4.0 * std::atan(1.0);
-  const double x = std::fabs(diff);
-  const double a = 2.0 * std::fabs(g) * w;  // difference of two first derivatives, (zero to inf) and (zero to -inf).
-  const double b = 0.0;
-  const double c = std::max((std::fabs(y) + std::fabs(t)) * eta, 1.0e-10);
-  return w * std::exp(-(x - b) * (x - b) / (2.0 * c * c)) * a / (c * std::sqrt(2 * pi));
-}
-
 template <typename T>
 inline static std::vector<T*> Vector2Ptr(std::vector<std::vector<T>>& data) {
  std::vector<T*> ptr(data.size());
@@ -882,6 +861,11 @@ inline static const char* SkipNewLine(const char* str) {
  return str;
 }

+template <typename T>
+static int Sign(T x) {
+  return (x > T(0)) - (x < T(0));
+}
+
 }  // namespace Common

 }  // namespace LightGBM

--- a/python-package/lightgbm/basic.py
+++ b/python-package/lightgbm/basic.py
@@ -1956,7 +1956,7 @@ class Booster(object):
                self.__name_inner_eval = \
                    [string_buffers[i].value.decode() for i in range_(self.__num_inner_eval)]
                self.__higher_better_inner_eval = \
-                    [name.startswith(('auc', 'ndcg', 'map')) for name in self.__name_inner_eval]
+                    [name.startswith(('auc', 'ndcg@', 'map@')) for name in self.__name_inner_eval]

    def attr(self, key):
        """Get attribute string from the Booster.

--- a/src/boosting/gbdt.cpp
+++ b/src/boosting/gbdt.cpp
@@ -295,42 +295,15 @@ void GBDT::Bagging(int iter) {
 * (i) and (ii) could be selected as say "auto_init_score" = 0 or 1 etc..
 *
 */
-double ObtainAutomaticInitialScore(const ObjectiveFunction* fobj, const label_t* label, data_size_t num_data) {
+double ObtainAutomaticInitialScore(const ObjectiveFunction* fobj) {
  double init_score = 0.0f;
-  bool got_custom = false;
  if (fobj != nullptr) {
-    got_custom = fobj->GetCustomAverage(&init_score);
-  }
-  if (!got_custom) {
-    double sum_label = 0.0f;
-    #pragma omp parallel for schedule(static) reduction(+:sum_label)
-    for (data_size_t i = 0; i < num_data; ++i) {
-      sum_label += label[i];
-    }
-    init_score = sum_label / num_data;
+    init_score = fobj->BoostFromScore();
  }
  if (Network::num_machines() > 1) {
-    double global_init_score = 0.0f;
-    Network::Allreduce(reinterpret_cast<char*>(&init_score),
-                       sizeof(init_score), sizeof(init_score),
-                       reinterpret_cast<char*>(&global_init_score),
-                       [](const char* src, char* dst, int type_size, comm_size_t len) {
-      comm_size_t used_size = 0;
-      const double *p1;
-      double *p2;
-      while (used_size < len) {
-        p1 = reinterpret_cast<const double *>(src);
-        p2 = reinterpret_cast<double *>(dst);
-        *p2 += *p1;
-        src += type_size;
-        dst += type_size;
-        used_size += type_size;
-      }
-    });
-    return global_init_score / Network::num_machines();
-  } else {
-    return init_score;
+    init_score = Network::GlobalSyncUpByMean(init_score);
  }
+  return init_score;
 }

 void GBDT::Train(int snapshot_freq, const std::string& model_output_path) {
@@ -379,21 +352,23 @@ void GBDT::RefitTree(const std::vector<std::vector<int>>& tree_leaf_prediction)

 double GBDT::BoostFromAverage() {
  // boosting from average label; or customized "average" if implemented for the current objective
-  if (models_.empty()
-      && gbdt_config_->boost_from_average
-      && !train_score_updater_->has_init_score()
+  if (models_.empty() && !train_score_updater_->has_init_score()
      && num_class_ <= 1
-      && objective_function_ != nullptr
-      && objective_function_->BoostFromAverage()) {
-
-    auto label = train_data_->metadata().label();
-    double init_score = ObtainAutomaticInitialScore(objective_function_, label, num_data_);
-    if (std::fabs(init_score) > kEpsilon) {
-      train_score_updater_->AddScore(init_score, 0);
-      for (auto& score_updater : valid_score_updater_) {
-        score_updater->AddScore(init_score, 0);
+      && objective_function_ != nullptr) {
+    if (gbdt_config_->boost_from_average) {
+      double init_score = ObtainAutomaticInitialScore(objective_function_);
+      if (std::fabs(init_score) > kEpsilon) {
+        train_score_updater_->AddScore(init_score, 0);
+        for (auto& score_updater : valid_score_updater_) {
+          score_updater->AddScore(init_score, 0);
+        }
+        Log::Info("Start training from score %lf", init_score);
+        return init_score;
      }
-      return init_score;
+    } else if (std::string(objective_function_->GetName()) == std::string("regression_l1")
+               || std::string(objective_function_->GetName()) == std::string("quantile")
+               || std::string(objective_function_->GetName()) == std::string("mape")) {
+      Log::Warning("Disable boost_from_average in %s may cause the slow convergence.", objective_function_->GetName());
    }
  }
  return 0.0f;
@@ -434,10 +409,9 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) {
    #ifdef TIMETAG
    start_time = std::chrono::steady_clock::now();
    #endif
-
+    const size_t bias = static_cast<size_t>(cur_tree_id) * num_data_;
    std::unique_ptr<Tree> new_tree(new Tree(2));
    if (class_need_train_[cur_tree_id]) {
-      size_t bias = static_cast<size_t>(cur_tree_id)* num_data_;
      auto grad = gradients + bias;
      auto hess = hessians + bias;

@@ -460,6 +434,8 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) {

    if (new_tree->num_leaves() > 1) {
      should_continue = true;
+      tree_learner_->RenewTreeOutput(new_tree.get(), objective_function_, train_score_updater_->score() + bias,
+                                     num_data_, bag_data_indices_.data(), bag_data_cnt_);
      // shrinkage by learning rate
      new_tree->Shrinkage(shrinkage_rate_);
      // update score

--- a/src/boosting/goss.hpp
+++ b/src/boosting/goss.hpp
@@ -206,11 +206,6 @@ public:
    }
  }

-  /*!
-  * \brief Get Type name of this boosting object
-  */
-  const char* SubModelName() const override { return "tree"; }
-
 private:
  std::vector<data_size_t> tmp_indice_right_;
 };

--- a/src/boosting/score_updater.hpp
+++ b/src/boosting/score_updater.hpp
@@ -73,7 +73,8 @@ public:
  * \param cur_tree_id Current tree for multiclass training
  */
  inline void AddScore(const Tree* tree, int cur_tree_id) {
-    tree->AddPredictionToScore(data_, num_data_, score_.data() + cur_tree_id * num_data_);
+    const size_t offset = static_cast<size_t>(num_data_) * cur_tree_id;
+    tree->AddPredictionToScore(data_, num_data_, score_.data() + offset);
  }
  /*!
  * \brief Adding prediction score, only used for training data.
@@ -83,7 +84,8 @@ public:
  * \param cur_tree_id Current tree for multiclass training
  */
  inline void AddScore(const TreeLearner* tree_learner, const Tree* tree, int cur_tree_id) {
-    tree_learner->AddPredictionToScore(tree, score_.data() + cur_tree_id * num_data_);
+    const size_t offset = static_cast<size_t>(num_data_) * cur_tree_id;
+    tree_learner->AddPredictionToScore(tree, score_.data() + offset);
  }
  /*!
  * \brief Using tree model to get prediction number, then adding to scores for parts of data
@@ -95,10 +97,12 @@ public:
  */
  inline void AddScore(const Tree* tree, const data_size_t* data_indices,
                       data_size_t data_cnt, int cur_tree_id) {
-    tree->AddPredictionToScore(data_, data_indices, data_cnt, score_.data() + cur_tree_id * num_data_);
+    const size_t offset = static_cast<size_t>(num_data_) * cur_tree_id;
+    tree->AddPredictionToScore(data_, data_indices, data_cnt, score_.data() + offset);
  }
  /*! \brief Pointer of score */
  inline const double* score() const { return score_.data(); }
+
  inline data_size_t num_data() const { return num_data_; }

  /*! \brief Disable copy */

--- a/src/io/config.cpp
+++ b/src/io/config.cpp
@@ -310,8 +310,6 @@ void ObjectiveConfig::Set(const std::unordered_map<std::string, std::string>& pa
  CHECK(sigmoid > 0);
  GetDouble(params, "fair_c", &fair_c);
  CHECK(fair_c > 0);
-  GetDouble(params, "gaussian_eta", &gaussian_eta);
-  CHECK(gaussian_eta > 0);
  GetDouble(params, "poisson_max_delta_step", &poisson_max_delta_step);
  CHECK(poisson_max_delta_step > 0);
  GetInt(params, "max_position", &max_position);

--- a/src/metric/metric.cpp
+++ b/src/metric/metric.cpp
@@ -43,6 +43,8 @@ Metric* Metric::CreateMetric(const std::string& type, const MetricConfig& config
    return new CrossEntropyLambdaMetric(config);
  } else if (type == std::string("kldiv") || type == std::string("kullback_leibler")) {
    return new KullbackLeiblerDivergence(config);
+  } else if (type == std::string("mean_absolute_percentage_error") || type == std::string("mape")) {
+    return new MAPEMetric(config);
  }
  return nullptr;
 }

--- a/src/metric/rank_metric.hpp
+++ b/src/metric/rank_metric.hpp
@@ -57,9 +57,11 @@ public:
        sum_query_weights_ += query_weights_[i];
      }
    }
+    inverse_max_dcgs_.resize(num_queries_);
    // cache the inverse max DCG for all querys, used to calculate NDCG
+    #pragma omp parallel for schedule(static)
    for (data_size_t i = 0; i < num_queries_; ++i) {
-      inverse_max_dcgs_.emplace_back(eval_at_.size(), 0.0f);
+      inverse_max_dcgs_[i].resize(eval_at_.size(), 0.0f);
      DCGCalculator::CalMaxDCG(eval_at_, label_ + query_boundaries_[i],
                               query_boundaries_[i + 1] - query_boundaries_[i],
                               &inverse_max_dcgs_[i]);

--- a/src/metric/regression_metric.hpp
+++ b/src/metric/regression_metric.hpp
@@ -227,5 +227,20 @@ public:
  }
 };

+
+/*! \brief Mape regression loss for regression task */
+class MAPEMetric : public RegressionMetric<MAPEMetric> {
+public:
+  explicit MAPEMetric(const MetricConfig& config) :RegressionMetric<MAPEMetric>(config) {
+  }
+
+  inline static double LossOnPoint(label_t label, double score, const MetricConfig&) {
+    return std::fabs((label - score)) / std::max(1.0f, std::fabs(label));
+  }
+  inline static const char* Name() {
+    return "mape";
+  }
+};
+
 }  // namespace LightGBM
 #endif   // LightGBM_METRIC_REGRESSION_METRIC_HPP_
--- a/src/objective/objective_function.cpp
+++ b/src/objective/objective_function.cpp
@@ -15,8 +15,6 @@ ObjectiveFunction* ObjectiveFunction::CreateObjectiveFunction(const std::string&
    return new RegressionL1loss(config);
  } else if (type == std::string("quantile")) {
    return new RegressionQuantileloss(config);
-  } else if (type == std::string("quantile_l2")) {
-    return new RegressionQuantileL2loss(config);
  } else if (type == std::string("huber")) {
    return new RegressionHuberLoss(config);
  } else if (type == std::string("fair")) {
@@ -35,6 +33,8 @@ ObjectiveFunction* ObjectiveFunction::CreateObjectiveFunction(const std::string&
    return new CrossEntropy(config);
  } else if (type == std::string("xentlambda") || type == std::string("cross_entropy_lambda")) {
    return new CrossEntropyLambda(config);
+  } else if (type == std::string("mean_absolute_percentage_error") || type == std::string("mape")) {
+    return new RegressionMAPELOSS(config);
  }
  return nullptr;
 }
@@ -48,8 +48,6 @@ ObjectiveFunction* ObjectiveFunction::CreateObjectiveFunction(const std::string&
    return new RegressionL1loss(strs);
  } else if (type == std::string("quantile")) {
    return new RegressionQuantileloss(strs);
-  } else if (type == std::string("quantile_l2")) {
-    return new RegressionQuantileL2loss(strs);
  } else if (type == std::string("huber")) {
    return new RegressionHuberLoss(strs);
  } else if (type == std::string("fair")) {

--- a/src/objective/regression_objective.hpp
+++ b/src/objective/regression_objective.hpp
@@ -4,10 +4,60 @@
 #include <LightGBM/meta.h>

 #include <LightGBM/objective_function.h>
-#include <LightGBM/utils/common.h>
+#include <LightGBM/utils/array_args.h>

 namespace LightGBM {

+#define PercentileFun(T, data_reader, cnt_data, alpha) {\
+  std::vector<T> ref_data(cnt_data);\
+  for (data_size_t i = 0; i < cnt_data; ++i) {\
+    ref_data[i] = data_reader(i);\
+  }\
+  const double float_pos = (1.0f - alpha) * cnt_data;\
+  const data_size_t pos = static_cast<data_size_t>(float_pos);\
+  if (pos < 1) {\
+    return ref_data[ArrayArgs<T>::ArgMax(ref_data)];\
+  } else if (pos >= cnt_data) {\
+    return ref_data[ArrayArgs<T>::ArgMin(ref_data)];\
+  } else {\
+    const double bias = float_pos - pos;\
+    if (pos > cnt_data / 2) {\
+      ArrayArgs<T>::ArgMaxAtK(&ref_data, 0, cnt_data, pos - 1);\
+      T v1 = ref_data[pos - 1];\
+      T v2 = ref_data[pos + ArrayArgs<T>::ArgMax(ref_data.data() + pos, cnt_data - pos)];\
+      return static_cast<T>(v1 - (v1 - v2) * bias);\
+    } else {\
+      ArrayArgs<T>::ArgMaxAtK(&ref_data, 0, cnt_data, pos);\
+      T v2 = ref_data[pos];\
+      T v1 = ref_data[ArrayArgs<T>::ArgMin(ref_data.data(), pos)];\
+      return static_cast<T>(v1 - (v1 - v2) * bias);\
+    }\
+  }\
+}\
+
+#define WeightedPercentileFun(T, data_reader, weight_reader, cnt_data, alpha) {\
+  std::vector<data_size_t> sorted_idx(cnt_data);\
+  for (data_size_t i = 0; i < cnt_data; ++i) {\
+    sorted_idx[i] = i;\
+  }\
+  std::sort(sorted_idx.begin(), sorted_idx.end(), [=](data_size_t a, data_size_t b) {return data_reader(a) < data_reader(b); });\
+  std::vector<double> weighted_cdf(cnt_data);\
+  weighted_cdf[0] = weight_reader(sorted_idx[0]);\
+  for (data_size_t i = 1; i < cnt_data; ++i) {\
+    weighted_cdf[i] = weighted_cdf[i - 1] + weight_reader(sorted_idx[i]);\
+  }\
+  double threshold = weighted_cdf[cnt_data - 1] * alpha;\
+  size_t pos = std::upper_bound(weighted_cdf.begin(), weighted_cdf.end(), threshold) - weighted_cdf.begin();\
+  if (pos == 0) {\
+    return data_reader(sorted_idx[0]);\
+  }\
+  CHECK(threshold >= weighted_cdf[pos - 1]);\
+  CHECK(threshold < weighted_cdf[pos]);\
+  T v1 = data_reader(sorted_idx[pos - 1]);\
+  T v2 = data_reader(sorted_idx[pos]);\
+  return static_cast<T>((threshold - weighted_cdf[pos]) / (weighted_cdf[pos + 1] - weighted_cdf[pos]) * (v2 - v1) + v1);\
+}\
+
 /*!
 * \brief Objective function for regression
 */
@@ -25,7 +75,7 @@ public:
      }
    }
  }
-
+  
  ~RegressionL2loss() {
  }

@@ -34,8 +84,9 @@ public:
    label_ = metadata.label();
    if (sqrt_) {
      trans_label_.resize(num_data_);
+      #pragma omp parallel for schedule(static)
      for (data_size_t i = 0; i < num_data; ++i) {
-        trans_label_[i] = std::copysign(std::sqrt(std::fabs(label_[i])), label_[i]);
+        trans_label_[i] = Common::Sign(label_[i]) * std::sqrt(std::fabs(label_[i]));
      }
      label_ = trans_label_.data();
    }
@@ -65,7 +116,7 @@ public:

  void ConvertOutput(const double* input, double* output) const override {
    if (sqrt_) {
-      output[0] = std::copysign(input[0] * input[0], input[0]);
+      output[0] = Common::Sign(input[0]) * input[0] * input[0];
    } else {
      output[0] = input[0];
    }
@@ -88,12 +139,23 @@ public:
    }
  }

-  bool BoostFromAverage() const override { 
-    if (sqrt_) {
-      return false;
+  double BoostFromScore() const override {
+    double suml = 0.0f;
+    double sumw = 0.0f;
+    if (weights_ != nullptr) {
+      #pragma omp parallel for schedule(static) reduction(+:suml,sumw)
+      for (data_size_t i = 0; i < num_data_; ++i) {
+        suml += label_[i] * weights_[i];
+        sumw += weights_[i];
+      }
    } else {
-      return true;
+      sumw = static_cast<double>(num_data_);
+      #pragma omp parallel for schedule(static) reduction(+:suml)
+      for (data_size_t i = 0; i < num_data_; ++i) {
+        suml += label_[i];
+      }
    }
+    return suml / sumw;
  }

 protected:
@@ -113,11 +175,9 @@ protected:
 class RegressionL1loss: public RegressionL2loss {
 public:
  explicit RegressionL1loss(const ObjectiveConfig& config): RegressionL2loss(config) {
-    eta_ = static_cast<double>(config.gaussian_eta);
  }

  explicit RegressionL1loss(const std::vector<std::string>& strs): RegressionL2loss(strs) {
-
  }

  ~RegressionL1loss() {}
@@ -128,37 +188,71 @@ public:
      #pragma omp parallel for schedule(static)
      for (data_size_t i = 0; i < num_data_; ++i) {
        const double diff = score[i] - label_[i];
-        if (diff >= 0.0f) {
-          gradients[i] = 1.0f;
-        } else {
-          gradients[i] = -1.0f;
-        }
-        hessians[i] = static_cast<score_t>(Common::ApproximateHessianWithGaussian(score[i], label_[i], gradients[i], eta_));
+        gradients[i] = static_cast<score_t>(Common::Sign(diff));
+        hessians[i] = 1.0f;
      }
    } else {
      #pragma omp parallel for schedule(static)
      for (data_size_t i = 0; i < num_data_; ++i) {
        const double diff = score[i] - label_[i];
-        if (diff >= 0.0f) {
-          gradients[i] = static_cast<score_t>(weights_[i]);
-        } else {
-          gradients[i] = static_cast<score_t>(-weights_[i]);
-        }
-        hessians[i] = static_cast<score_t>(Common::ApproximateHessianWithGaussian(score[i], label_[i], gradients[i], eta_, weights_[i]));
+        gradients[i] = static_cast<score_t>(Common::Sign(diff) * weights_[i]);
+        hessians[i] = weights_[i];
      }
    }
  }

-  const char* GetName() const override {
-    return "regression_l1";
+  double BoostFromScore() const override {
+    const double alpha = 0.5;
+    if (weights_ != nullptr) {
+      #define data_reader(i) (label_[i])
+      #define weight_reader(i) (weights_[i])
+      WeightedPercentileFun(label_t, data_reader, weight_reader, num_data_, alpha);
+      #undef data_reader
+      #undef weight_reader
+    } else {
+      #define data_reader(i) (label_[i])
+      PercentileFun(label_t, data_reader, num_data_, alpha);
+      #undef data_reader
+    }
  }

-  bool IsConstantHessian() const override {
-    return false;
+  bool IsRenewTreeOutput() const override { return true; }
+
+  double RenewTreeOutput(double, const double* pred, 
+                         const data_size_t* index_mapper,
+                         const data_size_t* bagging_mapper,
+                         data_size_t num_data_in_leaf) const override {
+    const double alpha = 0.5;
+    if (weights_ == nullptr) {
+      if (bagging_mapper == nullptr) {
+        #define data_reader(i) (label_[index_mapper[i]] - pred[index_mapper[i]])
+        PercentileFun(double, data_reader, num_data_in_leaf, alpha);
+        #undef data_reader
+      } else {
+        #define data_reader(i) (label_[bagging_mapper[index_mapper[i]]] - pred[bagging_mapper[index_mapper[i]]])
+        PercentileFun(double, data_reader, num_data_in_leaf, alpha);
+        #undef data_reader
+      }
+    } else {
+      if (bagging_mapper == nullptr) {
+        #define data_reader(i) (label_[index_mapper[i]] - pred[index_mapper[i]])
+        #define weight_reader(i) (weights_[index_mapper[i]])
+        WeightedPercentileFun(double, data_reader, weight_reader, num_data_in_leaf, alpha);
+        #undef data_reader
+        #undef weight_reader
+      } else {
+        #define data_reader(i) (label_[bagging_mapper[index_mapper[i]]] - pred[bagging_mapper[index_mapper[i]]])
+        #define weight_reader(i) (weights_[bagging_mapper[index_mapper[i]]])
+        WeightedPercentileFun(double, data_reader, weight_reader, num_data_in_leaf, alpha);
+        #undef data_reader
+        #undef weight_reader
+      }
+    }
  }

-private:
-  double eta_;
+  const char* GetName() const override {
+    return "regression_l1";
+  }
 };

 /*!
@@ -168,7 +262,6 @@ class RegressionHuberLoss: public RegressionL2loss {
 public:
  explicit RegressionHuberLoss(const ObjectiveConfig& config): RegressionL2loss(config) {
    alpha_ = static_cast<double>(config.alpha);
-    eta_ = static_cast<double>(config.gaussian_eta);
  }

  explicit RegressionHuberLoss(const std::vector<std::string>& strs): RegressionL2loss(strs) {
@@ -184,35 +277,23 @@ public:
      #pragma omp parallel for schedule(static)
      for (data_size_t i = 0; i < num_data_; ++i) {
        const double diff = score[i] - label_[i];
-
        if (std::abs(diff) <= alpha_) {
          gradients[i] = static_cast<score_t>(diff);
-          hessians[i] = 1.0f;
        } else {
-          if (diff >= 0.0f) {
-            gradients[i] = static_cast<score_t>(alpha_);
-          } else {
-            gradients[i] = static_cast<score_t>(-alpha_);
-          }
-          hessians[i] = static_cast<score_t>(Common::ApproximateHessianWithGaussian(score[i], label_[i], gradients[i], eta_));
+          gradients[i] = static_cast<score_t>(Common::Sign(diff) * alpha_);
        }
+        hessians[i] = 1.0f;
      }
    } else {
      #pragma omp parallel for schedule(static)
      for (data_size_t i = 0; i < num_data_; ++i) {
        const double diff = score[i] - label_[i];
-
        if (std::abs(diff) <= alpha_) {
          gradients[i] = static_cast<score_t>(diff * weights_[i]);
-          hessians[i] = static_cast<score_t>(weights_[i]);
        } else {
-          if (diff >= 0.0f) {
-            gradients[i] = static_cast<score_t>(alpha_ * weights_[i]);
-          } else {
-            gradients[i] = static_cast<score_t>(-alpha_ * weights_[i]);
-          }
-          hessians[i] = static_cast<score_t>(Common::ApproximateHessianWithGaussian(score[i], label_[i], gradients[i], eta_, weights_[i]));
+          gradients[i] = static_cast<score_t>(Common::Sign(diff) * weights_[i] * alpha_);
        }
+        hessians[i] = static_cast<score_t>(weights_[i]);
      }
    }
  }
@@ -228,8 +309,6 @@ public:
 private:
  /*! \brief delta for Huber loss */
  double alpha_;
-  /*! \brief a parameter to control the width of Gaussian function to approximate hessian */
-  double eta_;
 };


@@ -286,6 +365,10 @@ class RegressionPoissonLoss: public RegressionL2loss {
 public:
  explicit RegressionPoissonLoss(const ObjectiveConfig& config): RegressionL2loss(config) {
    max_delta_step_ = static_cast<double>(config.poisson_max_delta_step);
+    if (sqrt_) {
+      Log::Warning("cannot use sqrt transform in Poisson Regression, will auto disable it.");
+      sqrt_ = false;
+    }
  }

  explicit RegressionPoissonLoss(const std::vector<std::string>& strs): RegressionL2loss(strs) {
@@ -295,6 +378,10 @@ public:
  ~RegressionPoissonLoss() {}

  void Init(const Metadata& metadata, data_size_t num_data) override {
+    if (sqrt_) {
+      Log::Warning("cannot use sqrt transform in Poisson Regression, will auto disable it.");
+      sqrt_ = false;
+    }
    RegressionL2loss::Init(metadata, num_data);
    // Safety check of labels
    label_t miny;
@@ -322,22 +409,19 @@ public:
    if (weights_ == nullptr) {
      #pragma omp parallel for schedule(static)
      for (data_size_t i = 0; i < num_data_; ++i) {
-        const double ef = std::exp(score[i]);
-        gradients[i] = static_cast<score_t>(ef - label_[i]);
-        hessians[i] = static_cast<score_t>(ef);
+        gradients[i] = static_cast<score_t>(std::exp(score[i]) - label_[i]);
+        hessians[i] = static_cast<score_t>(std::exp(score[i] + max_delta_step_));
      }
    } else {
      #pragma omp parallel for schedule(static)
      for (data_size_t i = 0; i < num_data_; ++i) {
-        const double ef = std::exp(score[i]);
-        gradients[i] = static_cast<score_t>((ef - label_[i]) * weights_[i]);
-        hessians[i] = static_cast<score_t>(ef * weights_[i]);
+        gradients[i] = static_cast<score_t>((std::exp(score[i]) - label_[i]) * weights_[i]);
+        hessians[i] = static_cast<score_t>(std::exp(score[i] + max_delta_step_) * weights_[i]);
      }
    }
  }

  void ConvertOutput(const double* input, double* output) const override {
-    RegressionL2loss::ConvertOutput(input, output);
    output[0] = std::exp(input[0]);
  }

@@ -345,25 +429,8 @@ public:
    return "poisson";
  }

-  bool GetCustomAverage(double *initscore) const override {
-    if (initscore == nullptr) return false;
-    double sumw = 0.0f;
-    double sumy = 0.0f;
-    if (weights_ == nullptr) {
-      for (data_size_t i = 0; i < num_data_; i++) {
-        sumy += label_[i];
-      }
-      sumw = static_cast<double>(num_data_);
-    } else {
-      for (data_size_t i = 0; i < num_data_; i++) {
-        sumy += weights_[i] * label_[i];
-        sumw += weights_[i];
-      }
-    }
-    const double yavg = sumy / sumw;
-    *initscore = std::log(yavg);
-    Log::Info("[%s:%s]: yavg=%f -> initscore=%f",  GetName(), __func__, yavg, *initscore);
-    return true;
+  double BoostFromScore() const override {
+    return std::log(RegressionL2loss::BoostFromScore());
  }

  bool IsConstantHessian() const override {
@@ -418,63 +485,159 @@ public:
    return "quantile";
  }

+  double BoostFromScore() const override {
+    if (weights_ != nullptr) {
+      #define data_reader(i) (label_[i])
+      #define weight_reader(i) (weights_[i])
+      WeightedPercentileFun(label_t, data_reader, weight_reader, num_data_, alpha_);
+      #undef data_reader
+      #undef weight_reader
+    } else {
+      #define data_reader(i) (label_[i])
+      PercentileFun(label_t, data_reader, num_data_, alpha_);
+      #undef data_reader
+    }
+  }
+
+  bool IsRenewTreeOutput() const override { return true; }
+
+  double RenewTreeOutput(double, const double* pred,
+                         const data_size_t* index_mapper,
+                         const data_size_t* bagging_mapper,
+                         data_size_t num_data_in_leaf) const override {
+    if (weights_ == nullptr) {
+      if (bagging_mapper == nullptr) {
+        #define data_reader(i) (label_[index_mapper[i]] - pred[index_mapper[i]])
+        PercentileFun(double, data_reader, num_data_in_leaf, alpha_);
+        #undef data_reader
+      } else {
+        #define data_reader(i) (label_[bagging_mapper[index_mapper[i]]] - pred[bagging_mapper[index_mapper[i]]])
+        PercentileFun(double, data_reader, num_data_in_leaf, alpha_);
+        #undef data_reader
+      }
+    } else {
+      if (bagging_mapper == nullptr) {
+        #define data_reader(i) (label_[index_mapper[i]] - pred[index_mapper[i]])
+        #define weight_reader(i) (weights_[index_mapper[i]])
+        WeightedPercentileFun(double, data_reader, weight_reader, num_data_in_leaf, alpha_);
+        #undef data_reader
+        #undef weight_reader
+      } else {
+        #define data_reader(i) (label_[bagging_mapper[index_mapper[i]]] - pred[bagging_mapper[index_mapper[i]]])
+        #define weight_reader(i) (weights_[bagging_mapper[index_mapper[i]]])
+        WeightedPercentileFun(double, data_reader, weight_reader, num_data_in_leaf, alpha_);
+        #undef data_reader
+        #undef weight_reader
+      }
+    }
+  }
+
 private:
  score_t alpha_;
 };

-class RegressionQuantileL2loss : public RegressionL2loss {
+
+/*!
+* \brief Mape Regression Loss
+*/
+class RegressionMAPELOSS : public RegressionL1loss {
 public:
-  explicit RegressionQuantileL2loss(const ObjectiveConfig& config) : RegressionL2loss(config) {
-    alpha_ = static_cast<score_t>(config.alpha);
+  explicit RegressionMAPELOSS(const ObjectiveConfig& config) : RegressionL1loss(config) {
  }

-  explicit RegressionQuantileL2loss(const std::vector<std::string>& strs) : RegressionL2loss(strs) {
+  explicit RegressionMAPELOSS(const std::vector<std::string>& strs) : RegressionL1loss(strs) {

  }

-  ~RegressionQuantileL2loss() {}
+  ~RegressionMAPELOSS() {}
+
+  void Init(const Metadata& metadata, data_size_t num_data) override {
+    RegressionL2loss::Init(metadata, num_data);
+    for (data_size_t i = 0; i < num_data_; ++i) {
+      if (std::fabs(label_[i]) < 1) {
+        Log::Warning("Met 'abs(label) < 1', will convert them to '1' in Mape objective and metric.");
+        break;
+      }
+    }
+    label_weight_.resize(num_data);
+    if (weights_ == nullptr) {
+      #pragma omp parallel for schedule(static)
+      for (data_size_t i = 0; i < num_data_; ++i) {
+        label_weight_[i] = 1.0f / std::max(1.0f, std::fabs(label_[i]));
+      }
+    } else {
+      #pragma omp parallel for schedule(static)
+      for (data_size_t i = 0; i < num_data_; ++i) {
+        label_weight_[i] = 1.0f / std::max(1.0f, std::fabs(label_[i])) * weights_[i];
+      }
+    }
+  }

  void GetGradients(const double* score, score_t* gradients,
                    score_t* hessians) const override {
    if (weights_ == nullptr) {
      #pragma omp parallel for schedule(static)
      for (data_size_t i = 0; i < num_data_; ++i) {
-        score_t delta = static_cast<score_t>(score[i] - label_[i]);
-        if (delta > 0) {
-          gradients[i] = (1.0f - alpha_) * delta;
-          hessians[i] = (1.0f - alpha_);
-        } else {
-          gradients[i] = alpha_ * delta;
-          hessians[i] = alpha_;
-        }
-
+        const double diff = score[i] - label_[i];
+        gradients[i] = static_cast<score_t>(Common::Sign(diff) * label_weight_[i]);
+        hessians[i] = 1.0f;
      }
    } else {
      #pragma omp parallel for schedule(static)
      for (data_size_t i = 0; i < num_data_; ++i) {
-        score_t delta = static_cast<score_t>(score[i] - label_[i]);
-        if (delta > 0) {
-          gradients[i] = static_cast<score_t>((1.0f - alpha_) * delta * weights_[i]);
-          hessians[i] = static_cast<score_t>((1.0f - alpha_) * weights_[i]);
-        } else {
-          gradients[i] = static_cast<score_t>(alpha_ * delta * weights_[i]);
-          hessians[i] = static_cast<score_t>(alpha_ * weights_[i]);
-        }
+        const double diff = score[i] - label_[i];
+        gradients[i] = static_cast<score_t>(Common::Sign(diff) * label_weight_[i]);
+        hessians[i] = weights_[i];
      }
    }
  }

-  bool IsConstantHessian() const override {
-    return false;
+  double BoostFromScore() const override {
+    const double alpha = 0.5;
+    #define data_reader(i) (label_[i])
+    #define weight_reader(i) (label_weight_[i])
+    WeightedPercentileFun(label_t, data_reader, weight_reader, num_data_, alpha);
+    #undef data_reader
+    #undef weight_reader
+  }
+
+  bool IsRenewTreeOutput() const override { return true; }
+
+  double RenewTreeOutput(double, const double* pred,
+                         const data_size_t* index_mapper,
+                         const data_size_t* bagging_mapper,
+                         data_size_t num_data_in_leaf) const override {
+    const double alpha = 0.5;
+    if (bagging_mapper == nullptr) {
+      #define data_reader(i) (label_[index_mapper[i]] - pred[index_mapper[i]])
+      #define weight_reader(i) (label_weight_[index_mapper[i]])
+      WeightedPercentileFun(double, data_reader, weight_reader, num_data_in_leaf, alpha);
+      #undef data_reader
+      #undef weight_reader
+    } else {
+      #define data_reader(i) (label_[bagging_mapper[index_mapper[i]]] - pred[bagging_mapper[index_mapper[i]]])
+      #define weight_reader(i) (label_weight_[bagging_mapper[index_mapper[i]]])
+      WeightedPercentileFun(double, data_reader, weight_reader, num_data_in_leaf, alpha);
+      #undef data_reader
+      #undef weight_reader
+    }
  }

  const char* GetName() const override {
-    return "quantile_l2";
+    return "mape";
+  }
+
+  bool IsConstantHessian() const override {
+    return true;
  }

 private:
-  score_t alpha_;
+  std::vector<label_t> label_weight_;
+
 };

+#undef PercentileFun
+#undef WeightedPercentileFun
+
 }  // namespace LightGBM
 #endif   // LightGBM_OBJECTIVE_REGRESSION_OBJECTIVE_HPP_
--- a/src/objective/xentropy_objective.hpp
+++ b/src/objective/xentropy_objective.hpp
@@ -104,29 +104,27 @@ public:
    return str_buf.str();
  }

-  // allow boost from average option
-  bool BoostFromAverage() const override { return true; }
-
  // implement custom average to boost from (if enabled among options)
-  bool GetCustomAverage(double *initscore) const override {
-    if (initscore == nullptr) return false;
+  double BoostFromScore() const override {
    double suml = 0.0f;
    double sumw = 0.0f;
    if (weights_ != nullptr) {
+      #pragma omp parallel for schedule(static) reduction(+:suml,sumw)
      for (data_size_t i = 0; i < num_data_; ++i) {
        suml += label_[i] * weights_[i];
        sumw += weights_[i];
      }
    } else {
      sumw = static_cast<double>(num_data_);
+      #pragma omp parallel for schedule(static) reduction(+:suml)
      for (data_size_t i = 0; i < num_data_; ++i) {
        suml += label_[i];
      }
    }
    double pavg = suml / sumw;
-    *initscore = std::log(pavg / (1.0f - pavg));
-    Log::Info("[%s:%s]: pavg=%f -> initscore=%f",  GetName(), __func__, pavg, *initscore);
-    return true;
+    double initscore = std::log(pavg / (1.0f - pavg));
+    Log::Info("[%s:%s]: pavg=%f -> initscore=%f",  GetName(), __func__, pavg, initscore);
+    return initscore;
  }

 private:
@@ -232,22 +230,26 @@ public:
    return str_buf.str();
  }

-  bool BoostFromAverage() const override { return true; }
-
-  bool GetCustomAverage(double *initscore) const override {
-    if (initscore == nullptr) return false;
-    double sumy = 0.0f;
-    for (data_size_t i = 0; i < num_data_; ++i) sumy += label_[i];
+  double BoostFromScore() const override {
+    double suml = 0.0f;
    double sumw = 0.0f;
    if (weights_ != nullptr) {
-      for (data_size_t i = 0; i < num_data_; ++i) sumw += weights_[i];
+      #pragma omp parallel for schedule(static) reduction(+:suml,sumw)
+      for (data_size_t i = 0; i < num_data_; ++i) {
+        suml += label_[i] * weights_[i];
+        sumw += weights_[i];
+      }
    } else {
      sumw = static_cast<double>(num_data_);
+      #pragma omp parallel for schedule(static) reduction(+:suml)
+      for (data_size_t i = 0; i < num_data_; ++i) {
+        suml += label_[i];
+      }
    }
-    double havg = sumy / sumw;
-    *initscore = std::log(std::exp(havg) - 1.0f);
-    Log::Info("[%s:%s]: havg=%f -> initscore=%f",  GetName(), __func__, havg, *initscore);
-    return true;
+    double havg = suml / sumw;
+    double initscore = std::log(std::exp(havg) - 1.0f);
+    Log::Info("[%s:%s]: havg=%f -> initscore=%f",  GetName(), __func__, havg, initscore);
+    return initscore;
  }

 private:

--- a/src/treelearner/feature_histogram.hpp
+++ b/src/treelearner/feature_histogram.hpp
@@ -303,10 +303,8 @@ public:
  * \return leaf output
  */
  static double CalculateSplittedLeafOutput(double sum_gradients, double sum_hessians, double l1, double l2) {
-    double abs_sum_gradients = std::fabs(sum_gradients);
-    double reg_abs_sum_gradients = std::max(0.0, abs_sum_gradients - l1);
-    return -std::copysign(reg_abs_sum_gradients, sum_gradients)
-      / (sum_hessians + l2);
+    const double reg_abs_sum_gradients = std::max(0.0, std::fabs(sum_gradients) - l1);
+    return -(Common::Sign(sum_gradients) * reg_abs_sum_gradients) / (sum_hessians + l2);
  }

 private: