Force checking the label used in lambdarank.

cf93bfa1 · Guolin Ke · 9bb3b0d0 · cf93bfa1 · cf93bfa1 · cf93bfa1
Commit cf93bfa1 authored Jul 21, 2017 by Guolin Ke
6 changed files
--- a/docs/Parameters.md
+++ b/docs/Parameters.md
@@ -38,6 +38,8 @@ The parameter format is `key1=value1 key2=value2 ... ` . And parameters can be s
    * `poisson`, [Poisson regression](https://en.wikipedia.org/wiki/Poisson_regression "Poisson regression")
  * `binary`, binary classification application 
  * `lambdarank`, [lambdarank](https://pdfs.semanticscholar.org/fc9a/e09f9ced555558fdf1e997c0a5411fb51f15.pdf) application
+    * The label should be `int` type in lambdarank tasks, and larger number represent the higher relevance (e.g. 0:bad, 1:fair, 2:good, 3:perfect).
+    * `label_gain` can be used to set the gain(weight) of `int` label.
  * `multiclass`, multi-class classification application, should set `num_class` as well
 * `boosting`, default=`gbdt`, type=enum, options=`gbdt`,`rf`,`dart`,`goss`, alias=`boost`,`boosting_type`
  * `gbdt`, traditional Gradient Boosting Decision Tree 

--- a/docs/Quick-Start.md
+++ b/docs/Quick-Start.md
@@ -46,11 +46,18 @@ Some important parameters:
 * ```task```, default=```train```, type=enum, options=```train```,```prediction```
  * ```train``` for training
  * ```prediction``` for prediction.
-* ```application```, default=```regression```, type=enum, options=```regression```,```binary```,```lambdarank```,```multiclass```, alias=```objective```,```app```
-  * ```regression```, regression application
-  * ```binary```, binary classification application 
-  * ```lambdarank```, lambdarank application
-  * ```multiclass```, multi-class classification application, should set ```num_class``` as well
+* `application`, default=`regression`, type=enum, options=`regression`,`regression_l1`,`huber`,`fair`,`poisson`,`binary`,`lambdarank`,`multiclass`, alias=`objective`,`app`
+  * `regression`, regression application
+    * `regression_l2`, L2 loss, alias=`mean_squared_error`,`mse`
+    * `regression_l1`, L1 loss, alias=`mean_absolute_error`,`mae`
+    * `huber`, [Huber loss](https://en.wikipedia.org/wiki/Huber_loss "Huber loss - Wikipedia")
+    * `fair`, [Fair loss](https://www.kaggle.com/c/allstate-claims-severity/discussion/24520)
+    * `poisson`, [Poisson regression](https://en.wikipedia.org/wiki/Poisson_regression "Poisson regression")
+  * `binary`, binary classification application 
+  * `lambdarank`, [lambdarank](https://pdfs.semanticscholar.org/fc9a/e09f9ced555558fdf1e997c0a5411fb51f15.pdf) application
+    * The label should be `int` type in lambdarank tasks, and larger number represent the higher relevance (e.g. 0:bad, 1:fair, 2:good, 3:perfect).
+    * `label_gain` can be used to set the gain(weight) of `int` label.
+  * `multiclass`, multi-class classification application, should set `num_class` as well
 * `boosting`, default=`gbdt`, type=enum, options=`gbdt`,`rf`,`dart`,`goss`, alias=`boost`,`boosting_type`
  * `gbdt`, traditional Gradient Boosting Decision Tree 
  * `rf`, Random Forest

--- a/include/LightGBM/metric.h
+++ b/include/LightGBM/metric.h
@@ -95,6 +95,13 @@ public:
  static double CalMaxDCGAtK(data_size_t k,
    const float* label, data_size_t num_data);

+  /*!
+  * \brief Check the label range for NDCG and lambdarank
+  * \param label Pointer of label
+  * \param num_data Number of data
+  */
+  static void CheckLabel(const float* label, data_size_t num_data);
+
  /*!
  * \brief Calculate the Max DCG score at multi position
  * \param ks The positions want to eval at

--- a/src/metric/dcg_calculator.cpp
+++ b/src/metric/dcg_calculator.cpp
@@ -56,7 +56,6 @@ void DCGCalculator::CalMaxDCG(const std::vector<data_size_t>& ks,
  std::vector<data_size_t> label_cnt(label_gain_.size(), 0);
  // counts for all labels
  for (data_size_t i = 0; i < num_data; ++i) {
-    if (static_cast<size_t>(label[i]) >= label_cnt.size()) { Log::Fatal("Label excel %d", label[i]); }
    ++label_cnt[static_cast<int>(label[i])];
  }
  double cur_result = 0.0f;
@@ -127,4 +126,17 @@ void DCGCalculator::CalDCG(const std::vector<data_size_t>& ks, const float* labe
  }
 }

+void DCGCalculator::CheckLabel(const float* label, data_size_t num_data) {
+  for (data_size_t i = 0; i < num_data; ++i) {
+    float delta = std::fabs(label[i] - static_cast<int>(label[i]));
+    if (delta > kEpsilon) {
+      Log::Fatal("label should be int type (met %f) for ranking task, \
+                 for the gain of label, please set the label_gain parameter.", label[i]);
+    }
+    if (static_cast<size_t>(label[i]) >= label_gain_.size() || label[i] < 0) {
+      Log::Fatal("label (%d) excel the max range %d", label[i], label_gain_.size());
+    }
+  }
+}
+
 }  // namespace LightGBM
--- a/src/metric/rank_metric.hpp
+++ b/src/metric/rank_metric.hpp
@@ -40,6 +40,7 @@ public:
    num_data_ = num_data;
    // get label
    label_ = metadata.label();
+    DCGCalculator::CheckLabel(label_, num_data_);
    // get query boundaries
    query_boundaries_ = metadata.query_boundaries();
    if (query_boundaries_ == nullptr) {

--- a/src/objective/rank_objective.hpp
+++ b/src/objective/rank_objective.hpp
@@ -47,6 +47,7 @@ public:
    num_data_ = num_data;
    // get label
    label_ = metadata.label();
+    DCGCalculator::CheckLabel(label_, num_data_);
    // get weights
    weights_ = metadata.weights();
    // get boundries