support disable missing value handle.

3abff370 · Guolin Ke · 348c2b51 · 3abff370 · 3abff370 · 3abff370
Commit 3abff370 authored May 26, 2017 by Guolin Ke
4 changed files
--- a/docs/Parameters.md
+++ b/docs/Parameters.md
@@ -128,6 +128,8 @@ The parameter format is `key1=value1 key2=value2 ... ` . And parameters can be s
 * `max_bin`, default=`255`, type=int
  * max number of bin that feature values will bucket in. Small bin may reduce training accuracy but may increase general power (deal with over-fit).
  * LightGBM will auto compress memory according `max_bin`. For example, LightGBM will use `uint8_t` for feature value if `max_bin=255`.
+* `min_data_in_bin`, default=`5`, type=int
+  * min number of data inside one bin, use this to avoid one-data-one-bin (may over-fitting).
 * `data_random_seed`, default=`1`, type=int
  * random seed for data partition in parallel learning(not include feature parallel).
 * `output_model`, default=`LightGBM_model.txt`, type=string, alias=`model_output`,`model_out`
@@ -190,6 +192,8 @@ The parameter format is `key1=value1 key2=value2 ... ` . And parameters can be s
 * `num_iteration_predict`, default=`-1`, type=int
  * only used in prediction task, used to how many trained iterations will be used in prediction. 
  * `<= 0` means no limit
+* `use_missing`, default=`true`, type=bool
+  * Set to `false` will disbale the special handle of missing value. 


 ## Objective parameters

--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -208,6 +208,8 @@ public:
  int gpu_device_id = -1;
  /*! \brief Set to true to use double precision math on GPU (default using single precision) */
  bool gpu_use_dp = false;
+  /*! \brief Set to false to disable the handle of missing values */
+  bool use_missing = true;
  LIGHTGBM_EXPORT void Set(const std::unordered_map<std::string, std::string>& params) override;
 };


--- a/src/io/config.cpp
+++ b/src/io/config.cpp
@@ -328,6 +328,7 @@ void TreeConfig::Set(const std::unordered_map<std::string, std::string>& params)
  GetInt(params, "gpu_platform_id", &gpu_platform_id);
  GetInt(params, "gpu_device_id", &gpu_device_id);
  GetBool(params, "gpu_use_dp", &gpu_use_dp);
+  GetBool(params, "use_missing", &use_missing);
 }



--- a/src/treelearner/feature_histogram.hpp
+++ b/src/treelearner/feature_histogram.hpp
@@ -82,15 +82,18 @@ public:
    double gain_shift = GetLeafSplitGain(sum_gradient, sum_hessian,
                                         meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2);
    double min_gain_shift = gain_shift + meta_->tree_config->min_gain_to_split;
-
-    FindBestThresholdSequence(sum_gradient, sum_hessian, num_data, min_gain_shift, output, 0);
-    // Zero is not in leftmost or rightmost
-    if (static_cast<int>(meta_->default_bin) > 0 && static_cast<int>(meta_->default_bin) < meta_->num_bin - 1) {
+    if (meta_->tree_config->use_missing) {
+      FindBestThresholdSequence(sum_gradient, sum_hessian, num_data, min_gain_shift, output, 0);
+      // Zero is not in leftmost or rightmost
+      if (static_cast<int>(meta_->default_bin) > 0 && static_cast<int>(meta_->default_bin) < meta_->num_bin - 1) {
+        FindBestThresholdSequence(sum_gradient, sum_hessian, num_data, min_gain_shift, output, meta_->default_bin);
+      }
+      if (meta_->num_bin > 2) {
+        FindBestThresholdSequence(sum_gradient, sum_hessian, num_data, min_gain_shift, output, meta_->num_bin - 1);
+      }
+    } else {
      FindBestThresholdSequence(sum_gradient, sum_hessian, num_data, min_gain_shift, output, meta_->default_bin);
    }
-    if (meta_->num_bin > 2) {
-      FindBestThresholdSequence(sum_gradient, sum_hessian, num_data, min_gain_shift, output, meta_->num_bin - 1);
-    }
    output->gain -= min_gain_shift;
  }