Commit 00cb04a2 authored by Guolin Ke's avatar Guolin Ke Committed by GitHub
Browse files

Better missing value handle (#747)

* finish the data loading part

* allow prediction.

* fix bug for decision type.

* finish split finding part

* fix bugs.

* bug fixed. add a test .

* fix pep8 .

* update documents.

* fix test bugs.

* fix a format

* fix import error in python test.

* disable missing handle in categorial features.

* fix a bug.

* add more tests.

* fix pep8

* fix bugs.

* remove the missing handle code for categorical feature.
parent db4374e1
......@@ -14,6 +14,7 @@ namespace LightGBM
class FeatureMetainfo {
public:
int num_bin;
MissingType missing_type;
int bias = 0;
uint32_t default_bin;
/*! \brief pointer of tree config */
......@@ -70,7 +71,7 @@ public:
void FindBestThreshold(double sum_gradient, double sum_hessian, data_size_t num_data,
SplitInfo* output) {
output->default_bin_for_zero = meta_->default_bin;
output->default_left = true;
output->gain = kMinScore;
find_best_threshold_fun_(sum_gradient, sum_hessian + 2 * kEpsilon, num_data, output);
}
......@@ -82,17 +83,27 @@ public:
double gain_shift = GetLeafSplitGain(sum_gradient, sum_hessian,
meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2);
double min_gain_shift = gain_shift + meta_->tree_config->min_gain_to_split;
if (meta_->tree_config->use_missing && meta_->num_bin > 2) {
FindBestThresholdSequence(sum_gradient, sum_hessian, num_data, min_gain_shift, output, 0);
FindBestThresholdSequence(sum_gradient, sum_hessian, num_data, min_gain_shift, output, meta_->num_bin - 1);
if (meta_->num_bin > 2 && meta_->missing_type != MissingType::None) {
if (meta_->missing_type == MissingType::Zero) {
FindBestThresholdSequence(sum_gradient, sum_hessian, num_data, min_gain_shift, output, -1, true, false);
FindBestThresholdSequence(sum_gradient, sum_hessian, num_data, min_gain_shift, output, 1, true, false);
} else {
FindBestThresholdSequence(sum_gradient, sum_hessian, num_data, min_gain_shift, output, -1, false, true);
FindBestThresholdSequence(sum_gradient, sum_hessian, num_data, min_gain_shift, output, 1, false, true);
}
} else {
FindBestThresholdSequence(sum_gradient, sum_hessian, num_data, min_gain_shift, output, meta_->default_bin);
FindBestThresholdSequence(sum_gradient, sum_hessian, num_data, min_gain_shift, output, -1, false, false);
// fix the direction error when only have 2 bins
if (meta_->missing_type == MissingType::NaN) {
output->default_left = false;
}
}
output->gain -= min_gain_shift;
}
void FindBestThresholdCategorical(double sum_gradient, double sum_hessian, data_size_t num_data,
SplitInfo* output) {
output->default_left = false;
double best_gain = kMinScore;
uint32_t best_threshold = static_cast<uint32_t>(meta_->num_bin);
data_size_t best_left_count = 0;
......@@ -245,14 +256,8 @@ public:
private:
void FindBestThresholdSequence(double sum_gradient, double sum_hessian, data_size_t num_data, double min_gain_shift,
SplitInfo* output, uint32_t default_bin_for_zero) {
int dir = -1;
if (static_cast<int>(default_bin_for_zero) == meta_->num_bin - 1) { dir = 1; };
SplitInfo* output, int dir, bool skip_default_bin, bool use_na_as_missing) {
bool skip_default_bin = true;
if (static_cast<int>(default_bin_for_zero) > 0 && static_cast<int>(default_bin_for_zero) < meta_->num_bin - 1) {
skip_default_bin = false;
}
const int bias = meta_->bias;
double best_sum_left_gradient = NAN;
......@@ -267,7 +272,7 @@ private:
double sum_right_hessian = kEpsilon;
data_size_t right_count = 0;
int t = meta_->num_bin - 1 - bias;
int t = meta_->num_bin - 1 - bias - use_na_as_missing;
const int t_end = 1 - bias;
// from right to left, and we don't need data in bin0
......@@ -319,15 +324,27 @@ private:
int t = 0;
const int t_end = meta_->num_bin - 2 - bias;
// from right to left, and we don't need data in bin0
if (use_na_as_missing && bias == 1) {
sum_left_gradient = sum_gradient;
sum_left_hessian = sum_hessian - kEpsilon;
left_count = num_data;
for (int i = 0; i < meta_->num_bin - bias; ++i) {
sum_left_gradient -= data_[i].sum_gradients;
sum_left_hessian -= data_[i].sum_hessians;
left_count -= data_[i].cnt;
}
t = -1;
}
for (; t <= t_end; ++t) {
// need to skip default bin
if (skip_default_bin && (t + bias) == static_cast<int>(meta_->default_bin)) { continue; }
sum_left_gradient += data_[t].sum_gradients;
sum_left_hessian += data_[t].sum_hessians;
left_count += data_[t].cnt;
if (t >= 0) {
sum_left_gradient += data_[t].sum_gradients;
sum_left_hessian += data_[t].sum_hessians;
left_count += data_[t].cnt;
}
// if data not enough, or sum hessian too small
if (left_count < meta_->tree_config->min_data_in_leaf
|| sum_left_hessian < meta_->tree_config->min_sum_hessian_in_leaf) continue;
......@@ -376,7 +393,7 @@ private:
output->right_sum_gradient = sum_gradient - best_sum_left_gradient;
output->right_sum_hessian = sum_hessian - best_sum_left_hessian - kEpsilon;
output->gain = best_gain;
output->default_bin_for_zero = default_bin_for_zero;
output->default_left = dir == -1;
}
}
......@@ -444,6 +461,7 @@ public:
for (int i = 0; i < num_feature; ++i) {
feature_metas_[i].num_bin = train_data->FeatureNumBin(i);
feature_metas_[i].default_bin = train_data->FeatureBinMapper(i)->GetDefaultBin();
feature_metas_[i].missing_type = train_data->FeatureBinMapper(i)->missing_type();
if (train_data->FeatureBinMapper(i)->GetDefaultBin() == 0) {
feature_metas_[i].bias = 1;
} else {
......
......@@ -521,10 +521,6 @@ void SerialTreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* ri
const int inner_feature_index = train_data_->InnerFeatureIndex(best_split_info.feature);
// left = parent
*left_leaf = best_Leaf;
double default_value = 0.0f;
if (train_data_->FeatureBinMapper(inner_feature_index)->GetDefaultBin() != best_split_info.default_bin_for_zero) {
default_value = train_data_->RealThreshold(inner_feature_index, best_split_info.default_bin_for_zero);
}
// split tree, will return right leaf
*right_leaf = tree->Split(best_Leaf,
inner_feature_index,
......@@ -537,12 +533,11 @@ void SerialTreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* ri
static_cast<data_size_t>(best_split_info.left_count),
static_cast<data_size_t>(best_split_info.right_count),
static_cast<double>(best_split_info.gain),
train_data_->FeatureBinMapper(inner_feature_index)->GetDefaultBin(),
best_split_info.default_bin_for_zero,
default_value);
train_data_->FeatureBinMapper(inner_feature_index)->missing_type(),
best_split_info.default_left);
// split data partition
data_partition_->Split(best_Leaf, train_data_, inner_feature_index,
best_split_info.threshold, best_split_info.default_bin_for_zero, *right_leaf);
best_split_info.threshold, best_split_info.default_left, *right_leaf);
// init the leaves that used on next iteration
if (best_split_info.left_count < best_split_info.right_count) {
......
......@@ -20,8 +20,8 @@ public:
int feature;
/*! \brief Split threshold */
uint32_t threshold;
uint32_t default_bin_for_zero;
/*! \brief True if default split is left */
bool default_left;
/*! \brief Left output after split */
double left_output;
/*! \brief Right output after split */
......
......@@ -68,6 +68,7 @@ void VotingParallelTreeLearner<TREELEARNER_T>::Init(const Dataset* train_data, b
for (int i = 0; i < train_data->num_features(); ++i) {
feature_metas_[i].num_bin = train_data->FeatureNumBin(i);
feature_metas_[i].default_bin = train_data->FeatureBinMapper(i)->GetDefaultBin();
feature_metas_[i].missing_type = train_data->FeatureBinMapper(i)->missing_type();
if (train_data->FeatureBinMapper(i)->GetDefaultBin() == 0) {
feature_metas_[i].bias = 1;
} else {
......
......@@ -6,6 +6,7 @@ import os
import unittest
import lightgbm as lgb
import random
import numpy as np
from sklearn.datasets import (load_boston, load_breast_cancer, load_digits,
load_iris, load_svmlight_file)
......@@ -94,6 +95,120 @@ class TestEngine(unittest.TestCase):
self.assertLess(ret, 16)
self.assertAlmostEqual(evals_result['valid_0']['l2'][-1], ret, places=5)
def test_missing_value_handle(self):
X_train = np.zeros((1000, 1))
y_train = np.zeros(1000)
trues = random.sample(range(1000), 200)
for idx in trues:
X_train[idx, 0] = np.nan
y_train[idx] = 1
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_train, y_train)
params = {
'metric': 'l2',
'verbose': -1,
'boost_from_average': False
}
evals_result = {}
gbm = lgb.train(params, lgb_train,
num_boost_round=20,
valid_sets=lgb_eval,
verbose_eval=True,
evals_result=evals_result)
ret = mean_squared_error(y_train, gbm.predict(X_train))
self.assertLess(ret, 0.005)
self.assertAlmostEqual(evals_result['valid_0']['l2'][-1], ret, places=5)
def test_missing_value_handle_na(self):
x = [0, 1, 2, 3, 4, 5, 6, 7, np.nan]
y = [1, 1, 1, 1, 0, 0, 0, 0, 1]
X_train = np.array(x).reshape(len(x), 1)
y_train = np.array(y)
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_train, y_train)
params = {
'objective': 'binary',
'metric': 'auc',
'verbose': -1,
'boost_from_average': False,
'min_data': 1,
'num_leaves': 2,
'learning_rate': 1,
'min_data_in_bin': 1,
'zero_as_missing': False
}
evals_result = {}
gbm = lgb.train(params, lgb_train,
num_boost_round=1,
valid_sets=lgb_eval,
verbose_eval=True,
evals_result=evals_result)
pred = gbm.predict(X_train)
self.assertAlmostEqual(pred[-1], pred[0], places=5)
def test_missing_value_handle_zero(self):
x = [0, 1, 2, 3, 4, 5, 6, 7, np.nan]
y = [0, 1, 1, 1, 0, 0, 0, 0, 0]
X_train = np.array(x).reshape(len(x), 1)
y_train = np.array(y)
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_train, y_train)
params = {
'objective': 'binary',
'metric': 'auc',
'verbose': -1,
'boost_from_average': False,
'min_data': 1,
'num_leaves': 2,
'learning_rate': 1,
'min_data_in_bin': 1,
'zero_as_missing': True
}
evals_result = {}
gbm = lgb.train(params, lgb_train,
num_boost_round=1,
valid_sets=lgb_eval,
verbose_eval=True,
evals_result=evals_result)
pred = gbm.predict(X_train)
self.assertAlmostEqual(pred[-1], pred[-2], places=5)
self.assertAlmostEqual(pred[-1], pred[0], places=5)
def test_missing_value_handle_none(self):
x = [0, 1, 2, 3, 4, 5, 6, 7, np.nan]
y = [0, 1, 1, 1, 0, 0, 0, 0, 0]
X_train = np.array(x).reshape(len(x), 1)
y_train = np.array(y)
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_train, y_train)
params = {
'objective': 'binary',
'metric': 'auc',
'verbose': -1,
'boost_from_average': False,
'min_data': 1,
'num_leaves': 2,
'learning_rate': 1,
'min_data_in_bin': 1,
'use_missing': False
}
evals_result = {}
gbm = lgb.train(params, lgb_train,
num_boost_round=1,
valid_sets=lgb_eval,
verbose_eval=True,
evals_result=evals_result)
pred = gbm.predict(X_train)
self.assertAlmostEqual(pred[0], pred[1], places=5)
self.assertAlmostEqual(pred[-1], pred[0], places=5)
def test_multiclass(self):
X, y = load_digits(10, True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment