"src/git@developer.sourcefind.cn:tianlh/lightgbm-dcu.git" did not exist on "8a19834a6644ca2598704fd1208a4b50cbedd02d"
Unverified Commit 9f79e840 authored by Guolin Ke's avatar Guolin Ke Committed by GitHub
Browse files

[python] [R-package] refine the parameters for Dataset (#2594)



* reset

* fix a bug

* fix test

* Update c_api.h

* support to no filter features by min_data

* add warning in reset config

* refine warnings for override dataset's parameter

* some cleans

* clean code

* clean code

* refine C API function doxygen comments

* refined new param description

* refined doxygen comments for R API function

* removed stuff related to int8

* break long line in warning message

* removed tests which results cannot be validated anymore

* added test for warnings about unchangeable params

* write parameter from dataset to booster

* consider free_raw_data.

* fix params

* fix bug

* implementing R

* fix typo

* filter params in R

* fix R

* not min_data

* refined tests

* fixed linting

* refine

* pilint

* add docstring

* fix docstring

* R lint

* updated description for C API function

* use param aliases in Python

* fixed typo

* fixed typo

* added more params to test

* removed debug print

* fix dataset construct place

* fix merge bug

* Update feature_histogram.hpp

* add is_sparse back

* remove unused parameters

* fix lint

* add data random seed

* update

* [R-package] centrallized Dataset parameter aliases and added tests on Dataset parameter updating (#2767)
Co-authored-by: default avatarNikita Titov <nekit94-08@mail.ru>
Co-authored-by: default avatarJames Lamb <jaylamb20@gmail.com>
parent fed09d33
...@@ -265,11 +265,11 @@ LGBM_SE LGBM_DatasetGetFieldSize_R(LGBM_SE handle, ...@@ -265,11 +265,11 @@ LGBM_SE LGBM_DatasetGetFieldSize_R(LGBM_SE handle,
R_API_END(); R_API_END();
} }
LGBM_SE LGBM_DatasetUpdateParam_R(LGBM_SE handle, LGBM_SE LGBM_DatasetUpdateParamChecking_R(LGBM_SE old_params,
LGBM_SE params, LGBM_SE new_params,
LGBM_SE call_state) { LGBM_SE call_state) {
R_API_BEGIN(); R_API_BEGIN();
CHECK_CALL(LGBM_DatasetUpdateParam(R_GET_PTR(handle), R_CHAR_PTR(params))); CHECK_CALL(LGBM_DatasetUpdateParamChecking(R_CHAR_PTR(old_params), R_CHAR_PTR(new_params)));
R_API_END(); R_API_END();
} }
......
...@@ -27,8 +27,8 @@ class FeatureMetainfo { ...@@ -27,8 +27,8 @@ class FeatureMetainfo {
MissingType missing_type; MissingType missing_type;
int8_t offset = 0; int8_t offset = 0;
uint32_t default_bin; uint32_t default_bin;
int8_t monotone_type; int8_t monotone_type = 0;
double penalty; double penalty = 1.0;
/*! \brief pointer of tree config */ /*! \brief pointer of tree config */
const Config* config; const Config* config;
BinType bin_type; BinType bin_type;
...@@ -734,25 +734,62 @@ class HistogramPool { ...@@ -734,25 +734,62 @@ class HistogramPool {
} }
} }
static void SetFeatureInfo(const Dataset* train_data, const Config* config, std::vector<FeatureMetainfo>* feature_meta) {
auto& ref_feature_meta = *feature_meta;
const int num_feature = train_data->num_features();
ref_feature_meta.resize(num_feature);
#pragma omp parallel for schedule(static)
for (int i = 0; i < num_feature; ++i) {
ref_feature_meta[i].num_bin = train_data->FeatureNumBin(i);
ref_feature_meta[i].default_bin = train_data->FeatureBinMapper(i)->GetDefaultBin();
ref_feature_meta[i].missing_type = train_data->FeatureBinMapper(i)->missing_type();
const int real_fidx = train_data->RealFeatureIndex(i);
if (!config->monotone_constraints.empty()) {
ref_feature_meta[i].monotone_type = config->monotone_constraints[real_fidx];
} else {
ref_feature_meta[i].monotone_type = 0;
}
if (!config->feature_contri.empty()) {
ref_feature_meta[i].penalty = config->feature_contri[real_fidx];
} else {
ref_feature_meta[i].penalty = 1.0;
}
if (train_data->FeatureBinMapper(i)->GetMostFreqBin() == 0) {
ref_feature_meta[i].offset = 1;
} else {
ref_feature_meta[i].offset = 0;
}
ref_feature_meta[i].config = config;
ref_feature_meta[i].bin_type = train_data->FeatureBinMapper(i)->bin_type();
}
}
static void SetFeatureInfoConfig(const Dataset* train_data, const Config* config, std::vector<FeatureMetainfo>* feature_meta) {
auto& ref_feature_meta = *feature_meta;
const int num_feature = train_data->num_features();
ref_feature_meta.resize(num_feature);
#pragma omp parallel for schedule(static)
for (int i = 0; i < num_feature; ++i) {
const int real_fidx = train_data->RealFeatureIndex(i);
if (!config->monotone_constraints.empty()) {
ref_feature_meta[i].monotone_type = config->monotone_constraints[real_fidx];
} else {
ref_feature_meta[i].monotone_type = 0;
}
if (!config->feature_contri.empty()) {
ref_feature_meta[i].penalty = config->feature_contri[real_fidx];
} else {
ref_feature_meta[i].penalty = 1.0;
}
ref_feature_meta[i].config = config;
}
}
void DynamicChangeSize(const Dataset* train_data, bool is_hist_colwise, const Config* config, int cache_size, int total_size) { void DynamicChangeSize(const Dataset* train_data, bool is_hist_colwise, const Config* config, int cache_size, int total_size) {
if (feature_metas_.empty()) { if (feature_metas_.empty()) {
SetFeatureInfo(train_data, config, &feature_metas_);
uint64_t bin_cnt_over_features = 0; uint64_t bin_cnt_over_features = 0;
int num_feature = train_data->num_features(); for (int i = 0; i < train_data->num_features(); ++i) {
feature_metas_.resize(num_feature);
for (int i = 0; i < num_feature; ++i) {
feature_metas_[i].num_bin = train_data->FeatureNumBin(i);
bin_cnt_over_features += static_cast<uint64_t>(feature_metas_[i].num_bin); bin_cnt_over_features += static_cast<uint64_t>(feature_metas_[i].num_bin);
feature_metas_[i].default_bin = train_data->FeatureBinMapper(i)->GetDefaultBin();
feature_metas_[i].missing_type = train_data->FeatureBinMapper(i)->missing_type();
feature_metas_[i].monotone_type = train_data->FeatureMonotone(i);
feature_metas_[i].penalty = train_data->FeaturePenalte(i);
if (train_data->FeatureBinMapper(i)->GetMostFreqBin() == 0) {
feature_metas_[i].offset = 1;
} else {
feature_metas_[i].offset = 0;
}
feature_metas_[i].config = config;
feature_metas_[i].bin_type = train_data->FeatureBinMapper(i)->bin_type();
} }
Log::Info("Total Bins %d", bin_cnt_over_features); Log::Info("Total Bins %d", bin_cnt_over_features);
} }
...@@ -799,17 +836,10 @@ class HistogramPool { ...@@ -799,17 +836,10 @@ class HistogramPool {
OMP_LOOP_EX_END(); OMP_LOOP_EX_END();
} }
OMP_THROW_EX(); OMP_THROW_EX();
train_data_ = train_data;
} }
void ResetConfig(const Config* config) { void ResetConfig(const Dataset* train_data, const Config* config) {
int size = static_cast<int>(feature_metas_.size()); SetFeatureInfoConfig(train_data, config, &feature_metas_);
#pragma omp parallel for schedule(static, 512) if (size >= 1024)
for (int i = 0; i < size; ++i) {
feature_metas_[i].config = config;
feature_metas_[i].monotone_type = train_data_->FeatureMonotone(i);
feature_metas_[i].penalty = train_data_->FeaturePenalte(i);
}
} }
/*! /*!
...@@ -878,7 +908,6 @@ class HistogramPool { ...@@ -878,7 +908,6 @@ class HistogramPool {
std::vector<int> inverse_mapper_; std::vector<int> inverse_mapper_;
std::vector<int> last_used_time_; std::vector<int> last_used_time_;
int cur_time_ = 0; int cur_time_ = 0;
const Dataset* train_data_;
}; };
} // namespace LightGBM } // namespace LightGBM
......
...@@ -140,7 +140,7 @@ void SerialTreeLearner::ResetConfig(const Config* config) { ...@@ -140,7 +140,7 @@ void SerialTreeLearner::ResetConfig(const Config* config) {
} else { } else {
config_ = config; config_ = config;
} }
histogram_pool_.ResetConfig(config_); histogram_pool_.ResetConfig(train_data_, config_);
if (CostEfficientGradientBoosting::IsEnable(config_)) { if (CostEfficientGradientBoosting::IsEnable(config_)) {
cegb_.reset(new CostEfficientGradientBoosting(this)); cegb_.reset(new CostEfficientGradientBoosting(this));
cegb_->Init(); cegb_->Init();
......
...@@ -51,37 +51,22 @@ void VotingParallelTreeLearner<TREELEARNER_T>::Init(const Dataset* train_data, b ...@@ -51,37 +51,22 @@ void VotingParallelTreeLearner<TREELEARNER_T>::Init(const Dataset* train_data, b
larger_buffer_read_start_pos_.resize(this->num_features_); larger_buffer_read_start_pos_.resize(this->num_features_);
global_data_count_in_leaf_.resize(this->config_->num_leaves); global_data_count_in_leaf_.resize(this->config_->num_leaves);
smaller_leaf_splits_global_.reset(new LeafSplits(this->train_data_->num_data())); smaller_leaf_splits_global_.reset(new LeafSplits(train_data->num_data()));
larger_leaf_splits_global_.reset(new LeafSplits(this->train_data_->num_data())); larger_leaf_splits_global_.reset(new LeafSplits(train_data->num_data()));
local_config_ = *this->config_; local_config_ = *this->config_;
local_config_.min_data_in_leaf /= num_machines_; local_config_.min_data_in_leaf /= num_machines_;
local_config_.min_sum_hessian_in_leaf /= num_machines_; local_config_.min_sum_hessian_in_leaf /= num_machines_;
this->histogram_pool_.ResetConfig(&local_config_); this->histogram_pool_.ResetConfig(train_data, &local_config_);
// initialize histograms for global // initialize histograms for global
smaller_leaf_histogram_array_global_.reset(new FeatureHistogram[this->num_features_]); smaller_leaf_histogram_array_global_.reset(new FeatureHistogram[this->num_features_]);
larger_leaf_histogram_array_global_.reset(new FeatureHistogram[this->num_features_]); larger_leaf_histogram_array_global_.reset(new FeatureHistogram[this->num_features_]);
auto num_total_bin = this->train_data_->NumTotalBin(); auto num_total_bin = train_data->NumTotalBin();
smaller_leaf_histogram_data_.resize(num_total_bin); smaller_leaf_histogram_data_.resize(num_total_bin);
larger_leaf_histogram_data_.resize(num_total_bin); larger_leaf_histogram_data_.resize(num_total_bin);
feature_metas_.resize(train_data->num_features()); HistogramPool::SetFeatureInfo(train_data, this->config_, &feature_metas_);
#pragma omp parallel for schedule(static)
for (int i = 0; i < train_data->num_features(); ++i) {
feature_metas_[i].num_bin = train_data->FeatureNumBin(i);
feature_metas_[i].default_bin = train_data->FeatureBinMapper(i)->GetDefaultBin();
feature_metas_[i].missing_type = train_data->FeatureBinMapper(i)->missing_type();
feature_metas_[i].monotone_type = train_data->FeatureMonotone(i);
feature_metas_[i].penalty = train_data->FeaturePenalte(i);
if (train_data->FeatureBinMapper(i)->GetMostFreqBin() == 0) {
feature_metas_[i].offset = 1;
} else {
feature_metas_[i].offset = 0;
}
feature_metas_[i].config = this->config_;
feature_metas_[i].bin_type = train_data->FeatureBinMapper(i)->bin_type();
}
uint64_t offset = 0; uint64_t offset = 0;
for (int j = 0; j < train_data->num_features(); ++j) { for (int j = 0; j < train_data->num_features(); ++j) {
offset += static_cast<uint64_t>(train_data->SubFeatureBinOffset(j)); offset += static_cast<uint64_t>(train_data->SubFeatureBinOffset(j));
...@@ -103,12 +88,10 @@ void VotingParallelTreeLearner<TREELEARNER_T>::ResetConfig(const Config* config) ...@@ -103,12 +88,10 @@ void VotingParallelTreeLearner<TREELEARNER_T>::ResetConfig(const Config* config)
local_config_.min_data_in_leaf /= num_machines_; local_config_.min_data_in_leaf /= num_machines_;
local_config_.min_sum_hessian_in_leaf /= num_machines_; local_config_.min_sum_hessian_in_leaf /= num_machines_;
this->histogram_pool_.ResetConfig(&local_config_); this->histogram_pool_.ResetConfig(this->train_data_, &local_config_);
global_data_count_in_leaf_.resize(this->config_->num_leaves); global_data_count_in_leaf_.resize(this->config_->num_leaves);
for (size_t i = 0; i < feature_metas_.size(); ++i) { HistogramPool::SetFeatureInfoConfig(this->train_data_, config, &feature_metas_);
feature_metas_[i].config = this->config_;
}
} }
template <typename TREELEARNER_T> template <typename TREELEARNER_T>
......
...@@ -185,54 +185,6 @@ class TestBasic(unittest.TestCase): ...@@ -185,54 +185,6 @@ class TestBasic(unittest.TestCase):
d1txt = d1f.read() d1txt = d1f.read()
self.assertEqual(dtxt, d1txt) self.assertEqual(dtxt, d1txt)
def test_get_feature_penalty_and_monotone_constraints(self):
X = np.random.random((100, 1))
d = lgb.Dataset(X, params={'feature_penalty': [0.5],
'monotone_constraints': [1]}).construct()
np.testing.assert_allclose(d.get_feature_penalty(), [0.5])
np.testing.assert_array_equal(d.get_monotone_constraints(), [1])
d = lgb.Dataset(X).construct()
self.assertIsNone(d.get_feature_penalty())
self.assertIsNone(d.get_monotone_constraints())
def test_add_features_feature_penalty(self):
X = np.random.random((100, 2))
test_cases = [
(None, None, None),
([0.5], None, [0.5, 1]),
(None, [0.5], [1, 0.5]),
([0.5], [0.5], [0.5, 0.5])]
for (p1, p2, expected) in test_cases:
params1 = {'feature_penalty': p1} if p1 is not None else {}
d1 = lgb.Dataset(X[:, 0].reshape((-1, 1)), params=params1).construct()
params2 = {'feature_penalty': p2} if p2 is not None else {}
d2 = lgb.Dataset(X[:, 1].reshape((-1, 1)), params=params2).construct()
d1.add_features_from(d2)
actual = d1.get_feature_penalty()
if expected is None:
self.assertIsNone(actual)
else:
np.testing.assert_allclose(actual, expected)
def test_add_features_monotone_types(self):
X = np.random.random((100, 2))
test_cases = [
(None, None, None),
([1], None, [1, 0]),
(None, [1], [0, 1]),
([1], [-1], [1, -1])]
for (p1, p2, expected) in test_cases:
params1 = {'monotone_constraints': p1} if p1 is not None else {}
d1 = lgb.Dataset(X[:, 0].reshape((-1, 1)), params=params1).construct()
params2 = {'monotone_constraints': p2} if p2 is not None else {}
d2 = lgb.Dataset(X[:, 1].reshape((-1, 1)), params=params2).construct()
d1.add_features_from(d2)
actual = d1.get_monotone_constraints()
if actual is None:
self.assertIsNone(actual)
else:
np.testing.assert_array_equal(actual, expected)
def test_cegb_affects_behavior(self): def test_cegb_affects_behavior(self):
X = np.random.random((100, 5)) X = np.random.random((100, 5))
X[:, [1, 3]] = 0 X[:, [1, 3]] = 0
......
...@@ -1811,6 +1811,84 @@ class TestEngine(unittest.TestCase): ...@@ -1811,6 +1811,84 @@ class TestEngine(unittest.TestCase):
self.assertNotAlmostEqual(predicted[0], predicted[1]) self.assertNotAlmostEqual(predicted[0], predicted[1])
self.assertAlmostEqual(predicted[1], predicted[2]) self.assertAlmostEqual(predicted[1], predicted[2])
def test_dataset_update_params(self):
default_params = {"max_bin": 100,
"max_bin_by_feature": [20, 10],
"bin_construct_sample_cnt": 10000,
"min_data_in_bin": 1,
"use_missing": False,
"zero_as_missing": False,
"categorical_feature": [0],
"feature_pre_filter": True,
"pre_partition": False,
"enable_bundle": True,
"data_random_seed": 0,
"is_enable_sparse": True,
"header": True,
"two_round": True,
"label_column": 0,
"weight_column": 0,
"group_column": 0,
"ignore_column": 0,
"min_data_in_leaf": 10,
"verbose": -1}
unchangeable_params = {"max_bin": 150,
"max_bin_by_feature": [30, 5],
"bin_construct_sample_cnt": 5000,
"min_data_in_bin": 2,
"use_missing": True,
"zero_as_missing": True,
"categorical_feature": [0, 1],
"feature_pre_filter": False,
"pre_partition": True,
"enable_bundle": False,
"data_random_seed": 1,
"is_enable_sparse": False,
"header": False,
"two_round": False,
"label_column": 1,
"weight_column": 1,
"group_column": 1,
"ignore_column": 1,
"forcedbins_filename": "/some/path/forcedbins.json",
"min_data_in_leaf": 2}
X = np.random.random((100, 2))
y = np.random.random(100)
# decreasing without freeing raw data is allowed
lgb_data = lgb.Dataset(X, y, params=default_params, free_raw_data=False).construct()
default_params["min_data_in_leaf"] -= 1
lgb.train(default_params, lgb_data, num_boost_round=3)
# decreasing before lazy init is allowed
lgb_data = lgb.Dataset(X, y, params=default_params)
default_params["min_data_in_leaf"] -= 1
lgb.train(default_params, lgb_data, num_boost_round=3)
# increasing is allowed
default_params["min_data_in_leaf"] += 2
lgb.train(default_params, lgb_data, num_boost_round=3)
# decreasing with disabled filter is allowed
default_params["feature_pre_filter"] = False
lgb_data = lgb.Dataset(X, y, params=default_params).construct()
default_params["min_data_in_leaf"] -= 4
lgb.train(default_params, lgb_data, num_boost_round=3)
# decreasing with enabled filter is disallowed;
# also changes of other params are disallowed
default_params["feature_pre_filter"] = True
lgb_data = lgb.Dataset(X, y, params=default_params).construct()
for key, value in unchangeable_params.items():
new_params = default_params.copy()
new_params[key] = value
err_msg = ("Reducing `min_data_in_leaf` with `feature_pre_filter=true` may cause *"
if key == "min_data_in_leaf"
else "Cannot change {} *".format(key if key != "forcedbins_filename"
else "forced bins"))
with np.testing.assert_raises_regex(lgb.basic.LightGBMError, err_msg):
lgb.train(new_params, lgb_data, num_boost_round=3)
def test_extra_trees(self): def test_extra_trees(self):
# check extra trees increases regularization # check extra trees increases regularization
X, y = load_boston(True) X, y = load_boston(True)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment