Unverified Commit f49b0aef authored by Guolin Ke's avatar Guolin Ke Committed by GitHub
Browse files

add is_sparse back (#2729)

parent bef83598
......@@ -501,6 +501,10 @@ IO Parameters
- LightGBM will auto compress memory according to ``max_bin``. For example, LightGBM will use ``uint8_t`` for feature value if ``max_bin=255``
- ``is_enable_sparse`` :raw-html:`<a id="is_enable_sparse" title="Permalink to this parameter" href="#is_enable_sparse">&#x1F517;&#xFE0E;</a>`, default = ``true``, type = bool, aliases: ``is_sparse``, ``enable_sparse``, ``sparse``
- used to enable/disable sparse optimization
- ``max_bin_by_feature`` :raw-html:`<a id="max_bin_by_feature" title="Permalink to this parameter" href="#max_bin_by_feature">&#x1F517;&#xFE0E;</a>`, default = ``None``, type = multi-int
- max number of bins for each feature
......
......@@ -478,6 +478,11 @@ struct Config {
// desc = LightGBM will auto compress memory according to ``max_bin``. For example, LightGBM will use ``uint8_t`` for feature value if ``max_bin=255``
int max_bin = 255;
// alias = is_sparse, enable_sparse, sparse
// desc = used to enable/disable sparse optimization
bool is_enable_sparse = true;
// type = multi-int
// default = None
// desc = max number of bins for each feature
......
......@@ -92,6 +92,9 @@ std::unordered_map<std::string, std::string> Config::alias_table({
{"forced_splits_file", "forcedsplits_filename"},
{"forced_splits", "forcedsplits_filename"},
{"verbose", "verbosity"},
{"is_sparse", "is_enable_sparse"},
{"enable_sparse", "is_enable_sparse"},
{"sparse", "is_enable_sparse"},
{"subsample_for_bin", "bin_construct_sample_cnt"},
{"hist_pool_size", "histogram_pool_size"},
{"data_seed", "data_random_seed"},
......@@ -222,6 +225,7 @@ std::unordered_set<std::string> Config::parameter_set({
"cegb_penalty_feature_coupled",
"verbosity",
"max_bin",
"is_enable_sparse",
"max_bin_by_feature",
"min_data_in_bin",
"bin_construct_sample_cnt",
......@@ -435,6 +439,8 @@ void Config::GetMembersFromString(const std::unordered_map<std::string, std::str
GetInt(params, "max_bin", &max_bin);
CHECK(max_bin >1);
GetBool(params, "is_enable_sparse", &is_enable_sparse);
if (GetString(params, "max_bin_by_feature", &tmp_str)) {
max_bin_by_feature = Common::StringToArray<int32_t>(tmp_str, ',');
}
......@@ -634,6 +640,7 @@ std::string Config::SaveMembersToString() const {
str_buf << "[cegb_penalty_feature_coupled: " << Common::Join(cegb_penalty_feature_coupled, ",") << "]\n";
str_buf << "[verbosity: " << verbosity << "]\n";
str_buf << "[max_bin: " << max_bin << "]\n";
str_buf << "[is_enable_sparse: " << is_enable_sparse << "]\n";
str_buf << "[max_bin_by_feature: " << Common::Join(max_bin_by_feature, ",") << "]\n";
str_buf << "[min_data_in_bin: " << min_data_in_bin << "]\n";
str_buf << "[bin_construct_sample_cnt: " << bin_construct_sample_cnt << "]\n";
......
......@@ -98,6 +98,7 @@ std::vector<std::vector<int>> FindGroups(const std::vector<std::unique_ptr<BinMa
data_size_t total_sample_cnt,
data_size_t num_data,
bool is_use_gpu,
bool is_sparse,
std::vector<int8_t>* multi_val_group) {
const int max_search_group = 100;
const int max_bin_per_group = 256;
......@@ -165,6 +166,10 @@ std::vector<std::vector<int>> FindGroups(const std::vector<std::unique_ptr<BinMa
group_num_bin.push_back(1 + bin_mappers[fidx]->num_bin() + (bin_mappers[fidx]->GetDefaultBin() == 0 ? -1 : 0));
}
}
if (!is_sparse) {
multi_val_group->resize(features_in_group.size(), false);
return features_in_group;
}
std::vector<int> second_round_features;
std::vector<std::vector<int>> features_in_group2;
std::vector<std::vector<bool>> conflict_marks2;
......@@ -217,6 +222,7 @@ std::vector<std::vector<int>> FastFeatureBundling(const std::vector<std::unique_
const std::vector<int>& used_features,
data_size_t num_data,
bool is_use_gpu,
bool is_sparse,
std::vector<int8_t>* multi_val_group) {
Common::FunctionTimer fun_timer("Dataset::FastFeatureBundling", global_timer);
std::vector<size_t> feature_non_zero_cnt;
......@@ -263,8 +269,8 @@ std::vector<std::vector<int>> FastFeatureBundling(const std::vector<std::unique_
}
}
std::vector<int8_t> group_is_multi_val, group_is_multi_val2;
auto features_in_group = FindGroups(bin_mappers, used_features, sample_indices, tmp_num_per_col.data(), num_sample_col, total_sample_cnt, num_data, is_use_gpu, &group_is_multi_val);
auto group2 = FindGroups(bin_mappers, feature_order_by_cnt, sample_indices, tmp_num_per_col.data(), num_sample_col, total_sample_cnt, num_data, is_use_gpu, &group_is_multi_val2);
auto features_in_group = FindGroups(bin_mappers, used_features, sample_indices, tmp_num_per_col.data(), num_sample_col, total_sample_cnt, num_data, is_use_gpu, is_sparse, &group_is_multi_val);
auto group2 = FindGroups(bin_mappers, feature_order_by_cnt, sample_indices, tmp_num_per_col.data(), num_sample_col, total_sample_cnt, num_data, is_use_gpu, is_sparse, &group_is_multi_val2);
if (features_in_group.size() > group2.size()) {
features_in_group = group2;
......@@ -311,7 +317,7 @@ void Dataset::Construct(
if (io_config.enable_bundle && !used_features.empty()) {
features_in_group = FastFeatureBundling(*bin_mappers,
sample_non_zero_indices, sample_values, num_per_col, num_sample_col, static_cast<data_size_t>(total_sample_cnt),
used_features, num_data_, io_config.device_type == std::string("gpu"), &group_is_multi_val);
used_features, num_data_, io_config.device_type == std::string("gpu"), io_config.is_enable_sparse, &group_is_multi_val);
}
num_features_ = 0;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment