Unverified Commit c35ecfbf authored by Zhuyi Xue's avatar Zhuyi Xue Committed by GitHub
Browse files

renamed cur_cat => cur_cat_idx and added some comments (#5522)

parent 8b720844
......@@ -352,7 +352,7 @@ namespace LightGBM {
int zero_cnt = static_cast<int>(total_sample_cnt - num_sample_values - na_cnt);
// find distinct_values first
std::vector<double> distinct_values;
std::vector<int> counts;
std::vector<int> counts; // count of data points for each distinct feature value.
std::stable_sort(values, values + num_sample_values);
......@@ -389,7 +389,7 @@ namespace LightGBM {
}
min_val_ = distinct_values.front();
max_val_ = distinct_values.back();
std::vector<int> cnt_in_bin;
std::vector<int> cnt_in_bin; // count of data points in each bin.
int num_distinct_values = static_cast<int>(distinct_values.size());
if (bin_type_ == BinType::NumericalBin) {
if (missing_type_ == MissingType::Zero) {
......@@ -446,12 +446,12 @@ namespace LightGBM {
Log::Warning("Met categorical feature which contains sparse values. "
"Consider renumbering to consecutive integers started from zero");
}
// sort by counts
// sort by counts in descending order
Common::SortForPair<int, int>(&counts_int, &distinct_values_int, 0, true);
// will ignore the categorical of small counts
int cut_cnt = static_cast<int>(
Common::RoundInt((total_sample_cnt - na_cnt) * 0.99f));
size_t cur_cat = 0;
size_t cur_cat_idx = 0; // index of current category.
categorical_2_bin_.clear();
bin_2_categorical_.clear();
int used_cnt = 0;
......@@ -467,20 +467,20 @@ namespace LightGBM {
categorical_2_bin_[-1] = 0;
cnt_in_bin.push_back(0);
num_bin_ = 1;
while (cur_cat < distinct_values_int.size()
while (cur_cat_idx < distinct_values_int.size()
&& (used_cnt < cut_cnt || num_bin_ < max_bin)) {
if (counts_int[cur_cat] < min_data_in_bin && cur_cat > 1) {
if (counts_int[cur_cat_idx] < min_data_in_bin && cur_cat_idx > 1) {
break;
}
bin_2_categorical_.push_back(distinct_values_int[cur_cat]);
categorical_2_bin_[distinct_values_int[cur_cat]] = static_cast<unsigned int>(num_bin_);
used_cnt += counts_int[cur_cat];
cnt_in_bin.push_back(counts_int[cur_cat]);
bin_2_categorical_.push_back(distinct_values_int[cur_cat_idx]);
categorical_2_bin_[distinct_values_int[cur_cat_idx]] = static_cast<unsigned int>(num_bin_);
used_cnt += counts_int[cur_cat_idx];
cnt_in_bin.push_back(counts_int[cur_cat_idx]);
++num_bin_;
++cur_cat;
++cur_cat_idx;
}
// Use MissingType::None to represent this bin contains all categoricals
if (cur_cat == distinct_values_int.size() && na_cnt == 0) {
if (cur_cat_idx == distinct_values_int.size() && na_cnt == 0) {
missing_type_ = MissingType::None;
} else {
missing_type_ = MissingType::NaN;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment