Commit 00cb04a2 authored by Guolin Ke's avatar Guolin Ke Committed by GitHub
Browse files

Better missing value handle (#747)

* finish the data loading part

* allow prediction.

* fix bug for decision type.

* finish split finding part

* fix bugs.

* bug fixed. add a test .

* fix pep8 .

* update documents.

* fix test bugs.

* fix a format

* fix import error in python test.

* disable missing handle in categorial features.

* fix a bug.

* add more tests.

* fix pep8

* fix bugs.

* remove the missing handle code for categorical feature.
parent db4374e1
......@@ -203,6 +203,9 @@ The parameter format is `key1=value1 key2=value2 ... ` . And parameters can be s
* The Threshold of margin in early-stopping prediction.
* `use_missing`, default=`true`, type=bool
* Set to `false` will disbale the special handle of missing value.
* `zero_as_missing`, default=`false`, type=bool
* Set to `true` will treat all zero as missing values (including the unshown values in libsvm/sparse matrics).
* Set to `false` will use `na` to represent missing values.
## Objective parameters
......
......@@ -17,6 +17,11 @@ enum BinType {
CategoricalBin
};
enum MissingType {
None,
Zero,
NaN
};
/*! \brief Store data for one histogram bin */
struct HistogramBinEntry {
......@@ -63,6 +68,9 @@ public:
if (num_bin_ != other.num_bin_) {
return false;
}
if (missing_type_ != other.missing_type_) {
return false;
}
if (bin_type_ == BinType::NumericalBin) {
for (int i = 0; i < num_bin_; ++i) {
if (bin_upper_bound_[i] != other.bin_upper_bound_[i]) {
......@@ -81,6 +89,8 @@ public:
/*! \brief Get number of bins */
inline int num_bin() const { return num_bin_; }
/*! \brief Missing Type */
inline MissingType missing_type() const { return missing_type_; }
/*! \brief True if bin is trival (contains only one bin) */
inline bool is_trival() const { return is_trival_; }
/*! \brief Sparsity of this bin ( num_zero_bins / num_data ) */
......@@ -129,8 +139,11 @@ public:
* \param min_data_in_bin min number of data in one bin
* \param min_split_data
* \param bin_type Type of this bin
* \param use_missing True to enable missing value handle
* \param zero_as_missing True to use zero as missing value
*/
void FindBin(double* values, int num_values, size_t total_sample_cnt, int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type);
void FindBin(double* values, int num_values, size_t total_sample_cnt, int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type,
bool use_missing, bool zero_as_missing);
/*!
* \brief Use specific number of bin to calculate the size of this class
......@@ -173,6 +186,7 @@ public:
private:
/*! \brief Number of bins */
int num_bin_;
MissingType missing_type_;
/*! \brief Store upper bound for each bin */
std::vector<double> bin_upper_bound_;
/*! \brief True if this feature is trival */
......@@ -360,7 +374,8 @@ public:
* \param min_bin min_bin of current used feature
* \param max_bin max_bin of current used feature
* \param default_bin defualt bin if bin not in [min_bin, max_bin]
* \param default_bin_for_zero defualt bin for the zero(missing) bin
* \param missing_type missing type
* \param default_left missing bin will go to left child
* \param threshold The split threshold.
* \param data_indices Used data indices. After called this function. The less than or equal data indices will store on this object.
* \param num_data Number of used data
......@@ -370,7 +385,7 @@ public:
* \return The number of less than or equal data.
*/
virtual data_size_t Split(uint32_t min_bin, uint32_t max_bin,
uint32_t default_bin, uint32_t default_bin_for_zero, uint32_t threshold,
uint32_t default_bin, MissingType missing_type, bool default_left, uint32_t threshold,
data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices, BinType bin_type) const = 0;
......@@ -417,10 +432,20 @@ public:
};
inline uint32_t BinMapper::ValueToBin(double value) const {
if (std::isnan(value)) {
if (missing_type_ == MissingType::NaN) {
return num_bin_ - 1;
} else {
value = 0.0f;
}
}
if (bin_type_ == BinType::NumericalBin) {
// binary search to find bin
int l = 0;
int r = num_bin_ - 1;
if (missing_type_ == MissingType::NaN) {
r -= 1;
}
while (l < r) {
int m = (r + l - 1) / 2;
if (value <= bin_upper_bound_[m]) {
......
......@@ -148,7 +148,8 @@ public:
int pred_early_stop_freq = 10;
/*! \brief Threshold of margin of pred_early_stop */
double pred_early_stop_margin = 10.0f;
bool zero_as_missing = false;
bool use_missing = true;
LIGHTGBM_EXPORT void Set(const std::unordered_map<std::string, std::string>& params) override;
};
......@@ -219,8 +220,6 @@ public:
int gpu_device_id = -1;
/*! \brief Set to true to use double precision math on GPU (default using single precision) */
bool gpu_use_dp = false;
/*! \brief Set to false to disable the handle of missing values */
bool use_missing = true;
LIGHTGBM_EXPORT void Set(const std::unordered_map<std::string, std::string>& params) override;
};
......@@ -456,7 +455,7 @@ struct ParameterAlias {
"feature_fraction_seed", "enable_bundle", "data_filename", "valid_data_filenames",
"snapshot_freq", "verbosity", "sparse_threshold", "enable_load_from_binary_file",
"max_conflict_rate", "poisson_max_delta_step", "gaussian_eta",
"histogram_pool_size", "output_freq", "is_provide_training_metric", "machine_list_filename"
"histogram_pool_size", "output_freq", "is_provide_training_metric", "machine_list_filename", "zero_as_missing"
});
std::unordered_map<std::string, std::string> tmp_map;
for (const auto& pair : *params) {
......
......@@ -402,12 +402,12 @@ public:
HistogramBinEntry* data) const;
inline data_size_t Split(int feature,
uint32_t threshold, uint32_t default_bin_for_zero,
uint32_t threshold, bool default_left,
data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices) const {
const int group = feature2group_[feature];
const int sub_feature = feature2subfeature_[feature];
return feature_groups_[group]->Split(sub_feature, threshold, default_bin_for_zero, data_indices, num_data, lte_indices, gt_indices);
return feature_groups_[group]->Split(sub_feature, threshold, default_left, data_indices, num_data, lte_indices, gt_indices);
}
inline int SubFeatureBinOffset(int i) const {
......
......@@ -161,14 +161,15 @@ public:
inline data_size_t Split(
int sub_feature,
uint32_t threshold,
uint32_t default_bin_for_zero,
bool default_left,
data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices) const {
uint32_t min_bin = bin_offsets_[sub_feature];
uint32_t max_bin = bin_offsets_[sub_feature + 1] - 1;
uint32_t default_bin = bin_mappers_[sub_feature]->GetDefaultBin();
return bin_data_->Split(min_bin, max_bin, default_bin, default_bin_for_zero,
auto missing_type = bin_mappers_[sub_feature]->missing_type();
return bin_data_->Split(min_bin, max_bin, default_bin, missing_type, default_left,
threshold, data_indices, num_data, lte_indices, gt_indices, bin_mappers_[sub_feature]->bin_type());
}
/*!
......
......@@ -19,7 +19,7 @@ const score_t kMinScore = -std::numeric_limits<score_t>::infinity();
const score_t kEpsilon = 1e-15f;
const double kMissingValueRange = 1e-20f;
const double kZeroAsMissingValueRange = 1e-20f;
using ReduceFunction = std::function<void(const char*, char*, int)>;
......
......@@ -11,6 +11,8 @@
namespace LightGBM {
#define kMaxTreeOutput (100)
#define kCategoricalMask (1)
#define kDefaultLeftMask (2)
/*!
* \brief Tree model
......@@ -44,15 +46,13 @@ public:
* \param left_cnt Count of left child
* \param right_cnt Count of right child
* \param gain Split gain
* \param zero_bin bin value for value==0 (missing value)
* \param default_bin default conversion for the missing value, in bin
* \param default_value default conversion for the missing value, in float value
* \param missing_type missing type
* \param default_left default direction for missing value
* \return The index of new leaf.
*/
int Split(int leaf, int feature, BinType bin_type, uint32_t threshold, int real_feature,
double threshold_double, double left_value, double right_value,
data_size_t left_cnt, data_size_t right_cnt, double gain,
uint32_t zero_bin, uint32_t default_bin_for_zero, double default_value);
data_size_t left_cnt, data_size_t right_cnt, double gain, MissingType missing_type, bool default_left);
/*! \brief Get the output of one leaf */
inline double LeafOutput(int leaf) const { return leaf_value_[leaf]; }
......@@ -127,7 +127,7 @@ public:
std::string ToIfElse(int index, bool is_predict_leaf_index);
template<typename T>
static bool CategoricalDecision(T fval, T threshold) {
inline static bool CategoricalDecision(T fval, T threshold) {
if (static_cast<int>(fval) == static_cast<int>(threshold)) {
return true;
} else {
......@@ -136,7 +136,7 @@ public:
}
template<typename T>
static bool NumericalDecision(T fval, T threshold) {
inline static bool NumericalDecision(T fval, T threshold) {
if (fval <= threshold) {
return true;
} else {
......@@ -144,24 +144,67 @@ public:
}
}
static double DefaultValueForZero(double fval, double zero, double out) {
if (fval > -zero && fval <= zero) {
return out;
inline static bool IsZero(double fval) {
if (fval > -kZeroAsMissingValueRange && fval <= kZeroAsMissingValueRange) {
return true;
} else {
return fval;
return false;
}
}
static uint32_t DefaultValueForZero(uint32_t fval, uint32_t zero, uint32_t out) {
if (fval == zero) {
return out;
inline static bool GetDecisionType(int8_t decision_type, int8_t mask) {
return (decision_type & mask) > 0;
}
inline static void SetDecisionType(int8_t* decision_type, bool input, int8_t mask) {
if (input) {
(*decision_type) |= mask;
} else {
return fval;
(*decision_type) &= (127 - mask);
}
}
inline static int8_t GetMissingType(int8_t decision_type) {
return (decision_type >> 2) & 3;
}
inline static void SetMissingType(int8_t* decision_type, int8_t input) {
(*decision_type) &= 3;
(*decision_type) |= (input << 2);
}
inline static uint32_t ConvertMissingValue(uint32_t fval, uint32_t threshold, int8_t decision_type, uint32_t default_bin, uint32_t max_bin) {
uint8_t missing_type = GetMissingType(decision_type);
if ((missing_type == 1 && fval == default_bin)
|| (missing_type == 2 && fval == max_bin)) {
if (GetDecisionType(decision_type, kDefaultLeftMask)) {
fval = threshold;
} else {
fval = threshold + 1;
}
}
return fval;
}
inline static double ConvertMissingValue(double fval, double threshold, int8_t decision_type) {
uint8_t missing_type = GetMissingType(decision_type);
if (std::isnan(fval)) {
if (missing_type != 2) {
fval = 0.0f;
}
}
if ((missing_type == 1 && IsZero(fval))
|| (missing_type == 2 && std::isnan(fval))) {
if (GetDecisionType(decision_type, kDefaultLeftMask)) {
fval = threshold;
} else {
fval = 10.0f * threshold;
}
}
return fval;
}
static const char* GetDecisionTypeName(int8_t type) {
inline static const char* GetDecisionTypeName(int8_t type) {
if (type == 0) {
return "no_greater";
} else {
......@@ -204,12 +247,8 @@ private:
std::vector<uint32_t> threshold_in_bin_;
/*! \brief A non-leaf node's split threshold in feature value */
std::vector<double> threshold_;
/*! \brief Decision type, 0 for '<='(numerical feature), 1 for 'is'(categorical feature) */
/*! \brief Store the information for categorical feature handle and mising value handle. */
std::vector<int8_t> decision_type_;
/*! \brief Default values for the na/0 feature values */
std::vector<double> default_value_;
std::vector<uint32_t> zero_bin_;
std::vector<uint32_t> default_bin_for_zero_;
/*! \brief A non-leaf node's split gain */
std::vector<double> split_gain_;
// used for leaf node
......@@ -251,8 +290,8 @@ inline int Tree::GetLeaf(const double* feature_values) const {
int node = 0;
if (has_categorical_) {
while (node >= 0) {
double fval = DefaultValueForZero(feature_values[split_feature_[node]], kMissingValueRange, default_value_[node]);
if (decision_funs[decision_type_[node]](
double fval = ConvertMissingValue(feature_values[split_feature_[node]], threshold_[node], decision_type_[node]);
if (decision_funs[GetDecisionType(decision_type_[node], kCategoricalMask)](
fval,
threshold_[node])) {
node = left_child_[node];
......@@ -262,7 +301,7 @@ inline int Tree::GetLeaf(const double* feature_values) const {
}
} else {
while (node >= 0) {
double fval = DefaultValueForZero(feature_values[split_feature_[node]], kMissingValueRange, default_value_[node]);
double fval = ConvertMissingValue(feature_values[split_feature_[node]], threshold_[node], decision_type_[node]);
if (NumericalDecision<double>(
fval,
threshold_[node])) {
......
......@@ -233,7 +233,7 @@ inline static const char* Atof(const char* p, double* out) {
std::string tmp_str(p, cnt);
std::transform(tmp_str.begin(), tmp_str.end(), tmp_str.begin(), Common::tolower);
if (tmp_str == std::string("na") || tmp_str == std::string("nan")) {
*out = 0;
*out = NAN;
} else if (tmp_str == std::string("inf") || tmp_str == std::string("infinity")) {
*out = sign * 1e308;
} else {
......@@ -513,10 +513,10 @@ inline static std::vector<int> VectorSize(const std::vector<std::vector<T>>& dat
}
inline static double AvoidInf(double x) {
if (x >= std::numeric_limits<double>::max()) {
return std::numeric_limits<double>::max();
} else if(x <= std::numeric_limits<double>::lowest()) {
return std::numeric_limits<double>::lowest();
if (x >= 1e300) {
return 1e300;
} else if(x <= -1e300) {
return -1e300;
} else {
return x;
}
......
......@@ -453,7 +453,7 @@ bool GBDT::TrainOneIter(const score_t* gradient, const score_t* hessian, bool is
auto label = train_data_->metadata().label();
double init_score = ObtainAutomaticInitialScore(objective_function_, label, num_data_);
std::unique_ptr<Tree> new_tree(new Tree(2));
new_tree->Split(0, 0, BinType::NumericalBin, 0, 0, 0, init_score, init_score, 0, 0, -1, 0, 0, 0);
new_tree->Split(0, 0, BinType::NumericalBin, 0, 0, 0, init_score, init_score, 0, 0, -1, MissingType::None, true);
train_score_updater_->AddScore(init_score, 0);
for (auto& score_updater : valid_score_updater_) {
score_updater->AddScore(init_score, 0);
......@@ -532,7 +532,7 @@ bool GBDT::TrainOneIter(const score_t* gradient, const score_t* hessian, bool is
if (!class_need_train_[cur_tree_id] && models_.size() < static_cast<size_t>(num_tree_per_iteration_)) {
auto output = class_default_output_[cur_tree_id];
new_tree->Split(0, 0, BinType::NumericalBin, 0, 0, 0,
output, output, 0, 0, -1, 0, 0, 0);
output, output, 0, 0, -1, MissingType::None, true);
train_score_updater_->AddScore(output, cur_tree_id);
for (auto& score_updater : valid_score_updater_) {
score_updater->AddScore(output, cur_tree_id);
......
......@@ -128,7 +128,7 @@ public:
double output = class_default_output_[cur_tree_id];
objective_function_->ConvertOutput(&output, &output);
new_tree->Split(0, 0, BinType::NumericalBin, 0, 0, 0,
output, output, 0, 0, -1, 0, 0, 0);
output, output, 0, 0, -1, MissingType::None, true);
train_score_updater_->AddScore(output, cur_tree_id);
for (auto& score_updater : valid_score_updater_) {
score_updater->AddScore(output, cur_tree_id);
......
......@@ -478,7 +478,7 @@ int LGBM_DatasetCreateFromMat(const void* data,
auto idx = sample_indices[i];
auto row = get_row_fun(static_cast<int>(idx));
for (size_t j = 0; j < row.size(); ++j) {
if (std::fabs(row[j]) > kEpsilon) {
if (std::fabs(row[j]) > kEpsilon || std::isnan(row[j])) {
sample_values[j].emplace_back(row[j]);
sample_idx[j].emplace_back(static_cast<int>(i));
}
......@@ -547,7 +547,7 @@ int LGBM_DatasetCreateFromCSR(const void* indptr,
sample_values.resize(inner_data.first + 1);
sample_idx.resize(inner_data.first + 1);
}
if (std::fabs(inner_data.second) > kEpsilon) {
if (std::fabs(inner_data.second) > kEpsilon || std::isnan(inner_data.second)) {
sample_values[inner_data.first].emplace_back(inner_data.second);
sample_idx[inner_data.first].emplace_back(static_cast<int>(i));
}
......@@ -615,7 +615,7 @@ int LGBM_DatasetCreateFromCSC(const void* col_ptr,
CSC_RowIterator col_it(col_ptr, col_ptr_type, indices, data, data_type, ncol_ptr, nelem, i);
for (int j = 0; j < sample_cnt; j++) {
auto val = col_it.Get(sample_indices[j]);
if (std::fabs(val) > kEpsilon) {
if (std::fabs(val) > kEpsilon || std::isnan(val)) {
sample_values[i].emplace_back(val);
sample_idx[i].emplace_back(j);
}
......@@ -1070,7 +1070,7 @@ int LGBM_BoosterPredictForCSC(BoosterHandle handle,
const int tid = omp_get_thread_num();
for (int j = 0; j < ncol; ++j) {
auto val = iterators[tid][j].Get(i);
if (std::fabs(val) > kEpsilon) {
if (std::fabs(val) > kEpsilon || std::isnan(val)) {
one_row.emplace_back(j, val);
}
}
......@@ -1179,9 +1179,6 @@ RowFunctionFromDenseMatric(const void* data, int num_row, int num_col, int data_
auto tmp_ptr = data_ptr + static_cast<size_t>(num_col) * row_idx;
for (int i = 0; i < num_col; ++i) {
ret[i] = static_cast<double>(*(tmp_ptr + i));
if (std::isnan(ret[i])) {
ret[i] = 0.0f;
}
}
return ret;
};
......@@ -1190,9 +1187,6 @@ RowFunctionFromDenseMatric(const void* data, int num_row, int num_col, int data_
std::vector<double> ret(num_col);
for (int i = 0; i < num_col; ++i) {
ret[i] = static_cast<double>(*(data_ptr + static_cast<size_t>(num_row) * i + row_idx));
if (std::isnan(ret[i])) {
ret[i] = 0.0f;
}
}
return ret;
};
......@@ -1205,9 +1199,6 @@ RowFunctionFromDenseMatric(const void* data, int num_row, int num_col, int data_
auto tmp_ptr = data_ptr + static_cast<size_t>(num_col) * row_idx;
for (int i = 0; i < num_col; ++i) {
ret[i] = static_cast<double>(*(tmp_ptr + i));
if (std::isnan(ret[i])) {
ret[i] = 0.0f;
}
}
return ret;
};
......@@ -1216,9 +1207,6 @@ RowFunctionFromDenseMatric(const void* data, int num_row, int num_col, int data_
std::vector<double> ret(num_col);
for (int i = 0; i < num_col; ++i) {
ret[i] = static_cast<double>(*(data_ptr + static_cast<size_t>(num_row) * i + row_idx));
if (std::isnan(ret[i])) {
ret[i] = 0.0f;
}
}
return ret;
};
......@@ -1235,7 +1223,7 @@ RowPairFunctionFromDenseMatric(const void* data, int num_row, int num_col, int d
auto raw_values = inner_function(row_idx);
std::vector<std::pair<int, double>> ret;
for (int i = 0; i < static_cast<int>(raw_values.size()); ++i) {
if (std::fabs(raw_values[i]) > 1e-15) {
if (std::fabs(raw_values[i]) > kEpsilon || std::isnan(raw_values[i])) {
ret.emplace_back(i, raw_values[i]);
}
}
......@@ -1256,10 +1244,8 @@ RowFunctionFromCSR(const void* indptr, int indptr_type, const int32_t* indices,
int64_t start = ptr_indptr[idx];
int64_t end = ptr_indptr[idx + 1];
for (int64_t i = start; i < end; ++i) {
if (!std::isnan(data_ptr[i])) {
ret.emplace_back(indices[i], data_ptr[i]);
}
}
return ret;
};
} else if (indptr_type == C_API_DTYPE_INT64) {
......@@ -1269,10 +1255,8 @@ RowFunctionFromCSR(const void* indptr, int indptr_type, const int32_t* indices,
int64_t start = ptr_indptr[idx];
int64_t end = ptr_indptr[idx + 1];
for (int64_t i = start; i < end; ++i) {
if (!std::isnan(data_ptr[i])) {
ret.emplace_back(indices[i], data_ptr[i]);
}
}
return ret;
};
}
......@@ -1285,10 +1269,8 @@ RowFunctionFromCSR(const void* indptr, int indptr_type, const int32_t* indices,
int64_t start = ptr_indptr[idx];
int64_t end = ptr_indptr[idx + 1];
for (int64_t i = start; i < end; ++i) {
if (!std::isnan(data_ptr[i])) {
ret.emplace_back(indices[i], data_ptr[i]);
}
}
return ret;
};
} else if (indptr_type == C_API_DTYPE_INT64) {
......@@ -1298,10 +1280,8 @@ RowFunctionFromCSR(const void* indptr, int indptr_type, const int32_t* indices,
int64_t start = ptr_indptr[idx];
int64_t end = ptr_indptr[idx + 1];
for (int64_t i = start; i < end; ++i) {
if (!std::isnan(data_ptr[i])) {
ret.emplace_back(indices[i], data_ptr[i]);
}
}
return ret;
};
}
......@@ -1325,7 +1305,6 @@ IterateFunctionFromCSC(const void* col_ptr, int col_ptr_type, const int32_t* ind
}
int idx = static_cast<int>(indices[i]);
double val = static_cast<double>(data_ptr[i]);
if (std::isnan(val)) { val = 0.0f; }
return std::make_pair(idx, val);
};
} else if (col_ptr_type == C_API_DTYPE_INT64) {
......@@ -1339,7 +1318,6 @@ IterateFunctionFromCSC(const void* col_ptr, int col_ptr_type, const int32_t* ind
}
int idx = static_cast<int>(indices[i]);
double val = static_cast<double>(data_ptr[i]);
if (std::isnan(val)) { val = 0.0f; }
return std::make_pair(idx, val);
};
}
......@@ -1356,7 +1334,6 @@ IterateFunctionFromCSC(const void* col_ptr, int col_ptr_type, const int32_t* ind
}
int idx = static_cast<int>(indices[i]);
double val = static_cast<double>(data_ptr[i]);
if (std::isnan(val)) { val = 0.0f; }
return std::make_pair(idx, val);
};
} else if (col_ptr_type == C_API_DTYPE_INT64) {
......@@ -1370,7 +1347,6 @@ IterateFunctionFromCSC(const void* col_ptr, int col_ptr_type, const int32_t* ind
}
int idx = static_cast<int>(indices[i]);
double val = static_cast<double>(data_ptr[i]);
if (std::isnan(val)) { val = 0.0f; }
return std::make_pair(idx, val);
};
}
......
......@@ -22,6 +22,7 @@ BinMapper::BinMapper() {
// deep copy function for BinMapper
BinMapper::BinMapper(const BinMapper& other) {
num_bin_ = other.num_bin_;
missing_type_ = other.missing_type_;
is_trival_ = other.is_trival_;
sparse_rate_ = other.sparse_rate_;
bin_type_ = other.bin_type_;
......@@ -63,8 +64,9 @@ bool NeedFilter(std::vector<int>& cnt_in_bin, int total_cnt, int filter_cnt, Bin
}
return true;
}
std::vector<double> GreedyFindBin(const double* distinct_values, const int* counts,
int num_distinct_values, int max_bin, int total_cnt, int min_data_in_bin) {
int num_distinct_values, int max_bin, size_t total_cnt, int min_data_in_bin) {
std::vector<double> bin_upper_bound;
if (num_distinct_values <= max_bin) {
bin_upper_bound.clear();
......@@ -134,11 +136,82 @@ std::vector<double> GreedyFindBin(const double* distinct_values, const int* coun
return bin_upper_bound;
}
std::vector<double> FindBinWithZeroAsMissing(const double* distinct_values, const int* counts,
int num_distinct_values, int max_bin, size_t total_sample_cnt, int min_data_in_bin) {
std::vector<double> bin_upper_bound;
int left_cnt_data = 0;
int cnt_missing = 0;
int right_cnt_data = 0;
for (int i = 0; i < num_distinct_values; ++i) {
if (distinct_values[i] <= -kZeroAsMissingValueRange) {
left_cnt_data += counts[i];
} else if (distinct_values[i] > kZeroAsMissingValueRange) {
right_cnt_data += counts[i];
} else {
cnt_missing += counts[i];
}
}
int left_cnt = 0;
for (int i = 0; i < num_distinct_values; ++i) {
if (distinct_values[i] > -kZeroAsMissingValueRange) {
left_cnt = i;
break;
}
}
if (left_cnt > 0) {
int left_max_bin = static_cast<int>(static_cast<double>(left_cnt_data) / (total_sample_cnt - cnt_missing) * (max_bin - 1));
bin_upper_bound = GreedyFindBin(distinct_values, counts, left_cnt, left_max_bin, left_cnt_data, min_data_in_bin);
bin_upper_bound.back() = -kZeroAsMissingValueRange;
}
int right_start = -1;
for (int i = left_cnt; i < num_distinct_values; ++i) {
if (distinct_values[i] > kZeroAsMissingValueRange) {
right_start = i;
break;
}
}
if (right_start >= 0) {
int right_max_bin = max_bin - 1 - static_cast<int>(bin_upper_bound.size());
auto right_bounds = GreedyFindBin(distinct_values + right_start, counts + right_start,
num_distinct_values - right_start, right_max_bin, right_cnt_data, min_data_in_bin);
bin_upper_bound.push_back(kZeroAsMissingValueRange);
bin_upper_bound.insert(bin_upper_bound.end(), right_bounds.begin(), right_bounds.end());
} else {
bin_upper_bound.push_back(std::numeric_limits<double>::infinity());
}
return bin_upper_bound;
}
void BinMapper::FindBin(double* values, int num_sample_values, size_t total_sample_cnt,
int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type) {
int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type, bool use_missing, bool zero_as_missing) {
int na_cnt = 0;
int tmp_num_sample_values = 0;
for (int i = 0; i < num_sample_values; ++i) {
if (!std::isnan(values[i])) {
values[tmp_num_sample_values++] = values[i];
}
}
if (!use_missing) {
missing_type_ = MissingType::None;
} else if (zero_as_missing) {
missing_type_ = MissingType::Zero;
} else {
if (tmp_num_sample_values == num_sample_values) {
missing_type_ = MissingType::None;
} else {
missing_type_ = MissingType::NaN;
}
na_cnt = num_sample_values - tmp_num_sample_values;
}
num_sample_values = tmp_num_sample_values;
bin_type_ = bin_type;
default_bin_ = 0;
int zero_cnt = static_cast<int>(total_sample_cnt - num_sample_values);
int zero_cnt = static_cast<int>(total_sample_cnt - num_sample_values - na_cnt);
// find distinct_values first
std::vector<double> distinct_values;
std::vector<int> counts;
......@@ -179,52 +252,17 @@ void BinMapper::FindBin(double* values, int num_sample_values, size_t total_samp
std::vector<int> cnt_in_bin;
int num_distinct_values = static_cast<int>(distinct_values.size());
if (bin_type_ == BinType::NumericalBin) {
bin_upper_bound_.clear();
int left_cnt_data = 0;
int missing_cnt_data = 0;
int right_cnt_data = 0;
for (int i = 0; i < num_distinct_values; ++i) {
if (distinct_values[i] <= -kMissingValueRange) {
left_cnt_data += counts[i];
} else if (distinct_values[i] > kMissingValueRange) {
right_cnt_data += counts[i];
} else {
missing_cnt_data += counts[i];
if (missing_type_ == MissingType::Zero) {
bin_upper_bound_ = FindBinWithZeroAsMissing(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin);
if (bin_upper_bound_.size() == 2) {
missing_type_ = MissingType::None;
}
}
int left_cnt = 0;
for (int i = 0; i < num_distinct_values; ++i) {
if (distinct_values[i] > -kMissingValueRange) {
left_cnt = i;
break;
}
}
if (left_cnt > 0) {
int left_max_bin = static_cast<int>(static_cast<double>(left_cnt_data) / (total_sample_cnt - missing_cnt_data) * (max_bin - 1));
bin_upper_bound_ = GreedyFindBin(distinct_values.data(), counts.data(), left_cnt, left_max_bin, left_cnt_data, min_data_in_bin);
bin_upper_bound_.back() = -kMissingValueRange;
}
int right_start = -1;
for (int i = left_cnt; i < num_distinct_values; ++i) {
if (distinct_values[i] > kMissingValueRange) {
right_start = i;
break;
}
}
if (right_start >= 0) {
int right_max_bin = max_bin - 1 - static_cast<int>(bin_upper_bound_.size());
auto right_bounds = GreedyFindBin(distinct_values.data() + right_start, counts.data() + right_start,
num_distinct_values - right_start, right_max_bin, right_cnt_data, min_data_in_bin);
bin_upper_bound_.push_back(kMissingValueRange);
bin_upper_bound_.insert(bin_upper_bound_.end(), right_bounds.begin(), right_bounds.end());
} else if (missing_type_ == MissingType::None) {
bin_upper_bound_ = GreedyFindBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin);
} else {
bin_upper_bound_.push_back(std::numeric_limits<double>::infinity());
bin_upper_bound_ = GreedyFindBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt, min_data_in_bin);
bin_upper_bound_.push_back(NaN);
}
num_bin_ = static_cast<int>(bin_upper_bound_.size());
{
cnt_in_bin.resize(num_bin_, 0);
......@@ -235,9 +273,14 @@ void BinMapper::FindBin(double* values, int num_sample_values, size_t total_samp
}
cnt_in_bin[i_bin] += counts[i];
}
if (missing_type_ == MissingType::NaN) {
cnt_in_bin[num_bin_ - 1] = na_cnt;
}
}
CHECK(num_bin_ <= max_bin);
} else {
// No missing handle for categorical features
missing_type_ = MissingType::None;
// convert to int type first
std::vector<int> distinct_values_int;
std::vector<int> counts_int;
......@@ -293,6 +336,7 @@ void BinMapper::FindBin(double* values, int num_sample_values, size_t total_samp
int BinMapper::SizeForSpecificBin(int bin) {
int size = 0;
size += sizeof(int);
size += sizeof(MissingType);
size += sizeof(bool);
size += sizeof(double);
size += sizeof(BinType);
......@@ -305,6 +349,8 @@ int BinMapper::SizeForSpecificBin(int bin) {
void BinMapper::CopyTo(char * buffer) const {
std::memcpy(buffer, &num_bin_, sizeof(num_bin_));
buffer += sizeof(num_bin_);
std::memcpy(buffer, &missing_type_, sizeof(missing_type_));
buffer += sizeof(missing_type_);
std::memcpy(buffer, &is_trival_, sizeof(is_trival_));
buffer += sizeof(is_trival_);
std::memcpy(buffer, &sparse_rate_, sizeof(sparse_rate_));
......@@ -327,6 +373,8 @@ void BinMapper::CopyTo(char * buffer) const {
void BinMapper::CopyFrom(const char * buffer) {
std::memcpy(&num_bin_, buffer, sizeof(num_bin_));
buffer += sizeof(num_bin_);
std::memcpy(&missing_type_, buffer, sizeof(missing_type_));
buffer += sizeof(missing_type_);
std::memcpy(&is_trival_, buffer, sizeof(is_trival_));
buffer += sizeof(is_trival_);
std::memcpy(&sparse_rate_, buffer, sizeof(sparse_rate_));
......@@ -354,6 +402,7 @@ void BinMapper::CopyFrom(const char * buffer) {
void BinMapper::SaveBinaryToFile(FILE* file) const {
fwrite(&num_bin_, sizeof(num_bin_), 1, file);
fwrite(&missing_type_, sizeof(missing_type_), 1, file);
fwrite(&is_trival_, sizeof(is_trival_), 1, file);
fwrite(&sparse_rate_, sizeof(sparse_rate_), 1, file);
fwrite(&bin_type_, sizeof(bin_type_), 1, file);
......@@ -368,7 +417,7 @@ void BinMapper::SaveBinaryToFile(FILE* file) const {
}
size_t BinMapper::SizesInByte() const {
size_t ret = sizeof(num_bin_) + sizeof(is_trival_) + sizeof(sparse_rate_)
size_t ret = sizeof(num_bin_) + sizeof(missing_type_) + sizeof(is_trival_) + sizeof(sparse_rate_)
+ sizeof(bin_type_) + sizeof(min_val_) + sizeof(max_val_) + sizeof(default_bin_);
if (bin_type_ == BinType::NumericalBin) {
ret += sizeof(double) * num_bin_;
......
......@@ -280,7 +280,8 @@ void IOConfig::Set(const std::unordered_map<std::string, std::string>& params) {
GetBool(params, "pred_early_stop", &pred_early_stop);
GetInt(params, "pred_early_stop_freq", &pred_early_stop_freq);
GetDouble(params, "pred_early_stop_margin", &pred_early_stop_margin);
GetBool(params, "use_missing", &use_missing);
GetBool(params, "zero_as_missing", &zero_as_missing);
device_type = GetDeviceType(params);
}
......@@ -365,7 +366,6 @@ void TreeConfig::Set(const std::unordered_map<std::string, std::string>& params)
GetInt(params, "gpu_platform_id", &gpu_platform_id);
GetInt(params, "gpu_device_id", &gpu_device_id);
GetBool(params, "gpu_use_dp", &gpu_use_dp);
GetBool(params, "use_missing", &use_missing);
}
void BoostingConfig::Set(const std::unordered_map<std::string, std::string>& params) {
......
......@@ -508,7 +508,7 @@ Dataset* DatasetLoader::CostructFromSampleData(double** sample_values,
}
bin_mappers[i].reset(new BinMapper());
bin_mappers[i]->FindBin(sample_values[i], num_per_col[i], total_sample_size,
io_config_.max_bin, io_config_.min_data_in_bin, filter_cnt, bin_type);
io_config_.max_bin, io_config_.min_data_in_bin, filter_cnt, bin_type, io_config_.use_missing, io_config_.zero_as_missing);
OMP_LOOP_EX_END();
}
OMP_THROW_EX();
......@@ -677,7 +677,7 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
sample_values.resize(inner_data.first + 1);
sample_indices.resize(inner_data.first + 1);
}
if (std::fabs(inner_data.second) > kEpsilon) {
if (std::fabs(inner_data.second) > kEpsilon || std::isnan(inner_data.second)) {
sample_values[inner_data.first].emplace_back(inner_data.second);
sample_indices[inner_data.first].emplace_back(i);
}
......@@ -730,7 +730,7 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
}
bin_mappers[i].reset(new BinMapper());
bin_mappers[i]->FindBin(sample_values[i].data(), static_cast<int>(sample_values[i].size()),
sample_data.size(), io_config_.max_bin, io_config_.min_data_in_bin, filter_cnt, bin_type);
sample_data.size(), io_config_.max_bin, io_config_.min_data_in_bin, filter_cnt, bin_type, io_config_.use_missing, io_config_.zero_as_missing);
OMP_LOOP_EX_END();
}
OMP_THROW_EX();
......@@ -793,7 +793,7 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
}
bin_mappers[i].reset(new BinMapper());
bin_mappers[i]->FindBin(sample_values[start[rank] + i].data(), static_cast<int>(sample_values[start[rank] + i].size()),
sample_data.size(), io_config_.max_bin, io_config_.min_data_in_bin, filter_cnt, bin_type);
sample_data.size(), io_config_.max_bin, io_config_.min_data_in_bin, filter_cnt, bin_type, io_config_.use_missing, io_config_.zero_as_missing);
OMP_LOOP_EX_END();
}
OMP_THROW_EX();
......
......@@ -188,7 +188,7 @@ public:
}
virtual data_size_t Split(
uint32_t min_bin, uint32_t max_bin, uint32_t default_bin, uint32_t default_bin_for_zero,
uint32_t min_bin, uint32_t max_bin, uint32_t default_bin, MissingType missing_type, bool default_left,
uint32_t threshold, data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices, BinType bin_type) const override {
if (num_data <= 0) { return 0; }
......@@ -205,15 +205,28 @@ public:
data_size_t* default_indices = gt_indices;
data_size_t* default_count = &gt_count;
if (bin_type == BinType::NumericalBin) {
if (default_bin_for_zero <= threshold) {
if (missing_type != MissingType::Zero && default_bin <= threshold) {
default_indices = lte_indices;
default_count = &lte_count;
}
if (default_left && missing_type == MissingType::Zero) {
default_indices = lte_indices;
default_count = &lte_count;
}
if (missing_type == MissingType::NaN) {
data_size_t* missing_default_indices = gt_indices;
data_size_t* missing_default_count = &gt_count;
if (default_left) {
missing_default_indices = lte_indices;
missing_default_count = &lte_count;
}
for (data_size_t i = 0; i < num_data; ++i) {
const data_size_t idx = data_indices[i];
VAL_T bin = data_[idx];
if ( bin < minb || bin > maxb || t_default_bin == bin) {
if (bin < minb || bin > maxb || t_default_bin == bin) {
default_indices[(*default_count)++] = idx;
} else if (bin == maxb) {
missing_default_indices[(*missing_default_count)++] = idx;
} else if (bin > th) {
gt_indices[gt_count++] = idx;
} else {
......@@ -221,7 +234,20 @@ public:
}
}
} else {
if (default_bin_for_zero == threshold) {
for (data_size_t i = 0; i < num_data; ++i) {
const data_size_t idx = data_indices[i];
VAL_T bin = data_[idx];
if (bin < minb || bin > maxb || t_default_bin == bin) {
default_indices[(*default_count)++] = idx;
} else if (bin > th) {
gt_indices[gt_count++] = idx;
} else {
lte_indices[lte_count++] = idx;
}
}
}
} else {
if (default_bin == threshold) {
default_indices = lte_indices;
default_count = &lte_count;
}
......@@ -239,6 +265,7 @@ public:
}
return lte_count;
}
data_size_t num_data() const override { return num_data_; }
/*! \brief not ordered bin for dense feature */
......
......@@ -227,7 +227,7 @@ public:
}
virtual data_size_t Split(
uint32_t min_bin, uint32_t max_bin, uint32_t default_bin, uint32_t default_bin_for_zero,
uint32_t min_bin, uint32_t max_bin, uint32_t default_bin, MissingType missing_type, bool default_left,
uint32_t threshold, data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices, BinType bin_type) const override {
if (num_data <= 0) { return 0; }
......@@ -244,15 +244,28 @@ public:
data_size_t* default_indices = gt_indices;
data_size_t* default_count = &gt_count;
if (bin_type == BinType::NumericalBin) {
if (default_bin_for_zero <= threshold) {
if (missing_type != MissingType::Zero && default_bin <= threshold) {
default_indices = lte_indices;
default_count = &lte_count;
}
if (default_left && missing_type == MissingType::Zero) {
default_indices = lte_indices;
default_count = &lte_count;
}
if (missing_type == MissingType::NaN) {
data_size_t* missing_default_indices = gt_indices;
data_size_t* missing_default_count = &gt_count;
if (default_left) {
missing_default_indices = lte_indices;
missing_default_count = &lte_count;
}
for (data_size_t i = 0; i < num_data; ++i) {
const data_size_t idx = data_indices[i];
const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
if (bin < minb || bin > maxb || t_default_bin == bin) {
default_indices[(*default_count)++] = idx;
} else if (bin == maxb) {
missing_default_indices[(*missing_default_count)++] = idx;
} else if (bin > th) {
gt_indices[gt_count++] = idx;
} else {
......@@ -260,7 +273,20 @@ public:
}
}
} else {
if (default_bin_for_zero == threshold) {
for (data_size_t i = 0; i < num_data; ++i) {
const data_size_t idx = data_indices[i];
const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
if (bin < minb || bin > maxb || t_default_bin == bin) {
default_indices[(*default_count)++] = idx;
} else if (bin > th) {
gt_indices[gt_count++] = idx;
} else {
lte_indices[lte_count++] = idx;
}
}
}
} else {
if (default_bin == threshold) {
default_indices = lte_indices;
default_count = &lte_count;
}
......
......@@ -29,7 +29,7 @@ public:
*out_label = val;
bias = -1;
}
else if (fabs(val) > 1e-10) {
else if (std::fabs(val) > kEpsilon || std::isnan(val)) {
out_features->emplace_back(idx + bias, val);
}
++idx;
......@@ -59,7 +59,7 @@ public:
if (idx == label_idx_) {
*out_label = val;
bias = -1;
} else if (fabs(val) > 1e-10) {
} else if (std::fabs(val) > kEpsilon || std::isnan(val)) {
out_features->emplace_back(idx + bias, val);
}
++idx;
......
......@@ -142,7 +142,7 @@ public:
}
virtual data_size_t Split(
uint32_t min_bin, uint32_t max_bin, uint32_t default_bin, uint32_t default_bin_for_zero,
uint32_t min_bin, uint32_t max_bin, uint32_t default_bin, MissingType missing_type, bool default_left,
uint32_t threshold, data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices, BinType bin_type) const override {
// not need to split
......@@ -161,15 +161,28 @@ public:
data_size_t* default_indices = gt_indices;
data_size_t* default_count = &gt_count;
if (bin_type == BinType::NumericalBin) {
if (default_bin_for_zero <= threshold) {
if (missing_type != MissingType::Zero && default_bin <= threshold) {
default_indices = lte_indices;
default_count = &lte_count;
}
if (default_left && missing_type == MissingType::Zero) {
default_indices = lte_indices;
default_count = &lte_count;
}
if (missing_type == MissingType::NaN) {
data_size_t* missing_default_indices = gt_indices;
data_size_t* missing_default_count = &gt_count;
if (default_left) {
missing_default_indices = lte_indices;
missing_default_count = &lte_count;
}
for (data_size_t i = 0; i < num_data; ++i) {
const data_size_t idx = data_indices[i];
VAL_T bin = iterator.InnerRawGet(idx);
if (bin < minb || bin > maxb || t_default_bin == bin) {
default_indices[(*default_count)++] = idx;
} else if (bin == maxb) {
missing_default_indices[(*missing_default_count)++] = idx;
} else if (bin > th) {
gt_indices[gt_count++] = idx;
} else {
......@@ -177,7 +190,20 @@ public:
}
}
} else {
if (default_bin_for_zero == threshold) {
for (data_size_t i = 0; i < num_data; ++i) {
const data_size_t idx = data_indices[i];
VAL_T bin = iterator.InnerRawGet(idx);
if (bin < minb || bin > maxb || t_default_bin == bin) {
default_indices[(*default_count)++] = idx;
} else if (bin > th) {
gt_indices[gt_count++] = idx;
} else {
lte_indices[lte_count++] = idx;
}
}
}
} else {
if (default_bin == threshold) {
default_indices = lte_indices;
default_count = &lte_count;
}
......
......@@ -30,10 +30,7 @@ Tree::Tree(int max_leaves)
split_feature_.resize(max_leaves_ - 1);
threshold_in_bin_.resize(max_leaves_ - 1);
threshold_.resize(max_leaves_ - 1);
decision_type_.resize(max_leaves_ - 1);
default_value_.resize(max_leaves_ - 1);
zero_bin_.resize(max_leaves_ - 1);
default_bin_for_zero_.resize(max_leaves_ - 1);
decision_type_.resize(max_leaves_ - 1, 0);
split_gain_.resize(max_leaves_ - 1);
leaf_parent_.resize(max_leaves_);
leaf_value_.resize(max_leaves_);
......@@ -48,13 +45,14 @@ Tree::Tree(int max_leaves)
shrinkage_ = 1.0f;
has_categorical_ = false;
}
Tree::~Tree() {
}
int Tree::Split(int leaf, int feature, BinType bin_type, uint32_t threshold_bin, int real_feature, double threshold_double,
double left_value, double right_value, data_size_t left_cnt, data_size_t right_cnt, double gain,
uint32_t zero_bin, uint32_t default_bin_for_zero, double default_value) {
MissingType missing_type, bool default_left) {
int new_node_idx = num_leaves_ - 1;
// update parent info
int parent = leaf_parent_[leaf];
......@@ -70,15 +68,20 @@ int Tree::Split(int leaf, int feature, BinType bin_type, uint32_t threshold_bin,
split_feature_inner_[new_node_idx] = feature;
split_feature_[new_node_idx] = real_feature;
zero_bin_[new_node_idx] = zero_bin;
default_bin_for_zero_[new_node_idx] = default_bin_for_zero;
default_value_[new_node_idx] = Common::AvoidInf(default_value);
if (bin_type == BinType::NumericalBin) {
decision_type_[new_node_idx] = 0;
if (bin_type == BinType::NumericalBin) {
SetDecisionType(&decision_type_[new_node_idx], false, kCategoricalMask);
} else {
has_categorical_ = true;
decision_type_[new_node_idx] = 1;
SetDecisionType(&decision_type_[new_node_idx], true, kCategoricalMask);
}
SetDecisionType(&decision_type_[new_node_idx], default_left, kDefaultLeftMask);
if (missing_type == MissingType::None) {
SetMissingType(&decision_type_[new_node_idx], 0);
} else if (missing_type == MissingType::Zero) {
SetMissingType(&decision_type_[new_node_idx], 1);
} else if (missing_type == MissingType::NaN) {
SetMissingType(&decision_type_[new_node_idx], 2);
}
threshold_in_bin_[new_node_idx] = threshold_bin;
......@@ -107,10 +110,18 @@ int Tree::Split(int leaf, int feature, BinType bin_type, uint32_t threshold_bin,
void Tree::AddPredictionToScore(const Dataset* data, data_size_t num_data, double* score) const {
if (num_leaves_ <= 1) { return; }
std::vector<uint32_t> default_bins(num_leaves_ - 1);
std::vector<uint32_t> max_bins(num_leaves_ - 1);
for (int i = 0; i < num_leaves_ - 1; ++i) {
const int fidx = split_feature_inner_[i];
auto bin_mapper = data->FeatureBinMapper(fidx);
default_bins[i] = bin_mapper->GetDefaultBin();
max_bins[i] = bin_mapper->num_bin() - 1;
}
if (has_categorical_) {
if (data->num_features() > num_leaves_ - 1) {
Threading::For<data_size_t>(0, num_data,
[this, &data, score](int, data_size_t start, data_size_t end) {
[this, &data, score, &default_bins, &max_bins](int, data_size_t start, data_size_t end) {
std::vector<std::unique_ptr<BinIterator>> iter(num_leaves_ - 1);
for (int i = 0; i < num_leaves_ - 1; ++i) {
const int fidx = split_feature_inner_[i];
......@@ -120,8 +131,8 @@ void Tree::AddPredictionToScore(const Dataset* data, data_size_t num_data, doubl
for (data_size_t i = start; i < end; ++i) {
int node = 0;
while (node >= 0) {
uint32_t fval = DefaultValueForZero(iter[node]->Get(i), zero_bin_[node], default_bin_for_zero_[node]);
if (inner_decision_funs[decision_type_[node]](
uint32_t fval = ConvertMissingValue(iter[node]->Get(i), threshold_in_bin_[node], decision_type_[node], default_bins[node], max_bins[node]);
if (inner_decision_funs[GetDecisionType(decision_type_[node], kCategoricalMask)](
fval,
threshold_in_bin_[node])) {
node = left_child_[node];
......@@ -134,7 +145,7 @@ void Tree::AddPredictionToScore(const Dataset* data, data_size_t num_data, doubl
});
} else {
Threading::For<data_size_t>(0, num_data,
[this, &data, score](int, data_size_t start, data_size_t end) {
[this, &data, score, &default_bins, &max_bins](int, data_size_t start, data_size_t end) {
std::vector<std::unique_ptr<BinIterator>> iter(data->num_features());
for (int i = 0; i < data->num_features(); ++i) {
iter[i].reset(data->FeatureIterator(i));
......@@ -143,8 +154,8 @@ void Tree::AddPredictionToScore(const Dataset* data, data_size_t num_data, doubl
for (data_size_t i = start; i < end; ++i) {
int node = 0;
while (node >= 0) {
uint32_t fval = DefaultValueForZero(iter[split_feature_inner_[node]]->Get(i), zero_bin_[node], default_bin_for_zero_[node]);
if (inner_decision_funs[decision_type_[node]](
uint32_t fval = ConvertMissingValue(iter[split_feature_inner_[node]]->Get(i), threshold_in_bin_[node], decision_type_[node], default_bins[node], max_bins[node]);
if (inner_decision_funs[GetDecisionType(decision_type_[node], kCategoricalMask)](
fval,
threshold_in_bin_[node])) {
node = left_child_[node];
......@@ -159,7 +170,7 @@ void Tree::AddPredictionToScore(const Dataset* data, data_size_t num_data, doubl
} else {
if (data->num_features() > num_leaves_ - 1) {
Threading::For<data_size_t>(0, num_data,
[this, &data, score](int, data_size_t start, data_size_t end) {
[this, &data, score, &default_bins, &max_bins](int, data_size_t start, data_size_t end) {
std::vector<std::unique_ptr<BinIterator>> iter(num_leaves_ - 1);
for (int i = 0; i < num_leaves_ - 1; ++i) {
const int fidx = split_feature_inner_[i];
......@@ -169,7 +180,7 @@ void Tree::AddPredictionToScore(const Dataset* data, data_size_t num_data, doubl
for (data_size_t i = start; i < end; ++i) {
int node = 0;
while (node >= 0) {
uint32_t fval = DefaultValueForZero(iter[node]->Get(i), zero_bin_[node], default_bin_for_zero_[node]);
uint32_t fval = ConvertMissingValue(iter[node]->Get(i), threshold_in_bin_[node], decision_type_[node], default_bins[node], max_bins[node]);
if (fval <= threshold_in_bin_[node]) {
node = left_child_[node];
} else {
......@@ -181,7 +192,7 @@ void Tree::AddPredictionToScore(const Dataset* data, data_size_t num_data, doubl
});
} else {
Threading::For<data_size_t>(0, num_data,
[this, &data, score](int, data_size_t start, data_size_t end) {
[this, &data, score, &default_bins, &max_bins](int, data_size_t start, data_size_t end) {
std::vector<std::unique_ptr<BinIterator>> iter(data->num_features());
for (int i = 0; i < data->num_features(); ++i) {
iter[i].reset(data->FeatureIterator(i));
......@@ -190,7 +201,7 @@ void Tree::AddPredictionToScore(const Dataset* data, data_size_t num_data, doubl
for (data_size_t i = start; i < end; ++i) {
int node = 0;
while (node >= 0) {
uint32_t fval = DefaultValueForZero(iter[split_feature_inner_[node]]->Get(i), zero_bin_[node], default_bin_for_zero_[node]);
uint32_t fval = ConvertMissingValue(iter[split_feature_inner_[node]]->Get(i), threshold_in_bin_[node], decision_type_[node], default_bins[node], max_bins[node]);
if (fval <= threshold_in_bin_[node]) {
node = left_child_[node];
} else {
......@@ -208,10 +219,18 @@ void Tree::AddPredictionToScore(const Dataset* data,
const data_size_t* used_data_indices,
data_size_t num_data, double* score) const {
if (num_leaves_ <= 1) { return; }
std::vector<uint32_t> default_bins(num_leaves_ - 1);
std::vector<uint32_t> max_bins(num_leaves_ - 1);
for (int i = 0; i < num_leaves_ - 1; ++i) {
const int fidx = split_feature_inner_[i];
auto bin_mapper = data->FeatureBinMapper(fidx);
default_bins[i] = bin_mapper->GetDefaultBin();
max_bins[i] = bin_mapper->num_bin() - 1;
}
if (has_categorical_) {
if (data->num_features() > num_leaves_ - 1) {
Threading::For<data_size_t>(0, num_data,
[this, data, used_data_indices, score](int, data_size_t start, data_size_t end) {
[this, data, used_data_indices, score, &default_bins, &max_bins](int, data_size_t start, data_size_t end) {
std::vector<std::unique_ptr<BinIterator>> iter(num_leaves_ - 1);
for (int i = 0; i < num_leaves_ - 1; ++i) {
const int fidx = split_feature_inner_[i];
......@@ -222,8 +241,8 @@ void Tree::AddPredictionToScore(const Dataset* data,
int node = 0;
const data_size_t idx = used_data_indices[i];
while (node >= 0) {
uint32_t fval = DefaultValueForZero(iter[node]->Get(idx), zero_bin_[node], default_bin_for_zero_[node]);
if (inner_decision_funs[decision_type_[node]](
uint32_t fval = ConvertMissingValue(iter[node]->Get(idx), threshold_in_bin_[node], decision_type_[node], default_bins[node], max_bins[node]);
if (inner_decision_funs[GetDecisionType(decision_type_[node], kCategoricalMask)](
fval,
threshold_in_bin_[node])) {
node = left_child_[node];
......@@ -236,7 +255,7 @@ void Tree::AddPredictionToScore(const Dataset* data,
});
} else {
Threading::For<data_size_t>(0, num_data,
[this, data, used_data_indices, score](int, data_size_t start, data_size_t end) {
[this, data, used_data_indices, score, &default_bins, &max_bins](int, data_size_t start, data_size_t end) {
std::vector<std::unique_ptr<BinIterator>> iter(data->num_features());
for (int i = 0; i < data->num_features(); ++i) {
iter[i].reset(data->FeatureIterator(i));
......@@ -246,8 +265,8 @@ void Tree::AddPredictionToScore(const Dataset* data,
const data_size_t idx = used_data_indices[i];
int node = 0;
while (node >= 0) {
uint32_t fval = DefaultValueForZero(iter[split_feature_inner_[node]]->Get(idx), zero_bin_[node], default_bin_for_zero_[node]);
if (inner_decision_funs[decision_type_[node]](
uint32_t fval = ConvertMissingValue(iter[split_feature_inner_[node]]->Get(idx), threshold_in_bin_[node], decision_type_[node], default_bins[node], max_bins[node]);
if (inner_decision_funs[GetDecisionType(decision_type_[node], kCategoricalMask)](
fval,
threshold_in_bin_[node])) {
node = left_child_[node];
......@@ -262,7 +281,7 @@ void Tree::AddPredictionToScore(const Dataset* data,
} else {
if (data->num_features() > num_leaves_ - 1) {
Threading::For<data_size_t>(0, num_data,
[this, data, used_data_indices, score](int, data_size_t start, data_size_t end) {
[this, data, used_data_indices, score, &default_bins, &max_bins](int, data_size_t start, data_size_t end) {
std::vector<std::unique_ptr<BinIterator>> iter(num_leaves_ - 1);
for (int i = 0; i < num_leaves_ - 1; ++i) {
const int fidx = split_feature_inner_[i];
......@@ -273,7 +292,7 @@ void Tree::AddPredictionToScore(const Dataset* data,
int node = 0;
const data_size_t idx = used_data_indices[i];
while (node >= 0) {
uint32_t fval = DefaultValueForZero(iter[node]->Get(idx), zero_bin_[node], default_bin_for_zero_[node]);
uint32_t fval = ConvertMissingValue(iter[node]->Get(idx), threshold_in_bin_[node], decision_type_[node], default_bins[node], max_bins[node]);
if (fval <= threshold_in_bin_[node]) {
node = left_child_[node];
} else {
......@@ -285,7 +304,7 @@ void Tree::AddPredictionToScore(const Dataset* data,
});
} else {
Threading::For<data_size_t>(0, num_data,
[this, data, used_data_indices, score](int, data_size_t start, data_size_t end) {
[this, data, used_data_indices, score, &default_bins, &max_bins](int, data_size_t start, data_size_t end) {
std::vector<std::unique_ptr<BinIterator>> iter(data->num_features());
for (int i = 0; i < data->num_features(); ++i) {
iter[i].reset(data->FeatureIterator(i));
......@@ -295,7 +314,7 @@ void Tree::AddPredictionToScore(const Dataset* data,
const data_size_t idx = used_data_indices[i];
int node = 0;
while (node >= 0) {
uint32_t fval = DefaultValueForZero(iter[split_feature_inner_[node]]->Get(idx), zero_bin_[node], default_bin_for_zero_[node]);
uint32_t fval = ConvertMissingValue(iter[split_feature_inner_[node]]->Get(idx), threshold_in_bin_[node], decision_type_[node], default_bins[node], max_bins[node]);
if (fval <= threshold_in_bin_[node]) {
node = left_child_[node];
} else {
......@@ -320,8 +339,6 @@ std::string Tree::ToString() {
<< Common::ArrayToString<double>(threshold_, num_leaves_ - 1, ' ') << std::endl;
str_buf << "decision_type="
<< Common::ArrayToString<int>(Common::ArrayCast<int8_t, int>(decision_type_), num_leaves_ - 1, ' ') << std::endl;
str_buf << "default_value="
<< Common::ArrayToString<double>(default_value_, num_leaves_ - 1, ' ') << std::endl;
str_buf << "left_child="
<< Common::ArrayToString<int>(left_child_, num_leaves_ - 1, ' ') << std::endl;
str_buf << "right_child="
......@@ -368,7 +385,6 @@ std::string Tree::NodeToJSON(int index) {
str_buf << "\"split_gain\":" << split_gain_[index] << "," << std::endl;
str_buf << "\"threshold\":" << Common::AvoidInf(threshold_[index]) << "," << std::endl;
str_buf << "\"decision_type\":\"" << Tree::GetDecisionTypeName(decision_type_[index]) << "\"," << std::endl;
str_buf << "\"default_value\":" << default_value_[index] << "," << std::endl;
str_buf << "\"internal_value\":" << internal_value_[index] << "," << std::endl;
str_buf << "\"internal_count\":" << internal_count_[index] << "," << std::endl;
str_buf << "\"left_child\":" << NodeToJSON(left_child_[index]) << "," << std::endl;
......@@ -409,12 +425,8 @@ std::string Tree::NodeToIfElse(int index, bool is_predict_leaf_index) {
str_buf << std::setprecision(std::numeric_limits<double>::digits10 + 2);
if (index >= 0) {
// non-leaf
std::stringstream tmp_str_buf;
tmp_str_buf << "arr[" << split_feature_[index] << "]";
std::string str_fval = tmp_str_buf.str();
str_buf << "if( ( " << str_fval <<" <= " << kMissingValueRange << " && "<< str_fval << " > -" << kMissingValueRange <<" ? "
<< default_value_[index] << " : " << str_fval << " ) ";
if (decision_type_[index] == 0) {
str_buf << "if (Tree::ConvertMissingValue(arr[" << split_feature_[index] << "], " << threshold_[index] << ", " << static_cast<int>(decision_type_[index]) << ") ";
if (GetDecisionType(decision_type_[index], kCategoricalMask) == 0) {
str_buf << "<";
} else {
str_buf << "=";
......@@ -485,12 +497,6 @@ Tree::Tree(const std::string& str) {
Log::Fatal("Tree model string format error, should contain threshold field");
}
if (key_vals.count("default_value")) {
default_value_ = Common::StringToArray<double>(key_vals["default_value"], ' ', num_leaves_ - 1);
} else {
Log::Fatal("Tree model string format error, should contain default_value field");
}
if (key_vals.count("leaf_value")) {
leaf_value_ = Common::StringToArray<double>(key_vals["leaf_value"], ' ', num_leaves_);
} else {
......
......@@ -91,7 +91,7 @@ public:
* \param threshold threshold that want to split
* \param right_leaf index of right leaf
*/
void Split(int leaf, const Dataset* dataset, int feature, uint32_t threshold, uint32_t default_bin_for_zero, int right_leaf) {
void Split(int leaf, const Dataset* dataset, int feature, uint32_t threshold, bool default_left, int right_leaf) {
const data_size_t min_inner_size = 512;
// get leaf boundary
const data_size_t begin = leaf_begin_[leaf];
......@@ -111,7 +111,7 @@ public:
data_size_t cur_cnt = inner_size;
if (cur_start + cur_cnt > cnt) { cur_cnt = cnt - cur_start; }
// split data inner, reduce the times of function called
data_size_t cur_left_count = dataset->Split(feature, threshold, default_bin_for_zero, indices_.data() + begin + cur_start, cur_cnt,
data_size_t cur_left_count = dataset->Split(feature, threshold, default_left, indices_.data() + begin + cur_start, cur_cnt,
temp_left_indices_.data() + cur_start, temp_right_indices_.data() + cur_start);
offsets_buf_[i] = cur_start;
left_cnts_buf_[i] = cur_left_count;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment