".github/git@developer.sourcefind.cn:tianlh/lightgbm-dcu.git" did not exist on "c84900753c0332b3ba3e931fb2e4af54bccc67d7"
Commit 6c4a9750 authored by Guolin Ke's avatar Guolin Ke
Browse files

clean code for the split of bins and leaves.

parent 8fb26b06
This diff is collapsed.
......@@ -168,9 +168,14 @@ public:
uint32_t min_bin = bin_offsets_[sub_feature];
uint32_t max_bin = bin_offsets_[sub_feature + 1] - 1;
uint32_t default_bin = bin_mappers_[sub_feature]->GetDefaultBin();
auto missing_type = bin_mappers_[sub_feature]->missing_type();
return bin_data_->Split(min_bin, max_bin, default_bin, missing_type, default_left,
threshold, data_indices, num_data, lte_indices, gt_indices, bin_mappers_[sub_feature]->bin_type());
if (bin_mappers_[sub_feature]->bin_type() == BinType::NumericalBin) {
auto missing_type = bin_mappers_[sub_feature]->missing_type();
return bin_data_->Split(min_bin, max_bin, default_bin, missing_type, default_left,
threshold, data_indices, num_data, lte_indices, gt_indices);
} else {
return bin_data_->SplitCategorical(min_bin, max_bin, default_bin, threshold, data_indices, num_data, lte_indices, gt_indices);
}
}
/*!
* \brief From bin to feature value
......
......@@ -37,9 +37,8 @@ public:
* \brief Performing a split on tree leaves.
* \param leaf Index of leaf to be split
* \param feature Index of feature; the converted index after removing useless features
* \param bin_type type of this feature, numerical or categorical
* \param threshold Threshold(bin) of split
* \param real_feature Index of feature, the original index on data
* \param threshold_bin Threshold(bin) of split
* \param threshold_double Threshold on feature value
* \param left_value Model Left child output
* \param right_value Model Right child output
......@@ -50,10 +49,29 @@ public:
* \param default_left default direction for missing value
* \return The index of new leaf.
*/
int Split(int leaf, int feature, BinType bin_type, uint32_t threshold, int real_feature,
double threshold_double, double left_value, double right_value,
int Split(int leaf, int feature, int real_feature, uint32_t threshold_bin,
double threshold_double, double left_value, double right_value,
data_size_t left_cnt, data_size_t right_cnt, double gain, MissingType missing_type, bool default_left);
/*!
* \brief Performing a split on tree leaves, with categorical feature
* \param leaf Index of leaf to be split
* \param feature Index of feature; the converted index after removing useless features
* \param real_feature Index of feature, the original index on data
* \param threshold_bin Threshold(bin) of split, use bitset to represent
* \param num_threshold_bin size of threshold_bin
* \param threshold
* \param left_value Model Left child output
* \param right_value Model Right child output
* \param left_cnt Count of left child
* \param right_cnt Count of right child
* \param gain Split gain
* \return The index of new leaf.
*/
int SplitCategorical(int leaf, int feature, int real_feature, uint32_t threshold_bin,
double threshold, double left_value, double right_value,
data_size_t left_cnt, data_size_t right_cnt, double gain, MissingType missing_type);
/*! \brief Get the output of one leaf */
inline double LeafOutput(int leaf) const { return leaf_value_[leaf]; }
......@@ -89,6 +107,7 @@ public:
* \return Prediction result
*/
inline double Predict(const double* feature_values) const;
inline int PredictLeafIndex(const double* feature_values) const;
inline void PredictContrib(const double* feature_values, int num_features, double* output) const;
......@@ -139,7 +158,7 @@ public:
* \param rate The factor of shrinkage
*/
inline void Shrinkage(double rate) {
#pragma omp parallel for schedule(static, 512) if (num_leaves_ >= 1024)
#pragma omp parallel for schedule(static, 1024) if (num_leaves_ >= 2048)
for (int i = 0; i < num_leaves_; ++i) {
leaf_value_[i] *= rate;
if (leaf_value_[i] > kMaxTreeOutput) { leaf_value_[i] = kMaxTreeOutput; }
......@@ -157,24 +176,6 @@ public:
/*! \brief Serialize this object to if-else statement*/
std::string ToIfElse(int index, bool is_predict_leaf_index);
template<typename T>
inline static bool CategoricalDecision(T fval, T threshold) {
if (static_cast<int>(fval) == static_cast<int>(threshold)) {
return true;
} else {
return false;
}
}
template<typename T>
inline static bool NumericalDecision(T fval, T threshold) {
if (fval <= threshold) {
return true;
} else {
return false;
}
}
inline static bool IsZero(double fval) {
if (fval > -kZeroAsMissingValueRange && fval <= kZeroAsMissingValueRange) {
return true;
......@@ -204,21 +205,44 @@ public:
(*decision_type) |= (input << 2);
}
inline static uint32_t ConvertMissingValue(uint32_t fval, uint32_t threshold, int8_t decision_type, uint32_t default_bin, uint32_t max_bin) {
uint8_t missing_type = GetMissingType(decision_type);
if ((missing_type == 1 && fval == default_bin)
|| (missing_type == 2 && fval == max_bin)) {
if (GetDecisionType(decision_type, kDefaultLeftMask)) {
fval = threshold;
private:
inline std::string NumericalDecisionIfElse(int node) {
std::stringstream str_buf;
uint8_t missing_type = GetMissingType(decision_type_[node]);
bool default_left = GetDecisionType(decision_type_[node], kDefaultLeftMask);
if (missing_type == 0 || (missing_type == 1 && default_left && kZeroAsMissingValueRange < threshold_[node])) {
str_buf << "if (fval <= " << threshold_[node] << ") {";
} else if (missing_type == 1) {
if (default_left) {
str_buf << "if (fval <= " << threshold_[node] << " || Tree::IsZero(fval)" << " || std::isnan(fval)) {";
} else {
str_buf << "if (fval <= " << threshold_[node] << " && !Tree::IsZero(fval)" << " && !std::isnan(fval)) {";
}
} else {
if (default_left) {
str_buf << "if (fval <= " << threshold_[node] << " || std::isnan(fval)) {";
} else {
fval = threshold + 1;
str_buf << "if (fval <= " << threshold_[node] << " && !std::isnan(fval)) {";
}
}
return fval;
return str_buf.str();
}
inline static double ConvertMissingValue(double fval, double threshold, int8_t decision_type) {
uint8_t missing_type = GetMissingType(decision_type);
inline std::string CategoricalDecisionIfElse(int node) const {
uint8_t missing_type = GetMissingType(decision_type_[node]);
std::stringstream str_buf;
if (missing_type == 2) {
str_buf << "if (std::isnan(fval)) { int_fval = -1; } else { int_fval = static_cast<int>(fval); }";
} else {
str_buf << "if (std::isnan(fval)) { int_fval = 0; } else { int_fval = static_cast<int>(fval); }";
}
str_buf << "if (int_fval >= 0 && int_fval == " << static_cast<int>(threshold_[node]) << ") {";
return str_buf.str();
}
inline int NumericalDecision(double fval, int node) const {
uint8_t missing_type = GetMissingType(decision_type_[node]);
if (std::isnan(fval)) {
if (missing_type != 2) {
fval = 0.0f;
......@@ -226,28 +250,79 @@ public:
}
if ((missing_type == 1 && IsZero(fval))
|| (missing_type == 2 && std::isnan(fval))) {
if (GetDecisionType(decision_type, kDefaultLeftMask)) {
fval = threshold;
if (GetDecisionType(decision_type_[node], kDefaultLeftMask)) {
return left_child_[node];
} else {
fval = 10.0f * threshold;
return right_child_[node];
}
}
return fval;
if (fval <= threshold_[node]) {
return left_child_[node];
} else {
return right_child_[node];
}
}
inline static const char* GetDecisionTypeName(int8_t type) {
if (type == 0) {
return "no_greater";
inline int NumericalDecisionInner(uint32_t fval, int node, uint32_t default_bin, uint32_t max_bin) const {
uint8_t missing_type = GetMissingType(decision_type_[node]);
if ((missing_type == 1 && fval == default_bin)
|| (missing_type == 2 && fval == max_bin)) {
if (GetDecisionType(decision_type_[node], kDefaultLeftMask)) {
return left_child_[node];
} else {
return right_child_[node];
}
}
if (fval <= threshold_in_bin_[node]) {
return left_child_[node];
} else {
return "is";
return right_child_[node];
}
}
static std::vector<bool(*)(uint32_t, uint32_t)> inner_decision_funs;
static std::vector<bool(*)(double, double)> decision_funs;
inline int CategoricalDecision(double fval, int node) const {
uint8_t missing_type = GetMissingType(decision_type_[node]);
int int_fval = static_cast<int>(fval);
if (int_fval < 0) {
return right_child_[node];;
} else if (std::isnan(fval)) {
// NaN is always in the right
if (missing_type == 2) {
return right_child_[node];
}
int_fval = 0;
}
if (int_fval == static_cast<int>(threshold_[node])) {
return left_child_[node];
}
return right_child_[node];
}
private:
inline int CategoricalDecisionInner(uint32_t fval, int node) const {
if (fval == threshold_in_bin_[node]) {
return left_child_[node];
}
return right_child_[node];
}
inline int Decision(double fval, int node) const {
if (GetDecisionType(decision_type_[node], kCategoricalMask)) {
return CategoricalDecision(fval, node);
} else {
return NumericalDecision(fval, node);
}
}
inline int DecisionInner(uint32_t fval, int node, uint32_t default_bin, uint32_t max_bin) const {
if (GetDecisionType(decision_type_[node], kCategoricalMask)) {
return CategoricalDecisionInner(fval, node);
} else {
return NumericalDecisionInner(fval, node, default_bin, max_bin);
}
}
inline void Split(int leaf, int feature, int real_feature,
double left_value, double right_value, data_size_t left_cnt, data_size_t right_cnt, double gain);
/*!
* \brief Find leaf index of which record belongs by features
* \param feature_values Feature value of this record
......@@ -288,6 +363,7 @@ private:
std::vector<uint32_t> threshold_in_bin_;
/*! \brief A non-leaf node's split threshold in feature value */
std::vector<double> threshold_;
int num_cat_;
/*! \brief Store the information for categorical feature handle and mising value handle. */
std::vector<int8_t> decision_type_;
/*! \brief A non-leaf node's split gain */
......@@ -306,9 +382,44 @@ private:
/*! \brief Depth for leaves */
std::vector<int> leaf_depth_;
double shrinkage_;
bool has_categorical_;
};
inline void Tree::Split(int leaf, int feature, int real_feature,
double left_value, double right_value, data_size_t left_cnt, data_size_t right_cnt, double gain) {
int new_node_idx = num_leaves_ - 1;
// update parent info
int parent = leaf_parent_[leaf];
if (parent >= 0) {
// if cur node is left child
if (left_child_[parent] == ~leaf) {
left_child_[parent] = new_node_idx;
} else {
right_child_[parent] = new_node_idx;
}
}
// add new node
split_feature_inner_[new_node_idx] = feature;
split_feature_[new_node_idx] = real_feature;
split_gain_[new_node_idx] = Common::AvoidInf(gain);
// add two new leaves
left_child_[new_node_idx] = ~leaf;
right_child_[new_node_idx] = ~num_leaves_;
// update new leaves
leaf_parent_[leaf] = new_node_idx;
leaf_parent_[num_leaves_] = new_node_idx;
// save current leaf value to internal node before change
internal_value_[new_node_idx] = leaf_value_[leaf];
internal_count_[new_node_idx] = left_cnt + right_cnt;
leaf_value_[leaf] = std::isnan(left_value) ? 0.0f : left_value;
leaf_count_[leaf] = left_cnt;
leaf_value_[num_leaves_] = std::isnan(right_value) ? 0.0f : right_value;
leaf_count_[num_leaves_] = right_cnt;
// update leaf depth
leaf_depth_[num_leaves_] = leaf_depth_[leaf] + 1;
leaf_depth_[leaf]++;
}
inline double Tree::Predict(const double* feature_values) const {
if (num_leaves_ > 1) {
int leaf = GetLeaf(feature_values);
......@@ -409,8 +520,7 @@ inline void Tree::TreeSHAP(const double *feature_values, double *phi,
// internal node
} else {
const int hot_index =
decision_funs[GetDecisionType(decision_type_[node], kCategoricalMask)](feature_values[split_index], threshold_[node]);
const int hot_index = Decision(feature_values[split_index], node);
const int cold_index = (hot_index == left_child_[node] ? right_child_[node] : left_child_[node]);
const double w = data_count(node);
const double hot_zero_fraction = data_count(hot_index)/w;
......@@ -469,27 +579,13 @@ inline int Tree::MaxDepth() const {
inline int Tree::GetLeaf(const double* feature_values) const {
int node = 0;
if (has_categorical_) {
if (num_cat_ > 0) {
while (node >= 0) {
double fval = ConvertMissingValue(feature_values[split_feature_[node]], threshold_[node], decision_type_[node]);
if (decision_funs[GetDecisionType(decision_type_[node], kCategoricalMask)](
fval,
threshold_[node])) {
node = left_child_[node];
} else {
node = right_child_[node];
}
node = Decision(feature_values[split_feature_[node]], node);
}
} else {
while (node >= 0) {
double fval = ConvertMissingValue(feature_values[split_feature_[node]], threshold_[node], decision_type_[node]);
if (NumericalDecision<double>(
fval,
threshold_[node])) {
node = left_child_[node];
} else {
node = right_child_[node];
}
node = NumericalDecision(feature_values[split_feature_[node]], node);
}
}
return ~node;
......
......@@ -473,7 +473,7 @@ bool GBDT::TrainOneIter(const score_t* gradient, const score_t* hessian, bool is
auto label = train_data_->metadata().label();
double init_score = ObtainAutomaticInitialScore(objective_function_, label, num_data_);
std::unique_ptr<Tree> new_tree(new Tree(2));
new_tree->Split(0, 0, BinType::NumericalBin, 0, 0, 0, init_score, init_score, 0, 0, -1, MissingType::None, true);
new_tree->Split(0, 0, 0, 0, 0, init_score, init_score, 0, 0, -1, MissingType::None, true);
train_score_updater_->AddScore(init_score, 0);
for (auto& score_updater : valid_score_updater_) {
score_updater->AddScore(init_score, 0);
......@@ -553,7 +553,7 @@ bool GBDT::TrainOneIter(const score_t* gradient, const score_t* hessian, bool is
// only add default score one-time
if (!class_need_train_[cur_tree_id] && models_.size() < static_cast<size_t>(num_tree_per_iteration_)) {
auto output = class_default_output_[cur_tree_id];
new_tree->Split(0, 0, BinType::NumericalBin, 0, 0, 0,
new_tree->Split(0, 0, 0, 0, 0,
output, output, 0, 0, -1, MissingType::None, true);
train_score_updater_->AddScore(output, cur_tree_id);
for (auto& score_updater : valid_score_updater_) {
......
......@@ -127,7 +127,7 @@ public:
if (!class_need_train_[cur_tree_id] && models_.size() < static_cast<size_t>(num_tree_per_iteration_)) {
double output = class_default_output_[cur_tree_id];
objective_function_->ConvertOutput(&output, &output);
new_tree->Split(0, 0, BinType::NumericalBin, 0, 0, 0,
new_tree->Split(0, 0, 0, 0, 0,
output, output, 0, 0, -1, MissingType::None, true);
train_score_updater_->AddScore(output, cur_tree_id);
for (auto& score_updater : valid_score_updater_) {
......
......@@ -190,11 +190,11 @@ public:
virtual data_size_t Split(
uint32_t min_bin, uint32_t max_bin, uint32_t default_bin, MissingType missing_type, bool default_left,
uint32_t threshold, data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices, BinType bin_type) const override {
data_size_t* lte_indices, data_size_t* gt_indices) const override {
if (num_data <= 0) { return 0; }
VAL_T th = static_cast<VAL_T>(threshold + min_bin);
VAL_T minb = static_cast<VAL_T>(min_bin);
VAL_T maxb = static_cast<VAL_T>(max_bin);
const VAL_T minb = static_cast<VAL_T>(min_bin);
const VAL_T maxb = static_cast<VAL_T>(max_bin);
VAL_T t_default_bin = static_cast<VAL_T>(min_bin + default_bin);
if (default_bin == 0) {
th -= 1;
......@@ -204,59 +204,41 @@ public:
data_size_t gt_count = 0;
data_size_t* default_indices = gt_indices;
data_size_t* default_count = &gt_count;
if (bin_type == BinType::NumericalBin) {
if (missing_type != MissingType::Zero && default_bin <= threshold) {
if (missing_type == MissingType::NaN) {
if (default_bin <= threshold) {
default_indices = lte_indices;
default_count = &lte_count;
}
if (default_left && missing_type == MissingType::Zero) {
default_indices = lte_indices;
default_count = &lte_count;
}
if (missing_type == MissingType::NaN) {
data_size_t* missing_default_indices = gt_indices;
data_size_t* missing_default_count = &gt_count;
if (default_left) {
missing_default_indices = lte_indices;
missing_default_count = &lte_count;
}
for (data_size_t i = 0; i < num_data; ++i) {
const data_size_t idx = data_indices[i];
VAL_T bin = data_[idx];
if (bin < minb || bin > maxb || t_default_bin == bin) {
default_indices[(*default_count)++] = idx;
} else if (bin == maxb) {
missing_default_indices[(*missing_default_count)++] = idx;
} else if (bin > th) {
gt_indices[gt_count++] = idx;
} else {
lte_indices[lte_count++] = idx;
}
}
} else {
for (data_size_t i = 0; i < num_data; ++i) {
const data_size_t idx = data_indices[i];
VAL_T bin = data_[idx];
if (bin < minb || bin > maxb || t_default_bin == bin) {
default_indices[(*default_count)++] = idx;
} else if (bin > th) {
gt_indices[gt_count++] = idx;
} else {
lte_indices[lte_count++] = idx;
}
data_size_t* missing_default_indices = gt_indices;
data_size_t* missing_default_count = &gt_count;
if (default_left) {
missing_default_indices = lte_indices;
missing_default_count = &lte_count;
}
for (data_size_t i = 0; i < num_data; ++i) {
const data_size_t idx = data_indices[i];
const VAL_T bin = data_[idx];
if (bin < minb || bin > maxb || t_default_bin == bin) {
default_indices[(*default_count)++] = idx;
} else if (bin == maxb) {
missing_default_indices[(*missing_default_count)++] = idx;
} else if (bin > th) {
gt_indices[gt_count++] = idx;
} else {
lte_indices[lte_count++] = idx;
}
}
} else {
if (default_bin == threshold) {
if ((default_left && missing_type == MissingType::Zero) || (default_bin <= threshold && missing_type != MissingType::Zero)) {
default_indices = lte_indices;
default_count = &lte_count;
}
for (data_size_t i = 0; i < num_data; ++i) {
const data_size_t idx = data_indices[i];
VAL_T bin = data_[idx];
const VAL_T bin = data_[idx];
if (bin < minb || bin > maxb || t_default_bin == bin) {
default_indices[(*default_count)++] = idx;
} else if (bin != th) {
} else if (bin > th) {
gt_indices[gt_count++] = idx;
} else {
lte_indices[lte_count++] = idx;
......@@ -266,6 +248,33 @@ public:
return lte_count;
}
virtual data_size_t SplitCategorical(
uint32_t min_bin, uint32_t max_bin, uint32_t default_bin,
uint32_t threshold, data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices) const override {
if (num_data <= 0) { return 0; }
data_size_t lte_count = 0;
data_size_t gt_count = 0;
data_size_t* default_indices = gt_indices;
data_size_t* default_count = &gt_count;
if (threshold == default_bin) {
default_indices = lte_indices;
default_count = &lte_count;
}
for (data_size_t i = 0; i < num_data; ++i) {
const data_size_t idx = data_indices[i];
const uint32_t bin = data_[idx];
if (bin < min_bin || bin > max_bin) {
default_indices[(*default_count)++] = idx;
} else if (bin - min_bin == threshold) {
lte_indices[lte_count++] = idx;
} else {
gt_indices[gt_count++] = idx;
}
}
return lte_count;
}
data_size_t num_data() const override { return num_data_; }
/*! \brief not ordered bin for dense feature */
......
......@@ -229,11 +229,11 @@ public:
virtual data_size_t Split(
uint32_t min_bin, uint32_t max_bin, uint32_t default_bin, MissingType missing_type, bool default_left,
uint32_t threshold, data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices, BinType bin_type) const override {
data_size_t* lte_indices, data_size_t* gt_indices) const override {
if (num_data <= 0) { return 0; }
uint8_t th = static_cast<uint8_t>(threshold + min_bin);
uint8_t minb = static_cast<uint8_t>(min_bin);
uint8_t maxb = static_cast<uint8_t>(max_bin);
const uint8_t minb = static_cast<uint8_t>(min_bin);
const uint8_t maxb = static_cast<uint8_t>(max_bin);
uint8_t t_default_bin = static_cast<uint8_t>(min_bin + default_bin);
if (default_bin == 0) {
th -= 1;
......@@ -243,59 +243,41 @@ public:
data_size_t gt_count = 0;
data_size_t* default_indices = gt_indices;
data_size_t* default_count = &gt_count;
if (bin_type == BinType::NumericalBin) {
if (missing_type != MissingType::Zero && default_bin <= threshold) {
if (missing_type == MissingType::NaN) {
if (default_bin <= threshold) {
default_indices = lte_indices;
default_count = &lte_count;
}
if (default_left && missing_type == MissingType::Zero) {
default_indices = lte_indices;
default_count = &lte_count;
data_size_t* missing_default_indices = gt_indices;
data_size_t* missing_default_count = &gt_count;
if (default_left) {
missing_default_indices = lte_indices;
missing_default_count = &lte_count;
}
if (missing_type == MissingType::NaN) {
data_size_t* missing_default_indices = gt_indices;
data_size_t* missing_default_count = &gt_count;
if (default_left) {
missing_default_indices = lte_indices;
missing_default_count = &lte_count;
}
for (data_size_t i = 0; i < num_data; ++i) {
const data_size_t idx = data_indices[i];
const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
if (bin < minb || bin > maxb || t_default_bin == bin) {
default_indices[(*default_count)++] = idx;
} else if (bin == maxb) {
missing_default_indices[(*missing_default_count)++] = idx;
} else if (bin > th) {
gt_indices[gt_count++] = idx;
} else {
lte_indices[lte_count++] = idx;
}
}
} else {
for (data_size_t i = 0; i < num_data; ++i) {
const data_size_t idx = data_indices[i];
const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
if (bin < minb || bin > maxb || t_default_bin == bin) {
default_indices[(*default_count)++] = idx;
} else if (bin > th) {
gt_indices[gt_count++] = idx;
} else {
lte_indices[lte_count++] = idx;
}
for (data_size_t i = 0; i < num_data; ++i) {
const data_size_t idx = data_indices[i];
const uint8_t bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
if (bin < minb || bin > maxb || t_default_bin == bin) {
default_indices[(*default_count)++] = idx;
} else if (bin == maxb) {
missing_default_indices[(*missing_default_count)++] = idx;
} else if (bin > th) {
gt_indices[gt_count++] = idx;
} else {
lte_indices[lte_count++] = idx;
}
}
} else {
if (default_bin == threshold) {
if ((default_left && missing_type == MissingType::Zero) || (default_bin <= threshold && missing_type != MissingType::Zero)) {
default_indices = lte_indices;
default_count = &lte_count;
}
for (data_size_t i = 0; i < num_data; ++i) {
const data_size_t idx = data_indices[i];
const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
const uint8_t bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
if (bin < minb || bin > maxb || t_default_bin == bin) {
default_indices[(*default_count)++] = idx;
} else if (bin != th) {
} else if (bin > th) {
gt_indices[gt_count++] = idx;
} else {
lte_indices[lte_count++] = idx;
......@@ -304,6 +286,34 @@ public:
}
return lte_count;
}
virtual data_size_t SplitCategorical(
uint32_t min_bin, uint32_t max_bin, uint32_t default_bin,
uint32_t threshold, data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices) const override {
if (num_data <= 0) { return 0; }
data_size_t lte_count = 0;
data_size_t gt_count = 0;
data_size_t* default_indices = gt_indices;
data_size_t* default_count = &gt_count;
if (default_bin == threshold) {
default_indices = lte_indices;
default_count = &lte_count;
}
for (data_size_t i = 0; i < num_data; ++i) {
const data_size_t idx = data_indices[i];
const uint32_t bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
if (bin < min_bin || bin > max_bin) {
default_indices[(*default_count)++] = idx;
} else if (bin - min_bin == threshold) {
lte_indices[lte_count++] = idx;
} else {
gt_indices[gt_count++] = idx;
}
}
return lte_count;
}
data_size_t num_data() const override { return num_data_; }
/*! \brief not ordered bin for dense feature */
......
......@@ -144,12 +144,12 @@ public:
virtual data_size_t Split(
uint32_t min_bin, uint32_t max_bin, uint32_t default_bin, MissingType missing_type, bool default_left,
uint32_t threshold, data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices, BinType bin_type) const override {
data_size_t* lte_indices, data_size_t* gt_indices) const override {
// not need to split
if (num_data <= 0) { return 0; }
VAL_T th = static_cast<VAL_T>(threshold + min_bin);
VAL_T minb = static_cast<VAL_T>(min_bin);
VAL_T maxb = static_cast<VAL_T>(max_bin);
const VAL_T minb = static_cast<VAL_T>(min_bin);
const VAL_T maxb = static_cast<VAL_T>(max_bin);
VAL_T t_default_bin = static_cast<VAL_T>(min_bin + default_bin);
if (default_bin == 0) {
th -= 1;
......@@ -160,64 +160,74 @@ public:
data_size_t gt_count = 0;
data_size_t* default_indices = gt_indices;
data_size_t* default_count = &gt_count;
if (bin_type == BinType::NumericalBin) {
if (missing_type != MissingType::Zero && default_bin <= threshold) {
if (missing_type == MissingType::NaN) {
if (default_bin <= threshold) {
default_indices = lte_indices;
default_count = &lte_count;
}
if (default_left && missing_type == MissingType::Zero) {
default_indices = lte_indices;
default_count = &lte_count;
data_size_t* missing_default_indices = gt_indices;
data_size_t* missing_default_count = &gt_count;
if (default_left) {
missing_default_indices = lte_indices;
missing_default_count = &lte_count;
}
if (missing_type == MissingType::NaN) {
data_size_t* missing_default_indices = gt_indices;
data_size_t* missing_default_count = &gt_count;
if (default_left) {
missing_default_indices = lte_indices;
missing_default_count = &lte_count;
}
for (data_size_t i = 0; i < num_data; ++i) {
const data_size_t idx = data_indices[i];
VAL_T bin = iterator.InnerRawGet(idx);
if (bin < minb || bin > maxb || t_default_bin == bin) {
default_indices[(*default_count)++] = idx;
} else if (bin == maxb) {
missing_default_indices[(*missing_default_count)++] = idx;
} else if (bin > th) {
gt_indices[gt_count++] = idx;
} else {
lte_indices[lte_count++] = idx;
}
}
} else {
for (data_size_t i = 0; i < num_data; ++i) {
const data_size_t idx = data_indices[i];
VAL_T bin = iterator.InnerRawGet(idx);
if (bin < minb || bin > maxb || t_default_bin == bin) {
default_indices[(*default_count)++] = idx;
} else if (bin > th) {
gt_indices[gt_count++] = idx;
} else {
lte_indices[lte_count++] = idx;
}
for (data_size_t i = 0; i < num_data; ++i) {
const data_size_t idx = data_indices[i];
const VAL_T bin = iterator.InnerRawGet(idx);
if (bin < minb || bin > maxb || t_default_bin == bin) {
default_indices[(*default_count)++] = idx;
} else if (bin == maxb) {
missing_default_indices[(*missing_default_count)++] = idx;
} else if (bin > th) {
gt_indices[gt_count++] = idx;
} else {
lte_indices[lte_count++] = idx;
}
}
} else {
if (default_bin == threshold) {
if ((default_left && missing_type == MissingType::Zero) || (default_bin <= threshold && missing_type != MissingType::Zero)) {
default_indices = lte_indices;
default_count = &lte_count;
}
}
for (data_size_t i = 0; i < num_data; ++i) {
const data_size_t idx = data_indices[i];
VAL_T bin = iterator.InnerRawGet(idx);
const VAL_T bin = iterator.InnerRawGet(idx);
if (bin < minb || bin > maxb || t_default_bin == bin) {
default_indices[(*default_count)++] = idx;
} else if (bin != th) {
} else if (bin > th) {
gt_indices[gt_count++] = idx;
} else {
lte_indices[lte_count++] = idx;
}
}
}
return lte_count;
}
virtual data_size_t SplitCategorical(
uint32_t min_bin, uint32_t max_bin, uint32_t default_bin,
uint32_t threshold, data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices) const override {
if (num_data <= 0) { return 0; }
data_size_t lte_count = 0;
data_size_t gt_count = 0;
SparseBinIterator<VAL_T> iterator(this, data_indices[0]);
data_size_t* default_indices = gt_indices;
data_size_t* default_count = &gt_count;
if (default_bin == threshold) {
default_indices = lte_indices;
default_count = &lte_count;
}
for (data_size_t i = 0; i < num_data; ++i) {
const data_size_t idx = data_indices[i];
uint32_t bin = iterator.InnerRawGet(idx);
if (bin < min_bin || bin > max_bin) {
default_indices[(*default_count)++] = idx;
} else if (bin - min_bin == threshold) {
lte_indices[lte_count++] = idx;
} else {
gt_indices[gt_count++] = idx;
}
}
return lte_count;
}
......
This diff is collapsed.
......@@ -84,9 +84,9 @@ void DCGCalculator::CalMaxDCG(const std::vector<data_size_t>& ks,
double DCGCalculator::CalDCGAtK(data_size_t k, const float* label,
const double* score, data_size_t num_data) {
// get sorted indices by score
std::vector<data_size_t> sorted_idx;
std::vector<data_size_t> sorted_idx(num_data);
for (data_size_t i = 0; i < num_data; ++i) {
sorted_idx.emplace_back(i);
sorted_idx[i] = i;
}
std::sort(sorted_idx.begin(), sorted_idx.end(),
[score](data_size_t a, data_size_t b) {return score[a] > score[b]; });
......@@ -104,9 +104,9 @@ double DCGCalculator::CalDCGAtK(data_size_t k, const float* label,
void DCGCalculator::CalDCG(const std::vector<data_size_t>& ks, const float* label,
const double * score, data_size_t num_data, std::vector<double>* out) {
// get sorted indices by score
std::vector<data_size_t> sorted_idx;
std::vector<data_size_t> sorted_idx(num_data);
for (data_size_t i = 0; i < num_data; ++i) {
sorted_idx.emplace_back(i);
sorted_idx[i] = i;
}
std::sort(sorted_idx.begin(), sorted_idx.end(),
[score](data_size_t a, data_size_t b) {return score[a] > score[b]; });
......
......@@ -516,27 +516,40 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(const std::vector<int8_t>&
}
void SerialTreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) {
const SplitInfo& best_split_info = best_split_per_leaf_[best_Leaf];
void SerialTreeLearner::Split(Tree* tree, int best_leaf, int* left_leaf, int* right_leaf) {
const SplitInfo& best_split_info = best_split_per_leaf_[best_leaf];
const int inner_feature_index = train_data_->InnerFeatureIndex(best_split_info.feature);
// left = parent
*left_leaf = best_Leaf;
// split tree, will return right leaf
*right_leaf = tree->Split(best_Leaf,
inner_feature_index,
train_data_->FeatureBinMapper(inner_feature_index)->bin_type(),
best_split_info.threshold,
best_split_info.feature,
train_data_->RealThreshold(inner_feature_index, best_split_info.threshold),
static_cast<double>(best_split_info.left_output),
static_cast<double>(best_split_info.right_output),
static_cast<data_size_t>(best_split_info.left_count),
static_cast<data_size_t>(best_split_info.right_count),
static_cast<double>(best_split_info.gain),
train_data_->FeatureBinMapper(inner_feature_index)->missing_type(),
best_split_info.default_left);
*left_leaf = best_leaf;
if (train_data_->FeatureBinMapper(inner_feature_index)->bin_type() == BinType::NumericalBin) {
// split tree, will return right leaf
*right_leaf = tree->Split(best_leaf,
inner_feature_index,
best_split_info.feature,
best_split_info.threshold,
train_data_->RealThreshold(inner_feature_index, best_split_info.threshold),
static_cast<double>(best_split_info.left_output),
static_cast<double>(best_split_info.right_output),
static_cast<data_size_t>(best_split_info.left_count),
static_cast<data_size_t>(best_split_info.right_count),
static_cast<double>(best_split_info.gain),
train_data_->FeatureBinMapper(inner_feature_index)->missing_type(),
best_split_info.default_left);
} else {
*right_leaf = tree->SplitCategorical(best_leaf,
inner_feature_index,
best_split_info.feature,
best_split_info.threshold,
train_data_->RealThreshold(inner_feature_index, best_split_info.threshold),
static_cast<double>(best_split_info.left_output),
static_cast<double>(best_split_info.right_output),
static_cast<data_size_t>(best_split_info.left_count),
static_cast<data_size_t>(best_split_info.right_count),
static_cast<double>(best_split_info.gain),
train_data_->FeatureBinMapper(inner_feature_index)->missing_type());
}
// split data partition
data_partition_->Split(best_Leaf, train_data_, inner_feature_index,
data_partition_->Split(best_leaf, train_data_, inner_feature_index,
best_split_info.threshold, best_split_info.default_left, *right_leaf);
// init the leaves that used on next iteration
......
......@@ -218,6 +218,7 @@
<ClInclude Include="..\src\boosting\gbdt.h" />
<ClInclude Include="..\src\boosting\dart.hpp" />
<ClInclude Include="..\src\boosting\goss.hpp" />
<ClInclude Include="..\src\boosting\rf.hpp" />
<ClInclude Include="..\src\boosting\score_updater.hpp" />
<ClInclude Include="..\src\io\dense_bin.hpp" />
<ClInclude Include="..\src\io\dense_nbits_bin.hpp" />
......
......@@ -192,6 +192,9 @@
<ClInclude Include="..\include\LightGBM\R_object_helper.h">
<Filter>include\LightGBM</Filter>
</ClInclude>
<ClInclude Include="..\src\boosting\rf.hpp">
<Filter>src\boosting</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<ClCompile Include="..\src\application\application.cpp">
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment