Commit ba5c7459 authored by ww's avatar ww Committed by Guolin Ke
Browse files

Add func to handle sparse testing data (#1045)

* first commit

* fix bug

* fix by commits

* fix by commit

* add funcs to IfElse

* fix bug

* fix bug

* fix bug

* change tab to space
parent 302f84bc
......@@ -10,6 +10,7 @@
#include <vector>
#include <string>
#include <map>
namespace LightGBM {
......@@ -120,6 +121,10 @@ public:
virtual void PredictRaw(const double* features, double* output,
const PredictionEarlyStopInstance* early_stop) const = 0;
virtual void PredictRawByMap(const std::unordered_map<int, double>& features, double* output,
const PredictionEarlyStopInstance* early_stop) const = 0;
/*!
* \brief Prediction for one record, sigmoid transformation will be used if needed
* \param feature_values Feature value on this record
......@@ -129,6 +134,10 @@ public:
virtual void Predict(const double* features, double* output,
const PredictionEarlyStopInstance* early_stop) const = 0;
virtual void PredictByMap(const std::unordered_map<int, double>& features, double* output,
const PredictionEarlyStopInstance* early_stop) const = 0;
/*!
* \brief Prediction for one record with leaf index
* \param feature_values Feature value on this record
......@@ -137,6 +146,9 @@ public:
virtual void PredictLeafIndex(
const double* features, double* output) const = 0;
virtual void PredictLeafIndexByMap(
const std::unordered_map<int, double>& features, double* output) const = 0;
/*!
* \brief Feature contributions for the model's prediction of one record
* \param feature_values Feature value on this record
......
......@@ -10,6 +10,7 @@
#include <string>
#include <vector>
#include <memory>
#include <map>
namespace LightGBM {
......@@ -118,8 +119,11 @@ public:
* \return Prediction result
*/
inline double Predict(const double* feature_values) const;
inline double PredictByMap(const std::unordered_map<int, double>& feature_values) const;
inline int PredictLeafIndex(const double* feature_values) const;
inline int PredictLeafIndexByMap(const std::unordered_map<int, double>& feature_values) const;
inline void PredictContrib(const double* feature_values, int num_features, double* output);
......@@ -307,6 +311,7 @@ private:
* \return Leaf index
*/
inline int GetLeaf(const double* feature_values) const;
inline int GetLeafByMap(const std::unordered_map<int, double>& feature_values) const;
/*! \brief Serialize one node to json*/
std::string NodeToJSON(int index) const;
......@@ -314,6 +319,8 @@ private:
/*! \brief Serialize one node to if-else statement*/
std::string NodeToIfElse(int index, bool is_predict_leaf_index) const;
std::string NodeToIfElseByMap(int index, bool is_predict_leaf_index) const;
double ExpectedValue() const;
int MaxDepth();
......@@ -440,6 +447,15 @@ inline double Tree::Predict(const double* feature_values) const {
}
}
inline double Tree::PredictByMap(const std::unordered_map<int, double>& feature_values) const {
if (num_leaves_ > 1) {
int leaf = GetLeafByMap(feature_values);
return LeafOutput(leaf);
} else {
return leaf_value_[0];
}
}
inline int Tree::PredictLeafIndex(const double* feature_values) const {
if (num_leaves_ > 1) {
int leaf = GetLeaf(feature_values);
......@@ -449,6 +465,15 @@ inline int Tree::PredictLeafIndex(const double* feature_values) const {
}
}
inline int Tree::PredictLeafIndexByMap(const std::unordered_map<int, double>& feature_values) const {
if (num_leaves_ > 1) {
int leaf = GetLeafByMap(feature_values);
return leaf;
} else {
return 0;
}
}
inline void Tree::PredictContrib(const double* feature_values, int num_features, double* output) {
output[num_features] += ExpectedValue();
// Run the recursion with preallocated space for the unique path data
......@@ -484,6 +509,21 @@ inline int Tree::GetLeaf(const double* feature_values) const {
return ~node;
}
inline int Tree::GetLeafByMap(const std::unordered_map<int, double>& feature_values) const {
int node = 0;
if (num_cat_ > 0) {
while (node >= 0) {
node = Decision(feature_values.count(split_feature_[node]) > 0 ? feature_values.at(split_feature_[node]) : 0.0f, node);
}
} else {
while (node >= 0) {
node = NumericalDecision(feature_values.count(split_feature_[node]) > 0 ? feature_values.at(split_feature_[node]) : 0.0f, node);
}
}
return ~node;
}
} // namespace LightGBM
#endif // LightGBM_TREE_H_
#ifndef LIGHTGBM_PREDICTOR_HPP_
#define LIGHTGBM_PREDICTOR_HPP_
#define MAX_FEATURE 10000
#define SPARSITY 100
#include <LightGBM/meta.h>
#include <LightGBM/boosting.h>
#include <LightGBM/utils/text_reader.h>
......@@ -58,16 +61,21 @@ public:
num_pred_one_row_ = boosting_->NumPredictOneRow(num_iteration, is_predict_leaf_index, is_predict_contrib);
num_feature_ = boosting_->MaxFeatureIdx() + 1;
predict_buf_ = std::vector<std::vector<double>>(num_threads_, std::vector<double>(num_feature_, 0.0f));
predict_buf_map_ = std::vector<std::unordered_map<int, double>>(num_threads_);
if (is_predict_leaf_index) {
predict_fun_ = [this](const std::vector<std::pair<int, double>>& features, double* output) {
int tid = omp_get_thread_num();
if(num_feature_ > MAX_FEATURE && num_feature_/static_cast<int>(features.size()) > SPARSITY) {
CopyToPredictMap(tid, features);
boosting_->PredictLeafIndexByMap(predict_buf_map_[tid], output);
ClearPredictMap(tid);
} else {
CopyToPredictBuffer(predict_buf_[tid].data(), features);
// get result for leaf index
boosting_->PredictLeafIndex(predict_buf_[tid].data(), output);
ClearPredictBuffer(predict_buf_[tid].data(), predict_buf_[tid].size(), features);
}
};
} else if (is_predict_contrib) {
predict_fun_ = [this](const std::vector<std::pair<int, double>>& features, double* output) {
int tid = omp_get_thread_num();
......@@ -76,21 +84,32 @@ public:
boosting_->PredictContrib(predict_buf_[tid].data(), output, &early_stop_);
ClearPredictBuffer(predict_buf_[tid].data(), predict_buf_[tid].size(), features);
};
} else {
if (is_raw_score) {
predict_fun_ = [this](const std::vector<std::pair<int, double>>& features, double* output) {
int tid = omp_get_thread_num();
if(num_feature_ > MAX_FEATURE && num_feature_/static_cast<int>(features.size()) > SPARSITY) {
CopyToPredictMap(tid, features);
boosting_->PredictRawByMap(predict_buf_map_[tid], output, &early_stop_);
ClearPredictMap(tid);
} else {
CopyToPredictBuffer(predict_buf_[tid].data(), features);
boosting_->PredictRaw(predict_buf_[tid].data(), output, &early_stop_);
ClearPredictBuffer(predict_buf_[tid].data(), predict_buf_[tid].size(), features);
}
};
} else {
predict_fun_ = [this](const std::vector<std::pair<int, double>>& features, double* output) {
int tid = omp_get_thread_num();
if(num_feature_ > MAX_FEATURE && num_feature_/static_cast<int>(features.size()) > SPARSITY) {
CopyToPredictMap(tid, features);
boosting_->PredictByMap(predict_buf_map_[tid], output, &early_stop_);
ClearPredictMap(tid);
} else {
CopyToPredictBuffer(predict_buf_[tid].data(), features);
boosting_->Predict(predict_buf_[tid].data(), output, &early_stop_);
ClearPredictBuffer(predict_buf_[tid].data(), predict_buf_[tid].size(), features);
}
};
}
}
......@@ -225,6 +244,19 @@ private:
}
}
void CopyToPredictMap(int tid, const std::vector<std::pair<int, double>>& features) {
int loop_size = static_cast<int>(features.size());
for (int i = 0; i < loop_size; ++i) {
if (features[i].first < num_feature_) {
predict_buf_map_[tid][features[i].first] = features[i].second;
}
}
}
void ClearPredictMap(int tid) {
predict_buf_map_[tid].clear();
}
/*! \brief Boosting model */
const Boosting* boosting_;
/*! \brief function for prediction */
......@@ -234,6 +266,7 @@ private:
int num_pred_one_row_;
int num_threads_;
std::vector<std::vector<double>> predict_buf_;
std::vector<std::unordered_map<int, double>> predict_buf_map_;
};
} // namespace LightGBM
......
......@@ -13,6 +13,7 @@
#include <fstream>
#include <memory>
#include <mutex>
#include <map>
namespace LightGBM {
......@@ -186,11 +187,19 @@ public:
void PredictRaw(const double* features, double* output,
const PredictionEarlyStopInstance* earlyStop) const override;
void PredictRawByMap(const std::unordered_map<int, double>& features, double* output,
const PredictionEarlyStopInstance* early_stop) const override;
void Predict(const double* features, double* output,
const PredictionEarlyStopInstance* earlyStop) const override;
void PredictByMap(const std::unordered_map<int, double>& features, double* output,
const PredictionEarlyStopInstance* early_stop) const override;
void PredictLeafIndex(const double* features, double* output) const override;
void PredictLeafIndexByMap(const std::unordered_map<int, double>& features, double* output) const override;
void PredictContrib(const double* features, double* output,
const PredictionEarlyStopInstance* earlyStop) const override;
......
......@@ -101,6 +101,37 @@ std::string GBDT::ModelToIfElse(int num_iteration) const {
str_buf << "}" << std::endl;
str_buf << std::endl;
// PredictRawByMap
str_buf << "double (*PredictTreeByMapPtr[])(const std::unordered_map<int, double>&) = { ";
for (int i = 0; i < num_used_model; ++i) {
if (i > 0) {
str_buf << " , ";
}
str_buf << "PredictTree" << i << "ByMap";
}
str_buf << " };" << std::endl << std::endl;
std::stringstream pred_str_buf_map;
pred_str_buf_map << "\t" << "int early_stop_round_counter = 0;" << std::endl;
pred_str_buf_map << "\t" << "std::memset(output, 0, sizeof(double) * num_tree_per_iteration_);" << std::endl;
pred_str_buf_map << "\t" << "for (int i = 0; i < num_iteration_for_pred_; ++i) {" << std::endl;
pred_str_buf_map << "\t\t" << "for (int k = 0; k < num_tree_per_iteration_; ++k) {" << std::endl;
pred_str_buf_map << "\t\t\t" << "output[k] += (*PredictTreeByMapPtr[i * num_tree_per_iteration_ + k])(features);" << std::endl;
pred_str_buf_map << "\t\t" << "}" << std::endl;
pred_str_buf_map << "\t\t" << "++early_stop_round_counter;" << std::endl;
pred_str_buf_map << "\t\t" << "if (early_stop->round_period == early_stop_round_counter) {" << std::endl;
pred_str_buf_map << "\t\t\t" << "if (early_stop->callback_function(output, num_tree_per_iteration_))" << std::endl;
pred_str_buf_map << "\t\t\t\t" << "return;" << std::endl;
pred_str_buf_map << "\t\t\t" << "early_stop_round_counter = 0;" << std::endl;
pred_str_buf_map << "\t\t" << "}" << std::endl;
pred_str_buf_map << "\t" << "}" << std::endl;
str_buf << "void GBDT::PredictRawByMap(const std::unordered_map<int, double>& features, double* output, const PredictionEarlyStopInstance* early_stop) const {" << std::endl;
str_buf << pred_str_buf_map.str();
str_buf << "}" << std::endl;
str_buf << std::endl;
// Predict
str_buf << "void GBDT::Predict(const double* features, double *output, const PredictionEarlyStopInstance* early_stop) const {" << std::endl;
str_buf << "\t" << "PredictRaw(features, output, early_stop);" << std::endl;
......@@ -115,6 +146,21 @@ std::string GBDT::ModelToIfElse(int num_iteration) const {
str_buf << "}" << std::endl;
str_buf << std::endl;
// PredictByMap
str_buf << "void GBDT::PredictByMap(const std::unordered_map<int, double>& features, double* output, const PredictionEarlyStopInstance* early_stop) const {" << std::endl;
str_buf << "\t" << "PredictRawByMap(features, output, early_stop);" << std::endl;
str_buf << "\t" << "if (average_output_) {" << std::endl;
str_buf << "\t\t" << "for (int k = 0; k < num_tree_per_iteration_; ++k) {" << std::endl;
str_buf << "\t\t\t" << "output[k] /= num_iteration_for_pred_;" << std::endl;
str_buf << "\t\t" << "}" << std::endl;
str_buf << "\t" << "}" << std::endl;
str_buf << "\t" << "else if (objective_function_ != nullptr) {" << std::endl;
str_buf << "\t\t" << "objective_function_->ConvertOutput(output, output);" << std::endl;
str_buf << "\t" << "}" << std::endl;
str_buf << "}" << std::endl;
str_buf << std::endl;
// PredictLeafIndex
for (int i = 0; i < num_used_model; ++i) {
str_buf << models_[i]->ToIfElse(i, true) << std::endl;
......@@ -136,6 +182,23 @@ std::string GBDT::ModelToIfElse(int num_iteration) const {
str_buf << "\t" << "}" << std::endl;
str_buf << "}" << std::endl;
//PredictLeafIndexByMap
str_buf << "double (*PredictTreeLeafByMapPtr[])(const std::unordered_map<int, double>&) = { ";
for (int i = 0; i < num_used_model; ++i) {
if (i > 0) {
str_buf << " , ";
}
str_buf << "PredictTree" << i << "LeafByMap";
}
str_buf << " };" << std::endl << std::endl;
str_buf << "void GBDT::PredictLeafIndexByMap(const std::unordered_map<int, double>& features, double* output) const {" << std::endl;
str_buf << "\t" << "int total_tree = num_iteration_for_pred_ * num_tree_per_iteration_;" << std::endl;
str_buf << "\t" << "for (int i = 0; i < total_tree; ++i) {" << std::endl;
str_buf << "\t\t" << "output[i] = (*PredictTreeLeafByMapPtr[i])(features);" << std::endl;
str_buf << "\t" << "}" << std::endl;
str_buf << "}" << std::endl;
str_buf << "} // namespace LightGBM" << std::endl;
return str_buf.str();
......
......@@ -26,6 +26,26 @@ void GBDT::PredictRaw(const double* features, double* output, const PredictionEa
}
}
void GBDT::PredictRawByMap(const std::unordered_map<int, double>& features, double* output, const PredictionEarlyStopInstance* early_stop) const {
int early_stop_round_counter = 0;
// set zero
std::memset(output, 0, sizeof(double) * num_tree_per_iteration_);
for (int i = 0; i < num_iteration_for_pred_; ++i) {
// predict all the trees for one iteration
for (int k = 0; k < num_tree_per_iteration_; ++k) {
output[k] += models_[i * num_tree_per_iteration_ + k]->PredictByMap(features);
}
// check early stopping
++early_stop_round_counter;
if (early_stop->round_period == early_stop_round_counter) {
if (early_stop->callback_function(output, num_tree_per_iteration_)) {
return;
}
early_stop_round_counter = 0;
}
}
}
void GBDT::Predict(const double* features, double* output, const PredictionEarlyStopInstance* early_stop) const {
PredictRaw(features, output, early_stop);
if (average_output_) {
......@@ -37,6 +57,17 @@ void GBDT::Predict(const double* features, double* output, const PredictionEarly
}
}
void GBDT::PredictByMap(const std::unordered_map<int, double>& features, double* output, const PredictionEarlyStopInstance* early_stop) const {
PredictRawByMap(features, output, early_stop);
if (average_output_) {
for (int k = 0; k < num_tree_per_iteration_; ++k) {
output[k] /= num_iteration_for_pred_;
}
} else if (objective_function_ != nullptr) {
objective_function_->ConvertOutput(output, output);
}
}
void GBDT::PredictLeafIndex(const double* features, double* output) const {
int total_tree = num_iteration_for_pred_ * num_tree_per_iteration_;
for (int i = 0; i < total_tree; ++i) {
......@@ -44,4 +75,11 @@ void GBDT::PredictLeafIndex(const double* features, double* output) const {
}
}
void GBDT::PredictLeafIndexByMap(const std::unordered_map<int, double>& features, double* output) const {
int total_tree = num_iteration_for_pred_ * num_tree_per_iteration_;
for (int i = 0; i < total_tree; ++i) {
output[i] = models_[i]->PredictLeafIndexByMap(features);
}
}
} // namespace LightGBM
......@@ -377,6 +377,37 @@ std::string Tree::ToIfElse(int index, bool is_predict_leaf_index) const {
str_buf << NodeToIfElse(0, is_predict_leaf_index);
}
str_buf << " }" << std::endl;
//Predict func by Map to ifelse
str_buf << "double PredictTree" << index;
if (is_predict_leaf_index) {
str_buf << "LeafByMap";
}
else {
str_buf << "ByMap";
}
str_buf << "(const std::unordered_map<int, double>& arr) { ";
if (num_leaves_ <= 1) {
str_buf << "return " << leaf_value_[0] << ";";
}
else {
str_buf << "const std::vector<uint32_t> cat_threshold = {";
for (size_t i = 0; i < cat_threshold_.size(); ++i) {
if (i != 0) {
str_buf << ",";
}
str_buf << cat_threshold_[i];
}
str_buf << "};";
// use this for the missing value conversion
str_buf << "double fval = 0.0f; ";
if (num_cat_ > 0) {
str_buf << "int int_fval = 0; ";
}
str_buf << NodeToIfElseByMap(0, is_predict_leaf_index);
}
str_buf << " }" << std::endl;
return str_buf.str();
}
......@@ -411,6 +442,37 @@ std::string Tree::NodeToIfElse(int index, bool is_predict_leaf_index) const {
return str_buf.str();
}
std::string Tree::NodeToIfElseByMap(int index, bool is_predict_leaf_index) const {
std::stringstream str_buf;
str_buf << std::setprecision(std::numeric_limits<double>::digits10 + 2);
if (index >= 0) {
// non-leaf
str_buf << "fval = arr.count(" << split_feature_[index] << ") > 0 ? arr.at(" << split_feature_[index] << ") : 0.0f;";
if (GetDecisionType(decision_type_[index], kCategoricalMask) == 0) {
str_buf << NumericalDecisionIfElse(index);
} else {
str_buf << CategoricalDecisionIfElse(index);
}
// left subtree
str_buf << NodeToIfElseByMap(left_child_[index], is_predict_leaf_index);
str_buf << " } else { ";
// right subtree
str_buf << NodeToIfElseByMap(right_child_[index], is_predict_leaf_index);
str_buf << " }";
} else {
// leaf
str_buf << "return ";
if (is_predict_leaf_index) {
str_buf << ~index;
} else {
str_buf << leaf_value_[~index];
}
str_buf << ";";
}
return str_buf.str();
}
Tree::Tree(const std::string& str) {
std::vector<std::string> lines = Common::SplitLines(str.c_str());
std::unordered_map<std::string, std::string> key_vals;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment