Commit 603bffcf authored by wxchan's avatar wxchan Committed by Guolin Ke
Browse files

[MRG] expose feature importance to c_api (#860)

* expose feature importance to c_api

* support type=gain

* remove dump model from examples and tests temporarily because it's unstable

* use double instead of float
parent e5aa5658
...@@ -52,16 +52,7 @@ y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration) ...@@ -52,16 +52,7 @@ y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# eval # eval
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5) print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
print('Dump model to JSON...')
# dump model to json (and save to file)
model_json = gbm.dump_model()
with open('model.json', 'w+') as f:
json.dump(model_json, f, indent=4)
print('Feature names:', gbm.feature_name()) print('Feature names:', gbm.feature_name())
print('Calculate feature importances...')
# feature importances # feature importances
print('Feature importances:', list(gbm.feature_importance())) print('Feature importances:', list(gbm.feature_importance()))
...@@ -32,7 +32,6 @@ y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration) ...@@ -32,7 +32,6 @@ y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# eval # eval
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5) print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
print('Calculate feature importances...')
# feature importances # feature importances
print('Feature importances:', list(gbm.feature_importances_)) print('Feature importances:', list(gbm.feature_importances_))
......
...@@ -191,6 +191,14 @@ public: ...@@ -191,6 +191,14 @@ public:
*/ */
virtual bool LoadModelFromString(const std::string& model_str) = 0; virtual bool LoadModelFromString(const std::string& model_str) = 0;
/*!
* \brief Calculate feature importances
* \param num_iteration Number of model that want to use for feature importance, -1 means use all
* \param importance_type: 0 for split, 1 for gain
* \return vector of feature_importance
*/
virtual std::vector<double> FeatureImportance(int num_iteration, int importance_type) const = 0;
/*! /*!
* \brief Get max feature index of this model * \brief Get max feature index of this model
* \return Max feature index of this model * \return Max feature index of this model
......
...@@ -722,6 +722,19 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterSetLeafValue(BoosterHandle handle, ...@@ -722,6 +722,19 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterSetLeafValue(BoosterHandle handle,
int leaf_idx, int leaf_idx,
double val); double val);
/*!
* \brief get model feature importance
* \param handle handle
* \param num_iteration, <= 0 means use all
* \param importance_type: 0 for split, 1 for gain
* \param out_results output value array
* \return 0 when succeed, -1 when failure happens
*/
LIGHTGBM_C_EXPORT int LGBM_BoosterFeatureImportance(BoosterHandle handle,
int num_iteration,
int importance_type,
double* out_results);
#if defined(_MSC_VER) #if defined(_MSC_VER)
// exception handle and error msg // exception handle and error msg
static char* LastErrorMsg() { static __declspec(thread) char err_msg[512] = "Everything is fine"; return err_msg; } static char* LastErrorMsg() { static __declspec(thread) char err_msg[512] = "Everything is fine"; return err_msg; }
......
...@@ -1680,6 +1680,14 @@ class Booster(object): ...@@ -1680,6 +1680,14 @@ class Booster(object):
predictor.pandas_categorical = self.pandas_categorical predictor.pandas_categorical = self.pandas_categorical
return predictor return predictor
def num_feature(self):
"""Get num of features"""
out_num_feature = ctypes.c_int(0)
_safe_call(_LIB.LGBM_BoosterGetNumFeature(
self.handle,
ctypes.byref(out_num_feature)))
return out_num_feature.value
def feature_name(self): def feature_name(self):
""" """
Get feature names. Get feature names.
...@@ -1689,12 +1697,7 @@ class Booster(object): ...@@ -1689,12 +1697,7 @@ class Booster(object):
result : array result : array
Array of feature names. Array of feature names.
""" """
out_num_feature = ctypes.c_int(0) num_feature = self.num_feature()
"""Get num of features"""
_safe_call(_LIB.LGBM_BoosterGetNumFeature(
self.handle,
ctypes.byref(out_num_feature)))
num_feature = out_num_feature.value
"""Get name of features""" """Get name of features"""
tmp_out_len = ctypes.c_int(0) tmp_out_len = ctypes.c_int(0)
string_buffers = [ctypes.create_string_buffer(255) for i in range_(num_feature)] string_buffers = [ctypes.create_string_buffer(255) for i in range_(num_feature)]
...@@ -1707,7 +1710,7 @@ class Booster(object): ...@@ -1707,7 +1710,7 @@ class Booster(object):
raise ValueError("Length of feature names doesn't equal with num_feature") raise ValueError("Length of feature names doesn't equal with num_feature")
return [string_buffers[i].value.decode() for i in range_(num_feature)] return [string_buffers[i].value.decode() for i in range_(num_feature)]
def feature_importance(self, importance_type='split'): def feature_importance(self, importance_type='split', iteration=-1):
""" """
Get feature importances Get feature importances
...@@ -1723,23 +1726,23 @@ class Booster(object): ...@@ -1723,23 +1726,23 @@ class Booster(object):
result : array result : array
Array of feature importances. Array of feature importances.
""" """
if importance_type not in ["split", "gain"]: if importance_type == "split":
raise KeyError("importance_type must be split or gain") importance_type_int = 0
dump_model = self.dump_model() elif importance_type == "gain":
ret = [0] * (dump_model["max_feature_idx"] + 1) importance_type_int = 1
else:
def dfs(root): importance_type_int = -1
if "split_feature" in root: num_feature = self.num_feature()
if root['split_gain'] > 0: result = np.array([0 for _ in range_(num_feature)], dtype=np.float64)
if importance_type == 'split': _safe_call(_LIB.LGBM_BoosterFeatureImportance(
ret[root["split_feature"]] += 1 self.handle,
elif importance_type == 'gain': ctypes.c_int(iteration),
ret[root["split_feature"]] += root["split_gain"] ctypes.c_int(importance_type_int),
dfs(root["left_child"]) result.ctypes.data_as(ctypes.POINTER(ctypes.c_double))))
dfs(root["right_child"]) if importance_type_int == 0:
for tree in dump_model["tree_info"]: return result.astype(int)
dfs(tree["tree_structure"]) else:
return np.array(ret) return result
def __inner_eval(self, data_name, data_idx, feval=None): def __inner_eval(self, data_name, data_idx, feval=None):
""" """
......
...@@ -994,6 +994,8 @@ std::string GBDT::SaveModelToString(int num_iteration) const { ...@@ -994,6 +994,8 @@ std::string GBDT::SaveModelToString(int num_iteration) const {
ss << "feature_infos=" << Common::Join(feature_infos_, " ") << std::endl; ss << "feature_infos=" << Common::Join(feature_infos_, " ") << std::endl;
std::vector<double> feature_importances = FeatureImportance(num_iteration, 0);
ss << std::endl; ss << std::endl;
int num_used_model = static_cast<int>(models_.size()); int num_used_model = static_cast<int>(models_.size());
if (num_iteration > 0) { if (num_iteration > 0) {
...@@ -1006,7 +1008,20 @@ std::string GBDT::SaveModelToString(int num_iteration) const { ...@@ -1006,7 +1008,20 @@ std::string GBDT::SaveModelToString(int num_iteration) const {
ss << models_[i]->ToString() << std::endl; ss << models_[i]->ToString() << std::endl;
} }
std::vector<std::pair<size_t, std::string>> pairs = FeatureImportance(num_used_model); // store the importance first
std::vector<std::pair<size_t, std::string>> pairs;
for (size_t i = 0; i < feature_importances.size(); ++i) {
size_t feature_importances_int = static_cast<size_t>(feature_importances[i]);
if (feature_importances_int > 0) {
pairs.emplace_back(feature_importances_int, feature_names_[i]);
}
}
// sort the importance
std::sort(pairs.begin(), pairs.end(),
[] (const std::pair<size_t, std::string>& lhs,
const std::pair<size_t, std::string>& rhs) {
return lhs.first > rhs.first;
});
ss << std::endl << "feature importances:" << std::endl; ss << std::endl << "feature importances:" << std::endl;
for (size_t i = 0; i < pairs.size(); ++i) { for (size_t i = 0; i < pairs.size(); ++i) {
ss << pairs[i].second << "=" << std::to_string(pairs[i].first) << std::endl; ss << pairs[i].second << "=" << std::to_string(pairs[i].first) << std::endl;
...@@ -1130,30 +1145,35 @@ bool GBDT::LoadModelFromString(const std::string& model_str) { ...@@ -1130,30 +1145,35 @@ bool GBDT::LoadModelFromString(const std::string& model_str) {
return true; return true;
} }
std::vector<std::pair<size_t, std::string>> GBDT::FeatureImportance(int num_used_model) const { std::vector<double> GBDT::FeatureImportance(int num_iteration, int importance_type) const {
int num_used_model = static_cast<int>(models_.size());
if (num_iteration > 0) {
num_iteration += boost_from_average_ ? 1 : 0;
num_used_model = std::min(num_iteration * num_tree_per_iteration_, num_used_model);
}
std::vector<size_t> feature_importances(max_feature_idx_ + 1, 0); std::vector<double> feature_importances(max_feature_idx_ + 1, 0.0);
for (int iter = 0; iter < num_used_model; ++iter) { if (importance_type == 0) {
for (int split_idx = 0; split_idx < models_[iter]->num_leaves() - 1; ++split_idx) { for (int iter = boost_from_average_ ? 1 : 0; iter < num_used_model; ++iter) {
if (models_[iter]->split_gain(split_idx) > 0) { for (int split_idx = 0; split_idx < models_[iter]->num_leaves() - 1; ++split_idx) {
++feature_importances[models_[iter]->split_feature(split_idx)]; if (models_[iter]->split_gain(split_idx) > 0) {
feature_importances[models_[iter]->split_feature(split_idx)] += 1.0;
}
} }
} }
} } else if (importance_type == 1) {
// store the importance first for (int iter = boost_from_average_ ? 1 : 0; iter < num_used_model; ++iter) {
std::vector<std::pair<size_t, std::string>> pairs; for (int split_idx = 0; split_idx < models_[iter]->num_leaves() - 1; ++split_idx) {
for (size_t i = 0; i < feature_importances.size(); ++i) { if (models_[iter]->split_gain(split_idx) > 0) {
if (feature_importances[i] > 0) { feature_importances[models_[iter]->split_feature(split_idx)] += models_[iter]->split_gain(split_idx);
pairs.emplace_back(feature_importances[i], feature_names_[i]); }
}
} }
} else {
Log::Fatal("Unknown importance type: only support split=0 and gain=1.");
} }
// sort the importance return feature_importances;
std::sort(pairs.begin(), pairs.end(),
[] (const std::pair<size_t, std::string>& lhs,
const std::pair<size_t, std::string>& rhs) {
return lhs.first > rhs.first;
});
return pairs;
} }
} // namespace LightGBM } // namespace LightGBM
...@@ -201,6 +201,14 @@ public: ...@@ -201,6 +201,14 @@ public:
*/ */
bool LoadModelFromString(const std::string& model_str) override; bool LoadModelFromString(const std::string& model_str) override;
/*!
* \brief Calculate feature importances
* \param num_iteration Number of model that want to use for feature importance, -1 means use all
* \param importance_type: 0 for split, 1 for gain
* \return vector of feature_importance
*/
std::vector<double> FeatureImportance(int num_iteration, int importance_type) const override;
/*! /*!
* \brief Get max feature index of this model * \brief Get max feature index of this model
* \return Max feature index of this model * \return Max feature index of this model
...@@ -302,12 +310,6 @@ protected: ...@@ -302,12 +310,6 @@ protected:
* \return best_msg if met early_stopping * \return best_msg if met early_stopping
*/ */
std::string OutputMetric(int iter); std::string OutputMetric(int iter);
/*!
* \brief Calculate feature importances
* \param num_used_model Number of model that want to use for feature importance, -1 means use all
* \return sorted pairs of (feature_importance, feature_name)
*/
std::vector<std::pair<size_t, std::string>> FeatureImportance(int num_used_model) const;
/*! \brief current iteration */ /*! \brief current iteration */
int iter_; int iter_;
......
...@@ -248,6 +248,10 @@ public: ...@@ -248,6 +248,10 @@ public:
return boosting_->DumpModel(num_iteration); return boosting_->DumpModel(num_iteration);
} }
std::vector<double> FeatureImportance(int num_iteration, int importance_type) {
return boosting_->FeatureImportance(num_iteration, importance_type);
}
double GetLeafValue(int tree_idx, int leaf_idx) const { double GetLeafValue(int tree_idx, int leaf_idx) const {
return dynamic_cast<GBDT*>(boosting_.get())->GetLeafValue(tree_idx, leaf_idx); return dynamic_cast<GBDT*>(boosting_.get())->GetLeafValue(tree_idx, leaf_idx);
} }
...@@ -1175,6 +1179,19 @@ int LGBM_BoosterSetLeafValue(BoosterHandle handle, ...@@ -1175,6 +1179,19 @@ int LGBM_BoosterSetLeafValue(BoosterHandle handle,
API_END(); API_END();
} }
int LGBM_BoosterFeatureImportance(BoosterHandle handle,
int num_iteration,
int importance_type,
double* out_results) {
API_BEGIN();
Booster* ref_booster = reinterpret_cast<Booster*>(handle);
std::vector<double> feature_importances = ref_booster->FeatureImportance(num_iteration, importance_type);
for (size_t i = 0; i < feature_importances.size(); ++i) {
(out_results)[i] = feature_importances[i];
}
API_END();
}
// ---- start of some help functions // ---- start of some help functions
std::function<std::vector<double>(int row_idx)> std::function<std::vector<double>(int row_idx)>
......
...@@ -290,7 +290,7 @@ class TestEngine(unittest.TestCase): ...@@ -290,7 +290,7 @@ class TestEngine(unittest.TestCase):
self.assertIn(valid_set_name, gbm.best_score) self.assertIn(valid_set_name, gbm.best_score)
self.assertIn('binary_logloss', gbm.best_score[valid_set_name]) self.assertIn('binary_logloss', gbm.best_score[valid_set_name])
def test_continue_train_and_dump_model(self): def test_continue_train(self):
X, y = load_boston(True) X, y = load_boston(True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
params = { params = {
...@@ -317,9 +317,6 @@ class TestEngine(unittest.TestCase): ...@@ -317,9 +317,6 @@ class TestEngine(unittest.TestCase):
self.assertAlmostEqual(evals_result['valid_0']['l1'][-1], ret, places=5) self.assertAlmostEqual(evals_result['valid_0']['l1'][-1], ret, places=5)
for l1, mae in zip(evals_result['valid_0']['l1'], evals_result['valid_0']['mae']): for l1, mae in zip(evals_result['valid_0']['l1'], evals_result['valid_0']['mae']):
self.assertAlmostEqual(l1, mae, places=5) self.assertAlmostEqual(l1, mae, places=5)
# test dump model
self.assertIn('tree_info', gbm.dump_model())
self.assertIsInstance(gbm.feature_importance(), np.ndarray)
os.remove(model_name) os.remove(model_name)
def test_continue_train_multiclass(self): def test_continue_train_multiclass(self):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment