[python] Output model to a pandas DataFrame (#2592)

* trees_to_df method and unit test added. PEP 8 fixes for integration. * Co-Authored-By: Nikita Titov <nekit94-08@mail.ru> Post-review changes * changes from second round of reviews from striker * third round of review. formatting and added 2 more tests * replaced pandas dot attribute accessor with string attribute accessor * dealt with single tree edge case and minor refactor of tests * slight refactor for checking if tree is a single node

[python] Output model to a pandas DataFrame (#2592)
* trees_to_df method and unit test added. PEP 8 fixes for integration. * Co-Authored-By: Nikita Titov <nekit94-08@mail.ru> Post-review changes * changes from second round of reviews from striker * third round of review. formatting and added 2 more tests * replaced pandas dot attribute accessor with string attribute accessor * dealt with single tree edge case and minor refactor of tests * slight refactor for checking if tree is a single node
301402c8 · Patrick Ford · Nikita Titov · f6b8ecf6 · 301402c8 · 301402c8
Commit 301402c8 authored Jan 10, 2020 by Patrick Ford Committed by Nikita Titov Jan 10, 2020
Hide whitespace changes
Inline Side-by-side

Showing with 150 additions and 0 deletions

python-package/lightgbm/basic.py python-package/lightgbm/basic.py +116 -0

tests/python_package_test/test_basic.py tests/python_package_test/test_basic.py +34 -0

No files found.
--- a/python-package/lightgbm/basic.py
+++ b/python-package/lightgbm/basic.py
@@ -7,6 +7,7 @@ import ctypes
 import os
 import warnings
 from tempfile import NamedTemporaryFile
+from collections import OrderedDict
 import numpy as np
 import scipy.sparse
@@ -1858,6 +1859,121 @@ class Booster(object):
        self.network = False
        return self
+    def trees_to_dataframe(self):
+        """Parse the fitted model and return in an easy-to-read pandas DataFrame.
+        Returns
+        -------
+        result : pandas DataFrame
+            Returns a pandas DataFrame of the parsed model.
+        """
+        if not PANDAS_INSTALLED:
+            raise LightGBMError('This method cannot be run without pandas installed')
+        if self.num_trees() == 0:
+            raise LightGBMError('There are no trees in this Booster and thus nothing to parse')
+        def _is_split_node(tree):
+            return 'split_index' in tree.keys()
+        def create_node_record(tree, node_depth=1, tree_index=None,
+                               feature_names=None, parent_node=None):
+            def _get_node_index(tree, tree_index):
+                tree_num = str(tree_index) + '-' if tree_index is not None else ''
+                is_split = _is_split_node(tree)
+                node_type = 'S' if is_split else 'L'
+                # if a single node tree it won't have `leaf_index` so return 0
+                node_num = str(tree.get('split_index' if is_split else 'leaf_index', 0))
+                return tree_num + node_type + node_num
+            def _get_split_feature(tree, feature_names):
+                if _is_split_node(tree):
+                    if feature_names is not None:
+                        feature_name = feature_names[tree['split_feature']]
+                    else:
+                        feature_name = tree['split_feature']
+                else:
+                    feature_name = None
+                return feature_name
+            def _is_single_node_tree(tree):
+                return tree.keys() == {'leaf_value'}
+            # Create the node record, and populate universal data members
+            node = OrderedDict()
+            node['tree_index'] = tree_index
+            node['node_depth'] = node_depth
+            node['node_index'] = _get_node_index(tree, tree_index)
+            node['left_child'] = None
+            node['right_child'] = None
+            node['parent_index'] = parent_node
+            node['split_feature'] = _get_split_feature(tree, feature_names)
+            node['split_gain'] = None
+            node['threshold'] = None
+            node['decision_type'] = None
+            node['missing_direction'] = None
+            node['missing_type'] = None
+            node['value'] = None
+            node['weight'] = None
+            node['count'] = None
+            # Update values to reflect node type (leaf or split)
+            if _is_split_node(tree):
+                node['left_child'] = _get_node_index(tree['left_child'], tree_index)
+                node['right_child'] = _get_node_index(tree['right_child'], tree_index)
+                node['split_gain'] = tree['split_gain']
+                node['threshold'] = tree['threshold']
+                node['decision_type'] = tree['decision_type']
+                node['missing_direction'] = 'left' if tree['default_left'] else 'right'
+                node['missing_type'] = tree['missing_type']
+                node['value'] = tree['internal_value']
+                node['weight'] = tree['internal_weight']
+                node['count'] = tree['internal_count']
+            else:
+                node['value'] = tree['leaf_value']
+                if not _is_single_node_tree(tree):
+                    node['weight'] = tree['leaf_weight']
+                    node['count'] = tree['leaf_count']
+            return node
+        def tree_dict_to_node_list(tree, node_depth=1, tree_index=None,
+                                   feature_names=None, parent_node=None):
+            node = create_node_record(tree,
+                                      node_depth=node_depth,
+                                      tree_index=tree_index,
+                                      feature_names=feature_names,
+                                      parent_node=parent_node)
+            res = [node]
+            if _is_split_node(tree):
+                # traverse the next level of the tree
+                children = ['left_child', 'right_child']
+                for child in children:
+                    subtree_list = tree_dict_to_node_list(
+                        tree[child],
+                        node_depth=node_depth + 1,
+                        tree_index=tree_index,
+                        feature_names=feature_names,
+                        parent_node=node['node_index'])
+                    # In tree format, "subtree_list" is a list of node records (dicts),
+                    # and we add node to the list.
+                    res.extend(subtree_list)
+            return res
+        model_dict = self.dump_model()
+        feature_names = model_dict['feature_names']
+        model_list = []
+        for tree in model_dict['tree_info']:
+            model_list.extend(tree_dict_to_node_list(tree['tree_structure'],
+                                                     tree_index=tree['tree_index'],
+                                                     feature_names=feature_names))
+        return DataFrame(model_list, columns=model_list[0].keys())
    def set_train_data_name(self, name):
        """Set the name to the training Dataset.

--- a/tests/python_package_test/test_basic.py
+++ b/tests/python_package_test/test_basic.py
@@ -328,3 +328,37 @@ class TestBasic(unittest.TestCase):
        lgb_data.set_weight(sequence)
        lgb_data.set_init_score(sequence)
        check_asserts(lgb_data)
+    def test_trees_to_dataframe(self):
+        def _imptcs_to_numpy(X, impcts_dict):
+            cols = ['Column_' + str(i) for i in range(X.shape[1])]
+            imptcs = [impcts_dict.get(col, 0.) for col in cols]
+            return np.array(imptcs)
+        X, y = load_breast_cancer(True)
+        data = lgb.Dataset(X, label=y)
+        num_trees = 10
+        bst = lgb.train({"objective": "binary"}, data, num_trees)
+        tree_df = bst.trees_to_dataframe()
+        split_dict = (tree_df[~tree_df['split_gain'].isnull()]
+                      .groupby('split_feature')
+                      .size()
+                      .to_dict())
+        gains_dict = (tree_df
+                      .groupby('split_feature')['split_gain']
+                      .sum()
+                      .to_dict())
+        tree_split = _imptcs_to_numpy(X, split_dict)
+        tree_gains = _imptcs_to_numpy(X, gains_dict)
+        mod_split = bst.feature_importance('split')
+        mod_gains = bst.feature_importance('gain')
+        num_trees_from_df = tree_df['tree_index'].nunique()
+        obs_counts_from_df = tree_df.loc[tree_df['node_depth'] == 1, 'count'].values
+        np.testing.assert_equal(tree_split, mod_split)
+        np.testing.assert_allclose(tree_gains, mod_gains)
+        self.assertEqual(num_trees_from_df, num_trees)
+        np.testing.assert_equal(obs_counts_from_df, len(y))