Commit ef778069 authored by Guolin Ke's avatar Guolin Ke
Browse files

Add categorical feature support back.

parent d93eb338
...@@ -131,7 +131,7 @@ public: ...@@ -131,7 +131,7 @@ public:
uint32_t max_bin = bin_offsets_[sub_feature + 1] - 1; uint32_t max_bin = bin_offsets_[sub_feature + 1] - 1;
uint32_t default_bin = bin_mappers_[sub_feature]->GetDefaultBin(); uint32_t default_bin = bin_mappers_[sub_feature]->GetDefaultBin();
return bin_data_->Split(min_bin, max_bin, default_bin, return bin_data_->Split(min_bin, max_bin, default_bin,
threshold, data_indices, num_data, lte_indices, gt_indices); threshold, data_indices, num_data, lte_indices, gt_indices, bin_mappers_[sub_feature]->bin_type());
} }
/*! /*!
* \brief From bin to feature value * \brief From bin to feature value
......
...@@ -34,6 +34,7 @@ public: ...@@ -34,6 +34,7 @@ public:
* \brief Performing a split on tree leaves. * \brief Performing a split on tree leaves.
* \param leaf Index of leaf to be split * \param leaf Index of leaf to be split
* \param feature Index of feature; the converted index after removing useless features * \param feature Index of feature; the converted index after removing useless features
* \param bin_type type of this feature, numerical or categorical
* \param threshold Threshold(bin) of split * \param threshold Threshold(bin) of split
* \param real_feature Index of feature, the original index on data * \param real_feature Index of feature, the original index on data
* \param threshold_double Threshold on feature value * \param threshold_double Threshold on feature value
...@@ -44,7 +45,7 @@ public: ...@@ -44,7 +45,7 @@ public:
* \param gain Split gain * \param gain Split gain
* \return The index of new leaf. * \return The index of new leaf.
*/ */
int Split(int leaf, int feature, uint32_t threshold, int real_feature, int Split(int leaf, int feature, BinType bin_type, uint32_t threshold, int real_feature,
double threshold_double, double left_value, double threshold_double, double left_value,
double right_value, data_size_t left_cnt, data_size_t right_cnt, double gain); double right_value, data_size_t left_cnt, data_size_t right_cnt, double gain);
...@@ -113,6 +114,15 @@ public: ...@@ -113,6 +114,15 @@ public:
/*! \brief Serialize this object to json*/ /*! \brief Serialize this object to json*/
std::string ToJSON(); std::string ToJSON();
template<typename T>
static bool CategoricalDecision(T fval, T threshold) {
if (static_cast<int>(fval) == static_cast<int>(threshold)) {
return true;
} else {
return false;
}
}
template<typename T> template<typename T>
static bool NumericalDecision(T fval, T threshold) { static bool NumericalDecision(T fval, T threshold) {
if (fval <= threshold) { if (fval <= threshold) {
...@@ -122,13 +132,18 @@ public: ...@@ -122,13 +132,18 @@ public:
} }
} }
private: static const char* GetDecisionTypeName(int8_t type) {
if (type == 0) {
return "no_greater";
} else {
return "is";
}
}
inline int GetLeaf(std::vector<std::unique_ptr<BinIterator>>& iterators, static std::vector<bool(*)(uint32_t, uint32_t)> inner_decision_funs;
data_size_t data_idx) const; static std::vector<bool(*)(double, double)> decision_funs;
inline int GetLeafRaw(std::vector<std::unique_ptr<BinIterator>>& iterators, private:
data_size_t data_idx) const;
/*! /*!
* \brief Find leaf index of which record belongs by features * \brief Find leaf index of which record belongs by features
...@@ -157,6 +172,8 @@ private: ...@@ -157,6 +172,8 @@ private:
std::vector<uint32_t> threshold_in_bin_; std::vector<uint32_t> threshold_in_bin_;
/*! \brief A non-leaf node's split threshold in feature value */ /*! \brief A non-leaf node's split threshold in feature value */
std::vector<double> threshold_; std::vector<double> threshold_;
/*! \brief Decision type, 0 for '<='(numerical feature), 1 for 'is'(categorical feature) */
std::vector<int8_t> decision_type_;
/*! \brief A non-leaf node's split gain */ /*! \brief A non-leaf node's split gain */
std::vector<double> split_gain_; std::vector<double> split_gain_;
// used for leaf node // used for leaf node
...@@ -173,6 +190,7 @@ private: ...@@ -173,6 +190,7 @@ private:
/*! \brief Depth for leaves */ /*! \brief Depth for leaves */
std::vector<int> leaf_depth_; std::vector<int> leaf_depth_;
double shrinkage_; double shrinkage_;
bool has_categorical_;
}; };
...@@ -186,40 +204,10 @@ inline int Tree::PredictLeafIndex(const double* feature_values) const { ...@@ -186,40 +204,10 @@ inline int Tree::PredictLeafIndex(const double* feature_values) const {
return leaf; return leaf;
} }
inline int Tree::GetLeaf(std::vector<std::unique_ptr<BinIterator>>& iterators,
data_size_t data_idx) const {
int node = 0;
while (node >= 0) {
if (NumericalDecision<uint32_t>(
iterators[node]->Get(data_idx),
threshold_in_bin_[node])) {
node = left_child_[node];
} else {
node = right_child_[node];
}
}
return ~node;
}
inline int Tree::GetLeafRaw(std::vector<std::unique_ptr<BinIterator>>& iterators,
data_size_t data_idx) const {
int node = 0;
while (node >= 0) {
if (NumericalDecision<uint32_t>(
iterators[split_feature_inner[node]]->Get(data_idx),
threshold_in_bin_[node])) {
node = left_child_[node];
} else {
node = right_child_[node];
}
}
return ~node;
}
inline int Tree::GetLeaf(const double* feature_values) const { inline int Tree::GetLeaf(const double* feature_values) const {
int node = 0; int node = 0;
while (node >= 0) { while (node >= 0) {
if (NumericalDecision<double>( if (decision_funs[decision_type_[node]](
feature_values[split_feature_[node]], feature_values[split_feature_[node]],
threshold_[node])) { threshold_[node])) {
node = left_child_[node]; node = left_child_[node];
......
...@@ -31,9 +31,9 @@ def get_threshold(node_id, prev_node_idx, is_child): ...@@ -31,9 +31,9 @@ def get_threshold(node_id, prev_node_idx, is_child):
def print_simple_predicate(tab_len, node_id, is_left_child, prev_node_idx, is_leaf): def print_simple_predicate(tab_len, node_id, is_left_child, prev_node_idx, is_leaf):
if is_left_child: if is_left_child:
op = 'lessOrEqual' op = 'equal' if decision_type[prev_node_idx] == 1 else 'lessOrEqual'
else: else:
op = 'greaterThan' op = 'notEqual' if decision_type[prev_node_idx] == 1 else 'greaterThan'
out_('\t' * (tab_len + 1) + ("<SimplePredicate field=\"{0}\" " + " operator=\"{1}\" value=\"{2}\" />").format( out_('\t' * (tab_len + 1) + ("<SimplePredicate field=\"{0}\" " + " operator=\"{1}\" value=\"{2}\" />").format(
get_field_name(node_id, prev_node_idx, is_leaf), op, get_threshold(node_id, prev_node_idx, is_leaf))) get_field_name(node_id, prev_node_idx, is_leaf), op, get_threshold(node_id, prev_node_idx, is_leaf)))
...@@ -128,6 +128,7 @@ with open('LightGBM_pmml.xml', 'w') as pmml_out: ...@@ -128,6 +128,7 @@ with open('LightGBM_pmml.xml', 'w') as pmml_out:
split_feature = get_array_ints(next(model_content)) split_feature = get_array_ints(next(model_content))
split_gain = next(model_content) # unused split_gain = next(model_content) # unused
threshold = get_array_strings(next(model_content)) threshold = get_array_strings(next(model_content))
decision_type = get_array_ints(next(model_content))
left_child = get_array_ints(next(model_content)) left_child = get_array_ints(next(model_content))
right_child = get_array_ints(next(model_content)) right_child = get_array_ints(next(model_content))
leaf_parent = get_array_ints(next(model_content)) leaf_parent = get_array_ints(next(model_content))
......
...@@ -12,8 +12,9 @@ from tempfile import NamedTemporaryFile ...@@ -12,8 +12,9 @@ from tempfile import NamedTemporaryFile
import numpy as np import numpy as np
import scipy.sparse import scipy.sparse
from .compat import (DataFrame, Series, integer_types, json, numeric_types, from .compat import (DataFrame, Series, integer_types, json,
range_, string_type) json_default_with_numpy, numeric_types, range_,
string_type)
from .libpath import find_lib_path from .libpath import find_lib_path
...@@ -220,22 +221,49 @@ PANDAS_DTYPE_MAPPER = {'int8': 'int', 'int16': 'int', 'int32': 'int', ...@@ -220,22 +221,49 @@ PANDAS_DTYPE_MAPPER = {'int8': 'int', 'int16': 'int', 'int32': 'int',
'float32': 'float', 'float64': 'float', 'bool': 'int'} 'float32': 'float', 'float64': 'float', 'bool': 'int'}
def _data_from_pandas(data, feature_name): def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorical):
if isinstance(data, DataFrame): if isinstance(data, DataFrame):
bad_fields = [data.columns[i] for i, dtype in enumerate(data.dtypes) if dtype.name not in PANDAS_DTYPE_MAPPER] if feature_name == 'auto' or feature_name is None:
if bad_fields:
msg = """DataFrame.dtypes for data must be int, float or bool. Did not expect the data types in fields: """
raise ValueError(msg + ', '.join(bad_fields))
if feature_name == 'auto':
if all([isinstance(name, integer_types + (np.integer, )) for name in data.columns]): if all([isinstance(name, integer_types + (np.integer, )) for name in data.columns]):
msg = """Using Pandas (default) integer column names, not column indexes. You can use indexes with DataFrame.values.""" msg = """Using Pandas (default) integer column names, not column indexes. You can use indexes with DataFrame.values."""
warnings.filterwarnings('once') warnings.filterwarnings('once')
warnings.warn(msg, stacklevel=5) warnings.warn(msg, stacklevel=5)
feature_name = [str(name) for name in data.columns] data = data.rename(columns=str)
cat_cols = data.select_dtypes(include=['category']).columns
if pandas_categorical is None: # train dataset
pandas_categorical = [list(data[col].cat.categories) for col in cat_cols]
else:
if len(cat_cols) != len(pandas_categorical):
raise ValueError('train and valid dataset categorical_feature do not match.')
for col, category in zip(cat_cols, pandas_categorical):
if list(data[col].cat.categories) != list(category):
data[col] = data[col].cat.set_categories(category)
if len(cat_cols): # cat_cols is pandas Index object
data = data.copy() # not alter origin DataFrame
data[cat_cols] = data[cat_cols].apply(lambda x: x.cat.codes)
if categorical_feature is not None:
if feature_name is None:
feature_name = list(data.columns)
if categorical_feature == 'auto':
categorical_feature = list(cat_cols)
else:
categorical_feature = list(categorical_feature) + list(cat_cols)
if feature_name == 'auto':
feature_name = list(data.columns)
data_dtypes = data.dtypes
if not all(dtype.name in PANDAS_DTYPE_MAPPER for dtype in data_dtypes):
bad_fields = [data.columns[i] for i, dtype in
enumerate(data_dtypes) if dtype.name not in PANDAS_DTYPE_MAPPER]
msg = """DataFrame.dtypes for data must be int, float or bool. Did not expect the data types in fields """
raise ValueError(msg + ', '.join(bad_fields))
data = data.values.astype('float') data = data.values.astype('float')
elif feature_name == 'auto': else:
if feature_name == 'auto':
feature_name = None feature_name = None
return data, feature_name if categorical_feature == 'auto':
categorical_feature = None
return data, feature_name, categorical_feature, pandas_categorical
def _label_from_pandas(label): def _label_from_pandas(label):
...@@ -249,6 +277,19 @@ def _label_from_pandas(label): ...@@ -249,6 +277,19 @@ def _label_from_pandas(label):
return label return label
def _save_pandas_categorical(file_name, pandas_categorical):
with open(file_name, 'a') as f:
f.write('\npandas_categorical:' + json.dumps(pandas_categorical, default=json_default_with_numpy))
def _load_pandas_categorical(file_name):
with open(file_name, 'r') as f:
last_line = f.readlines()[-1]
if last_line.startswith('pandas_categorical:'):
return json.loads(last_line[len('pandas_categorical:'):])
return None
class _InnerPredictor(object): class _InnerPredictor(object):
""" """
A _InnerPredictor of LightGBM. A _InnerPredictor of LightGBM.
...@@ -280,6 +321,7 @@ class _InnerPredictor(object): ...@@ -280,6 +321,7 @@ class _InnerPredictor(object):
ctypes.byref(out_num_class))) ctypes.byref(out_num_class)))
self.num_class = out_num_class.value self.num_class = out_num_class.value
self.num_total_iteration = out_num_iterations.value self.num_total_iteration = out_num_iterations.value
self.pandas_categorical = _load_pandas_categorical(model_file)
elif booster_handle is not None: elif booster_handle is not None:
self.__is_manage_handle = False self.__is_manage_handle = False
self.handle = booster_handle self.handle = booster_handle
...@@ -293,6 +335,7 @@ class _InnerPredictor(object): ...@@ -293,6 +335,7 @@ class _InnerPredictor(object):
self.handle, self.handle,
ctypes.byref(out_num_iterations))) ctypes.byref(out_num_iterations)))
self.num_total_iteration = out_num_iterations.value self.num_total_iteration = out_num_iterations.value
self.pandas_categorical = None
else: else:
raise TypeError('Need Model file or Booster handle to create a predictor') raise TypeError('Need Model file or Booster handle to create a predictor')
...@@ -328,7 +371,7 @@ class _InnerPredictor(object): ...@@ -328,7 +371,7 @@ class _InnerPredictor(object):
""" """
if isinstance(data, Dataset): if isinstance(data, Dataset):
raise TypeError("Cannot use Dataset instance for prediction, please use raw data instead") raise TypeError("Cannot use Dataset instance for prediction, please use raw data instead")
data = _data_from_pandas(data, None)[0] data = _data_from_pandas(data, None, None, self.pandas_categorical)[0]
predict_type = C_API_PREDICT_NORMAL predict_type = C_API_PREDICT_NORMAL
if raw_score: if raw_score:
predict_type = C_API_PREDICT_RAW_SCORE predict_type = C_API_PREDICT_RAW_SCORE
...@@ -359,6 +402,9 @@ class _InnerPredictor(object): ...@@ -359,6 +402,9 @@ class _InnerPredictor(object):
elif isinstance(data, np.ndarray): elif isinstance(data, np.ndarray):
preds, nrow = self.__pred_for_np2d(data, num_iteration, preds, nrow = self.__pred_for_np2d(data, num_iteration,
predict_type) predict_type)
elif isinstance(data, DataFrame):
preds, nrow = self.__pred_for_np2d(data.values, num_iteration,
predict_type)
else: else:
try: try:
csr = scipy.sparse.csr_matrix(data) csr = scipy.sparse.csr_matrix(data)
...@@ -486,7 +532,7 @@ class Dataset(object): ...@@ -486,7 +532,7 @@ class Dataset(object):
"""Dataset in LightGBM.""" """Dataset in LightGBM."""
def __init__(self, data, label=None, max_bin=255, reference=None, def __init__(self, data, label=None, max_bin=255, reference=None,
weight=None, group=None, silent=False, weight=None, group=None, silent=False,
feature_name='auto', params=None, feature_name='auto', categorical_feature='auto', params=None,
free_raw_data=True): free_raw_data=True):
""" """
Parameters Parameters
...@@ -509,6 +555,11 @@ class Dataset(object): ...@@ -509,6 +555,11 @@ class Dataset(object):
feature_name : list of str, or 'auto' feature_name : list of str, or 'auto'
Feature names Feature names
If 'auto' and data is pandas DataFrame, use data columns name If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
params: dict, optional params: dict, optional
Other parameters Other parameters
free_raw_data: Bool free_raw_data: Bool
...@@ -523,10 +574,12 @@ class Dataset(object): ...@@ -523,10 +574,12 @@ class Dataset(object):
self.group = group self.group = group
self.silent = silent self.silent = silent
self.feature_name = feature_name self.feature_name = feature_name
self.categorical_feature = categorical_feature
self.params = params self.params = params
self.free_raw_data = free_raw_data self.free_raw_data = free_raw_data
self.used_indices = None self.used_indices = None
self._predictor = None self._predictor = None
self.pandas_categorical = None
def __del__(self): def __del__(self):
self._free_handle() self._free_handle()
...@@ -539,11 +592,11 @@ class Dataset(object): ...@@ -539,11 +592,11 @@ class Dataset(object):
def _lazy_init(self, data, label=None, max_bin=255, reference=None, def _lazy_init(self, data, label=None, max_bin=255, reference=None,
weight=None, group=None, predictor=None, weight=None, group=None, predictor=None,
silent=False, feature_name='auto', silent=False, feature_name='auto',
params=None): categorical_feature='auto', params=None):
if data is None: if data is None:
self.handle = None self.handle = None
return return
data, feature_name, = _data_from_pandas(data, feature_name) data, feature_name, categorical_feature, self.pandas_categorical = _data_from_pandas(data, feature_name, categorical_feature, self.pandas_categorical)
label = _label_from_pandas(label) label = _label_from_pandas(label)
self.data_has_header = False self.data_has_header = False
"""process for args""" """process for args"""
...@@ -555,6 +608,23 @@ class Dataset(object): ...@@ -555,6 +608,23 @@ class Dataset(object):
params["verbose"] = 0 params["verbose"] = 0
elif "verbose" not in params: elif "verbose" not in params:
params["verbose"] = 1 params["verbose"] = 1
"""get categorical features"""
if categorical_feature is not None:
categorical_indices = set()
feature_dict = {}
if feature_name is not None:
feature_dict = {name: i for i, name in enumerate(feature_name)}
for name in categorical_feature:
if isinstance(name, string_type) and name in feature_dict:
categorical_indices.add(feature_dict[name])
elif isinstance(name, integer_types):
categorical_indices.add(name)
else:
raise TypeError("Wrong type({}) or unknown name({}) in categorical_feature"
.format(type(name).__name__, name))
params['categorical_column'] = sorted(categorical_indices)
params_str = param_dict_to_str(params) params_str = param_dict_to_str(params)
"""process for reference dataset""" """process for reference dataset"""
ref_dataset = None ref_dataset = None
...@@ -714,7 +784,7 @@ class Dataset(object): ...@@ -714,7 +784,7 @@ class Dataset(object):
self._lazy_init(self.data, label=self.label, max_bin=self.max_bin, self._lazy_init(self.data, label=self.label, max_bin=self.max_bin,
weight=self.weight, group=self.group, predictor=self._predictor, weight=self.weight, group=self.group, predictor=self._predictor,
silent=self.silent, feature_name=self.feature_name, silent=self.silent, feature_name=self.feature_name,
params=self.params) categorical_feature=self.categorical_feature, params=self.params)
if self.free_raw_data: if self.free_raw_data:
self.data = None self.data = None
return self return self
...@@ -744,6 +814,7 @@ class Dataset(object): ...@@ -744,6 +814,7 @@ class Dataset(object):
weight=weight, group=group, silent=silent, params=params, weight=weight, group=group, silent=silent, params=params,
free_raw_data=self.free_raw_data) free_raw_data=self.free_raw_data)
ret._predictor = self._predictor ret._predictor = self._predictor
ret.pandas_categorical = self.pandas_categorical
return ret return ret
def subset(self, used_indices, params=None): def subset(self, used_indices, params=None):
...@@ -758,8 +829,9 @@ class Dataset(object): ...@@ -758,8 +829,9 @@ class Dataset(object):
Other parameters Other parameters
""" """
ret = Dataset(None, reference=self, feature_name=self.feature_name, ret = Dataset(None, reference=self, feature_name=self.feature_name,
params=params) categorical_feature=self.categorical_feature, params=params)
ret._predictor = self._predictor ret._predictor = self._predictor
ret.pandas_categorical = self.pandas_categorical
ret.used_indices = used_indices ret.used_indices = used_indices
return ret return ret
...@@ -867,6 +939,24 @@ class Dataset(object): ...@@ -867,6 +939,24 @@ class Dataset(object):
else: else:
raise TypeError("Unknown type") raise TypeError("Unknown type")
def set_categorical_feature(self, categorical_feature):
"""
Set categorical features
Parameters
----------
categorical_feature : list of int or str
Name/index of categorical features
"""
if self.categorical_feature == categorical_feature:
return
if self.data is not None:
self.categorical_feature = categorical_feature
self._free_handle()
else:
raise LightGBMError("Cannot set categorical feature after freed raw data, set free_raw_data=False when construct Dataset to avoid this.")
def _set_predictor(self, predictor): def _set_predictor(self, predictor):
""" """
Set predictor for continued training, not recommand for user to call this function. Set predictor for continued training, not recommand for user to call this function.
...@@ -889,6 +979,7 @@ class Dataset(object): ...@@ -889,6 +979,7 @@ class Dataset(object):
reference : Dataset reference : Dataset
Will use reference as template to consturct current dataset Will use reference as template to consturct current dataset
""" """
self.set_categorical_feature(reference.categorical_feature)
self.set_feature_name(reference.feature_name) self.set_feature_name(reference.feature_name)
self._set_predictor(reference._predictor) self._set_predictor(reference._predictor)
if self.reference is reference: if self.reference is reference:
...@@ -1117,6 +1208,7 @@ class Booster(object): ...@@ -1117,6 +1208,7 @@ class Booster(object):
self.__inner_predict_buffer = [None] self.__inner_predict_buffer = [None]
self.__is_predicted_cur_iter = [False] self.__is_predicted_cur_iter = [False]
self.__get_eval_info() self.__get_eval_info()
self.pandas_categorical = train_set.pandas_categorical
elif model_file is not None: elif model_file is not None:
"""Prediction task""" """Prediction task"""
out_num_iterations = ctypes.c_int(0) out_num_iterations = ctypes.c_int(0)
...@@ -1129,6 +1221,7 @@ class Booster(object): ...@@ -1129,6 +1221,7 @@ class Booster(object):
self.handle, self.handle,
ctypes.byref(out_num_class))) ctypes.byref(out_num_class)))
self.__num_class = out_num_class.value self.__num_class = out_num_class.value
self.pandas_categorical = _load_pandas_categorical(model_file)
elif 'model_str' in params: elif 'model_str' in params:
self.__load_model_from_string(params['model_str']) self.__load_model_from_string(params['model_str'])
else: else:
...@@ -1144,6 +1237,7 @@ class Booster(object): ...@@ -1144,6 +1237,7 @@ class Booster(object):
def __deepcopy__(self, _): def __deepcopy__(self, _):
model_str = self.__save_model_to_string() model_str = self.__save_model_to_string()
booster = Booster({'model_str': model_str}) booster = Booster({'model_str': model_str})
booster.pandas_categorical = self.pandas_categorical
return booster return booster
def __getstate__(self): def __getstate__(self):
...@@ -1383,6 +1477,7 @@ class Booster(object): ...@@ -1383,6 +1477,7 @@ class Booster(object):
self.handle, self.handle,
ctypes.c_int(num_iteration), ctypes.c_int(num_iteration),
c_str(filename))) c_str(filename)))
_save_pandas_categorical(filename, self.pandas_categorical)
def __load_model_from_string(self, model_str): def __load_model_from_string(self, model_str):
"""[Private] Load model from string""" """[Private] Load model from string"""
...@@ -1494,6 +1589,7 @@ class Booster(object): ...@@ -1494,6 +1589,7 @@ class Booster(object):
def _to_predictor(self): def _to_predictor(self):
"""Convert to predictor""" """Convert to predictor"""
predictor = _InnerPredictor(booster_handle=self.handle) predictor = _InnerPredictor(booster_handle=self.handle)
predictor.pandas_categorical = self.pandas_categorical
return predictor return predictor
def feature_name(self): def feature_name(self):
......
...@@ -39,6 +39,15 @@ except (ImportError, SyntaxError): ...@@ -39,6 +39,15 @@ except (ImportError, SyntaxError):
import json import json
def json_default_with_numpy(obj):
if isinstance(obj, (np.integer, np.floating, np.bool_)):
return obj.item()
elif isinstance(obj, np.ndarray):
return obj.tolist()
else:
return obj
"""pandas""" """pandas"""
try: try:
from pandas import Series, DataFrame from pandas import Series, DataFrame
......
...@@ -17,7 +17,7 @@ from .compat import (SKLEARN_INSTALLED, LGBMStratifiedKFold, integer_types, ...@@ -17,7 +17,7 @@ from .compat import (SKLEARN_INSTALLED, LGBMStratifiedKFold, integer_types,
def train(params, train_set, num_boost_round=100, def train(params, train_set, num_boost_round=100,
valid_sets=None, valid_names=None, valid_sets=None, valid_names=None,
fobj=None, feval=None, init_model=None, fobj=None, feval=None, init_model=None,
feature_name='auto', feature_name='auto', categorical_feature='auto',
early_stopping_rounds=None, evals_result=None, early_stopping_rounds=None, evals_result=None,
verbose_eval=True, learning_rates=None, callbacks=None): verbose_eval=True, learning_rates=None, callbacks=None):
""" """
...@@ -45,6 +45,11 @@ def train(params, train_set, num_boost_round=100, ...@@ -45,6 +45,11 @@ def train(params, train_set, num_boost_round=100,
feature_name : list of str, or 'auto' feature_name : list of str, or 'auto'
Feature names Feature names
If 'auto' and data is pandas DataFrame, use data columns name If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
early_stopping_rounds: int early_stopping_rounds: int
Activates early stopping. Activates early stopping.
Requires at least one validation data and one metric Requires at least one validation data and one metric
...@@ -98,6 +103,7 @@ def train(params, train_set, num_boost_round=100, ...@@ -98,6 +103,7 @@ def train(params, train_set, num_boost_round=100,
train_set._update_params(params) train_set._update_params(params)
train_set._set_predictor(predictor) train_set._set_predictor(predictor)
train_set.set_feature_name(feature_name) train_set.set_feature_name(feature_name)
train_set.set_categorical_feature(categorical_feature)
is_valid_contain_train = False is_valid_contain_train = False
train_data_name = "training" train_data_name = "training"
...@@ -271,7 +277,7 @@ def _agg_cv_result(raw_results): ...@@ -271,7 +277,7 @@ def _agg_cv_result(raw_results):
def cv(params, train_set, num_boost_round=10, def cv(params, train_set, num_boost_round=10,
data_splitter=None, nfold=5, stratified=False, shuffle=True, data_splitter=None, nfold=5, stratified=False, shuffle=True,
metrics=None, fobj=None, feval=None, init_model=None, metrics=None, fobj=None, feval=None, init_model=None,
feature_name='auto', feature_name='auto', categorical_feature='auto',
early_stopping_rounds=None, fpreproc=None, early_stopping_rounds=None, fpreproc=None,
verbose_eval=None, show_stdv=True, seed=0, verbose_eval=None, show_stdv=True, seed=0,
callbacks=None): callbacks=None):
...@@ -305,6 +311,11 @@ def cv(params, train_set, num_boost_round=10, ...@@ -305,6 +311,11 @@ def cv(params, train_set, num_boost_round=10,
feature_name : list of str, or 'auto' feature_name : list of str, or 'auto'
Feature names Feature names
If 'auto' and data is pandas DataFrame, use data columns name If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
early_stopping_rounds: int early_stopping_rounds: int
Activates early stopping. CV error needs to decrease at least Activates early stopping. CV error needs to decrease at least
every <early_stopping_rounds> round(s) to continue. every <early_stopping_rounds> round(s) to continue.
...@@ -343,6 +354,7 @@ def cv(params, train_set, num_boost_round=10, ...@@ -343,6 +354,7 @@ def cv(params, train_set, num_boost_round=10,
train_set._update_params(params) train_set._update_params(params)
train_set._set_predictor(predictor) train_set._set_predictor(predictor)
train_set.set_feature_name(feature_name) train_set.set_feature_name(feature_name)
train_set.set_categorical_feature(categorical_feature)
if metrics: if metrics:
params.setdefault('metric', []) params.setdefault('metric', [])
......
...@@ -257,7 +257,12 @@ def _to_graphviz(graph, tree_info, show_info, feature_names): ...@@ -257,7 +257,12 @@ def _to_graphviz(graph, tree_info, show_info, feature_names):
if info in {'split_gain', 'internal_value', 'internal_count'}: if info in {'split_gain', 'internal_value', 'internal_count'}:
label += '\n' + info + ':' + str(root[info]) label += '\n' + info + ':' + str(root[info])
graph.node(name, label=label) graph.node(name, label=label)
if root['decision_type'] == 'no_greater':
l_dec, r_dec = '<=', '>' l_dec, r_dec = '<=', '>'
elif root['decision_type'] == 'is':
l_dec, r_dec = 'is', "isn't"
else:
raise ValueError('Invalid decision type in tree model.')
add(root['left_child'], name, l_dec) add(root['left_child'], name, l_dec)
add(root['right_child'], name, r_dec) add(root['right_child'], name, r_dec)
else: # leaf else: # leaf
......
...@@ -284,7 +284,7 @@ class LGBMModel(LGBMModelBase): ...@@ -284,7 +284,7 @@ class LGBMModel(LGBMModelBase):
eval_init_score=None, eval_group=None, eval_init_score=None, eval_group=None,
eval_metric=None, eval_metric=None,
early_stopping_rounds=None, verbose=True, early_stopping_rounds=None, verbose=True,
feature_name='auto', feature_name='auto', categorical_feature='auto',
callbacks=None): callbacks=None):
""" """
Fit the gradient boosting model Fit the gradient boosting model
...@@ -318,6 +318,11 @@ class LGBMModel(LGBMModelBase): ...@@ -318,6 +318,11 @@ class LGBMModel(LGBMModelBase):
feature_name : list of str, or 'auto' feature_name : list of str, or 'auto'
Feature names Feature names
If 'auto' and data is pandas DataFrame, use data columns name If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
callbacks : list of callback functions callbacks : list of callback functions
List of callback functions that are applied at each iteration. List of callback functions that are applied at each iteration.
See Callbacks in Python-API.md for more information. See Callbacks in Python-API.md for more information.
...@@ -401,6 +406,7 @@ class LGBMModel(LGBMModelBase): ...@@ -401,6 +406,7 @@ class LGBMModel(LGBMModelBase):
early_stopping_rounds=early_stopping_rounds, early_stopping_rounds=early_stopping_rounds,
evals_result=evals_result, fobj=self.fobj, feval=feval, evals_result=evals_result, fobj=self.fobj, feval=feval,
verbose_eval=verbose, feature_name=feature_name, verbose_eval=verbose, feature_name=feature_name,
categorical_feature=categorical_feature,
callbacks=callbacks) callbacks=callbacks)
if evals_result: if evals_result:
...@@ -508,7 +514,7 @@ class LGBMRegressor(LGBMModel, LGBMRegressorBase): ...@@ -508,7 +514,7 @@ class LGBMRegressor(LGBMModel, LGBMRegressorBase):
eval_init_score=None, eval_init_score=None,
eval_metric="l2", eval_metric="l2",
early_stopping_rounds=None, verbose=True, early_stopping_rounds=None, verbose=True,
feature_name='auto', callbacks=None): feature_name='auto', categorical_feature='auto', callbacks=None):
super(LGBMRegressor, self).fit(X, y, sample_weight=sample_weight, super(LGBMRegressor, self).fit(X, y, sample_weight=sample_weight,
init_score=init_score, eval_set=eval_set, init_score=init_score, eval_set=eval_set,
...@@ -517,6 +523,7 @@ class LGBMRegressor(LGBMModel, LGBMRegressorBase): ...@@ -517,6 +523,7 @@ class LGBMRegressor(LGBMModel, LGBMRegressorBase):
eval_metric=eval_metric, eval_metric=eval_metric,
early_stopping_rounds=early_stopping_rounds, early_stopping_rounds=early_stopping_rounds,
verbose=verbose, feature_name=feature_name, verbose=verbose, feature_name=feature_name,
categorical_feature=categorical_feature,
callbacks=callbacks) callbacks=callbacks)
return self return self
...@@ -553,7 +560,7 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase): ...@@ -553,7 +560,7 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
eval_init_score=None, eval_init_score=None,
eval_metric="binary_logloss", eval_metric="binary_logloss",
early_stopping_rounds=None, verbose=True, early_stopping_rounds=None, verbose=True,
feature_name='auto', feature_name='auto', categorical_feature='auto',
callbacks=None): callbacks=None):
self._le = LGBMLabelEncoder().fit(y) self._le = LGBMLabelEncoder().fit(y)
y = self._le.transform(y) y = self._le.transform(y)
...@@ -576,6 +583,7 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase): ...@@ -576,6 +583,7 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
eval_metric=eval_metric, eval_metric=eval_metric,
early_stopping_rounds=early_stopping_rounds, early_stopping_rounds=early_stopping_rounds,
verbose=verbose, feature_name=feature_name, verbose=verbose, feature_name=feature_name,
categorical_feature=categorical_feature,
callbacks=callbacks) callbacks=callbacks)
return self return self
...@@ -653,7 +661,7 @@ class LGBMRanker(LGBMModel): ...@@ -653,7 +661,7 @@ class LGBMRanker(LGBMModel):
eval_init_score=None, eval_group=None, eval_init_score=None, eval_group=None,
eval_metric='ndcg', eval_at=1, eval_metric='ndcg', eval_at=1,
early_stopping_rounds=None, verbose=True, early_stopping_rounds=None, verbose=True,
feature_name='auto', feature_name='auto', categorical_feature='auto',
callbacks=None): callbacks=None):
""" """
Most arguments like common methods except following: Most arguments like common methods except following:
...@@ -684,5 +692,6 @@ class LGBMRanker(LGBMModel): ...@@ -684,5 +692,6 @@ class LGBMRanker(LGBMModel):
eval_metric=eval_metric, eval_metric=eval_metric,
early_stopping_rounds=early_stopping_rounds, early_stopping_rounds=early_stopping_rounds,
verbose=verbose, feature_name=feature_name, verbose=verbose, feature_name=feature_name,
categorical_feature=categorical_feature,
callbacks=callbacks) callbacks=callbacks)
return self return self
...@@ -24,7 +24,13 @@ BinMapper::BinMapper(const BinMapper& other) { ...@@ -24,7 +24,13 @@ BinMapper::BinMapper(const BinMapper& other) {
num_bin_ = other.num_bin_; num_bin_ = other.num_bin_;
is_trival_ = other.is_trival_; is_trival_ = other.is_trival_;
sparse_rate_ = other.sparse_rate_; sparse_rate_ = other.sparse_rate_;
bin_type_ = other.bin_type_;
if (bin_type_ == BinType::NumericalBin) {
bin_upper_bound_ = other.bin_upper_bound_; bin_upper_bound_ = other.bin_upper_bound_;
} else {
bin_2_categorical_ = other.bin_2_categorical_;
categorical_2_bin_ = other.categorical_2_bin_;
}
min_val_ = other.min_val_; min_val_ = other.min_val_;
max_val_ = other.max_val_; max_val_ = other.max_val_;
default_bin_ = other.default_bin_; default_bin_ = other.default_bin_;
...@@ -38,7 +44,8 @@ BinMapper::~BinMapper() { ...@@ -38,7 +44,8 @@ BinMapper::~BinMapper() {
} }
bool NeedFilter(std::vector<int>& cnt_in_bin, int total_cnt, int filter_cnt) { bool NeedFilter(std::vector<int>& cnt_in_bin, int total_cnt, int filter_cnt, BinType bin_type) {
if (bin_type == BinType::NumericalBin) {
int sum_left = 0; int sum_left = 0;
for (size_t i = 0; i < cnt_in_bin.size() - 1; ++i) { for (size_t i = 0; i < cnt_in_bin.size() - 1; ++i) {
sum_left += cnt_in_bin[i]; sum_left += cnt_in_bin[i];
...@@ -48,12 +55,23 @@ bool NeedFilter(std::vector<int>& cnt_in_bin, int total_cnt, int filter_cnt) { ...@@ -48,12 +55,23 @@ bool NeedFilter(std::vector<int>& cnt_in_bin, int total_cnt, int filter_cnt) {
return false; return false;
} }
} }
} else {
for (size_t i = 0; i < cnt_in_bin.size() - 1; ++i) {
int sum_left = cnt_in_bin[i];
if (sum_left >= filter_cnt) {
return false;
} else if (total_cnt - sum_left >= filter_cnt) {
return false;
}
}
}
return true; return true;
} }
void BinMapper::FindBin(std::vector<double>& values, size_t total_sample_cnt, void BinMapper::FindBin(std::vector<double>& values, size_t total_sample_cnt,
int max_bin, int min_data_in_bin, int min_split_data) { int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type) {
// limit max_bin by min_data_in_bin bin_type_ = bin_type;
default_bin_ = 0;
std::vector<double>& raw_values = values; std::vector<double>& raw_values = values;
int zero_cnt = static_cast<int>(total_sample_cnt - raw_values.size()); int zero_cnt = static_cast<int>(total_sample_cnt - raw_values.size());
// find distinct_values first // find distinct_values first
...@@ -95,7 +113,7 @@ void BinMapper::FindBin(std::vector<double>& values, size_t total_sample_cnt, ...@@ -95,7 +113,7 @@ void BinMapper::FindBin(std::vector<double>& values, size_t total_sample_cnt,
max_val_ = distinct_values.back(); max_val_ = distinct_values.back();
std::vector<int> cnt_in_bin; std::vector<int> cnt_in_bin;
int num_values = static_cast<int>(distinct_values.size()); int num_values = static_cast<int>(distinct_values.size());
if (bin_type_ == BinType::NumericalBin) {
if (num_values <= max_bin) { if (num_values <= max_bin) {
// use distinct value is enough // use distinct value is enough
bin_upper_bound_.clear(); bin_upper_bound_.clear();
...@@ -172,21 +190,57 @@ void BinMapper::FindBin(std::vector<double>& values, size_t total_sample_cnt, ...@@ -172,21 +190,57 @@ void BinMapper::FindBin(std::vector<double>& values, size_t total_sample_cnt,
// last bin upper bound // last bin upper bound
bin_upper_bound_[bin_cnt - 1] = std::numeric_limits<double>::infinity(); bin_upper_bound_[bin_cnt - 1] = std::numeric_limits<double>::infinity();
} }
CHECK(num_bin_ <= max_bin);
} else {
// convert to int type first
std::vector<int> distinct_values_int;
std::vector<int> counts_int;
distinct_values_int.push_back(static_cast<int>(distinct_values[0]));
counts_int.push_back(counts[0]);
for (size_t i = 1; i < distinct_values.size(); ++i) {
if (static_cast<int>(distinct_values[i]) != distinct_values_int.back()) {
distinct_values_int.push_back(static_cast<int>(distinct_values[i]));
counts_int.push_back(counts[i]);
} else {
counts_int.back() += counts[i];
}
}
// sort by counts
Common::SortForPair<int, int>(counts_int, distinct_values_int, 0, true);
// will ingore the categorical of small counts
const int cut_cnt = static_cast<int>(total_sample_cnt * 0.98f);
categorical_2_bin_.clear();
bin_2_categorical_.clear();
num_bin_ = 0;
int used_cnt = 0;
max_bin = std::min(static_cast<int>(distinct_values_int.size()), max_bin);
while (used_cnt < cut_cnt || num_bin_ < max_bin) {
bin_2_categorical_.push_back(distinct_values_int[num_bin_]);
categorical_2_bin_[distinct_values_int[num_bin_]] = static_cast<unsigned int>(num_bin_);
used_cnt += counts_int[num_bin_];
++num_bin_;
}
cnt_in_bin = counts_int;
counts_int.resize(num_bin_);
counts_int.back() += static_cast<int>(total_sample_cnt - used_cnt);
}
// check trival(num_bin_ == 1) feature // check trival(num_bin_ == 1) feature
if (num_bin_ <= 1) { if (num_bin_ <= 1) {
is_trival_ = true; is_trival_ = true;
default_bin_ = 0;
} else { } else {
is_trival_ = false; is_trival_ = false;
default_bin_ = ValueToBin(0);
} }
if (NeedFilter(cnt_in_bin, static_cast<int>(total_sample_cnt), min_split_data)) { // check useless bin
if (!is_trival_ && NeedFilter(cnt_in_bin, static_cast<int>(total_sample_cnt), min_split_data, bin_type_)) {
is_trival_ = true; is_trival_ = true;
} }
if (!is_trival_) {
default_bin_ = ValueToBin(0);
}
// calculate sparse rate // calculate sparse rate
CHECK(num_bin_ <= max_bin); sparse_rate_ = static_cast<double>(cnt_in_bin[default_bin_]) / static_cast<double>(total_sample_cnt);
sparse_rate_ = static_cast<double>(cnt_in_bin[GetDefaultBin()]) / static_cast<double>(total_sample_cnt);
} }
...@@ -195,6 +249,7 @@ int BinMapper::SizeForSpecificBin(int bin) { ...@@ -195,6 +249,7 @@ int BinMapper::SizeForSpecificBin(int bin) {
size += sizeof(int); size += sizeof(int);
size += sizeof(bool); size += sizeof(bool);
size += sizeof(double); size += sizeof(double);
size += sizeof(BinType);
size += 2 * sizeof(double); size += 2 * sizeof(double);
size += bin * sizeof(double); size += bin * sizeof(double);
size += sizeof(uint32_t); size += sizeof(uint32_t);
...@@ -208,13 +263,19 @@ void BinMapper::CopyTo(char * buffer) { ...@@ -208,13 +263,19 @@ void BinMapper::CopyTo(char * buffer) {
buffer += sizeof(is_trival_); buffer += sizeof(is_trival_);
std::memcpy(buffer, &sparse_rate_, sizeof(sparse_rate_)); std::memcpy(buffer, &sparse_rate_, sizeof(sparse_rate_));
buffer += sizeof(sparse_rate_); buffer += sizeof(sparse_rate_);
std::memcpy(buffer, &bin_type_, sizeof(bin_type_));
buffer += sizeof(bin_type_);
std::memcpy(&min_val_, buffer, sizeof(min_val_)); std::memcpy(&min_val_, buffer, sizeof(min_val_));
buffer += sizeof(min_val_); buffer += sizeof(min_val_);
std::memcpy(&max_val_, buffer, sizeof(max_val_)); std::memcpy(&max_val_, buffer, sizeof(max_val_));
buffer += sizeof(max_val_); buffer += sizeof(max_val_);
std::memcpy(&default_bin_, buffer, sizeof(default_bin_)); std::memcpy(&default_bin_, buffer, sizeof(default_bin_));
buffer += sizeof(default_bin_); buffer += sizeof(default_bin_);
if (bin_type_ == BinType::NumericalBin) {
std::memcpy(buffer, bin_upper_bound_.data(), num_bin_ * sizeof(double)); std::memcpy(buffer, bin_upper_bound_.data(), num_bin_ * sizeof(double));
} else {
std::memcpy(buffer, bin_2_categorical_.data(), num_bin_ * sizeof(int));
}
} }
void BinMapper::CopyFrom(const char * buffer) { void BinMapper::CopyFrom(const char * buffer) {
...@@ -224,30 +285,50 @@ void BinMapper::CopyFrom(const char * buffer) { ...@@ -224,30 +285,50 @@ void BinMapper::CopyFrom(const char * buffer) {
buffer += sizeof(is_trival_); buffer += sizeof(is_trival_);
std::memcpy(&sparse_rate_, buffer, sizeof(sparse_rate_)); std::memcpy(&sparse_rate_, buffer, sizeof(sparse_rate_));
buffer += sizeof(sparse_rate_); buffer += sizeof(sparse_rate_);
std::memcpy(&bin_type_, buffer, sizeof(bin_type_));
buffer += sizeof(bin_type_);
std::memcpy(&min_val_, buffer, sizeof(min_val_)); std::memcpy(&min_val_, buffer, sizeof(min_val_));
buffer += sizeof(min_val_); buffer += sizeof(min_val_);
std::memcpy(&max_val_, buffer, sizeof(max_val_)); std::memcpy(&max_val_, buffer, sizeof(max_val_));
buffer += sizeof(max_val_); buffer += sizeof(max_val_);
std::memcpy(&default_bin_, buffer, sizeof(default_bin_)); std::memcpy(&default_bin_, buffer, sizeof(default_bin_));
buffer += sizeof(default_bin_); buffer += sizeof(default_bin_);
if (bin_type_ == BinType::NumericalBin) {
bin_upper_bound_ = std::vector<double>(num_bin_); bin_upper_bound_ = std::vector<double>(num_bin_);
std::memcpy(bin_upper_bound_.data(), buffer, num_bin_ * sizeof(double)); std::memcpy(bin_upper_bound_.data(), buffer, num_bin_ * sizeof(double));
} else {
bin_2_categorical_ = std::vector<int>(num_bin_);
std::memcpy(bin_2_categorical_.data(), buffer, num_bin_ * sizeof(int));
categorical_2_bin_.clear();
for (int i = 0; i < num_bin_; ++i) {
categorical_2_bin_[bin_2_categorical_[i]] = static_cast<unsigned int>(i);
}
}
} }
void BinMapper::SaveBinaryToFile(FILE* file) const { void BinMapper::SaveBinaryToFile(FILE* file) const {
fwrite(&num_bin_, sizeof(num_bin_), 1, file); fwrite(&num_bin_, sizeof(num_bin_), 1, file);
fwrite(&is_trival_, sizeof(is_trival_), 1, file); fwrite(&is_trival_, sizeof(is_trival_), 1, file);
fwrite(&sparse_rate_, sizeof(sparse_rate_), 1, file); fwrite(&sparse_rate_, sizeof(sparse_rate_), 1, file);
fwrite(&bin_type_, sizeof(bin_type_), 1, file);
fwrite(&min_val_, sizeof(min_val_), 1, file); fwrite(&min_val_, sizeof(min_val_), 1, file);
fwrite(&max_val_, sizeof(max_val_), 1, file); fwrite(&max_val_, sizeof(max_val_), 1, file);
fwrite(&default_bin_, sizeof(default_bin_), 1, file); fwrite(&default_bin_, sizeof(default_bin_), 1, file);
if (bin_type_ == BinType::NumericalBin) {
fwrite(bin_upper_bound_.data(), sizeof(double), num_bin_, file); fwrite(bin_upper_bound_.data(), sizeof(double), num_bin_, file);
} else {
fwrite(bin_2_categorical_.data(), sizeof(int), num_bin_, file);
}
} }
size_t BinMapper::SizesInByte() const { size_t BinMapper::SizesInByte() const {
size_t ret = sizeof(num_bin_) + sizeof(is_trival_) + sizeof(sparse_rate_) size_t ret = sizeof(num_bin_) + sizeof(is_trival_) + sizeof(sparse_rate_)
+ sizeof(min_val_) + sizeof(max_val_) + sizeof(default_bin_); + sizeof(bin_type_) + sizeof(min_val_) + sizeof(max_val_) + sizeof(default_bin_);
if (bin_type_ == BinType::NumericalBin) {
ret += sizeof(double) * num_bin_; ret += sizeof(double) * num_bin_;
} else {
ret += sizeof(int) * num_bin_;
}
return ret; return ret;
} }
......
...@@ -216,6 +216,7 @@ void IOConfig::Set(const std::unordered_map<std::string, std::string>& params) { ...@@ -216,6 +216,7 @@ void IOConfig::Set(const std::unordered_map<std::string, std::string>& params) {
GetString(params, "weight_column", &weight_column); GetString(params, "weight_column", &weight_column);
GetString(params, "group_column", &group_column); GetString(params, "group_column", &group_column);
GetString(params, "ignore_column", &ignore_column); GetString(params, "ignore_column", &ignore_column);
GetString(params, "categorical_column", &categorical_column);
GetInt(params, "min_data_in_leaf", &min_data_in_leaf); GetInt(params, "min_data_in_leaf", &min_data_in_leaf);
GetInt(params, "min_dato_in_bin", &min_data_in_bin); GetInt(params, "min_dato_in_bin", &min_data_in_bin);
GetDouble(params, "max_conflict_rate", &max_conflict_rate); GetDouble(params, "max_conflict_rate", &max_conflict_rate);
......
...@@ -43,8 +43,8 @@ std::vector<std::vector<int>> NoGroup( ...@@ -43,8 +43,8 @@ std::vector<std::vector<int>> NoGroup(
void Dataset::Construct( void Dataset::Construct(
std::vector<std::unique_ptr<BinMapper>>& bin_mappers, std::vector<std::unique_ptr<BinMapper>>& bin_mappers,
const std::vector<std::vector<int>>& sample_indices, const std::vector<std::vector<int>>&,
size_t total_sample_cnt, size_t,
const IOConfig& io_config) { const IOConfig& io_config) {
num_total_features_ = static_cast<int>(bin_mappers.size()); num_total_features_ = static_cast<int>(bin_mappers.size());
// get num_features // get num_features
......
...@@ -131,6 +131,29 @@ void DatasetLoader::SetHeader(const char* filename) { ...@@ -131,6 +131,29 @@ void DatasetLoader::SetHeader(const char* filename) {
ignore_features_.emplace(group_idx_); ignore_features_.emplace(group_idx_);
} }
} }
if (io_config_.categorical_column.size() > 0) {
if (Common::StartsWith(io_config_.categorical_column, name_prefix)) {
std::string names = io_config_.categorical_column.substr(name_prefix.size());
for (auto name : Common::Split(names.c_str(), ',')) {
if (name2idx.count(name) > 0) {
int tmp = name2idx[name];
categorical_features_.emplace(tmp);
} else {
Log::Fatal("Could not find categorical_column %s in data file", name.c_str());
}
}
} else {
for (auto token : Common::Split(io_config_.categorical_column.c_str(), ',')) {
int tmp = 0;
if (!Common::AtoiAndCheck(token.c_str(), &tmp)) {
Log::Fatal("categorical_column is not a number, \
if you want to use a column name, \
please add the prefix \"name:\" to the column name");
}
categorical_features_.emplace(tmp);
}
}
}
} }
Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_machines) { Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_machines) {
...@@ -471,9 +494,13 @@ Dataset* DatasetLoader::CostructFromSampleData(std::vector<std::vector<double>>& ...@@ -471,9 +494,13 @@ Dataset* DatasetLoader::CostructFromSampleData(std::vector<std::vector<double>>&
bin_mappers[i] = nullptr; bin_mappers[i] = nullptr;
continue; continue;
} }
BinType bin_type = BinType::NumericalBin;
if (categorical_features_.count(i)) {
bin_type = BinType::CategoricalBin;
}
bin_mappers[i].reset(new BinMapper()); bin_mappers[i].reset(new BinMapper());
bin_mappers[i]->FindBin(sample_values[i], total_sample_size, bin_mappers[i]->FindBin(sample_values[i], total_sample_size,
io_config_.max_bin, io_config_.min_data_in_bin, filter_cnt); io_config_.max_bin, io_config_.min_data_in_bin, filter_cnt, bin_type);
} }
auto dataset = std::unique_ptr<Dataset>(new Dataset(num_data)); auto dataset = std::unique_ptr<Dataset>(new Dataset(num_data));
dataset->feature_names_ = feature_names_; dataset->feature_names_ = feature_names_;
...@@ -684,9 +711,13 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines, ...@@ -684,9 +711,13 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
bin_mappers[i] = nullptr; bin_mappers[i] = nullptr;
continue; continue;
} }
BinType bin_type = BinType::NumericalBin;
if (categorical_features_.count(i)) {
bin_type = BinType::CategoricalBin;
}
bin_mappers[i].reset(new BinMapper()); bin_mappers[i].reset(new BinMapper());
bin_mappers[i]->FindBin(sample_values[i], sample_data.size(), bin_mappers[i]->FindBin(sample_values[i], sample_data.size(),
io_config_.max_bin, io_config_.min_data_in_bin, filter_cnt); io_config_.max_bin, io_config_.min_data_in_bin, filter_cnt, bin_type);
} }
} else { } else {
// if have multi-machines, need find bin distributed // if have multi-machines, need find bin distributed
...@@ -716,9 +747,13 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines, ...@@ -716,9 +747,13 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
// find local feature bins and copy to buffer // find local feature bins and copy to buffer
#pragma omp parallel for schedule(guided) #pragma omp parallel for schedule(guided)
for (int i = 0; i < len[rank]; ++i) { for (int i = 0; i < len[rank]; ++i) {
BinType bin_type = BinType::NumericalBin;
if (categorical_features_.count(start[rank] + i)) {
bin_type = BinType::CategoricalBin;
}
BinMapper bin_mapper; BinMapper bin_mapper;
bin_mapper.FindBin(sample_values[start[rank] + i], sample_data.size(), bin_mapper.FindBin(sample_values[start[rank] + i], sample_data.size(),
io_config_.max_bin, io_config_.min_data_in_bin, filter_cnt); io_config_.max_bin, io_config_.min_data_in_bin, filter_cnt, bin_type);
bin_mapper.CopyTo(input_buffer.data() + i * type_size); bin_mapper.CopyTo(input_buffer.data() + i * type_size);
} }
// convert to binary size // convert to binary size
......
...@@ -132,7 +132,7 @@ public: ...@@ -132,7 +132,7 @@ public:
virtual data_size_t Split( virtual data_size_t Split(
uint32_t min_bin, uint32_t max_bin, uint32_t default_bin, uint32_t min_bin, uint32_t max_bin, uint32_t default_bin,
uint32_t threshold, data_size_t* data_indices, data_size_t num_data, uint32_t threshold, data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices) const override { data_size_t* lte_indices, data_size_t* gt_indices, BinType bin_type) const override {
if (num_data <= 0) { return 0; } if (num_data <= 0) { return 0; }
VAL_T th = static_cast<VAL_T>(threshold + min_bin); VAL_T th = static_cast<VAL_T>(threshold + min_bin);
VAL_T minb = static_cast<VAL_T>(min_bin); VAL_T minb = static_cast<VAL_T>(min_bin);
...@@ -144,6 +144,7 @@ public: ...@@ -144,6 +144,7 @@ public:
data_size_t gt_count = 0; data_size_t gt_count = 0;
data_size_t* default_indices = gt_indices; data_size_t* default_indices = gt_indices;
data_size_t* default_count = &gt_count; data_size_t* default_count = &gt_count;
if (bin_type == BinType::NumericalBin) {
if (default_bin <= threshold) { if (default_bin <= threshold) {
default_indices = lte_indices; default_indices = lte_indices;
default_count = &lte_count; default_count = &lte_count;
...@@ -159,6 +160,23 @@ public: ...@@ -159,6 +160,23 @@ public:
lte_indices[lte_count++] = idx; lte_indices[lte_count++] = idx;
} }
} }
} else {
if (default_bin == threshold) {
default_indices = lte_indices;
default_count = &lte_count;
}
for (data_size_t i = 0; i < num_data; ++i) {
const data_size_t idx = data_indices[i];
VAL_T bin = data_[idx];
if (bin > maxb || bin < minb) {
default_indices[(*default_count)++] = idx;
} else if (bin != th) {
gt_indices[gt_count++] = idx;
} else {
lte_indices[lte_count++] = idx;
}
}
}
return lte_count; return lte_count;
} }
data_size_t num_data() const override { return num_data_; } data_size_t num_data() const override { return num_data_; }
......
...@@ -161,7 +161,7 @@ public: ...@@ -161,7 +161,7 @@ public:
virtual data_size_t Split( virtual data_size_t Split(
uint32_t min_bin, uint32_t max_bin, uint32_t default_bin, uint32_t min_bin, uint32_t max_bin, uint32_t default_bin,
uint32_t threshold, data_size_t* data_indices, data_size_t num_data, uint32_t threshold, data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices) const override { data_size_t* lte_indices, data_size_t* gt_indices, BinType bin_type) const override {
if (num_data <= 0) { return 0; } if (num_data <= 0) { return 0; }
uint8_t th = static_cast<uint8_t>(threshold + min_bin); uint8_t th = static_cast<uint8_t>(threshold + min_bin);
uint8_t minb = static_cast<uint8_t>(min_bin); uint8_t minb = static_cast<uint8_t>(min_bin);
...@@ -173,6 +173,7 @@ public: ...@@ -173,6 +173,7 @@ public:
data_size_t gt_count = 0; data_size_t gt_count = 0;
data_size_t* default_indices = gt_indices; data_size_t* default_indices = gt_indices;
data_size_t* default_count = &gt_count; data_size_t* default_count = &gt_count;
if (bin_type == BinType::NumericalBin) {
if (default_bin <= threshold) { if (default_bin <= threshold) {
default_indices = lte_indices; default_indices = lte_indices;
default_count = &lte_count; default_count = &lte_count;
...@@ -188,6 +189,23 @@ public: ...@@ -188,6 +189,23 @@ public:
lte_indices[lte_count++] = idx; lte_indices[lte_count++] = idx;
} }
} }
} else {
if (default_bin == threshold) {
default_indices = lte_indices;
default_count = &lte_count;
}
for (data_size_t i = 0; i < num_data; ++i) {
const data_size_t idx = data_indices[i];
const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
if (bin > maxb || bin < minb) {
default_indices[(*default_count)++] = idx;
} else if (bin != th) {
gt_indices[gt_count++] = idx;
} else {
lte_indices[lte_count++] = idx;
}
}
}
return lte_count; return lte_count;
} }
data_size_t num_data() const override { return num_data_; } data_size_t num_data() const override { return num_data_; }
......
...@@ -125,7 +125,7 @@ public: ...@@ -125,7 +125,7 @@ public:
virtual data_size_t Split( virtual data_size_t Split(
uint32_t min_bin, uint32_t max_bin, uint32_t default_bin, uint32_t min_bin, uint32_t max_bin, uint32_t default_bin,
uint32_t threshold, data_size_t* data_indices, data_size_t num_data, uint32_t threshold, data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices) const override { data_size_t* lte_indices, data_size_t* gt_indices, BinType bin_type) const override {
// not need to split // not need to split
if (num_data <= 0) { return 0; } if (num_data <= 0) { return 0; }
VAL_T th = static_cast<VAL_T>(threshold + min_bin); VAL_T th = static_cast<VAL_T>(threshold + min_bin);
...@@ -139,6 +139,7 @@ public: ...@@ -139,6 +139,7 @@ public:
data_size_t gt_count = 0; data_size_t gt_count = 0;
data_size_t* default_indices = gt_indices; data_size_t* default_indices = gt_indices;
data_size_t* default_count = &gt_count; data_size_t* default_count = &gt_count;
if (bin_type == BinType::NumericalBin) {
if (default_bin <= threshold) { if (default_bin <= threshold) {
default_indices = lte_indices; default_indices = lte_indices;
default_count = &lte_count; default_count = &lte_count;
...@@ -154,6 +155,23 @@ public: ...@@ -154,6 +155,23 @@ public:
lte_indices[lte_count++] = idx; lte_indices[lte_count++] = idx;
} }
} }
} else {
if (default_bin == threshold) {
default_indices = lte_indices;
default_count = &lte_count;
}
for (data_size_t i = 0; i < num_data; ++i) {
const data_size_t idx = data_indices[i];
VAL_T bin = iterator.RawGet(idx);
if (bin > maxb || bin < minb) {
default_indices[(*default_count)++] = idx;
} else if (bin != th) {
gt_indices[gt_count++] = idx;
} else {
lte_indices[lte_count++] = idx;
}
}
}
return lte_count; return lte_count;
} }
......
...@@ -15,6 +15,11 @@ ...@@ -15,6 +15,11 @@
namespace LightGBM { namespace LightGBM {
std::vector<bool(*)(uint32_t, uint32_t)> Tree::inner_decision_funs =
{ Tree::NumericalDecision<uint32_t>, Tree::CategoricalDecision<uint32_t> };
std::vector<bool(*)(double, double)> Tree::decision_funs =
{ Tree::NumericalDecision<double>, Tree::CategoricalDecision<double> };
Tree::Tree(int max_leaves) Tree::Tree(int max_leaves)
:max_leaves_(max_leaves) { :max_leaves_(max_leaves) {
...@@ -25,6 +30,7 @@ Tree::Tree(int max_leaves) ...@@ -25,6 +30,7 @@ Tree::Tree(int max_leaves)
split_feature_ = std::vector<int>(max_leaves_ - 1); split_feature_ = std::vector<int>(max_leaves_ - 1);
threshold_in_bin_ = std::vector<uint32_t>(max_leaves_ - 1); threshold_in_bin_ = std::vector<uint32_t>(max_leaves_ - 1);
threshold_ = std::vector<double>(max_leaves_ - 1); threshold_ = std::vector<double>(max_leaves_ - 1);
decision_type_ = std::vector<int8_t>(max_leaves_ - 1);
split_gain_ = std::vector<double>(max_leaves_ - 1); split_gain_ = std::vector<double>(max_leaves_ - 1);
leaf_parent_ = std::vector<int>(max_leaves_); leaf_parent_ = std::vector<int>(max_leaves_);
leaf_value_ = std::vector<double>(max_leaves_); leaf_value_ = std::vector<double>(max_leaves_);
...@@ -37,12 +43,13 @@ Tree::Tree(int max_leaves) ...@@ -37,12 +43,13 @@ Tree::Tree(int max_leaves)
num_leaves_ = 1; num_leaves_ = 1;
leaf_parent_[0] = -1; leaf_parent_[0] = -1;
shrinkage_ = 1.0f; shrinkage_ = 1.0f;
has_categorical_ = false;
} }
Tree::~Tree() { Tree::~Tree() {
} }
int Tree::Split(int leaf, int feature, uint32_t threshold_bin, int real_feature, int Tree::Split(int leaf, int feature, BinType bin_type, uint32_t threshold_bin, int real_feature,
double threshold_double, double left_value, double threshold_double, double left_value,
double right_value, data_size_t left_cnt, data_size_t right_cnt, double gain) { double right_value, data_size_t left_cnt, data_size_t right_cnt, double gain) {
int new_node_idx = num_leaves_ - 1; int new_node_idx = num_leaves_ - 1;
...@@ -59,6 +66,12 @@ int Tree::Split(int leaf, int feature, uint32_t threshold_bin, int real_feature, ...@@ -59,6 +66,12 @@ int Tree::Split(int leaf, int feature, uint32_t threshold_bin, int real_feature,
// add new node // add new node
split_feature_inner[new_node_idx] = feature; split_feature_inner[new_node_idx] = feature;
split_feature_[new_node_idx] = real_feature; split_feature_[new_node_idx] = real_feature;
if (bin_type == BinType::NumericalBin) {
decision_type_[new_node_idx] = 0;
} else {
has_categorical_ = true;
decision_type_[new_node_idx] = 1;
}
threshold_in_bin_[new_node_idx] = threshold_bin; threshold_in_bin_[new_node_idx] = threshold_bin;
threshold_[new_node_idx] = threshold_double; threshold_[new_node_idx] = threshold_double;
split_gain_[new_node_idx] = gain; split_gain_[new_node_idx] = gain;
...@@ -84,6 +97,54 @@ int Tree::Split(int leaf, int feature, uint32_t threshold_bin, int real_feature, ...@@ -84,6 +97,54 @@ int Tree::Split(int leaf, int feature, uint32_t threshold_bin, int real_feature,
} }
void Tree::AddPredictionToScore(const Dataset* data, data_size_t num_data, double* score) const { void Tree::AddPredictionToScore(const Dataset* data, data_size_t num_data, double* score) const {
if (has_categorical_) {
if (data->num_features() > num_leaves_ - 1) {
Threading::For<data_size_t>(0, num_data,
[this, &data, score](int, data_size_t start, data_size_t end) {
std::vector<std::unique_ptr<BinIterator>> iter(num_leaves_ - 1);
for (int i = 0; i < num_leaves_ - 1; ++i) {
const int fidx = split_feature_inner[i];
iter[i].reset(data->FeatureIterator(fidx));
iter[i]->Reset(start);
}
for (data_size_t i = start; i < end; ++i) {
int node = 0;
while (node >= 0) {
if (inner_decision_funs[decision_type_[node]](
iter[node]->Get(i),
threshold_in_bin_[node])) {
node = left_child_[node];
} else {
node = right_child_[node];
}
}
score[i] += static_cast<double>(leaf_value_[~node]);
}
});
} else {
Threading::For<data_size_t>(0, num_data,
[this, &data, score](int, data_size_t start, data_size_t end) {
std::vector<std::unique_ptr<BinIterator>> iter(data->num_features());
for (int i = 0; i < data->num_features(); ++i) {
iter[i].reset(data->FeatureIterator(i));
iter[i]->Reset(start);
}
for (data_size_t i = start; i < end; ++i) {
int node = 0;
while (node >= 0) {
if (inner_decision_funs[decision_type_[node]](
iter[split_feature_inner[node]]->Get(i),
threshold_in_bin_[node])) {
node = left_child_[node];
} else {
node = right_child_[node];
}
}
score[i] += static_cast<double>(leaf_value_[~node]);
}
});
}
} else {
if (data->num_features() > num_leaves_ - 1) { if (data->num_features() > num_leaves_ - 1) {
Threading::For<data_size_t>(0, num_data, Threading::For<data_size_t>(0, num_data,
[this, &data, score](int, data_size_t start, data_size_t end) { [this, &data, score](int, data_size_t start, data_size_t end) {
...@@ -94,7 +155,15 @@ void Tree::AddPredictionToScore(const Dataset* data, data_size_t num_data, doubl ...@@ -94,7 +155,15 @@ void Tree::AddPredictionToScore(const Dataset* data, data_size_t num_data, doubl
iter[i]->Reset(start); iter[i]->Reset(start);
} }
for (data_size_t i = start; i < end; ++i) { for (data_size_t i = start; i < end; ++i) {
score[i] += static_cast<double>(leaf_value_[GetLeaf(iter, i)]); int node = 0;
while (node >= 0) {
if (iter[node]->Get(i) <= threshold_in_bin_[node]) {
node = left_child_[node];
} else {
node = right_child_[node];
}
}
score[i] += static_cast<double>(leaf_value_[~node]);
} }
}); });
} else { } else {
...@@ -106,15 +175,25 @@ void Tree::AddPredictionToScore(const Dataset* data, data_size_t num_data, doubl ...@@ -106,15 +175,25 @@ void Tree::AddPredictionToScore(const Dataset* data, data_size_t num_data, doubl
iter[i]->Reset(start); iter[i]->Reset(start);
} }
for (data_size_t i = start; i < end; ++i) { for (data_size_t i = start; i < end; ++i) {
score[i] += static_cast<double>(leaf_value_[GetLeafRaw(iter, i)]); int node = 0;
while (node >= 0) {
if (iter[split_feature_inner[node]]->Get(i) <= threshold_in_bin_[node]) {
node = left_child_[node];
} else {
node = right_child_[node];
}
}
score[i] += static_cast<double>(leaf_value_[~node]);
} }
}); });
} }
}
} }
void Tree::AddPredictionToScore(const Dataset* data, void Tree::AddPredictionToScore(const Dataset* data,
const data_size_t* used_data_indices, const data_size_t* used_data_indices,
data_size_t num_data, double* score) const { data_size_t num_data, double* score) const {
if (has_categorical_) {
if (data->num_features() > num_leaves_ - 1) { if (data->num_features() > num_leaves_ - 1) {
Threading::For<data_size_t>(0, num_data, Threading::For<data_size_t>(0, num_data,
[this, data, used_data_indices, score](int, data_size_t start, data_size_t end) { [this, data, used_data_indices, score](int, data_size_t start, data_size_t end) {
...@@ -125,7 +204,18 @@ void Tree::AddPredictionToScore(const Dataset* data, ...@@ -125,7 +204,18 @@ void Tree::AddPredictionToScore(const Dataset* data,
iter[i]->Reset(used_data_indices[start]); iter[i]->Reset(used_data_indices[start]);
} }
for (data_size_t i = start; i < end; ++i) { for (data_size_t i = start; i < end; ++i) {
score[used_data_indices[i]] += static_cast<double>(leaf_value_[GetLeaf(iter, used_data_indices[i])]); int node = 0;
const data_size_t idx = used_data_indices[i];
while (node >= 0) {
if (inner_decision_funs[decision_type_[node]](
iter[node]->Get(idx),
threshold_in_bin_[node])) {
node = left_child_[node];
} else {
node = right_child_[node];
}
}
score[idx] += static_cast<double>(leaf_value_[~node]);
} }
}); });
} else { } else {
...@@ -137,10 +227,67 @@ void Tree::AddPredictionToScore(const Dataset* data, ...@@ -137,10 +227,67 @@ void Tree::AddPredictionToScore(const Dataset* data,
iter[i]->Reset(used_data_indices[start]); iter[i]->Reset(used_data_indices[start]);
} }
for (data_size_t i = start; i < end; ++i) { for (data_size_t i = start; i < end; ++i) {
score[used_data_indices[i]] += static_cast<double>(leaf_value_[GetLeafRaw(iter, used_data_indices[i])]); const data_size_t idx = used_data_indices[i];
int node = 0;
while (node >= 0) {
if (inner_decision_funs[decision_type_[node]](
iter[split_feature_inner[node]]->Get(idx),
threshold_in_bin_[node])) {
node = left_child_[node];
} else {
node = right_child_[node];
}
}
score[idx] += static_cast<double>(leaf_value_[~node]);
} }
}); });
} }
} else {
if (data->num_features() > num_leaves_ - 1) {
Threading::For<data_size_t>(0, num_data,
[this, data, used_data_indices, score](int, data_size_t start, data_size_t end) {
std::vector<std::unique_ptr<BinIterator>> iter(num_leaves_ - 1);
for (int i = 0; i < num_leaves_ - 1; ++i) {
const int fidx = split_feature_inner[i];
iter[i].reset(data->FeatureIterator(fidx));
iter[i]->Reset(used_data_indices[start]);
}
for (data_size_t i = start; i < end; ++i) {
int node = 0;
const data_size_t idx = used_data_indices[i];
while (node >= 0) {
if (iter[node]->Get(idx) <= threshold_in_bin_[node]) {
node = left_child_[node];
} else {
node = right_child_[node];
}
}
score[idx] += static_cast<double>(leaf_value_[~node]);
}
});
} else {
Threading::For<data_size_t>(0, num_data,
[this, data, used_data_indices, score](int, data_size_t start, data_size_t end) {
std::vector<std::unique_ptr<BinIterator>> iter(data->num_features());
for (int i = 0; i < data->num_features(); ++i) {
iter[i].reset(data->FeatureIterator(i));
iter[i]->Reset(used_data_indices[start]);
}
for (data_size_t i = start; i < end; ++i) {
const data_size_t idx = used_data_indices[i];
int node = 0;
while (node >= 0) {
if (iter[split_feature_inner[node]]->Get(idx) <= threshold_in_bin_[node]) {
node = left_child_[node];
} else {
node = right_child_[node];
}
}
score[idx] += static_cast<double>(leaf_value_[~node]);
}
});
}
}
} }
std::string Tree::ToString() { std::string Tree::ToString() {
...@@ -152,6 +299,8 @@ std::string Tree::ToString() { ...@@ -152,6 +299,8 @@ std::string Tree::ToString() {
<< Common::ArrayToString<double>(split_gain_, num_leaves_ - 1, ' ') << std::endl; << Common::ArrayToString<double>(split_gain_, num_leaves_ - 1, ' ') << std::endl;
str_buf << "threshold=" str_buf << "threshold="
<< Common::ArrayToString<double>(threshold_, num_leaves_ - 1, ' ') << std::endl; << Common::ArrayToString<double>(threshold_, num_leaves_ - 1, ' ') << std::endl;
str_buf << "decision_type="
<< Common::ArrayToString<int>(Common::ArrayCast<int8_t, int>(decision_type_), num_leaves_ - 1, ' ') << std::endl;
str_buf << "left_child=" str_buf << "left_child="
<< Common::ArrayToString<int>(left_child_, num_leaves_ - 1, ' ') << std::endl; << Common::ArrayToString<int>(left_child_, num_leaves_ - 1, ' ') << std::endl;
str_buf << "right_child=" str_buf << "right_child="
...@@ -191,6 +340,7 @@ std::string Tree::NodeToJSON(int index) { ...@@ -191,6 +340,7 @@ std::string Tree::NodeToJSON(int index) {
str_buf << "\"split_feature\":" << split_feature_[index] << "," << std::endl; str_buf << "\"split_feature\":" << split_feature_[index] << "," << std::endl;
str_buf << "\"split_gain\":" << split_gain_[index] << "," << std::endl; str_buf << "\"split_gain\":" << split_gain_[index] << "," << std::endl;
str_buf << "\"threshold\":" << threshold_[index] << "," << std::endl; str_buf << "\"threshold\":" << threshold_[index] << "," << std::endl;
str_buf << "\"decision_type\":\"" << Tree::GetDecisionTypeName(decision_type_[index]) << "\"," << std::endl;
str_buf << "\"internal_value\":" << internal_value_[index] << "," << std::endl; str_buf << "\"internal_value\":" << internal_value_[index] << "," << std::endl;
str_buf << "\"internal_count\":" << internal_count_[index] << "," << std::endl; str_buf << "\"internal_count\":" << internal_count_[index] << "," << std::endl;
str_buf << "\"left_child\":" << NodeToJSON(left_child_[index]) << "," << std::endl; str_buf << "\"left_child\":" << NodeToJSON(left_child_[index]) << "," << std::endl;
...@@ -229,6 +379,7 @@ Tree::Tree(const std::string& str) { ...@@ -229,6 +379,7 @@ Tree::Tree(const std::string& str) {
|| key_vals.count("leaf_parent") <= 0 || key_vals.count("leaf_value") <= 0 || key_vals.count("leaf_parent") <= 0 || key_vals.count("leaf_value") <= 0
|| key_vals.count("internal_value") <= 0 || key_vals.count("internal_count") <= 0 || key_vals.count("internal_value") <= 0 || key_vals.count("internal_count") <= 0
|| key_vals.count("leaf_count") <= 0 || key_vals.count("shrinkage") <= 0 || key_vals.count("leaf_count") <= 0 || key_vals.count("shrinkage") <= 0
|| key_vals.count("decision_type") <= 0
) { ) {
Log::Fatal("Tree model string format error"); Log::Fatal("Tree model string format error");
} }
...@@ -239,6 +390,7 @@ Tree::Tree(const std::string& str) { ...@@ -239,6 +390,7 @@ Tree::Tree(const std::string& str) {
right_child_ = Common::StringToArray<int>(key_vals["right_child"], ' ', num_leaves_ - 1); right_child_ = Common::StringToArray<int>(key_vals["right_child"], ' ', num_leaves_ - 1);
split_feature_ = Common::StringToArray<int>(key_vals["split_feature"], ' ', num_leaves_ - 1); split_feature_ = Common::StringToArray<int>(key_vals["split_feature"], ' ', num_leaves_ - 1);
threshold_ = Common::StringToArray<double>(key_vals["threshold"], ' ', num_leaves_ - 1); threshold_ = Common::StringToArray<double>(key_vals["threshold"], ' ', num_leaves_ - 1);
decision_type_ = Common::StringToArray<int8_t>(key_vals["decision_type"], ' ', num_leaves_ - 1);
split_gain_ = Common::StringToArray<double>(key_vals["split_gain"], ' ', num_leaves_ - 1); split_gain_ = Common::StringToArray<double>(key_vals["split_gain"], ' ', num_leaves_ - 1);
internal_count_ = Common::StringToArray<data_size_t>(key_vals["internal_count"], ' ', num_leaves_ - 1); internal_count_ = Common::StringToArray<data_size_t>(key_vals["internal_count"], ' ', num_leaves_ - 1);
internal_value_ = Common::StringToArray<double>(key_vals["internal_value"], ' ', num_leaves_ - 1); internal_value_ = Common::StringToArray<double>(key_vals["internal_value"], ' ', num_leaves_ - 1);
......
...@@ -41,9 +41,16 @@ public: ...@@ -41,9 +41,16 @@ public:
* \param feature the feature data for this histogram * \param feature the feature data for this histogram
* \param min_num_data_one_leaf minimal number of data in one leaf * \param min_num_data_one_leaf minimal number of data in one leaf
*/ */
void Init(HistogramBinEntry* data, const FeatureMetainfo* meta) { void Init(HistogramBinEntry* data, const FeatureMetainfo* meta, BinType bin_type) {
meta_ = meta; meta_ = meta;
data_ = data; data_ = data;
if (bin_type == BinType::NumericalBin) {
find_best_threshold_fun_ = std::bind(&FeatureHistogram::FindBestThresholdNumerical, this, std::placeholders::_1
, std::placeholders::_2, std::placeholders::_3, std::placeholders::_4);
} else {
find_best_threshold_fun_ = std::bind(&FeatureHistogram::FindBestThresholdCategorical, this, std::placeholders::_1
, std::placeholders::_2, std::placeholders::_3, std::placeholders::_4);
}
} }
HistogramBinEntry* RawData() { HistogramBinEntry* RawData() {
...@@ -60,9 +67,14 @@ public: ...@@ -60,9 +67,14 @@ public:
data_[i].sum_hessians -= other.data_[i].sum_hessians; data_[i].sum_hessians -= other.data_[i].sum_hessians;
} }
} }
void FindBestThreshold(double sum_gradient, double sum_hessian, data_size_t num_data, void FindBestThreshold(double sum_gradient, double sum_hessian, data_size_t num_data,
SplitInfo* output) { SplitInfo* output) {
sum_hessian += 2 * kEpsilon; find_best_threshold_fun_(sum_gradient, sum_hessian + 2 * kEpsilon, num_data, output);
}
void FindBestThresholdNumerical(double sum_gradient, double sum_hessian, data_size_t num_data,
SplitInfo* output) {
double best_sum_left_gradient = NAN; double best_sum_left_gradient = NAN;
double best_sum_left_hessian = NAN; double best_sum_left_hessian = NAN;
double best_gain = kMinScore; double best_gain = kMinScore;
...@@ -131,6 +143,97 @@ public: ...@@ -131,6 +143,97 @@ public:
output->gain = kMinScore; output->gain = kMinScore;
} }
} }
void FindBestThresholdCategorical(double sum_gradient, double sum_hessian, data_size_t num_data,
SplitInfo* output) {
double best_gain = kMinScore;
uint32_t best_threshold = static_cast<uint32_t>(meta_->num_bin);
double gain_shift = GetLeafSplitGain(sum_gradient, sum_hessian);
double min_gain_shift = gain_shift + meta_->tree_config->min_gain_to_split;
is_splittable_ = false;
const int bias = meta_->bias;
int t = meta_->num_bin - 1 - bias;
const int t_end = 0;
// from right to left, and we don't need data in bin0
for (; t >= t_end; --t) {
// if data not enough, or sum hessian too small
if (data_[t].cnt < meta_->tree_config->min_data_in_leaf
|| data_[t].sum_hessians < meta_->tree_config->min_sum_hessian_in_leaf) continue;
data_size_t other_count = num_data - data_[t].cnt;
// if data not enough
if (other_count < meta_->tree_config->min_data_in_leaf) continue;
double sum_other_hessian = sum_hessian - data_[t].sum_hessians - kEpsilon;
// if sum hessian too small
if (sum_other_hessian < meta_->tree_config->min_sum_hessian_in_leaf) continue;
double sum_other_gradient = sum_gradient - data_[t].sum_gradients;
// current split gain
double current_gain = GetLeafSplitGain(sum_other_gradient, sum_other_hessian)
+ GetLeafSplitGain(data_[t].sum_gradients, data_[t].sum_hessians + kEpsilon);
// gain with split is worse than without split
if (current_gain <= min_gain_shift) continue;
// mark to is splittable
is_splittable_ = true;
// better split point
if (current_gain > best_gain) {
best_threshold = static_cast<uint32_t>(t + bias);
best_gain = current_gain;
}
}
// need restore zero bin
if (bias == 1) {
t = meta_->num_bin - 1 - bias;
double sum_bin0_gradient = sum_gradient;
double sum_bin0_hessian = sum_hessian;
data_size_t cnt_bin0 = num_data;
for (; t >= 0; --t) {
sum_bin0_gradient -= data_[t].sum_gradients;
sum_bin0_hessian -= data_[t].sum_hessians;
cnt_bin0 -= data_[t].cnt;
}
data_size_t other_count = num_data - cnt_bin0;
double sum_other_hessian = sum_hessian - sum_bin0_hessian - kEpsilon;
if (cnt_bin0 >= meta_->tree_config->min_data_in_leaf
&& sum_bin0_hessian >= meta_->tree_config->min_sum_hessian_in_leaf
&& other_count >= meta_->tree_config->min_data_in_leaf
&& sum_other_hessian >= meta_->tree_config->min_sum_hessian_in_leaf) {
double sum_other_gradient = sum_gradient - sum_bin0_gradient;
double current_gain = GetLeafSplitGain(sum_other_gradient, sum_other_hessian)
+ GetLeafSplitGain(sum_bin0_gradient, sum_bin0_hessian + kEpsilon);
if (current_gain > min_gain_shift) {
is_splittable_ = true;
// better split point
if (current_gain > best_gain) {
best_threshold = static_cast<uint32_t>(0);
best_gain = current_gain;
}
}
}
}
if (is_splittable_) {
// update split information
output->feature = meta_->feature_idx;
output->threshold = best_threshold;
output->left_output = CalculateSplittedLeafOutput(data_[best_threshold].sum_gradients,
data_[best_threshold].sum_hessians + kEpsilon);
output->left_count = data_[best_threshold].cnt;
output->left_sum_gradient = data_[best_threshold].sum_gradients;
output->left_sum_hessian = data_[best_threshold].sum_hessians + kEpsilon;
output->right_output = CalculateSplittedLeafOutput(sum_gradient - data_[best_threshold].sum_gradients,
sum_hessian - data_[best_threshold].sum_hessians - kEpsilon);
output->right_count = num_data - data_[best_threshold].cnt;
output->right_sum_gradient = sum_gradient - data_[best_threshold].sum_gradients;
output->right_sum_hessian = sum_hessian - data_[best_threshold].sum_hessians - kEpsilon;
output->gain = best_gain - gain_shift;
} else {
output->feature = meta_->feature_idx;
output->gain = kMinScore;
}
}
/*! /*!
* \brief Binary size of this histogram * \brief Binary size of this histogram
*/ */
...@@ -188,6 +291,8 @@ private: ...@@ -188,6 +291,8 @@ private:
//std::vector<HistogramBinEntry> data_; //std::vector<HistogramBinEntry> data_;
/*! \brief False if this histogram cannot split */ /*! \brief False if this histogram cannot split */
bool is_splittable_ = true; bool is_splittable_ = true;
std::function<void(double, double, data_size_t, SplitInfo*)> find_best_threshold_fun_;
}; };
class HistogramPool { class HistogramPool {
public: public:
...@@ -264,7 +369,7 @@ public: ...@@ -264,7 +369,7 @@ public:
uint64_t offset = 0; uint64_t offset = 0;
for (int j = 0; j < train_data->num_features(); ++j) { for (int j = 0; j < train_data->num_features(); ++j) {
offset += static_cast<uint64_t>(train_data->SubFeatureBinOffset(j)); offset += static_cast<uint64_t>(train_data->SubFeatureBinOffset(j));
pool_[i][j].Init(data_[i].data() + offset, &feature_metas_[j]); pool_[i][j].Init(data_[i].data() + offset, &feature_metas_[j], train_data->FeatureBinMapper(j)->bin_type());
auto num_bin = train_data->FeatureNumBin(j); auto num_bin = train_data->FeatureNumBin(j);
if (train_data->FeatureBinMapper(j)->GetDefaultBin() == 0) { if (train_data->FeatureBinMapper(j)->GetDefaultBin() == 0) {
num_bin -= 1; num_bin -= 1;
......
...@@ -491,6 +491,7 @@ void SerialTreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* ri ...@@ -491,6 +491,7 @@ void SerialTreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* ri
*left_leaf = best_Leaf; *left_leaf = best_Leaf;
// split tree, will return right leaf // split tree, will return right leaf
*right_leaf = tree->Split(best_Leaf, best_split_info.feature, *right_leaf = tree->Split(best_Leaf, best_split_info.feature,
train_data_->FeatureBinMapper(best_split_info.feature)->bin_type(),
best_split_info.threshold, best_split_info.threshold,
train_data_->RealFeatureIndex(best_split_info.feature), train_data_->RealFeatureIndex(best_split_info.feature),
train_data_->RealThreshold(best_split_info.feature, best_split_info.threshold), train_data_->RealThreshold(best_split_info.feature, best_split_info.threshold),
......
...@@ -79,8 +79,8 @@ void VotingParallelTreeLearner::Init(const Dataset* train_data) { ...@@ -79,8 +79,8 @@ void VotingParallelTreeLearner::Init(const Dataset* train_data) {
uint64_t offset = 0; uint64_t offset = 0;
for (int j = 0; j < train_data->num_features(); ++j) { for (int j = 0; j < train_data->num_features(); ++j) {
offset += static_cast<uint64_t>(train_data->SubFeatureBinOffset(j)); offset += static_cast<uint64_t>(train_data->SubFeatureBinOffset(j));
smaller_leaf_histogram_array_global_[j].Init(smaller_leaf_histogram_data_.data() + offset, &feature_metas_[j]); smaller_leaf_histogram_array_global_[j].Init(smaller_leaf_histogram_data_.data() + offset, &feature_metas_[j], train_data->FeatureBinMapper(j)->bin_type());
larger_leaf_histogram_array_global_[j].Init(larger_leaf_histogram_data_.data() + offset, &feature_metas_[j]); larger_leaf_histogram_array_global_[j].Init(larger_leaf_histogram_data_.data() + offset, &feature_metas_[j], train_data->FeatureBinMapper(j)->bin_type());
auto num_bin = train_data->FeatureNumBin(j); auto num_bin = train_data->FeatureNumBin(j);
if (train_data->FeatureBinMapper(j)->GetDefaultBin() == 0) { if (train_data->FeatureBinMapper(j)->GetDefaultBin() == 0) {
num_bin -= 1; num_bin -= 1;
......
...@@ -49,7 +49,7 @@ class TestBasic(unittest.TestCase): ...@@ -49,7 +49,7 @@ class TestBasic(unittest.TestCase):
for preds in zip(pred_from_matr, pred_from_model_file): for preds in zip(pred_from_matr, pred_from_model_file):
self.assertEqual(*preds) self.assertEqual(*preds)
# check pmml # check pmml
# os.system('python ../../pmml/pmml.py model.txt') os.system('python ../../pmml/pmml.py model.txt')
print("----------------------------------------------------------------------") print("----------------------------------------------------------------------")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment