Commit ef778069 authored by Guolin Ke's avatar Guolin Ke
Browse files

Add categorical feature support back.

parent d93eb338
......@@ -131,7 +131,7 @@ public:
uint32_t max_bin = bin_offsets_[sub_feature + 1] - 1;
uint32_t default_bin = bin_mappers_[sub_feature]->GetDefaultBin();
return bin_data_->Split(min_bin, max_bin, default_bin,
threshold, data_indices, num_data, lte_indices, gt_indices);
threshold, data_indices, num_data, lte_indices, gt_indices, bin_mappers_[sub_feature]->bin_type());
}
/*!
* \brief From bin to feature value
......
......@@ -34,6 +34,7 @@ public:
* \brief Performing a split on tree leaves.
* \param leaf Index of leaf to be split
* \param feature Index of feature; the converted index after removing useless features
* \param bin_type type of this feature, numerical or categorical
* \param threshold Threshold(bin) of split
* \param real_feature Index of feature, the original index on data
* \param threshold_double Threshold on feature value
......@@ -44,7 +45,7 @@ public:
* \param gain Split gain
* \return The index of new leaf.
*/
int Split(int leaf, int feature, uint32_t threshold, int real_feature,
int Split(int leaf, int feature, BinType bin_type, uint32_t threshold, int real_feature,
double threshold_double, double left_value,
double right_value, data_size_t left_cnt, data_size_t right_cnt, double gain);
......@@ -113,6 +114,15 @@ public:
/*! \brief Serialize this object to json*/
std::string ToJSON();
template<typename T>
static bool CategoricalDecision(T fval, T threshold) {
if (static_cast<int>(fval) == static_cast<int>(threshold)) {
return true;
} else {
return false;
}
}
template<typename T>
static bool NumericalDecision(T fval, T threshold) {
if (fval <= threshold) {
......@@ -122,13 +132,18 @@ public:
}
}
private:
static const char* GetDecisionTypeName(int8_t type) {
if (type == 0) {
return "no_greater";
} else {
return "is";
}
}
inline int GetLeaf(std::vector<std::unique_ptr<BinIterator>>& iterators,
data_size_t data_idx) const;
static std::vector<bool(*)(uint32_t, uint32_t)> inner_decision_funs;
static std::vector<bool(*)(double, double)> decision_funs;
inline int GetLeafRaw(std::vector<std::unique_ptr<BinIterator>>& iterators,
data_size_t data_idx) const;
private:
/*!
* \brief Find leaf index of which record belongs by features
......@@ -157,6 +172,8 @@ private:
std::vector<uint32_t> threshold_in_bin_;
/*! \brief A non-leaf node's split threshold in feature value */
std::vector<double> threshold_;
/*! \brief Decision type, 0 for '<='(numerical feature), 1 for 'is'(categorical feature) */
std::vector<int8_t> decision_type_;
/*! \brief A non-leaf node's split gain */
std::vector<double> split_gain_;
// used for leaf node
......@@ -173,6 +190,7 @@ private:
/*! \brief Depth for leaves */
std::vector<int> leaf_depth_;
double shrinkage_;
bool has_categorical_;
};
......@@ -186,40 +204,10 @@ inline int Tree::PredictLeafIndex(const double* feature_values) const {
return leaf;
}
inline int Tree::GetLeaf(std::vector<std::unique_ptr<BinIterator>>& iterators,
data_size_t data_idx) const {
int node = 0;
while (node >= 0) {
if (NumericalDecision<uint32_t>(
iterators[node]->Get(data_idx),
threshold_in_bin_[node])) {
node = left_child_[node];
} else {
node = right_child_[node];
}
}
return ~node;
}
inline int Tree::GetLeafRaw(std::vector<std::unique_ptr<BinIterator>>& iterators,
data_size_t data_idx) const {
int node = 0;
while (node >= 0) {
if (NumericalDecision<uint32_t>(
iterators[split_feature_inner[node]]->Get(data_idx),
threshold_in_bin_[node])) {
node = left_child_[node];
} else {
node = right_child_[node];
}
}
return ~node;
}
inline int Tree::GetLeaf(const double* feature_values) const {
int node = 0;
while (node >= 0) {
if (NumericalDecision<double>(
if (decision_funs[decision_type_[node]](
feature_values[split_feature_[node]],
threshold_[node])) {
node = left_child_[node];
......
......@@ -31,9 +31,9 @@ def get_threshold(node_id, prev_node_idx, is_child):
def print_simple_predicate(tab_len, node_id, is_left_child, prev_node_idx, is_leaf):
if is_left_child:
op = 'lessOrEqual'
op = 'equal' if decision_type[prev_node_idx] == 1 else 'lessOrEqual'
else:
op = 'greaterThan'
op = 'notEqual' if decision_type[prev_node_idx] == 1 else 'greaterThan'
out_('\t' * (tab_len + 1) + ("<SimplePredicate field=\"{0}\" " + " operator=\"{1}\" value=\"{2}\" />").format(
get_field_name(node_id, prev_node_idx, is_leaf), op, get_threshold(node_id, prev_node_idx, is_leaf)))
......@@ -128,6 +128,7 @@ with open('LightGBM_pmml.xml', 'w') as pmml_out:
split_feature = get_array_ints(next(model_content))
split_gain = next(model_content) # unused
threshold = get_array_strings(next(model_content))
decision_type = get_array_ints(next(model_content))
left_child = get_array_ints(next(model_content))
right_child = get_array_ints(next(model_content))
leaf_parent = get_array_ints(next(model_content))
......
......@@ -12,8 +12,9 @@ from tempfile import NamedTemporaryFile
import numpy as np
import scipy.sparse
from .compat import (DataFrame, Series, integer_types, json, numeric_types,
range_, string_type)
from .compat import (DataFrame, Series, integer_types, json,
json_default_with_numpy, numeric_types, range_,
string_type)
from .libpath import find_lib_path
......@@ -220,22 +221,49 @@ PANDAS_DTYPE_MAPPER = {'int8': 'int', 'int16': 'int', 'int32': 'int',
'float32': 'float', 'float64': 'float', 'bool': 'int'}
def _data_from_pandas(data, feature_name):
def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorical):
if isinstance(data, DataFrame):
bad_fields = [data.columns[i] for i, dtype in enumerate(data.dtypes) if dtype.name not in PANDAS_DTYPE_MAPPER]
if bad_fields:
msg = """DataFrame.dtypes for data must be int, float or bool. Did not expect the data types in fields: """
raise ValueError(msg + ', '.join(bad_fields))
if feature_name == 'auto':
if feature_name == 'auto' or feature_name is None:
if all([isinstance(name, integer_types + (np.integer, )) for name in data.columns]):
msg = """Using Pandas (default) integer column names, not column indexes. You can use indexes with DataFrame.values."""
warnings.filterwarnings('once')
warnings.warn(msg, stacklevel=5)
feature_name = [str(name) for name in data.columns]
data = data.rename(columns=str)
cat_cols = data.select_dtypes(include=['category']).columns
if pandas_categorical is None: # train dataset
pandas_categorical = [list(data[col].cat.categories) for col in cat_cols]
else:
if len(cat_cols) != len(pandas_categorical):
raise ValueError('train and valid dataset categorical_feature do not match.')
for col, category in zip(cat_cols, pandas_categorical):
if list(data[col].cat.categories) != list(category):
data[col] = data[col].cat.set_categories(category)
if len(cat_cols): # cat_cols is pandas Index object
data = data.copy() # not alter origin DataFrame
data[cat_cols] = data[cat_cols].apply(lambda x: x.cat.codes)
if categorical_feature is not None:
if feature_name is None:
feature_name = list(data.columns)
if categorical_feature == 'auto':
categorical_feature = list(cat_cols)
else:
categorical_feature = list(categorical_feature) + list(cat_cols)
if feature_name == 'auto':
feature_name = list(data.columns)
data_dtypes = data.dtypes
if not all(dtype.name in PANDAS_DTYPE_MAPPER for dtype in data_dtypes):
bad_fields = [data.columns[i] for i, dtype in
enumerate(data_dtypes) if dtype.name not in PANDAS_DTYPE_MAPPER]
msg = """DataFrame.dtypes for data must be int, float or bool. Did not expect the data types in fields """
raise ValueError(msg + ', '.join(bad_fields))
data = data.values.astype('float')
elif feature_name == 'auto':
feature_name = None
return data, feature_name
else:
if feature_name == 'auto':
feature_name = None
if categorical_feature == 'auto':
categorical_feature = None
return data, feature_name, categorical_feature, pandas_categorical
def _label_from_pandas(label):
......@@ -249,6 +277,19 @@ def _label_from_pandas(label):
return label
def _save_pandas_categorical(file_name, pandas_categorical):
with open(file_name, 'a') as f:
f.write('\npandas_categorical:' + json.dumps(pandas_categorical, default=json_default_with_numpy))
def _load_pandas_categorical(file_name):
with open(file_name, 'r') as f:
last_line = f.readlines()[-1]
if last_line.startswith('pandas_categorical:'):
return json.loads(last_line[len('pandas_categorical:'):])
return None
class _InnerPredictor(object):
"""
A _InnerPredictor of LightGBM.
......@@ -280,6 +321,7 @@ class _InnerPredictor(object):
ctypes.byref(out_num_class)))
self.num_class = out_num_class.value
self.num_total_iteration = out_num_iterations.value
self.pandas_categorical = _load_pandas_categorical(model_file)
elif booster_handle is not None:
self.__is_manage_handle = False
self.handle = booster_handle
......@@ -293,6 +335,7 @@ class _InnerPredictor(object):
self.handle,
ctypes.byref(out_num_iterations)))
self.num_total_iteration = out_num_iterations.value
self.pandas_categorical = None
else:
raise TypeError('Need Model file or Booster handle to create a predictor')
......@@ -328,7 +371,7 @@ class _InnerPredictor(object):
"""
if isinstance(data, Dataset):
raise TypeError("Cannot use Dataset instance for prediction, please use raw data instead")
data = _data_from_pandas(data, None)[0]
data = _data_from_pandas(data, None, None, self.pandas_categorical)[0]
predict_type = C_API_PREDICT_NORMAL
if raw_score:
predict_type = C_API_PREDICT_RAW_SCORE
......@@ -359,6 +402,9 @@ class _InnerPredictor(object):
elif isinstance(data, np.ndarray):
preds, nrow = self.__pred_for_np2d(data, num_iteration,
predict_type)
elif isinstance(data, DataFrame):
preds, nrow = self.__pred_for_np2d(data.values, num_iteration,
predict_type)
else:
try:
csr = scipy.sparse.csr_matrix(data)
......@@ -486,7 +532,7 @@ class Dataset(object):
"""Dataset in LightGBM."""
def __init__(self, data, label=None, max_bin=255, reference=None,
weight=None, group=None, silent=False,
feature_name='auto', params=None,
feature_name='auto', categorical_feature='auto', params=None,
free_raw_data=True):
"""
Parameters
......@@ -509,6 +555,11 @@ class Dataset(object):
feature_name : list of str, or 'auto'
Feature names
If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
params: dict, optional
Other parameters
free_raw_data: Bool
......@@ -523,10 +574,12 @@ class Dataset(object):
self.group = group
self.silent = silent
self.feature_name = feature_name
self.categorical_feature = categorical_feature
self.params = params
self.free_raw_data = free_raw_data
self.used_indices = None
self._predictor = None
self.pandas_categorical = None
def __del__(self):
self._free_handle()
......@@ -539,11 +592,11 @@ class Dataset(object):
def _lazy_init(self, data, label=None, max_bin=255, reference=None,
weight=None, group=None, predictor=None,
silent=False, feature_name='auto',
params=None):
categorical_feature='auto', params=None):
if data is None:
self.handle = None
return
data, feature_name, = _data_from_pandas(data, feature_name)
data, feature_name, categorical_feature, self.pandas_categorical = _data_from_pandas(data, feature_name, categorical_feature, self.pandas_categorical)
label = _label_from_pandas(label)
self.data_has_header = False
"""process for args"""
......@@ -555,6 +608,23 @@ class Dataset(object):
params["verbose"] = 0
elif "verbose" not in params:
params["verbose"] = 1
"""get categorical features"""
if categorical_feature is not None:
categorical_indices = set()
feature_dict = {}
if feature_name is not None:
feature_dict = {name: i for i, name in enumerate(feature_name)}
for name in categorical_feature:
if isinstance(name, string_type) and name in feature_dict:
categorical_indices.add(feature_dict[name])
elif isinstance(name, integer_types):
categorical_indices.add(name)
else:
raise TypeError("Wrong type({}) or unknown name({}) in categorical_feature"
.format(type(name).__name__, name))
params['categorical_column'] = sorted(categorical_indices)
params_str = param_dict_to_str(params)
"""process for reference dataset"""
ref_dataset = None
......@@ -714,7 +784,7 @@ class Dataset(object):
self._lazy_init(self.data, label=self.label, max_bin=self.max_bin,
weight=self.weight, group=self.group, predictor=self._predictor,
silent=self.silent, feature_name=self.feature_name,
params=self.params)
categorical_feature=self.categorical_feature, params=self.params)
if self.free_raw_data:
self.data = None
return self
......@@ -744,6 +814,7 @@ class Dataset(object):
weight=weight, group=group, silent=silent, params=params,
free_raw_data=self.free_raw_data)
ret._predictor = self._predictor
ret.pandas_categorical = self.pandas_categorical
return ret
def subset(self, used_indices, params=None):
......@@ -758,8 +829,9 @@ class Dataset(object):
Other parameters
"""
ret = Dataset(None, reference=self, feature_name=self.feature_name,
params=params)
categorical_feature=self.categorical_feature, params=params)
ret._predictor = self._predictor
ret.pandas_categorical = self.pandas_categorical
ret.used_indices = used_indices
return ret
......@@ -867,6 +939,24 @@ class Dataset(object):
else:
raise TypeError("Unknown type")
def set_categorical_feature(self, categorical_feature):
"""
Set categorical features
Parameters
----------
categorical_feature : list of int or str
Name/index of categorical features
"""
if self.categorical_feature == categorical_feature:
return
if self.data is not None:
self.categorical_feature = categorical_feature
self._free_handle()
else:
raise LightGBMError("Cannot set categorical feature after freed raw data, set free_raw_data=False when construct Dataset to avoid this.")
def _set_predictor(self, predictor):
"""
Set predictor for continued training, not recommand for user to call this function.
......@@ -889,6 +979,7 @@ class Dataset(object):
reference : Dataset
Will use reference as template to consturct current dataset
"""
self.set_categorical_feature(reference.categorical_feature)
self.set_feature_name(reference.feature_name)
self._set_predictor(reference._predictor)
if self.reference is reference:
......@@ -1117,6 +1208,7 @@ class Booster(object):
self.__inner_predict_buffer = [None]
self.__is_predicted_cur_iter = [False]
self.__get_eval_info()
self.pandas_categorical = train_set.pandas_categorical
elif model_file is not None:
"""Prediction task"""
out_num_iterations = ctypes.c_int(0)
......@@ -1129,6 +1221,7 @@ class Booster(object):
self.handle,
ctypes.byref(out_num_class)))
self.__num_class = out_num_class.value
self.pandas_categorical = _load_pandas_categorical(model_file)
elif 'model_str' in params:
self.__load_model_from_string(params['model_str'])
else:
......@@ -1144,6 +1237,7 @@ class Booster(object):
def __deepcopy__(self, _):
model_str = self.__save_model_to_string()
booster = Booster({'model_str': model_str})
booster.pandas_categorical = self.pandas_categorical
return booster
def __getstate__(self):
......@@ -1383,6 +1477,7 @@ class Booster(object):
self.handle,
ctypes.c_int(num_iteration),
c_str(filename)))
_save_pandas_categorical(filename, self.pandas_categorical)
def __load_model_from_string(self, model_str):
"""[Private] Load model from string"""
......@@ -1494,6 +1589,7 @@ class Booster(object):
def _to_predictor(self):
"""Convert to predictor"""
predictor = _InnerPredictor(booster_handle=self.handle)
predictor.pandas_categorical = self.pandas_categorical
return predictor
def feature_name(self):
......
......@@ -39,6 +39,15 @@ except (ImportError, SyntaxError):
import json
def json_default_with_numpy(obj):
if isinstance(obj, (np.integer, np.floating, np.bool_)):
return obj.item()
elif isinstance(obj, np.ndarray):
return obj.tolist()
else:
return obj
"""pandas"""
try:
from pandas import Series, DataFrame
......
......@@ -17,7 +17,7 @@ from .compat import (SKLEARN_INSTALLED, LGBMStratifiedKFold, integer_types,
def train(params, train_set, num_boost_round=100,
valid_sets=None, valid_names=None,
fobj=None, feval=None, init_model=None,
feature_name='auto',
feature_name='auto', categorical_feature='auto',
early_stopping_rounds=None, evals_result=None,
verbose_eval=True, learning_rates=None, callbacks=None):
"""
......@@ -45,6 +45,11 @@ def train(params, train_set, num_boost_round=100,
feature_name : list of str, or 'auto'
Feature names
If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
early_stopping_rounds: int
Activates early stopping.
Requires at least one validation data and one metric
......@@ -98,6 +103,7 @@ def train(params, train_set, num_boost_round=100,
train_set._update_params(params)
train_set._set_predictor(predictor)
train_set.set_feature_name(feature_name)
train_set.set_categorical_feature(categorical_feature)
is_valid_contain_train = False
train_data_name = "training"
......@@ -271,7 +277,7 @@ def _agg_cv_result(raw_results):
def cv(params, train_set, num_boost_round=10,
data_splitter=None, nfold=5, stratified=False, shuffle=True,
metrics=None, fobj=None, feval=None, init_model=None,
feature_name='auto',
feature_name='auto', categorical_feature='auto',
early_stopping_rounds=None, fpreproc=None,
verbose_eval=None, show_stdv=True, seed=0,
callbacks=None):
......@@ -305,6 +311,11 @@ def cv(params, train_set, num_boost_round=10,
feature_name : list of str, or 'auto'
Feature names
If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
early_stopping_rounds: int
Activates early stopping. CV error needs to decrease at least
every <early_stopping_rounds> round(s) to continue.
......@@ -343,6 +354,7 @@ def cv(params, train_set, num_boost_round=10,
train_set._update_params(params)
train_set._set_predictor(predictor)
train_set.set_feature_name(feature_name)
train_set.set_categorical_feature(categorical_feature)
if metrics:
params.setdefault('metric', [])
......
......@@ -257,7 +257,12 @@ def _to_graphviz(graph, tree_info, show_info, feature_names):
if info in {'split_gain', 'internal_value', 'internal_count'}:
label += '\n' + info + ':' + str(root[info])
graph.node(name, label=label)
l_dec, r_dec = '<=', '>'
if root['decision_type'] == 'no_greater':
l_dec, r_dec = '<=', '>'
elif root['decision_type'] == 'is':
l_dec, r_dec = 'is', "isn't"
else:
raise ValueError('Invalid decision type in tree model.')
add(root['left_child'], name, l_dec)
add(root['right_child'], name, r_dec)
else: # leaf
......
......@@ -284,7 +284,7 @@ class LGBMModel(LGBMModelBase):
eval_init_score=None, eval_group=None,
eval_metric=None,
early_stopping_rounds=None, verbose=True,
feature_name='auto',
feature_name='auto', categorical_feature='auto',
callbacks=None):
"""
Fit the gradient boosting model
......@@ -318,6 +318,11 @@ class LGBMModel(LGBMModelBase):
feature_name : list of str, or 'auto'
Feature names
If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
callbacks : list of callback functions
List of callback functions that are applied at each iteration.
See Callbacks in Python-API.md for more information.
......@@ -401,6 +406,7 @@ class LGBMModel(LGBMModelBase):
early_stopping_rounds=early_stopping_rounds,
evals_result=evals_result, fobj=self.fobj, feval=feval,
verbose_eval=verbose, feature_name=feature_name,
categorical_feature=categorical_feature,
callbacks=callbacks)
if evals_result:
......@@ -508,7 +514,7 @@ class LGBMRegressor(LGBMModel, LGBMRegressorBase):
eval_init_score=None,
eval_metric="l2",
early_stopping_rounds=None, verbose=True,
feature_name='auto', callbacks=None):
feature_name='auto', categorical_feature='auto', callbacks=None):
super(LGBMRegressor, self).fit(X, y, sample_weight=sample_weight,
init_score=init_score, eval_set=eval_set,
......@@ -517,6 +523,7 @@ class LGBMRegressor(LGBMModel, LGBMRegressorBase):
eval_metric=eval_metric,
early_stopping_rounds=early_stopping_rounds,
verbose=verbose, feature_name=feature_name,
categorical_feature=categorical_feature,
callbacks=callbacks)
return self
......@@ -553,7 +560,7 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
eval_init_score=None,
eval_metric="binary_logloss",
early_stopping_rounds=None, verbose=True,
feature_name='auto',
feature_name='auto', categorical_feature='auto',
callbacks=None):
self._le = LGBMLabelEncoder().fit(y)
y = self._le.transform(y)
......@@ -576,6 +583,7 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
eval_metric=eval_metric,
early_stopping_rounds=early_stopping_rounds,
verbose=verbose, feature_name=feature_name,
categorical_feature=categorical_feature,
callbacks=callbacks)
return self
......@@ -653,7 +661,7 @@ class LGBMRanker(LGBMModel):
eval_init_score=None, eval_group=None,
eval_metric='ndcg', eval_at=1,
early_stopping_rounds=None, verbose=True,
feature_name='auto',
feature_name='auto', categorical_feature='auto',
callbacks=None):
"""
Most arguments like common methods except following:
......@@ -684,5 +692,6 @@ class LGBMRanker(LGBMModel):
eval_metric=eval_metric,
early_stopping_rounds=early_stopping_rounds,
verbose=verbose, feature_name=feature_name,
categorical_feature=categorical_feature,
callbacks=callbacks)
return self
......@@ -24,7 +24,13 @@ BinMapper::BinMapper(const BinMapper& other) {
num_bin_ = other.num_bin_;
is_trival_ = other.is_trival_;
sparse_rate_ = other.sparse_rate_;
bin_upper_bound_ = other.bin_upper_bound_;
bin_type_ = other.bin_type_;
if (bin_type_ == BinType::NumericalBin) {
bin_upper_bound_ = other.bin_upper_bound_;
} else {
bin_2_categorical_ = other.bin_2_categorical_;
categorical_2_bin_ = other.categorical_2_bin_;
}
min_val_ = other.min_val_;
max_val_ = other.max_val_;
default_bin_ = other.default_bin_;
......@@ -38,22 +44,34 @@ BinMapper::~BinMapper() {
}
bool NeedFilter(std::vector<int>& cnt_in_bin, int total_cnt, int filter_cnt) {
int sum_left = 0;
for (size_t i = 0; i < cnt_in_bin.size() - 1; ++i) {
sum_left += cnt_in_bin[i];
if (sum_left >= filter_cnt) {
return false;
} else if (total_cnt - sum_left >= filter_cnt) {
return false;
bool NeedFilter(std::vector<int>& cnt_in_bin, int total_cnt, int filter_cnt, BinType bin_type) {
if (bin_type == BinType::NumericalBin) {
int sum_left = 0;
for (size_t i = 0; i < cnt_in_bin.size() - 1; ++i) {
sum_left += cnt_in_bin[i];
if (sum_left >= filter_cnt) {
return false;
} else if (total_cnt - sum_left >= filter_cnt) {
return false;
}
}
} else {
for (size_t i = 0; i < cnt_in_bin.size() - 1; ++i) {
int sum_left = cnt_in_bin[i];
if (sum_left >= filter_cnt) {
return false;
} else if (total_cnt - sum_left >= filter_cnt) {
return false;
}
}
}
return true;
}
void BinMapper::FindBin(std::vector<double>& values, size_t total_sample_cnt,
int max_bin, int min_data_in_bin, int min_split_data) {
// limit max_bin by min_data_in_bin
int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type) {
bin_type_ = bin_type;
default_bin_ = 0;
std::vector<double>& raw_values = values;
int zero_cnt = static_cast<int>(total_sample_cnt - raw_values.size());
// find distinct_values first
......@@ -95,98 +113,134 @@ void BinMapper::FindBin(std::vector<double>& values, size_t total_sample_cnt,
max_val_ = distinct_values.back();
std::vector<int> cnt_in_bin;
int num_values = static_cast<int>(distinct_values.size());
if (num_values <= max_bin) {
// use distinct value is enough
bin_upper_bound_.clear();
int cur_cnt_inbin = 0;
for (int i = 0; i < num_values - 1; ++i) {
cur_cnt_inbin += counts[i];
if (cur_cnt_inbin >= min_data_in_bin) {
bin_upper_bound_.push_back((distinct_values[i] + distinct_values[i + 1]) / 2);
cnt_in_bin.push_back(cur_cnt_inbin);
cur_cnt_inbin = 0;
if (bin_type_ == BinType::NumericalBin) {
if (num_values <= max_bin) {
// use distinct value is enough
bin_upper_bound_.clear();
int cur_cnt_inbin = 0;
for (int i = 0; i < num_values - 1; ++i) {
cur_cnt_inbin += counts[i];
if (cur_cnt_inbin >= min_data_in_bin) {
bin_upper_bound_.push_back((distinct_values[i] + distinct_values[i + 1]) / 2);
cnt_in_bin.push_back(cur_cnt_inbin);
cur_cnt_inbin = 0;
}
}
}
cur_cnt_inbin += counts.back();
cnt_in_bin.push_back(cur_cnt_inbin);
bin_upper_bound_.push_back(std::numeric_limits<double>::infinity());
num_bin_ = static_cast<int>(bin_upper_bound_.size());
} else {
if (min_data_in_bin > 0) {
max_bin = std::min(max_bin, static_cast<int>(total_sample_cnt / min_data_in_bin));
max_bin = std::max(max_bin, 1);
}
double mean_bin_size = static_cast<double>(total_sample_cnt) / max_bin;
if (zero_cnt > mean_bin_size) {
int non_zero_cnt = static_cast<int>(raw_values.size());
max_bin = std::min(max_bin, 1 + static_cast<int>(non_zero_cnt / min_data_in_bin));
}
// mean size for one bin
int rest_bin_cnt = max_bin;
int rest_sample_cnt = static_cast<int>(total_sample_cnt);
std::vector<bool> is_big_count_value(num_values, false);
for (int i = 0; i < num_values; ++i) {
if (counts[i] >= mean_bin_size) {
is_big_count_value[i] = true;
--rest_bin_cnt;
rest_sample_cnt -= counts[i];
cur_cnt_inbin += counts.back();
cnt_in_bin.push_back(cur_cnt_inbin);
bin_upper_bound_.push_back(std::numeric_limits<double>::infinity());
num_bin_ = static_cast<int>(bin_upper_bound_.size());
} else {
if (min_data_in_bin > 0) {
max_bin = std::min(max_bin, static_cast<int>(total_sample_cnt / min_data_in_bin));
max_bin = std::max(max_bin, 1);
}
}
mean_bin_size = static_cast<double>(rest_sample_cnt) / rest_bin_cnt;
std::vector<double> upper_bounds(max_bin, std::numeric_limits<double>::infinity());
std::vector<double> lower_bounds(max_bin, std::numeric_limits<double>::infinity());
int bin_cnt = 0;
lower_bounds[bin_cnt] = distinct_values[0];
int cur_cnt_inbin = 0;
for (int i = 0; i < num_values - 1; ++i) {
if (!is_big_count_value[i]) {
rest_sample_cnt -= counts[i];
double mean_bin_size = static_cast<double>(total_sample_cnt) / max_bin;
if (zero_cnt > mean_bin_size) {
int non_zero_cnt = static_cast<int>(raw_values.size());
max_bin = std::min(max_bin, 1 + static_cast<int>(non_zero_cnt / min_data_in_bin));
}
cur_cnt_inbin += counts[i];
// need a new bin
if (is_big_count_value[i] || cur_cnt_inbin >= mean_bin_size ||
(is_big_count_value[i + 1] && cur_cnt_inbin >= std::max(1.0, mean_bin_size * 0.5f))) {
upper_bounds[bin_cnt] = distinct_values[i];
cnt_in_bin.push_back(cur_cnt_inbin);
++bin_cnt;
lower_bounds[bin_cnt] = distinct_values[i + 1];
if (bin_cnt >= max_bin - 1) { break; }
cur_cnt_inbin = 0;
if (!is_big_count_value[i]) {
// mean size for one bin
int rest_bin_cnt = max_bin;
int rest_sample_cnt = static_cast<int>(total_sample_cnt);
std::vector<bool> is_big_count_value(num_values, false);
for (int i = 0; i < num_values; ++i) {
if (counts[i] >= mean_bin_size) {
is_big_count_value[i] = true;
--rest_bin_cnt;
mean_bin_size = rest_sample_cnt / static_cast<double>(rest_bin_cnt);
rest_sample_cnt -= counts[i];
}
}
mean_bin_size = static_cast<double>(rest_sample_cnt) / rest_bin_cnt;
std::vector<double> upper_bounds(max_bin, std::numeric_limits<double>::infinity());
std::vector<double> lower_bounds(max_bin, std::numeric_limits<double>::infinity());
int bin_cnt = 0;
lower_bounds[bin_cnt] = distinct_values[0];
int cur_cnt_inbin = 0;
for (int i = 0; i < num_values - 1; ++i) {
if (!is_big_count_value[i]) {
rest_sample_cnt -= counts[i];
}
cur_cnt_inbin += counts[i];
// need a new bin
if (is_big_count_value[i] || cur_cnt_inbin >= mean_bin_size ||
(is_big_count_value[i + 1] && cur_cnt_inbin >= std::max(1.0, mean_bin_size * 0.5f))) {
upper_bounds[bin_cnt] = distinct_values[i];
cnt_in_bin.push_back(cur_cnt_inbin);
++bin_cnt;
lower_bounds[bin_cnt] = distinct_values[i + 1];
if (bin_cnt >= max_bin - 1) { break; }
cur_cnt_inbin = 0;
if (!is_big_count_value[i]) {
--rest_bin_cnt;
mean_bin_size = rest_sample_cnt / static_cast<double>(rest_bin_cnt);
}
}
}
cur_cnt_inbin += counts.back();
cnt_in_bin.push_back(cur_cnt_inbin);
++bin_cnt;
// update bin upper bound
bin_upper_bound_ = std::vector<double>(bin_cnt);
num_bin_ = bin_cnt;
for (int i = 0; i < bin_cnt - 1; ++i) {
bin_upper_bound_[i] = (upper_bounds[i] + lower_bounds[i + 1]) / 2.0f;
}
// last bin upper bound
bin_upper_bound_[bin_cnt - 1] = std::numeric_limits<double>::infinity();
}
CHECK(num_bin_ <= max_bin);
} else {
// convert to int type first
std::vector<int> distinct_values_int;
std::vector<int> counts_int;
distinct_values_int.push_back(static_cast<int>(distinct_values[0]));
counts_int.push_back(counts[0]);
for (size_t i = 1; i < distinct_values.size(); ++i) {
if (static_cast<int>(distinct_values[i]) != distinct_values_int.back()) {
distinct_values_int.push_back(static_cast<int>(distinct_values[i]));
counts_int.push_back(counts[i]);
} else {
counts_int.back() += counts[i];
}
}
cur_cnt_inbin += counts.back();
cnt_in_bin.push_back(cur_cnt_inbin);
++bin_cnt;
// update bin upper bound
bin_upper_bound_ = std::vector<double>(bin_cnt);
num_bin_ = bin_cnt;
for (int i = 0; i < bin_cnt - 1; ++i) {
bin_upper_bound_[i] = (upper_bounds[i] + lower_bounds[i + 1]) / 2.0f;
// sort by counts
Common::SortForPair<int, int>(counts_int, distinct_values_int, 0, true);
// will ingore the categorical of small counts
const int cut_cnt = static_cast<int>(total_sample_cnt * 0.98f);
categorical_2_bin_.clear();
bin_2_categorical_.clear();
num_bin_ = 0;
int used_cnt = 0;
max_bin = std::min(static_cast<int>(distinct_values_int.size()), max_bin);
while (used_cnt < cut_cnt || num_bin_ < max_bin) {
bin_2_categorical_.push_back(distinct_values_int[num_bin_]);
categorical_2_bin_[distinct_values_int[num_bin_]] = static_cast<unsigned int>(num_bin_);
used_cnt += counts_int[num_bin_];
++num_bin_;
}
// last bin upper bound
bin_upper_bound_[bin_cnt - 1] = std::numeric_limits<double>::infinity();
cnt_in_bin = counts_int;
counts_int.resize(num_bin_);
counts_int.back() += static_cast<int>(total_sample_cnt - used_cnt);
}
// check trival(num_bin_ == 1) feature
if (num_bin_ <= 1) {
is_trival_ = true;
default_bin_ = 0;
} else {
is_trival_ = false;
default_bin_ = ValueToBin(0);
}
if (NeedFilter(cnt_in_bin, static_cast<int>(total_sample_cnt), min_split_data)) {
// check useless bin
if (!is_trival_ && NeedFilter(cnt_in_bin, static_cast<int>(total_sample_cnt), min_split_data, bin_type_)) {
is_trival_ = true;
}
if (!is_trival_) {
default_bin_ = ValueToBin(0);
}
// calculate sparse rate
CHECK(num_bin_ <= max_bin);
sparse_rate_ = static_cast<double>(cnt_in_bin[GetDefaultBin()]) / static_cast<double>(total_sample_cnt);
sparse_rate_ = static_cast<double>(cnt_in_bin[default_bin_]) / static_cast<double>(total_sample_cnt);
}
......@@ -195,6 +249,7 @@ int BinMapper::SizeForSpecificBin(int bin) {
size += sizeof(int);
size += sizeof(bool);
size += sizeof(double);
size += sizeof(BinType);
size += 2 * sizeof(double);
size += bin * sizeof(double);
size += sizeof(uint32_t);
......@@ -208,13 +263,19 @@ void BinMapper::CopyTo(char * buffer) {
buffer += sizeof(is_trival_);
std::memcpy(buffer, &sparse_rate_, sizeof(sparse_rate_));
buffer += sizeof(sparse_rate_);
std::memcpy(buffer, &bin_type_, sizeof(bin_type_));
buffer += sizeof(bin_type_);
std::memcpy(&min_val_, buffer, sizeof(min_val_));
buffer += sizeof(min_val_);
std::memcpy(&max_val_, buffer, sizeof(max_val_));
buffer += sizeof(max_val_);
std::memcpy(&default_bin_, buffer, sizeof(default_bin_));
buffer += sizeof(default_bin_);
std::memcpy(buffer, bin_upper_bound_.data(), num_bin_ * sizeof(double));
if (bin_type_ == BinType::NumericalBin) {
std::memcpy(buffer, bin_upper_bound_.data(), num_bin_ * sizeof(double));
} else {
std::memcpy(buffer, bin_2_categorical_.data(), num_bin_ * sizeof(int));
}
}
void BinMapper::CopyFrom(const char * buffer) {
......@@ -224,30 +285,50 @@ void BinMapper::CopyFrom(const char * buffer) {
buffer += sizeof(is_trival_);
std::memcpy(&sparse_rate_, buffer, sizeof(sparse_rate_));
buffer += sizeof(sparse_rate_);
std::memcpy(&bin_type_, buffer, sizeof(bin_type_));
buffer += sizeof(bin_type_);
std::memcpy(&min_val_, buffer, sizeof(min_val_));
buffer += sizeof(min_val_);
std::memcpy(&max_val_, buffer, sizeof(max_val_));
buffer += sizeof(max_val_);
std::memcpy(&default_bin_, buffer, sizeof(default_bin_));
buffer += sizeof(default_bin_);
bin_upper_bound_ = std::vector<double>(num_bin_);
std::memcpy(bin_upper_bound_.data(), buffer, num_bin_ * sizeof(double));
if (bin_type_ == BinType::NumericalBin) {
bin_upper_bound_ = std::vector<double>(num_bin_);
std::memcpy(bin_upper_bound_.data(), buffer, num_bin_ * sizeof(double));
} else {
bin_2_categorical_ = std::vector<int>(num_bin_);
std::memcpy(bin_2_categorical_.data(), buffer, num_bin_ * sizeof(int));
categorical_2_bin_.clear();
for (int i = 0; i < num_bin_; ++i) {
categorical_2_bin_[bin_2_categorical_[i]] = static_cast<unsigned int>(i);
}
}
}
void BinMapper::SaveBinaryToFile(FILE* file) const {
fwrite(&num_bin_, sizeof(num_bin_), 1, file);
fwrite(&is_trival_, sizeof(is_trival_), 1, file);
fwrite(&sparse_rate_, sizeof(sparse_rate_), 1, file);
fwrite(&bin_type_, sizeof(bin_type_), 1, file);
fwrite(&min_val_, sizeof(min_val_), 1, file);
fwrite(&max_val_, sizeof(max_val_), 1, file);
fwrite(&default_bin_, sizeof(default_bin_), 1, file);
fwrite(bin_upper_bound_.data(), sizeof(double), num_bin_, file);
if (bin_type_ == BinType::NumericalBin) {
fwrite(bin_upper_bound_.data(), sizeof(double), num_bin_, file);
} else {
fwrite(bin_2_categorical_.data(), sizeof(int), num_bin_, file);
}
}
size_t BinMapper::SizesInByte() const {
size_t ret = sizeof(num_bin_) + sizeof(is_trival_) + sizeof(sparse_rate_)
+ sizeof(min_val_) + sizeof(max_val_) + sizeof(default_bin_);
ret += sizeof(double) * num_bin_;
+ sizeof(bin_type_) + sizeof(min_val_) + sizeof(max_val_) + sizeof(default_bin_);
if (bin_type_ == BinType::NumericalBin) {
ret += sizeof(double) * num_bin_;
} else {
ret += sizeof(int) * num_bin_;
}
return ret;
}
......
......@@ -216,6 +216,7 @@ void IOConfig::Set(const std::unordered_map<std::string, std::string>& params) {
GetString(params, "weight_column", &weight_column);
GetString(params, "group_column", &group_column);
GetString(params, "ignore_column", &ignore_column);
GetString(params, "categorical_column", &categorical_column);
GetInt(params, "min_data_in_leaf", &min_data_in_leaf);
GetInt(params, "min_dato_in_bin", &min_data_in_bin);
GetDouble(params, "max_conflict_rate", &max_conflict_rate);
......
......@@ -43,8 +43,8 @@ std::vector<std::vector<int>> NoGroup(
void Dataset::Construct(
std::vector<std::unique_ptr<BinMapper>>& bin_mappers,
const std::vector<std::vector<int>>& sample_indices,
size_t total_sample_cnt,
const std::vector<std::vector<int>>&,
size_t,
const IOConfig& io_config) {
num_total_features_ = static_cast<int>(bin_mappers.size());
// get num_features
......
......@@ -131,6 +131,29 @@ void DatasetLoader::SetHeader(const char* filename) {
ignore_features_.emplace(group_idx_);
}
}
if (io_config_.categorical_column.size() > 0) {
if (Common::StartsWith(io_config_.categorical_column, name_prefix)) {
std::string names = io_config_.categorical_column.substr(name_prefix.size());
for (auto name : Common::Split(names.c_str(), ',')) {
if (name2idx.count(name) > 0) {
int tmp = name2idx[name];
categorical_features_.emplace(tmp);
} else {
Log::Fatal("Could not find categorical_column %s in data file", name.c_str());
}
}
} else {
for (auto token : Common::Split(io_config_.categorical_column.c_str(), ',')) {
int tmp = 0;
if (!Common::AtoiAndCheck(token.c_str(), &tmp)) {
Log::Fatal("categorical_column is not a number, \
if you want to use a column name, \
please add the prefix \"name:\" to the column name");
}
categorical_features_.emplace(tmp);
}
}
}
}
Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_machines) {
......@@ -471,9 +494,13 @@ Dataset* DatasetLoader::CostructFromSampleData(std::vector<std::vector<double>>&
bin_mappers[i] = nullptr;
continue;
}
BinType bin_type = BinType::NumericalBin;
if (categorical_features_.count(i)) {
bin_type = BinType::CategoricalBin;
}
bin_mappers[i].reset(new BinMapper());
bin_mappers[i]->FindBin(sample_values[i], total_sample_size,
io_config_.max_bin, io_config_.min_data_in_bin, filter_cnt);
io_config_.max_bin, io_config_.min_data_in_bin, filter_cnt, bin_type);
}
auto dataset = std::unique_ptr<Dataset>(new Dataset(num_data));
dataset->feature_names_ = feature_names_;
......@@ -684,9 +711,13 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
bin_mappers[i] = nullptr;
continue;
}
BinType bin_type = BinType::NumericalBin;
if (categorical_features_.count(i)) {
bin_type = BinType::CategoricalBin;
}
bin_mappers[i].reset(new BinMapper());
bin_mappers[i]->FindBin(sample_values[i], sample_data.size(),
io_config_.max_bin, io_config_.min_data_in_bin, filter_cnt);
io_config_.max_bin, io_config_.min_data_in_bin, filter_cnt, bin_type);
}
} else {
// if have multi-machines, need find bin distributed
......@@ -716,9 +747,13 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
// find local feature bins and copy to buffer
#pragma omp parallel for schedule(guided)
for (int i = 0; i < len[rank]; ++i) {
BinType bin_type = BinType::NumericalBin;
if (categorical_features_.count(start[rank] + i)) {
bin_type = BinType::CategoricalBin;
}
BinMapper bin_mapper;
bin_mapper.FindBin(sample_values[start[rank] + i], sample_data.size(),
io_config_.max_bin, io_config_.min_data_in_bin, filter_cnt);
io_config_.max_bin, io_config_.min_data_in_bin, filter_cnt, bin_type);
bin_mapper.CopyTo(input_buffer.data() + i * type_size);
}
// convert to binary size
......
......@@ -132,7 +132,7 @@ public:
virtual data_size_t Split(
uint32_t min_bin, uint32_t max_bin, uint32_t default_bin,
uint32_t threshold, data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices) const override {
data_size_t* lte_indices, data_size_t* gt_indices, BinType bin_type) const override {
if (num_data <= 0) { return 0; }
VAL_T th = static_cast<VAL_T>(threshold + min_bin);
VAL_T minb = static_cast<VAL_T>(min_bin);
......@@ -144,19 +144,37 @@ public:
data_size_t gt_count = 0;
data_size_t* default_indices = gt_indices;
data_size_t* default_count = &gt_count;
if (default_bin <= threshold) {
default_indices = lte_indices;
default_count = &lte_count;
}
for (data_size_t i = 0; i < num_data; ++i) {
const data_size_t idx = data_indices[i];
VAL_T bin = data_[idx];
if (bin > maxb || bin < minb) {
default_indices[(*default_count)++] = idx;
} else if (bin > th) {
gt_indices[gt_count++] = idx;
} else {
lte_indices[lte_count++] = idx;
if (bin_type == BinType::NumericalBin) {
if (default_bin <= threshold) {
default_indices = lte_indices;
default_count = &lte_count;
}
for (data_size_t i = 0; i < num_data; ++i) {
const data_size_t idx = data_indices[i];
VAL_T bin = data_[idx];
if (bin > maxb || bin < minb) {
default_indices[(*default_count)++] = idx;
} else if (bin > th) {
gt_indices[gt_count++] = idx;
} else {
lte_indices[lte_count++] = idx;
}
}
} else {
if (default_bin == threshold) {
default_indices = lte_indices;
default_count = &lte_count;
}
for (data_size_t i = 0; i < num_data; ++i) {
const data_size_t idx = data_indices[i];
VAL_T bin = data_[idx];
if (bin > maxb || bin < minb) {
default_indices[(*default_count)++] = idx;
} else if (bin != th) {
gt_indices[gt_count++] = idx;
} else {
lte_indices[lte_count++] = idx;
}
}
}
return lte_count;
......
......@@ -161,7 +161,7 @@ public:
virtual data_size_t Split(
uint32_t min_bin, uint32_t max_bin, uint32_t default_bin,
uint32_t threshold, data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices) const override {
data_size_t* lte_indices, data_size_t* gt_indices, BinType bin_type) const override {
if (num_data <= 0) { return 0; }
uint8_t th = static_cast<uint8_t>(threshold + min_bin);
uint8_t minb = static_cast<uint8_t>(min_bin);
......@@ -173,19 +173,37 @@ public:
data_size_t gt_count = 0;
data_size_t* default_indices = gt_indices;
data_size_t* default_count = &gt_count;
if (default_bin <= threshold) {
default_indices = lte_indices;
default_count = &lte_count;
}
for (data_size_t i = 0; i < num_data; ++i) {
const data_size_t idx = data_indices[i];
const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
if (bin > maxb || bin < minb) {
default_indices[(*default_count)++] = idx;
} else if (bin > th) {
gt_indices[gt_count++] = idx;
} else {
lte_indices[lte_count++] = idx;
if (bin_type == BinType::NumericalBin) {
if (default_bin <= threshold) {
default_indices = lte_indices;
default_count = &lte_count;
}
for (data_size_t i = 0; i < num_data; ++i) {
const data_size_t idx = data_indices[i];
const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
if (bin > maxb || bin < minb) {
default_indices[(*default_count)++] = idx;
} else if (bin > th) {
gt_indices[gt_count++] = idx;
} else {
lte_indices[lte_count++] = idx;
}
}
} else {
if (default_bin == threshold) {
default_indices = lte_indices;
default_count = &lte_count;
}
for (data_size_t i = 0; i < num_data; ++i) {
const data_size_t idx = data_indices[i];
const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
if (bin > maxb || bin < minb) {
default_indices[(*default_count)++] = idx;
} else if (bin != th) {
gt_indices[gt_count++] = idx;
} else {
lte_indices[lte_count++] = idx;
}
}
}
return lte_count;
......
......@@ -125,7 +125,7 @@ public:
virtual data_size_t Split(
uint32_t min_bin, uint32_t max_bin, uint32_t default_bin,
uint32_t threshold, data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices) const override {
data_size_t* lte_indices, data_size_t* gt_indices, BinType bin_type) const override {
// not need to split
if (num_data <= 0) { return 0; }
VAL_T th = static_cast<VAL_T>(threshold + min_bin);
......@@ -139,19 +139,37 @@ public:
data_size_t gt_count = 0;
data_size_t* default_indices = gt_indices;
data_size_t* default_count = &gt_count;
if (default_bin <= threshold) {
default_indices = lte_indices;
default_count = &lte_count;
}
for (data_size_t i = 0; i < num_data; ++i) {
const data_size_t idx = data_indices[i];
VAL_T bin = iterator.RawGet(idx);
if (bin > maxb || bin < minb) {
default_indices[(*default_count)++] = idx;
} else if (bin > th) {
gt_indices[gt_count++] = idx;
} else {
lte_indices[lte_count++] = idx;
if (bin_type == BinType::NumericalBin) {
if (default_bin <= threshold) {
default_indices = lte_indices;
default_count = &lte_count;
}
for (data_size_t i = 0; i < num_data; ++i) {
const data_size_t idx = data_indices[i];
VAL_T bin = iterator.RawGet(idx);
if (bin > maxb || bin < minb) {
default_indices[(*default_count)++] = idx;
} else if (bin > th) {
gt_indices[gt_count++] = idx;
} else {
lte_indices[lte_count++] = idx;
}
}
} else {
if (default_bin == threshold) {
default_indices = lte_indices;
default_count = &lte_count;
}
for (data_size_t i = 0; i < num_data; ++i) {
const data_size_t idx = data_indices[i];
VAL_T bin = iterator.RawGet(idx);
if (bin > maxb || bin < minb) {
default_indices[(*default_count)++] = idx;
} else if (bin != th) {
gt_indices[gt_count++] = idx;
} else {
lte_indices[lte_count++] = idx;
}
}
}
return lte_count;
......
......@@ -15,6 +15,11 @@
namespace LightGBM {
std::vector<bool(*)(uint32_t, uint32_t)> Tree::inner_decision_funs =
{ Tree::NumericalDecision<uint32_t>, Tree::CategoricalDecision<uint32_t> };
std::vector<bool(*)(double, double)> Tree::decision_funs =
{ Tree::NumericalDecision<double>, Tree::CategoricalDecision<double> };
Tree::Tree(int max_leaves)
:max_leaves_(max_leaves) {
......@@ -25,6 +30,7 @@ Tree::Tree(int max_leaves)
split_feature_ = std::vector<int>(max_leaves_ - 1);
threshold_in_bin_ = std::vector<uint32_t>(max_leaves_ - 1);
threshold_ = std::vector<double>(max_leaves_ - 1);
decision_type_ = std::vector<int8_t>(max_leaves_ - 1);
split_gain_ = std::vector<double>(max_leaves_ - 1);
leaf_parent_ = std::vector<int>(max_leaves_);
leaf_value_ = std::vector<double>(max_leaves_);
......@@ -37,12 +43,13 @@ Tree::Tree(int max_leaves)
num_leaves_ = 1;
leaf_parent_[0] = -1;
shrinkage_ = 1.0f;
has_categorical_ = false;
}
Tree::~Tree() {
}
int Tree::Split(int leaf, int feature, uint32_t threshold_bin, int real_feature,
int Tree::Split(int leaf, int feature, BinType bin_type, uint32_t threshold_bin, int real_feature,
double threshold_double, double left_value,
double right_value, data_size_t left_cnt, data_size_t right_cnt, double gain) {
int new_node_idx = num_leaves_ - 1;
......@@ -59,6 +66,12 @@ int Tree::Split(int leaf, int feature, uint32_t threshold_bin, int real_feature,
// add new node
split_feature_inner[new_node_idx] = feature;
split_feature_[new_node_idx] = real_feature;
if (bin_type == BinType::NumericalBin) {
decision_type_[new_node_idx] = 0;
} else {
has_categorical_ = true;
decision_type_[new_node_idx] = 1;
}
threshold_in_bin_[new_node_idx] = threshold_bin;
threshold_[new_node_idx] = threshold_double;
split_gain_[new_node_idx] = gain;
......@@ -84,62 +97,196 @@ int Tree::Split(int leaf, int feature, uint32_t threshold_bin, int real_feature,
}
void Tree::AddPredictionToScore(const Dataset* data, data_size_t num_data, double* score) const {
if (data->num_features() > num_leaves_ - 1) {
Threading::For<data_size_t>(0, num_data,
[this, &data, score](int, data_size_t start, data_size_t end) {
std::vector<std::unique_ptr<BinIterator>> iter(num_leaves_ - 1);
for (int i = 0; i < num_leaves_ - 1; ++i) {
const int fidx = split_feature_inner[i];
iter[i].reset(data->FeatureIterator(fidx));
iter[i]->Reset(start);
}
for (data_size_t i = start; i < end; ++i) {
score[i] += static_cast<double>(leaf_value_[GetLeaf(iter, i)]);
}
});
if (has_categorical_) {
if (data->num_features() > num_leaves_ - 1) {
Threading::For<data_size_t>(0, num_data,
[this, &data, score](int, data_size_t start, data_size_t end) {
std::vector<std::unique_ptr<BinIterator>> iter(num_leaves_ - 1);
for (int i = 0; i < num_leaves_ - 1; ++i) {
const int fidx = split_feature_inner[i];
iter[i].reset(data->FeatureIterator(fidx));
iter[i]->Reset(start);
}
for (data_size_t i = start; i < end; ++i) {
int node = 0;
while (node >= 0) {
if (inner_decision_funs[decision_type_[node]](
iter[node]->Get(i),
threshold_in_bin_[node])) {
node = left_child_[node];
} else {
node = right_child_[node];
}
}
score[i] += static_cast<double>(leaf_value_[~node]);
}
});
} else {
Threading::For<data_size_t>(0, num_data,
[this, &data, score](int, data_size_t start, data_size_t end) {
std::vector<std::unique_ptr<BinIterator>> iter(data->num_features());
for (int i = 0; i < data->num_features(); ++i) {
iter[i].reset(data->FeatureIterator(i));
iter[i]->Reset(start);
}
for (data_size_t i = start; i < end; ++i) {
int node = 0;
while (node >= 0) {
if (inner_decision_funs[decision_type_[node]](
iter[split_feature_inner[node]]->Get(i),
threshold_in_bin_[node])) {
node = left_child_[node];
} else {
node = right_child_[node];
}
}
score[i] += static_cast<double>(leaf_value_[~node]);
}
});
}
} else {
Threading::For<data_size_t>(0, num_data,
[this, &data, score](int, data_size_t start, data_size_t end) {
std::vector<std::unique_ptr<BinIterator>> iter(data->num_features());
for (int i = 0; i < data->num_features(); ++i) {
iter[i].reset(data->FeatureIterator(i));
iter[i]->Reset(start);
}
for (data_size_t i = start; i < end; ++i) {
score[i] += static_cast<double>(leaf_value_[GetLeafRaw(iter, i)]);
}
});
if (data->num_features() > num_leaves_ - 1) {
Threading::For<data_size_t>(0, num_data,
[this, &data, score](int, data_size_t start, data_size_t end) {
std::vector<std::unique_ptr<BinIterator>> iter(num_leaves_ - 1);
for (int i = 0; i < num_leaves_ - 1; ++i) {
const int fidx = split_feature_inner[i];
iter[i].reset(data->FeatureIterator(fidx));
iter[i]->Reset(start);
}
for (data_size_t i = start; i < end; ++i) {
int node = 0;
while (node >= 0) {
if (iter[node]->Get(i) <= threshold_in_bin_[node]) {
node = left_child_[node];
} else {
node = right_child_[node];
}
}
score[i] += static_cast<double>(leaf_value_[~node]);
}
});
} else {
Threading::For<data_size_t>(0, num_data,
[this, &data, score](int, data_size_t start, data_size_t end) {
std::vector<std::unique_ptr<BinIterator>> iter(data->num_features());
for (int i = 0; i < data->num_features(); ++i) {
iter[i].reset(data->FeatureIterator(i));
iter[i]->Reset(start);
}
for (data_size_t i = start; i < end; ++i) {
int node = 0;
while (node >= 0) {
if (iter[split_feature_inner[node]]->Get(i) <= threshold_in_bin_[node]) {
node = left_child_[node];
} else {
node = right_child_[node];
}
}
score[i] += static_cast<double>(leaf_value_[~node]);
}
});
}
}
}
void Tree::AddPredictionToScore(const Dataset* data,
const data_size_t* used_data_indices,
data_size_t num_data, double* score) const {
if (data->num_features() > num_leaves_ - 1) {
Threading::For<data_size_t>(0, num_data,
[this, data, used_data_indices, score](int, data_size_t start, data_size_t end) {
std::vector<std::unique_ptr<BinIterator>> iter(num_leaves_ - 1);
for (int i = 0; i < num_leaves_ - 1; ++i) {
const int fidx = split_feature_inner[i];
iter[i].reset(data->FeatureIterator(fidx));
iter[i]->Reset(used_data_indices[start]);
}
for (data_size_t i = start; i < end; ++i) {
score[used_data_indices[i]] += static_cast<double>(leaf_value_[GetLeaf(iter, used_data_indices[i])]);
}
});
if (has_categorical_) {
if (data->num_features() > num_leaves_ - 1) {
Threading::For<data_size_t>(0, num_data,
[this, data, used_data_indices, score](int, data_size_t start, data_size_t end) {
std::vector<std::unique_ptr<BinIterator>> iter(num_leaves_ - 1);
for (int i = 0; i < num_leaves_ - 1; ++i) {
const int fidx = split_feature_inner[i];
iter[i].reset(data->FeatureIterator(fidx));
iter[i]->Reset(used_data_indices[start]);
}
for (data_size_t i = start; i < end; ++i) {
int node = 0;
const data_size_t idx = used_data_indices[i];
while (node >= 0) {
if (inner_decision_funs[decision_type_[node]](
iter[node]->Get(idx),
threshold_in_bin_[node])) {
node = left_child_[node];
} else {
node = right_child_[node];
}
}
score[idx] += static_cast<double>(leaf_value_[~node]);
}
});
} else {
Threading::For<data_size_t>(0, num_data,
[this, data, used_data_indices, score](int, data_size_t start, data_size_t end) {
std::vector<std::unique_ptr<BinIterator>> iter(data->num_features());
for (int i = 0; i < data->num_features(); ++i) {
iter[i].reset(data->FeatureIterator(i));
iter[i]->Reset(used_data_indices[start]);
}
for (data_size_t i = start; i < end; ++i) {
const data_size_t idx = used_data_indices[i];
int node = 0;
while (node >= 0) {
if (inner_decision_funs[decision_type_[node]](
iter[split_feature_inner[node]]->Get(idx),
threshold_in_bin_[node])) {
node = left_child_[node];
} else {
node = right_child_[node];
}
}
score[idx] += static_cast<double>(leaf_value_[~node]);
}
});
}
} else {
Threading::For<data_size_t>(0, num_data,
[this, data, used_data_indices, score](int, data_size_t start, data_size_t end) {
std::vector<std::unique_ptr<BinIterator>> iter(data->num_features());
for (int i = 0; i < data->num_features(); ++i) {
iter[i].reset(data->FeatureIterator(i));
iter[i]->Reset(used_data_indices[start]);
}
for (data_size_t i = start; i < end; ++i) {
score[used_data_indices[i]] += static_cast<double>(leaf_value_[GetLeafRaw(iter, used_data_indices[i])]);
}
});
if (data->num_features() > num_leaves_ - 1) {
Threading::For<data_size_t>(0, num_data,
[this, data, used_data_indices, score](int, data_size_t start, data_size_t end) {
std::vector<std::unique_ptr<BinIterator>> iter(num_leaves_ - 1);
for (int i = 0; i < num_leaves_ - 1; ++i) {
const int fidx = split_feature_inner[i];
iter[i].reset(data->FeatureIterator(fidx));
iter[i]->Reset(used_data_indices[start]);
}
for (data_size_t i = start; i < end; ++i) {
int node = 0;
const data_size_t idx = used_data_indices[i];
while (node >= 0) {
if (iter[node]->Get(idx) <= threshold_in_bin_[node]) {
node = left_child_[node];
} else {
node = right_child_[node];
}
}
score[idx] += static_cast<double>(leaf_value_[~node]);
}
});
} else {
Threading::For<data_size_t>(0, num_data,
[this, data, used_data_indices, score](int, data_size_t start, data_size_t end) {
std::vector<std::unique_ptr<BinIterator>> iter(data->num_features());
for (int i = 0; i < data->num_features(); ++i) {
iter[i].reset(data->FeatureIterator(i));
iter[i]->Reset(used_data_indices[start]);
}
for (data_size_t i = start; i < end; ++i) {
const data_size_t idx = used_data_indices[i];
int node = 0;
while (node >= 0) {
if (iter[split_feature_inner[node]]->Get(idx) <= threshold_in_bin_[node]) {
node = left_child_[node];
} else {
node = right_child_[node];
}
}
score[idx] += static_cast<double>(leaf_value_[~node]);
}
});
}
}
}
......@@ -152,6 +299,8 @@ std::string Tree::ToString() {
<< Common::ArrayToString<double>(split_gain_, num_leaves_ - 1, ' ') << std::endl;
str_buf << "threshold="
<< Common::ArrayToString<double>(threshold_, num_leaves_ - 1, ' ') << std::endl;
str_buf << "decision_type="
<< Common::ArrayToString<int>(Common::ArrayCast<int8_t, int>(decision_type_), num_leaves_ - 1, ' ') << std::endl;
str_buf << "left_child="
<< Common::ArrayToString<int>(left_child_, num_leaves_ - 1, ' ') << std::endl;
str_buf << "right_child="
......@@ -191,6 +340,7 @@ std::string Tree::NodeToJSON(int index) {
str_buf << "\"split_feature\":" << split_feature_[index] << "," << std::endl;
str_buf << "\"split_gain\":" << split_gain_[index] << "," << std::endl;
str_buf << "\"threshold\":" << threshold_[index] << "," << std::endl;
str_buf << "\"decision_type\":\"" << Tree::GetDecisionTypeName(decision_type_[index]) << "\"," << std::endl;
str_buf << "\"internal_value\":" << internal_value_[index] << "," << std::endl;
str_buf << "\"internal_count\":" << internal_count_[index] << "," << std::endl;
str_buf << "\"left_child\":" << NodeToJSON(left_child_[index]) << "," << std::endl;
......@@ -229,6 +379,7 @@ Tree::Tree(const std::string& str) {
|| key_vals.count("leaf_parent") <= 0 || key_vals.count("leaf_value") <= 0
|| key_vals.count("internal_value") <= 0 || key_vals.count("internal_count") <= 0
|| key_vals.count("leaf_count") <= 0 || key_vals.count("shrinkage") <= 0
|| key_vals.count("decision_type") <= 0
) {
Log::Fatal("Tree model string format error");
}
......@@ -239,6 +390,7 @@ Tree::Tree(const std::string& str) {
right_child_ = Common::StringToArray<int>(key_vals["right_child"], ' ', num_leaves_ - 1);
split_feature_ = Common::StringToArray<int>(key_vals["split_feature"], ' ', num_leaves_ - 1);
threshold_ = Common::StringToArray<double>(key_vals["threshold"], ' ', num_leaves_ - 1);
decision_type_ = Common::StringToArray<int8_t>(key_vals["decision_type"], ' ', num_leaves_ - 1);
split_gain_ = Common::StringToArray<double>(key_vals["split_gain"], ' ', num_leaves_ - 1);
internal_count_ = Common::StringToArray<data_size_t>(key_vals["internal_count"], ' ', num_leaves_ - 1);
internal_value_ = Common::StringToArray<double>(key_vals["internal_value"], ' ', num_leaves_ - 1);
......
......@@ -41,9 +41,16 @@ public:
* \param feature the feature data for this histogram
* \param min_num_data_one_leaf minimal number of data in one leaf
*/
void Init(HistogramBinEntry* data, const FeatureMetainfo* meta) {
void Init(HistogramBinEntry* data, const FeatureMetainfo* meta, BinType bin_type) {
meta_ = meta;
data_ = data;
if (bin_type == BinType::NumericalBin) {
find_best_threshold_fun_ = std::bind(&FeatureHistogram::FindBestThresholdNumerical, this, std::placeholders::_1
, std::placeholders::_2, std::placeholders::_3, std::placeholders::_4);
} else {
find_best_threshold_fun_ = std::bind(&FeatureHistogram::FindBestThresholdCategorical, this, std::placeholders::_1
, std::placeholders::_2, std::placeholders::_3, std::placeholders::_4);
}
}
HistogramBinEntry* RawData() {
......@@ -60,9 +67,14 @@ public:
data_[i].sum_hessians -= other.data_[i].sum_hessians;
}
}
void FindBestThreshold(double sum_gradient, double sum_hessian, data_size_t num_data,
SplitInfo* output) {
sum_hessian += 2 * kEpsilon;
find_best_threshold_fun_(sum_gradient, sum_hessian + 2 * kEpsilon, num_data, output);
}
void FindBestThresholdNumerical(double sum_gradient, double sum_hessian, data_size_t num_data,
SplitInfo* output) {
double best_sum_left_gradient = NAN;
double best_sum_left_hessian = NAN;
double best_gain = kMinScore;
......@@ -131,6 +143,97 @@ public:
output->gain = kMinScore;
}
}
void FindBestThresholdCategorical(double sum_gradient, double sum_hessian, data_size_t num_data,
SplitInfo* output) {
double best_gain = kMinScore;
uint32_t best_threshold = static_cast<uint32_t>(meta_->num_bin);
double gain_shift = GetLeafSplitGain(sum_gradient, sum_hessian);
double min_gain_shift = gain_shift + meta_->tree_config->min_gain_to_split;
is_splittable_ = false;
const int bias = meta_->bias;
int t = meta_->num_bin - 1 - bias;
const int t_end = 0;
// from right to left, and we don't need data in bin0
for (; t >= t_end; --t) {
// if data not enough, or sum hessian too small
if (data_[t].cnt < meta_->tree_config->min_data_in_leaf
|| data_[t].sum_hessians < meta_->tree_config->min_sum_hessian_in_leaf) continue;
data_size_t other_count = num_data - data_[t].cnt;
// if data not enough
if (other_count < meta_->tree_config->min_data_in_leaf) continue;
double sum_other_hessian = sum_hessian - data_[t].sum_hessians - kEpsilon;
// if sum hessian too small
if (sum_other_hessian < meta_->tree_config->min_sum_hessian_in_leaf) continue;
double sum_other_gradient = sum_gradient - data_[t].sum_gradients;
// current split gain
double current_gain = GetLeafSplitGain(sum_other_gradient, sum_other_hessian)
+ GetLeafSplitGain(data_[t].sum_gradients, data_[t].sum_hessians + kEpsilon);
// gain with split is worse than without split
if (current_gain <= min_gain_shift) continue;
// mark to is splittable
is_splittable_ = true;
// better split point
if (current_gain > best_gain) {
best_threshold = static_cast<uint32_t>(t + bias);
best_gain = current_gain;
}
}
// need restore zero bin
if (bias == 1) {
t = meta_->num_bin - 1 - bias;
double sum_bin0_gradient = sum_gradient;
double sum_bin0_hessian = sum_hessian;
data_size_t cnt_bin0 = num_data;
for (; t >= 0; --t) {
sum_bin0_gradient -= data_[t].sum_gradients;
sum_bin0_hessian -= data_[t].sum_hessians;
cnt_bin0 -= data_[t].cnt;
}
data_size_t other_count = num_data - cnt_bin0;
double sum_other_hessian = sum_hessian - sum_bin0_hessian - kEpsilon;
if (cnt_bin0 >= meta_->tree_config->min_data_in_leaf
&& sum_bin0_hessian >= meta_->tree_config->min_sum_hessian_in_leaf
&& other_count >= meta_->tree_config->min_data_in_leaf
&& sum_other_hessian >= meta_->tree_config->min_sum_hessian_in_leaf) {
double sum_other_gradient = sum_gradient - sum_bin0_gradient;
double current_gain = GetLeafSplitGain(sum_other_gradient, sum_other_hessian)
+ GetLeafSplitGain(sum_bin0_gradient, sum_bin0_hessian + kEpsilon);
if (current_gain > min_gain_shift) {
is_splittable_ = true;
// better split point
if (current_gain > best_gain) {
best_threshold = static_cast<uint32_t>(0);
best_gain = current_gain;
}
}
}
}
if (is_splittable_) {
// update split information
output->feature = meta_->feature_idx;
output->threshold = best_threshold;
output->left_output = CalculateSplittedLeafOutput(data_[best_threshold].sum_gradients,
data_[best_threshold].sum_hessians + kEpsilon);
output->left_count = data_[best_threshold].cnt;
output->left_sum_gradient = data_[best_threshold].sum_gradients;
output->left_sum_hessian = data_[best_threshold].sum_hessians + kEpsilon;
output->right_output = CalculateSplittedLeafOutput(sum_gradient - data_[best_threshold].sum_gradients,
sum_hessian - data_[best_threshold].sum_hessians - kEpsilon);
output->right_count = num_data - data_[best_threshold].cnt;
output->right_sum_gradient = sum_gradient - data_[best_threshold].sum_gradients;
output->right_sum_hessian = sum_hessian - data_[best_threshold].sum_hessians - kEpsilon;
output->gain = best_gain - gain_shift;
} else {
output->feature = meta_->feature_idx;
output->gain = kMinScore;
}
}
/*!
* \brief Binary size of this histogram
*/
......@@ -188,6 +291,8 @@ private:
//std::vector<HistogramBinEntry> data_;
/*! \brief False if this histogram cannot split */
bool is_splittable_ = true;
std::function<void(double, double, data_size_t, SplitInfo*)> find_best_threshold_fun_;
};
class HistogramPool {
public:
......@@ -264,7 +369,7 @@ public:
uint64_t offset = 0;
for (int j = 0; j < train_data->num_features(); ++j) {
offset += static_cast<uint64_t>(train_data->SubFeatureBinOffset(j));
pool_[i][j].Init(data_[i].data() + offset, &feature_metas_[j]);
pool_[i][j].Init(data_[i].data() + offset, &feature_metas_[j], train_data->FeatureBinMapper(j)->bin_type());
auto num_bin = train_data->FeatureNumBin(j);
if (train_data->FeatureBinMapper(j)->GetDefaultBin() == 0) {
num_bin -= 1;
......
......@@ -490,7 +490,8 @@ void SerialTreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* ri
// left = parent
*left_leaf = best_Leaf;
// split tree, will return right leaf
*right_leaf = tree->Split(best_Leaf, best_split_info.feature,
*right_leaf = tree->Split(best_Leaf, best_split_info.feature,
train_data_->FeatureBinMapper(best_split_info.feature)->bin_type(),
best_split_info.threshold,
train_data_->RealFeatureIndex(best_split_info.feature),
train_data_->RealThreshold(best_split_info.feature, best_split_info.threshold),
......
......@@ -79,8 +79,8 @@ void VotingParallelTreeLearner::Init(const Dataset* train_data) {
uint64_t offset = 0;
for (int j = 0; j < train_data->num_features(); ++j) {
offset += static_cast<uint64_t>(train_data->SubFeatureBinOffset(j));
smaller_leaf_histogram_array_global_[j].Init(smaller_leaf_histogram_data_.data() + offset, &feature_metas_[j]);
larger_leaf_histogram_array_global_[j].Init(larger_leaf_histogram_data_.data() + offset, &feature_metas_[j]);
smaller_leaf_histogram_array_global_[j].Init(smaller_leaf_histogram_data_.data() + offset, &feature_metas_[j], train_data->FeatureBinMapper(j)->bin_type());
larger_leaf_histogram_array_global_[j].Init(larger_leaf_histogram_data_.data() + offset, &feature_metas_[j], train_data->FeatureBinMapper(j)->bin_type());
auto num_bin = train_data->FeatureNumBin(j);
if (train_data->FeatureBinMapper(j)->GetDefaultBin() == 0) {
num_bin -= 1;
......
......@@ -49,7 +49,7 @@ class TestBasic(unittest.TestCase):
for preds in zip(pred_from_matr, pred_from_model_file):
self.assertEqual(*preds)
# check pmml
# os.system('python ../../pmml/pmml.py model.txt')
os.system('python ../../pmml/pmml.py model.txt')
print("----------------------------------------------------------------------")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment