Commit eade219e authored by Qiwei Ye's avatar Qiwei Ye
Browse files

merge conflict

parents f23e6083 060bd316
# coding: utf-8
# pylint: disable = C0103
"""Plotting Library."""
from __future__ import absolute_import
import warnings
from copy import deepcopy
from io import BytesIO
import numpy as np
from .basic import Booster
from .sklearn import LGBMModel
def check_not_tuple_of_2_elements(obj, obj_name='obj'):
"""check object is not tuple or does not have 2 elements"""
if not isinstance(obj, tuple) or len(obj) != 2:
raise TypeError('%s must be a tuple of 2 elements.' % obj_name)
def plot_importance(booster, ax=None, height=0.2,
xlim=None, ylim=None, title='Feature importance',
xlabel='Feature importance', ylabel='Features',
importance_type='split', max_num_features=None,
ignore_zero=True, figsize=None, grid=True, **kwargs):
"""Plot model feature importances.
Parameters
----------
booster : Booster or LGBMModel
Booster or LGBMModel instance
ax : matplotlib Axes
Target axes instance. If None, new figure and axes will be created.
height : float
Bar height, passed to ax.barh()
xlim : tuple of 2 elements
Tuple passed to axes.xlim()
ylim : tuple of 2 elements
Tuple passed to axes.ylim()
title : str
Axes title. Pass None to disable.
xlabel : str
X axis title label. Pass None to disable.
ylabel : str
Y axis title label. Pass None to disable.
importance_type : str
How the importance is calculated: "split" or "gain"
"split" is the number of times a feature is used in a model
"gain" is the total gain of splits which use the feature
max_num_features : int
Max number of top features displayed on plot.
If None or smaller than 1, all features will be displayed.
ignore_zero : bool
Ignore features with zero importance
figsize : tuple of 2 elements
Figure size
grid : bool
Whether add grid for axes
**kwargs :
Other keywords passed to ax.barh()
Returns
-------
ax : matplotlib Axes
"""
try:
import matplotlib.pyplot as plt
except ImportError:
raise ImportError('You must install matplotlib to plot importance.')
if isinstance(booster, LGBMModel):
booster = booster.booster_
elif not isinstance(booster, Booster):
raise TypeError('booster must be Booster or LGBMModel.')
importance = booster.feature_importance(importance_type=importance_type)
feature_name = booster.feature_name()
if not len(importance):
raise ValueError('Booster feature_importances are empty.')
tuples = sorted(zip(feature_name, importance), key=lambda x: x[1])
if ignore_zero:
tuples = [x for x in tuples if x[1] > 0]
if max_num_features is not None and max_num_features > 0:
tuples = tuples[-max_num_features:]
labels, values = zip(*tuples)
if ax is None:
if figsize is not None:
check_not_tuple_of_2_elements(figsize, 'figsize')
_, ax = plt.subplots(1, 1, figsize=figsize)
ylocs = np.arange(len(values))
ax.barh(ylocs, values, align='center', height=height, **kwargs)
for x, y in zip(values, ylocs):
ax.text(x + 1, y, x, va='center')
ax.set_yticks(ylocs)
ax.set_yticklabels(labels)
if xlim is not None:
check_not_tuple_of_2_elements(xlim, 'xlim')
else:
xlim = (0, max(values) * 1.1)
ax.set_xlim(xlim)
if ylim is not None:
check_not_tuple_of_2_elements(ylim, 'ylim')
else:
ylim = (-1, len(values))
ax.set_ylim(ylim)
if title is not None:
ax.set_title(title)
if xlabel is not None:
ax.set_xlabel(xlabel)
if ylabel is not None:
ax.set_ylabel(ylabel)
ax.grid(grid)
return ax
def plot_metric(booster, metric=None, dataset_names=None,
ax=None, xlim=None, ylim=None,
title='Metric during training',
xlabel='Iterations', ylabel='auto',
figsize=None, grid=True):
"""Plot one metric during training.
Parameters
----------
booster : dict or LGBMModel
Evals_result recorded by lightgbm.train() or LGBMModel instance
metric : str or None
The metric name to plot.
Only one metric supported because different metrics have various scales.
Pass None to pick `first` one (according to dict hashcode).
dataset_names : None or list of str
List of the dataset names to plot.
Pass None to plot all datasets.
ax : matplotlib Axes
Target axes instance. If None, new figure and axes will be created.
xlim : tuple of 2 elements
Tuple passed to axes.xlim()
ylim : tuple of 2 elements
Tuple passed to axes.ylim()
title : str
Axes title. Pass None to disable.
xlabel : str
X axis title label. Pass None to disable.
ylabel : str
Y axis title label. Pass None to disable. Pass 'auto' to use `metric`.
figsize : tuple of 2 elements
Figure size
grid : bool
Whether add grid for axes
Returns
-------
ax : matplotlib Axes
"""
try:
import matplotlib.pyplot as plt
except ImportError:
raise ImportError('You must install matplotlib to plot metric.')
if isinstance(booster, LGBMModel):
eval_results = deepcopy(booster.evals_result_)
elif isinstance(booster, dict):
eval_results = deepcopy(booster)
else:
raise TypeError('booster must be dict or LGBMModel.')
num_data = len(eval_results)
if not num_data:
raise ValueError('eval results cannot be empty.')
if ax is None:
if figsize is not None:
check_not_tuple_of_2_elements(figsize, 'figsize')
_, ax = plt.subplots(1, 1, figsize=figsize)
if dataset_names is None:
dataset_names = iter(eval_results.keys())
elif not isinstance(dataset_names, (list, tuple, set)) or not dataset_names:
raise ValueError('dataset_names should be iterable and cannot be empty')
else:
dataset_names = iter(dataset_names)
name = next(dataset_names) # take one as sample
metrics_for_one = eval_results[name]
num_metric = len(metrics_for_one)
if metric is None:
if num_metric > 1:
msg = """more than one metric available, picking one to plot."""
warnings.warn(msg, stacklevel=2)
metric, results = metrics_for_one.popitem()
else:
if metric not in metrics_for_one:
raise KeyError('No given metric in eval results.')
results = metrics_for_one[metric]
num_iteration, max_result, min_result = len(results), max(results), min(results)
x_ = range(num_iteration)
ax.plot(x_, results, label=name)
for name in dataset_names:
metrics_for_one = eval_results[name]
results = metrics_for_one[metric]
max_result, min_result = max(max(results), max_result), min(min(results), min_result)
ax.plot(x_, results, label=name)
ax.legend(loc='best')
if xlim is not None:
check_not_tuple_of_2_elements(xlim, 'xlim')
else:
xlim = (0, num_iteration)
ax.set_xlim(xlim)
if ylim is not None:
check_not_tuple_of_2_elements(ylim, 'ylim')
else:
range_result = max_result - min_result
ylim = (min_result - range_result * 0.2, max_result + range_result * 0.2)
ax.set_ylim(ylim)
if ylabel == 'auto':
ylabel = metric
if title is not None:
ax.set_title(title)
if xlabel is not None:
ax.set_xlabel(xlabel)
if ylabel is not None:
ax.set_ylabel(ylabel)
ax.grid(grid)
return ax
def _to_graphviz(graph, tree_info, show_info, feature_names):
"""Convert specified tree to graphviz instance."""
def add(root, parent=None, decision=None):
"""recursively add node or edge"""
if 'split_index' in root: # non-leaf
name = 'split' + str(root['split_index'])
if feature_names is not None:
label = 'split_feature_name:' + str(feature_names[root['split_feature']])
else:
label = 'split_feature_index:' + str(root['split_feature'])
label += '\nthreshold:' + str(root['threshold'])
for info in show_info:
if info in {'split_gain', 'internal_value', 'internal_count'}:
label += '\n' + info + ':' + str(root[info])
graph.node(name, label=label)
if root['decision_type'] == 'no_greater':
l_dec, r_dec = '<=', '>'
elif root['decision_type'] == 'is':
l_dec, r_dec = 'is', "isn't"
else:
raise ValueError('Invalid decision type in tree model.')
add(root['left_child'], name, l_dec)
add(root['right_child'], name, r_dec)
else: # leaf
name = 'left' + str(root['leaf_index'])
label = 'leaf_value:' + str(root['leaf_value'])
if 'leaf_count' in show_info:
label += '\nleaf_count:' + str(root['leaf_count'])
graph.node(name, label=label)
if parent is not None:
graph.edge(parent, name, decision)
add(tree_info['tree_structure'])
return graph
def plot_tree(booster, ax=None, tree_index=0, figsize=None,
graph_attr=None, node_attr=None, edge_attr=None,
show_info=None):
"""Plot specified tree.
Parameters
----------
booster : Booster, LGBMModel
Booster or LGBMModel instance.
ax : matplotlib Axes
Target axes instance. If None, new figure and axes will be created.
tree_index : int, default 0
Specify tree index of target tree.
figsize : tuple of 2 elements
Figure size.
graph_attr : dict
Mapping of (attribute, value) pairs for the graph.
node_attr : dict
Mapping of (attribute, value) pairs set for all nodes.
edge_attr : dict
Mapping of (attribute, value) pairs set for all edges.
show_info : list
Information shows on nodes.
options: 'split_gain', 'internal_value', 'internal_count' or 'leaf_count'.
Returns
-------
ax : matplotlib Axes
"""
try:
import matplotlib.pyplot as plt
import matplotlib.image as image
except ImportError:
raise ImportError('You must install matplotlib to plot tree.')
try:
from graphviz import Digraph
except ImportError:
raise ImportError('You must install graphviz to plot tree.')
if ax is None:
if figsize is not None:
check_not_tuple_of_2_elements(figsize, 'figsize')
_, ax = plt.subplots(1, 1, figsize=figsize)
if isinstance(booster, LGBMModel):
booster = booster.booster_
elif not isinstance(booster, Booster):
raise TypeError('booster must be Booster or LGBMModel.')
model = booster.dump_model()
tree_infos = model['tree_info']
if 'feature_names' in model:
feature_names = model['feature_names']
else:
feature_names = None
if tree_index < len(tree_infos):
tree_info = tree_infos[tree_index]
else:
raise IndexError('tree_index is out of range.')
graph = Digraph(graph_attr=graph_attr, node_attr=node_attr, edge_attr=edge_attr)
if show_info is None:
show_info = []
ret = _to_graphviz(graph, tree_info, show_info, feature_names)
s = BytesIO()
s.write(ret.pipe(format='png'))
s.seek(0)
img = image.imread(s)
ax.imshow(img)
ax.axis('off')
return ax
......@@ -130,6 +130,7 @@ class LGBMModel(LGBMModelBase):
reg_alpha=0, reg_lambda=0, scale_pos_weight=1,
is_unbalance=False, seed=0, nthread=-1, silent=True,
sigmoid=1.0, huber_delta=1.0, gaussian_eta=1.0, fair_c=1.0,
poisson_max_delta_step=0.7,
max_position=20, label_gain=None,
drop_rate=0.1, skip_drop=0.5, max_drop=50,
uniform_drop=False, xgboost_dart_mode=False):
......@@ -192,6 +193,8 @@ class LGBMModel(LGBMModelBase):
It is used to control the width of Gaussian function to approximate hessian.
fair_c : float
Only used in regression. Parameter for Fair loss function.
poisson_max_delta_step : float
parameter used to safeguard optimization in Poisson regression.
max_position : int
Only used in lambdarank, will optimize NDCG at this position.
label_gain : list of float
......@@ -259,6 +262,7 @@ class LGBMModel(LGBMModelBase):
self.huber_delta = huber_delta
self.gaussian_eta = gaussian_eta
self.fair_c = fair_c
self.poisson_max_delta_step = poisson_max_delta_step
self.max_position = max_position
self.label_gain = label_gain
self.drop_rate = drop_rate
......@@ -280,7 +284,7 @@ class LGBMModel(LGBMModelBase):
eval_init_score=None, eval_group=None,
eval_metric=None,
early_stopping_rounds=None, verbose=True,
feature_name=None, categorical_feature=None,
feature_name='auto', categorical_feature='auto',
callbacks=None):
"""
Fit the gradient boosting model
......@@ -311,12 +315,14 @@ class LGBMModel(LGBMModelBase):
early_stopping_rounds : int
verbose : bool
If `verbose` and an evaluation set is used, writes the evaluation
feature_name : list of str
feature_name : list of str, or 'auto'
Feature names
categorical_feature : list of str or int
If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
callbacks : list of callback functions
List of callback functions that are applied at each iteration.
See Callbacks in Python-API.md for more information.
......@@ -461,7 +467,7 @@ class LGBMModel(LGBMModelBase):
return self.evals_result
@property
def feature_importance_(self):
def feature_importances_(self):
"""Get normailized feature importances."""
importace_array = self.booster_.feature_importance().astype(np.float32)
return importace_array / importace_array.sum()
......@@ -470,9 +476,9 @@ class LGBMModel(LGBMModelBase):
def booster(self):
return self.booster_
@LGBMDeprecated('Use attribute feature_importance_ instead.')
@LGBMDeprecated('Use attribute feature_importances_ instead.')
def feature_importance(self):
return self.feature_importance_
return self.feature_importances_
class LGBMRegressor(LGBMModel, LGBMRegressorBase):
......@@ -485,6 +491,7 @@ class LGBMRegressor(LGBMModel, LGBMRegressorBase):
reg_alpha=0, reg_lambda=0,
seed=0, nthread=-1, silent=True,
huber_delta=1.0, gaussian_eta=1.0, fair_c=1.0,
poisson_max_delta_step=0.7,
drop_rate=0.1, skip_drop=0.5, max_drop=50,
uniform_drop=False, xgboost_dart_mode=False):
super(LGBMRegressor, self).__init__(boosting_type=boosting_type, num_leaves=num_leaves,
......@@ -497,6 +504,7 @@ class LGBMRegressor(LGBMModel, LGBMRegressorBase):
reg_alpha=reg_alpha, reg_lambda=reg_lambda,
seed=seed, nthread=nthread, silent=silent,
huber_delta=huber_delta, gaussian_eta=gaussian_eta, fair_c=fair_c,
poisson_max_delta_step=poisson_max_delta_step,
drop_rate=drop_rate, skip_drop=skip_drop, max_drop=max_drop,
uniform_drop=uniform_drop, xgboost_dart_mode=xgboost_dart_mode)
......@@ -506,7 +514,7 @@ class LGBMRegressor(LGBMModel, LGBMRegressorBase):
eval_init_score=None,
eval_metric="l2",
early_stopping_rounds=None, verbose=True,
feature_name=None, categorical_feature=None, callbacks=None):
feature_name='auto', categorical_feature='auto', callbacks=None):
super(LGBMRegressor, self).fit(X, y, sample_weight=sample_weight,
init_score=init_score, eval_set=eval_set,
......@@ -550,9 +558,9 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
sample_weight=None, init_score=None,
eval_set=None, eval_sample_weight=None,
eval_init_score=None,
eval_metric="binary_logloss",
eval_metric="logloss",
early_stopping_rounds=None, verbose=True,
feature_name=None, categorical_feature=None,
feature_name='auto', categorical_feature='auto',
callbacks=None):
self._le = LGBMLabelEncoder().fit(y)
y = self._le.transform(y)
......@@ -562,8 +570,15 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
if self.n_classes > 2:
# Switch to using a multiclass objective in the underlying LGBM instance
self.objective = "multiclass"
if eval_set is not None and eval_metric == "binary_logloss":
if eval_metric == 'logloss' or eval_metric == 'binary_logloss':
eval_metric = "multi_logloss"
elif eval_metric == 'error' or eval_metric == 'binary_error':
eval_metric = "multi_error"
else:
if eval_metric == 'logloss' or eval_metric == 'multi_logloss':
eval_metric = 'binary_logloss'
elif eval_metric == 'error' or eval_metric == 'multi_error':
eval_metric = 'binary_error'
if eval_set is not None:
eval_set = [(x[0], self._le.transform(x[1])) for x in eval_set]
......@@ -653,7 +668,7 @@ class LGBMRanker(LGBMModel):
eval_init_score=None, eval_group=None,
eval_metric='ndcg', eval_at=1,
early_stopping_rounds=None, verbose=True,
feature_name=None, categorical_feature=None,
feature_name='auto', categorical_feature='auto',
callbacks=None):
"""
Most arguments like common methods except following:
......
......@@ -12,7 +12,7 @@
#include "predictor.hpp"
#include <omp.h>
#include <LightGBM/utils/openmp_wrapper.h>
#include <cstdio>
#include <ctime>
......@@ -33,7 +33,7 @@ Application::Application(int argc, char** argv) {
omp_set_num_threads(config_.num_threads);
}
if (config_.io_config.data_filename.size() == 0) {
Log::Fatal("No training/prediction data, application quit");
Log::Fatal("No training/prediction data, application quit");
}
}
......@@ -226,10 +226,10 @@ void Application::Train() {
int total_iter = config_.boosting_config.num_iterations;
bool is_finished = false;
bool need_eval = true;
auto start_time = std::chrono::high_resolution_clock::now();
auto start_time = std::chrono::steady_clock::now();
for (int iter = 0; iter < total_iter && !is_finished; ++iter) {
is_finished = boosting_->TrainOneIter(nullptr, nullptr, need_eval);
auto end_time = std::chrono::high_resolution_clock::now();
auto end_time = std::chrono::steady_clock::now();
// output used time per iteration
Log::Info("%f seconds elapsed, finished iteration %d", std::chrono::duration<double,
std::milli>(end_time - start_time) * 1e-3, iter + 1);
......
......@@ -6,7 +6,7 @@
#include <LightGBM/utils/text_reader.h>
#include <LightGBM/dataset.h>
#include <omp.h>
#include <LightGBM/utils/openmp_wrapper.h>
#include <cstring>
#include <cstdio>
......@@ -19,7 +19,7 @@
namespace LightGBM {
/*!
* \brief Used to prediction data with input model
* \brief Used to predict data with input model
*/
class Predictor {
public:
......@@ -27,7 +27,7 @@ public:
* \brief Constructor
* \param boosting Input boosting model
* \param is_raw_score True if need to predict result with raw score
* \param predict_leaf_index True if output leaf index instead of prediction score
* \param is_predict_leaf_index True if output leaf index instead of prediction score
*/
Predictor(const Boosting* boosting, bool is_raw_score, bool is_predict_leaf_index) {
boosting_ = boosting;
......@@ -69,14 +69,13 @@ public:
~Predictor() {
}
inline const PredictFunction& GetPredictFunction() {
inline const PredictFunction& GetPredictFunction() const {
return predict_fun_;
}
/*!
* \brief predicting on data, then saving result to disk
* \param data_filename Filename of data
* \param has_label True if this data contains label
* \param result_filename Filename of output result
*/
void Predict(const char* data_filename, const char* result_filename, bool has_header) {
......
#include <LightGBM/boosting.h>
#include "gbdt.h"
#include "dart.hpp"
#include "goss.hpp"
namespace LightGBM {
......@@ -10,7 +11,7 @@ std::string GetBoostingTypeFromModelFile(const char* filename) {
return type;
}
void Boosting::LoadFileToBoosting(Boosting* boosting, const char* filename) {
bool Boosting::LoadFileToBoosting(Boosting* boosting, const char* filename) {
if (boosting != nullptr) {
TextReader<size_t> model_reader(filename, true);
model_reader.ReadAllLines();
......@@ -18,8 +19,11 @@ void Boosting::LoadFileToBoosting(Boosting* boosting, const char* filename) {
for (auto& line : model_reader.Lines()) {
str_buf << line << '\n';
}
boosting->LoadModelFromString(str_buf.str());
if (!boosting->LoadModelFromString(str_buf.str()))
return false;
}
return true;
}
Boosting* Boosting::CreateBoosting(const std::string& type, const char* filename) {
......@@ -28,6 +32,8 @@ Boosting* Boosting::CreateBoosting(const std::string& type, const char* filename
return new GBDT();
} else if (type == std::string("dart")) {
return new DART();
} else if (type == std::string("goss")) {
return new GOSS();
} else {
return nullptr;
}
......@@ -39,6 +45,10 @@ Boosting* Boosting::CreateBoosting(const std::string& type, const char* filename
ret.reset(new GBDT());
} else if (type == std::string("dart")) {
ret.reset(new DART());
} else if (type == std::string("goss")) {
ret.reset(new GOSS());
} else {
Log::Fatal("unknow boosting type %s", type.c_str());
}
LoadFileToBoosting(ret.get(), filename);
} else {
......
......@@ -38,6 +38,11 @@ public:
random_for_drop_ = Random(gbdt_config_->drop_seed);
sum_weight_ = 0.0f;
}
void ResetTrainingData(const BoostingConfig* config, const Dataset* train_data, const ObjectiveFunction* object_function,
const std::vector<const Metric*>& training_metrics) override {
GBDT::ResetTrainingData(config, train_data, object_function, training_metrics);
}
/*!
* \brief one training iteration
*/
......@@ -78,7 +83,7 @@ private:
*/
void DroppingTrees() {
drop_index_.clear();
bool is_skip = random_for_drop_.NextDouble() < gbdt_config_->skip_drop;
bool is_skip = random_for_drop_.NextFloat() < gbdt_config_->skip_drop;
// select dropping tree indexes based on drop_rate and tree weights
if (!is_skip) {
double drop_rate = gbdt_config_->drop_rate;
......@@ -88,7 +93,7 @@ private:
drop_rate = std::min(drop_rate, gbdt_config_->max_drop * inv_average_weight / sum_weight_);
}
for (int i = 0; i < iter_; ++i) {
if (random_for_drop_.NextDouble() < drop_rate * tree_weight_[i] * inv_average_weight) {
if (random_for_drop_.NextFloat() < drop_rate * tree_weight_[i] * inv_average_weight) {
drop_index_.push_back(i);
}
}
......@@ -97,7 +102,7 @@ private:
drop_rate = std::min(drop_rate, gbdt_config_->max_drop / static_cast<double>(iter_));
}
for (int i = 0; i < iter_; ++i) {
if (random_for_drop_.NextDouble() < drop_rate) {
if (random_for_drop_.NextFloat() < drop_rate) {
drop_index_.push_back(i);
}
}
......
#include "gbdt.h"
#include <omp.h>
#include <LightGBM/utils/openmp_wrapper.h>
#include <LightGBM/utils/common.h>
#include <LightGBM/feature.h>
#include <LightGBM/objective_function.h>
#include <LightGBM/metric.h>
......@@ -18,6 +17,17 @@
namespace LightGBM {
#ifdef TIMETAG
std::chrono::duration<double, std::milli> boosting_time;
std::chrono::duration<double, std::milli> train_score_time;
std::chrono::duration<double, std::milli> out_of_bag_score_time;
std::chrono::duration<double, std::milli> valid_score_time;
std::chrono::duration<double, std::milli> metric_time;
std::chrono::duration<double, std::milli> bagging_time;
std::chrono::duration<double, std::milli> sub_gradient_time;
std::chrono::duration<double, std::milli> tree_time;
#endif // TIMETAG
GBDT::GBDT()
:iter_(0),
train_data_(nullptr),
......@@ -25,7 +35,7 @@ GBDT::GBDT()
early_stopping_round_(0),
max_feature_idx_(0),
num_class_(1),
sigmoid_(1.0f),
sigmoid_(-1.0f),
num_iteration_for_pred_(0),
shrinkage_rate_(0.1f),
num_init_iteration_(0) {
......@@ -37,7 +47,16 @@ GBDT::GBDT()
}
GBDT::~GBDT() {
#ifdef TIMETAG
Log::Info("GBDT::boosting costs %f", boosting_time * 1e-3);
Log::Info("GBDT::train_score costs %f", train_score_time * 1e-3);
Log::Info("GBDT::out_of_bag_score costs %f", out_of_bag_score_time * 1e-3);
Log::Info("GBDT::valid_score costs %f", valid_score_time * 1e-3);
Log::Info("GBDT::metric costs %f", metric_time * 1e-3);
Log::Info("GBDT::bagging costs %f", bagging_time * 1e-3);
Log::Info("GBDT::sub_gradient costs %f", sub_gradient_time * 1e-3);
Log::Info("GBDT::tree costs %f", tree_time * 1e-3);
#endif
}
void GBDT::Init(const BoostingConfig* config, const Dataset* train_data, const ObjectiveFunction* object_function,
......@@ -46,9 +65,6 @@ void GBDT::Init(const BoostingConfig* config, const Dataset* train_data, const O
num_iteration_for_pred_ = 0;
max_feature_idx_ = 0;
num_class_ = config->num_class;
for (int i = 0; i < num_threads_; ++i) {
random_.emplace_back(config->bagging_seed + i);
}
train_data_ = nullptr;
gbdt_config_ = nullptr;
tree_learner_ = nullptr;
......@@ -107,6 +123,10 @@ void GBDT::ResetTrainingData(const BoostingConfig* config, const Dataset* train_
max_feature_idx_ = train_data->num_total_features() - 1;
// get label index
label_idx_ = train_data->label_idx();
// get feature names
feature_names_ = train_data->feature_names();
feature_infos_ = train_data->feature_infos();
}
if ((train_data_ != train_data && train_data != nullptr)
......@@ -122,16 +142,26 @@ void GBDT::ResetTrainingData(const BoostingConfig* config, const Dataset* train_
right_cnts_buf_.resize(num_threads_);
left_write_pos_buf_.resize(num_threads_);
right_write_pos_buf_.resize(num_threads_);
double average_bag_rate = new_config->bagging_fraction / new_config->bagging_freq;
is_use_subset_ = false;
if (average_bag_rate <= 0.5) {
tmp_subset_.reset(new Dataset(bag_data_cnt_));
tmp_subset_->CopyFeatureMapperFrom(train_data);
is_use_subset_ = true;
Log::Debug("use subset for bagging");
}
} else {
bag_data_cnt_ = num_data_;
bag_data_indices_.clear();
tmp_indices_.clear();
is_use_subset_ = false;
}
}
train_data_ = train_data;
if (train_data_ != nullptr) {
// reset config for tree learner
tree_learner_->ResetConfig(&new_config->tree_config);
is_class_end_ = std::vector<bool>(num_class_, false);
}
gbdt_config_.reset(new_config.release());
}
......@@ -168,34 +198,39 @@ void GBDT::AddValidDataset(const Dataset* valid_data,
valid_metrics_.back().shrink_to_fit();
}
data_size_t GBDT::BaggingHelper(data_size_t start, data_size_t cnt, data_size_t* buffer){
const int tid = omp_get_thread_num();
data_size_t GBDT::BaggingHelper(Random& cur_rand, data_size_t start, data_size_t cnt, data_size_t* buffer){
if (cnt <= 0) {
return 0;
}
data_size_t bag_data_cnt =
static_cast<data_size_t>(gbdt_config_->bagging_fraction * cnt);
data_size_t cur_left_cnt = 0;
data_size_t cur_right_cnt = 0;
auto right_buffer = buffer + bag_data_cnt;
// random bagging, minimal unit is one record
for (data_size_t i = 0; i < cnt; ++i) {
double prob =
(bag_data_cnt - cur_left_cnt) / static_cast<double>(cnt - i);
if (random_[tid].NextDouble() < prob) {
float prob =
(bag_data_cnt - cur_left_cnt) / static_cast<float>(cnt - i);
if (cur_rand.NextFloat() < prob) {
buffer[cur_left_cnt++] = start + i;
} else {
buffer[bag_data_cnt + cur_right_cnt++] = start + i;
right_buffer[cur_right_cnt++] = start + i;
}
}
CHECK(cur_left_cnt == bag_data_cnt);
return cur_left_cnt;
}
void GBDT::Bagging(int iter) {
// if need bagging
if (bag_data_cnt_ < num_data_ && iter % gbdt_config_->bagging_freq == 0) {
const data_size_t min_inner_size = 10000;
const data_size_t min_inner_size = 1000;
data_size_t inner_size = (num_data_ + num_threads_ - 1) / num_threads_;
if (inner_size < min_inner_size) { inner_size = min_inner_size; }
#pragma omp parallel for schedule(static, 1)
#pragma omp parallel for schedule(static,1)
for (int i = 0; i < num_threads_; ++i) {
left_cnts_buf_[i] = 0;
right_cnts_buf_[i] = 0;
......@@ -203,7 +238,8 @@ void GBDT::Bagging(int iter) {
if (cur_start > num_data_) { continue; }
data_size_t cur_cnt = inner_size;
if (cur_start + cur_cnt > num_data_) { cur_cnt = num_data_ - cur_start; }
data_size_t cur_left_count = BaggingHelper(cur_start, cur_cnt, tmp_indices_.data() + cur_start);
Random cur_rand(gbdt_config_->bagging_seed + iter * num_threads_ + i);
data_size_t cur_left_count = BaggingHelper(cur_rand, cur_start, cur_cnt, tmp_indices_.data() + cur_start);
offsets_buf_[i] = cur_start;
left_cnts_buf_[i] = cur_left_count;
right_cnts_buf_[i] = cur_cnt - cur_left_count;
......@@ -228,47 +264,114 @@ void GBDT::Bagging(int iter) {
tmp_indices_.data() + offsets_buf_[i] + left_cnts_buf_[i], right_cnts_buf_[i] * sizeof(data_size_t));
}
}
bag_data_cnt_ = left_cnt;
CHECK(bag_data_indices_[bag_data_cnt_ - 1] > bag_data_indices_[bag_data_cnt_]);
Log::Debug("Re-bagging, using %d data to train", bag_data_cnt_);
// set bagging data to tree learner
tree_learner_->SetBaggingData(bag_data_indices_.data(), bag_data_cnt_);
if (!is_use_subset_) {
tree_learner_->SetBaggingData(bag_data_indices_.data(), bag_data_cnt_);
} else {
// get subset
tmp_subset_->ReSize(bag_data_cnt_);
tmp_subset_->CopySubset(train_data_, bag_data_indices_.data(), bag_data_cnt_, false);
tree_learner_->ResetTrainingData(tmp_subset_.get());
}
}
}
void GBDT::UpdateScoreOutOfBag(const Tree* tree, const int curr_class) {
#ifdef TIMETAG
auto start_time = std::chrono::steady_clock::now();
#endif
// we need to predict out-of-bag socres of data for boosting
if (num_data_ - bag_data_cnt_ > 0) {
if (num_data_ - bag_data_cnt_ > 0 && !is_use_subset_) {
train_score_updater_->AddScore(tree, bag_data_indices_.data() + bag_data_cnt_, num_data_ - bag_data_cnt_, curr_class);
}
#ifdef TIMETAG
out_of_bag_score_time += std::chrono::steady_clock::now() - start_time;
#endif
}
bool GBDT::TrainOneIter(const score_t* gradient, const score_t* hessian, bool is_eval) {
// boosting first
if (gradient == nullptr || hessian == nullptr) {
#ifdef TIMETAG
auto start_time = std::chrono::steady_clock::now();
#endif
Boosting();
gradient = gradients_.data();
hessian = hessians_.data();
#ifdef TIMETAG
boosting_time += std::chrono::steady_clock::now() - start_time;
#endif
}
#ifdef TIMETAG
auto start_time = std::chrono::steady_clock::now();
#endif
// bagging logic
Bagging(iter_);
#ifdef TIMETAG
bagging_time += std::chrono::steady_clock::now() - start_time;
#endif
if (is_use_subset_ && bag_data_cnt_ < num_data_) {
#ifdef TIMETAG
start_time = std::chrono::steady_clock::now();
#endif
if (gradients_.empty()) {
size_t total_size = static_cast<size_t>(num_data_) * num_class_;
gradients_.resize(total_size);
hessians_.resize(total_size);
}
// get sub gradients
for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
auto bias = curr_class * num_data_;
// cannot multi-threding
for (int i = 0; i < bag_data_cnt_; ++i) {
gradients_[bias + i] = gradient[bias + bag_data_indices_[i]];
hessians_[bias + i] = hessian[bias + bag_data_indices_[i]];
}
}
gradient = gradients_.data();
hessian = hessians_.data();
#ifdef TIMETAG
sub_gradient_time += std::chrono::steady_clock::now() - start_time;
#endif
}
bool should_continue = false;
for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
// train a new tree
std::unique_ptr<Tree> new_tree(tree_learner_->Train(gradient + curr_class * num_data_, hessian + curr_class * num_data_));
// if cannot learn a new tree, then stop
if (new_tree->num_leaves() <= 1) {
Log::Info("Stopped training because there are no more leafs that meet the split requirements.");
return true;
#ifdef TIMETAG
start_time = std::chrono::steady_clock::now();
#endif
std::unique_ptr<Tree> new_tree(new Tree(2));
if (!is_class_end_[curr_class]) {
// train a new tree
new_tree.reset(tree_learner_->Train(gradient + curr_class * num_data_, hessian + curr_class * num_data_));
}
#ifdef TIMETAG
tree_time += std::chrono::steady_clock::now() - start_time;
#endif
if (new_tree->num_leaves() > 1) {
should_continue = true;
// shrinkage by learning rate
new_tree->Shrinkage(shrinkage_rate_);
// update score
UpdateScore(new_tree.get(), curr_class);
UpdateScoreOutOfBag(new_tree.get(), curr_class);
} else {
is_class_end_[curr_class] = true;
}
// shrinkage by learning rate
new_tree->Shrinkage(shrinkage_rate_);
// update score
UpdateScore(new_tree.get(), curr_class);
UpdateScoreOutOfBag(new_tree.get(), curr_class);
// add model
models_.push_back(std::move(new_tree));
}
if (!should_continue) {
Log::Warning("Stopped training because there are no more leaves that meet the split requirements.");
for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
models_.pop_back();
}
return true;
}
++iter_;
if (is_eval) {
return EvalAndCheckEarlyStopping();
......@@ -294,13 +397,20 @@ void GBDT::RollbackOneIter() {
for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
models_.pop_back();
}
is_class_end_ = std::vector<bool>(num_class_, false);
--iter_;
}
bool GBDT::EvalAndCheckEarlyStopping() {
bool is_met_early_stopping = false;
#ifdef TIMETAG
auto start_time = std::chrono::steady_clock::now();
#endif
// print message for metric
auto best_msg = OutputMetric(iter_);
#ifdef TIMETAG
metric_time += std::chrono::steady_clock::now() - start_time;
#endif
is_met_early_stopping = !best_msg.empty();
if (is_met_early_stopping) {
Log::Info("Early stopping at iteration %d, the best iteration round is %d",
......@@ -315,12 +425,28 @@ bool GBDT::EvalAndCheckEarlyStopping() {
}
void GBDT::UpdateScore(const Tree* tree, const int curr_class) {
#ifdef TIMETAG
auto start_time = std::chrono::steady_clock::now();
#endif
// update training score
train_score_updater_->AddScore(tree_learner_.get(), curr_class);
if (!is_use_subset_) {
train_score_updater_->AddScore(tree_learner_.get(), curr_class);
} else {
train_score_updater_->AddScore(tree, curr_class);
}
#ifdef TIMETAG
train_score_time += std::chrono::steady_clock::now() - start_time;
#endif
#ifdef TIMETAG
start_time = std::chrono::steady_clock::now();
#endif
// update validation score
for (auto& score_updater : valid_score_updater_) {
score_updater->AddScore(tree, curr_class);
}
#ifdef TIMETAG
valid_score_time += std::chrono::steady_clock::now() - start_time;
#endif
}
std::string GBDT::OutputMetric(int iter) {
......@@ -441,7 +567,7 @@ void GBDT::GetPredictAt(int data_idx, double* out_result, int64_t* out_len) {
} else if(sigmoid_ > 0.0f){
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data; ++i) {
out_result[i] = static_cast<double>(1.0f / (1.0f + std::exp(-2.0f * sigmoid_ * raw_scores[i])));
out_result[i] = static_cast<double>(1.0f / (1.0f + std::exp(- sigmoid_ * raw_scores[i])));
}
} else {
#pragma omp parallel for schedule(static)
......@@ -472,14 +598,8 @@ std::string GBDT::DumpModel(int num_iteration) const {
str_buf << "\"max_feature_idx\":" << max_feature_idx_ << "," << std::endl;
str_buf << "\"sigmoid\":" << sigmoid_ << "," << std::endl;
// output feature names
auto feature_names = std::ref(feature_names_);
if (train_data_ != nullptr) {
feature_names = std::ref(train_data_->feature_names());
}
str_buf << "\"feature_names\":[\""
<< Common::Join(feature_names.get(), "\",\"") << "\"],"
<< Common::Join(feature_names_, "\",\"") << "\"],"
<< std::endl;
str_buf << "\"tree_info\":[";
......@@ -503,51 +623,61 @@ std::string GBDT::DumpModel(int num_iteration) const {
return str_buf.str();
}
void GBDT::SaveModelToFile(int num_iteration, const char* filename) const {
std::string GBDT::SaveModelToString(int num_iterations) const {
std::stringstream ss;
// output model type
ss << SubModelName() << std::endl;
// output number of class
ss << "num_class=" << num_class_ << std::endl;
// output label index
ss << "label_index=" << label_idx_ << std::endl;
// output max_feature_idx
ss << "max_feature_idx=" << max_feature_idx_ << std::endl;
// output objective name
if (object_function_ != nullptr) {
ss << "objective=" << object_function_->GetName() << std::endl;
}
// output sigmoid parameter
ss << "sigmoid=" << sigmoid_ << std::endl;
ss << "feature_names=" << Common::Join(feature_names_, " ") << std::endl;
ss << "feature_infos=" << Common::Join(feature_infos_, " ") << std::endl;
ss << std::endl;
int num_used_model = static_cast<int>(models_.size());
if (num_iterations > 0) {
num_used_model = std::min(num_iterations * num_class_, num_used_model);
}
// output tree models
for (int i = 0; i < num_used_model; ++i) {
ss << "Tree=" << i << std::endl;
ss << models_[i]->ToString() << std::endl;
}
std::vector<std::pair<size_t, std::string>> pairs = FeatureImportance();
ss << std::endl << "feature importances:" << std::endl;
for (size_t i = 0; i < pairs.size(); ++i) {
ss << pairs[i].second << "=" << std::to_string(pairs[i].first) << std::endl;
}
return ss.str();
}
bool GBDT::SaveModelToFile(int num_iteration, const char* filename) const {
/*! \brief File to write models */
std::ofstream output_file;
output_file.open(filename);
// output model type
output_file << SubModelName() << std::endl;
// output number of class
output_file << "num_class=" << num_class_ << std::endl;
// output label index
output_file << "label_index=" << label_idx_ << std::endl;
// output max_feature_idx
output_file << "max_feature_idx=" << max_feature_idx_ << std::endl;
// output objective name
if (object_function_ != nullptr) {
output_file << "objective=" << object_function_->GetName() << std::endl;
}
// output sigmoid parameter
output_file << "sigmoid=" << sigmoid_ << std::endl;
// output feature names
auto feature_names = std::ref(feature_names_);
if (train_data_ != nullptr) {
feature_names = std::ref(train_data_->feature_names());
}
output_file << "feature_names=" << Common::Join(feature_names.get(), " ") << std::endl;
output_file << std::endl;
int num_used_model = static_cast<int>(models_.size());
if (num_iteration > 0) {
num_used_model = std::min(num_iteration * num_class_, num_used_model);
}
// output tree models
for (int i = 0; i < num_used_model; ++i) {
output_file << "Tree=" << i << std::endl;
output_file << models_[i]->ToString() << std::endl;
}
output_file << SaveModelToString(num_iteration);
std::vector<std::pair<size_t, std::string>> pairs = FeatureImportance();
output_file << std::endl << "feature importances:" << std::endl;
for (size_t i = 0; i < pairs.size(); ++i) {
output_file << pairs[i].second << "=" << std::to_string(pairs[i].first) << std::endl;
}
output_file.close();
return (bool)output_file;
}
void GBDT::LoadModelFromString(const std::string& model_str) {
bool GBDT::LoadModelFromString(const std::string& model_str) {
// use serialized string to restore this object
models_.clear();
std::vector<std::string> lines = Common::Split(model_str.c_str(), '\n');
......@@ -558,7 +688,7 @@ void GBDT::LoadModelFromString(const std::string& model_str) {
Common::Atoi(Common::Split(line.c_str(), '=')[1].c_str(), &num_class_);
} else {
Log::Fatal("Model file doesn't specify the number of classes");
return;
return false;
}
// get index of label
line = Common::FindFromLines(lines, "label_index=");
......@@ -566,7 +696,7 @@ void GBDT::LoadModelFromString(const std::string& model_str) {
Common::Atoi(Common::Split(line.c_str(), '=')[1].c_str(), &label_idx_);
} else {
Log::Fatal("Model file doesn't specify the label index");
return;
return false;
}
// get max_feature_idx first
line = Common::FindFromLines(lines, "max_feature_idx=");
......@@ -574,7 +704,7 @@ void GBDT::LoadModelFromString(const std::string& model_str) {
Common::Atoi(Common::Split(line.c_str(), '=')[1].c_str(), &max_feature_idx_);
} else {
Log::Fatal("Model file doesn't specify max_feature_idx");
return;
return false;
}
// get sigmoid parameter
line = Common::FindFromLines(lines, "sigmoid=");
......@@ -589,11 +719,24 @@ void GBDT::LoadModelFromString(const std::string& model_str) {
feature_names_ = Common::Split(line.substr(std::strlen("feature_names=")).c_str(), " ");
if (feature_names_.size() != static_cast<size_t>(max_feature_idx_ + 1)) {
Log::Fatal("Wrong size of feature_names");
return;
return false;
}
} else {
}
else {
Log::Fatal("Model file doesn't contain feature names");
return;
return false;
}
line = Common::FindFromLines(lines, "feature_infos=");
if (line.size() > 0) {
feature_infos_ = Common::Split(line.substr(std::strlen("feature_infos=")).c_str(), " ");
if (feature_infos_.size() != static_cast<size_t>(max_feature_idx_ + 1)) {
Log::Fatal("Wrong size of feature_infos");
return false;
}
} else {
Log::Fatal("Model file doesn't contain feature infos");
return false;
}
// get tree models
......@@ -616,24 +759,23 @@ void GBDT::LoadModelFromString(const std::string& model_str) {
num_iteration_for_pred_ = static_cast<int>(models_.size()) / num_class_;
num_init_iteration_ = num_iteration_for_pred_;
iter_ = 0;
return true;
}
std::vector<std::pair<size_t, std::string>> GBDT::FeatureImportance() const {
auto feature_names = std::ref(feature_names_);
if (train_data_ != nullptr) {
feature_names = std::ref(train_data_->feature_names());
}
std::vector<size_t> feature_importances(max_feature_idx_ + 1, 0);
for (size_t iter = 0; iter < models_.size(); ++iter) {
for (int split_idx = 0; split_idx < models_[iter]->num_leaves() - 1; ++split_idx) {
++feature_importances[models_[iter]->split_feature_real(split_idx)];
++feature_importances[models_[iter]->split_feature(split_idx)];
}
}
// store the importance first
std::vector<std::pair<size_t, std::string>> pairs;
for (size_t i = 0; i < feature_importances.size(); ++i) {
if (feature_importances[i] > 0) {
pairs.emplace_back(feature_importances[i], feature_names.get().at(i));
pairs.emplace_back(feature_importances[i], feature_names_[i]);
}
}
// sort the importance
......@@ -664,7 +806,7 @@ std::vector<double> GBDT::Predict(const double* value) const {
}
// if need sigmoid transform
if (sigmoid_ > 0 && num_class_ == 1) {
ret[0] = 1.0f / (1.0f + std::exp(- 2.0f * sigmoid_ * ret[0]));
ret[0] = 1.0f / (1.0f + std::exp(-sigmoid_ * ret[0]));
} else if (num_class_ > 1) {
Common::Softmax(&ret);
}
......
......@@ -119,7 +119,7 @@ public:
* \brief Get prediction result at data_idx data
* \param data_idx 0: training data, 1: 1st validation data
* \param result used to store prediction result, should allocate memory before call this function
* \param out_len lenght of returned score
* \param out_len length of returned score
*/
void GetPredictAt(int data_idx, double* out_result, int64_t* out_len) override;
......@@ -156,12 +156,19 @@ public:
* \param is_finish Is training finished or not
* \param filename Filename that want to save to
*/
virtual void SaveModelToFile(int num_iterations, const char* filename) const override ;
virtual bool SaveModelToFile(int num_iterations, const char* filename) const override ;
/*!
* \brief Save model to string
* \param num_used_model Number of model that want to save, -1 means save all
* \return Non-empty string if succeeded
*/
virtual std::string SaveModelToString(int num_iterations) const override ;
/*!
* \brief Restore from a serialized string
*/
void LoadModelFromString(const std::string& model_str) override;
bool LoadModelFromString(const std::string& model_str) override;
/*!
* \brief Get max feature index of this model
......@@ -169,6 +176,12 @@ public:
*/
inline int MaxFeatureIdx() const override { return max_feature_idx_; }
/*!
* \brief Get feature names of this model
* \return Feature names of this model
*/
inline std::vector<std::string> FeatureNames() const override { return feature_names_; }
/*!
* \brief Get index of label column
* \return index of label column
......@@ -228,7 +241,7 @@ protected:
* \param buffer output buffer
* \return count of left size
*/
virtual data_size_t BaggingHelper(data_size_t start, data_size_t cnt, data_size_t* buffer);
data_size_t BaggingHelper(Random& cur_rand, data_size_t start, data_size_t cnt, data_size_t* buffer);
/*!
* \brief updating score for out-of-bag data.
* Data should be update since we may re-bagging data on training
......@@ -301,8 +314,6 @@ protected:
data_size_t num_data_;
/*! \brief Number of classes */
int num_class_;
/*! \brief Random generator, used for bagging */
std::vector<Random> random_;
/*!
* \brief Sigmoid parameter, used for prediction.
* if > 0 means output score will transform by sigmoid function
......@@ -318,6 +329,7 @@ protected:
int num_init_iteration_;
/*! \brief Feature names */
std::vector<std::string> feature_names_;
std::vector<std::string> feature_infos_;
/*! \brief number of threads */
int num_threads_;
/*! \brief Buffer for multi-threading bagging */
......@@ -330,6 +342,9 @@ protected:
std::vector<data_size_t> left_write_pos_buf_;
/*! \brief Buffer for multi-threading bagging */
std::vector<data_size_t> right_write_pos_buf_;
std::unique_ptr<Dataset> tmp_subset_;
bool is_use_subset_;
std::vector<bool> is_class_end_;
};
} // namespace LightGBM
......
#ifndef LIGHTGBM_BOOSTING_GOSS_H_
#define LIGHTGBM_BOOSTING_GOSS_H_
#include <LightGBM/utils/array_args.h>
#include <LightGBM/utils/log.h>
#include <LightGBM/utils/openmp_wrapper.h>
#include <LightGBM/boosting.h>
#include "score_updater.hpp"
#include "gbdt.h"
#include <cstdio>
#include <vector>
#include <string>
#include <fstream>
#include <chrono>
namespace LightGBM {
#ifdef TIMETAG
std::chrono::duration<double, std::milli> subset_time;
std::chrono::duration<double, std::milli> re_init_tree_time;
#endif
class GOSS: public GBDT {
public:
/*!
* \brief Constructor
*/
GOSS() : GBDT() {
}
~GOSS() {
#ifdef TIMETAG
Log::Info("GOSS::subset costs %f", subset_time * 1e-3);
Log::Info("GOSS::re_init_tree costs %f", re_init_tree_time * 1e-3);
#endif
}
void Init(const BoostingConfig* config, const Dataset* train_data, const ObjectiveFunction* object_function,
const std::vector<const Metric*>& training_metrics) override {
GBDT::Init(config, train_data, object_function, training_metrics);
CHECK(gbdt_config_->top_rate + gbdt_config_->other_rate <= 1.0f);
CHECK(gbdt_config_->top_rate > 0.0f && gbdt_config_->other_rate > 0.0f);
if (gbdt_config_->bagging_freq > 0 && gbdt_config_->bagging_fraction != 1.0f) {
Log::Fatal("cannot use bagging in GOSS");
}
Log::Info("using GOSS");
}
void ResetTrainingData(const BoostingConfig* config, const Dataset* train_data, const ObjectiveFunction* object_function,
const std::vector<const Metric*>& training_metrics) override {
if (config->bagging_freq > 0 && config->bagging_fraction != 1.0f) {
Log::Fatal("cannot use bagging in GOSS");
}
GBDT::ResetTrainingData(config, train_data, object_function, training_metrics);
if (train_data_ == nullptr) { return; }
bag_data_indices_.resize(num_data_);
tmp_indices_.resize(num_data_);
tmp_indice_right_.resize(num_data_);
offsets_buf_.resize(num_threads_);
left_cnts_buf_.resize(num_threads_);
right_cnts_buf_.resize(num_threads_);
left_write_pos_buf_.resize(num_threads_);
right_write_pos_buf_.resize(num_threads_);
is_use_subset_ = false;
if (config->top_rate + config->other_rate <= 0.5) {
auto bag_data_cnt = static_cast<data_size_t>((config->top_rate + config->other_rate) * num_data_);
tmp_subset_.reset(new Dataset(bag_data_cnt));
tmp_subset_->CopyFeatureMapperFrom(train_data_);
is_use_subset_ = true;
}
// flag to not bagging first
bag_data_cnt_ = num_data_;
}
data_size_t BaggingHelper(Random& cur_rand, data_size_t start, data_size_t cnt, data_size_t* buffer, data_size_t* buffer_right) {
std::vector<score_t> tmp_gradients(cnt, 0.0f);
for (data_size_t i = 0; i < cnt; ++i) {
for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
int idx = curr_class * num_data_ + start + i;
tmp_gradients[i] += std::fabs(gradients_[idx] * hessians_[idx]);
}
}
data_size_t top_k = static_cast<data_size_t>(cnt * gbdt_config_->top_rate);
data_size_t other_k = static_cast<data_size_t>(cnt * gbdt_config_->other_rate);
top_k = std::max(1, top_k);
ArrayArgs<score_t>::ArgMaxAtK(&tmp_gradients, 0, static_cast<int>(tmp_gradients.size()), top_k);
score_t threshold = tmp_gradients[top_k - 1];
score_t multiply = static_cast<score_t>(cnt - top_k) / other_k;
data_size_t cur_left_cnt = 0;
data_size_t cur_right_cnt = 0;
data_size_t big_weight_cnt = 0;
for (data_size_t i = 0; i < cnt; ++i) {
score_t grad = 0.0f;
for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
int idx = curr_class * num_data_ + start + i;
grad += std::fabs(gradients_[idx] * hessians_[idx]);
}
if (grad >= threshold) {
buffer[cur_left_cnt++] = start + i;
++big_weight_cnt;
} else {
data_size_t sampled = cur_left_cnt - big_weight_cnt;
data_size_t rest_need = other_k - sampled;
data_size_t rest_all = (cnt - i) - (top_k - big_weight_cnt);
double prob = (rest_need) / static_cast<double>(rest_all);
if (cur_rand.NextFloat() < prob) {
buffer[cur_left_cnt++] = start + i;
for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
int idx = curr_class * num_data_ + start + i;
gradients_[idx] *= multiply;
hessians_[idx] *= multiply;
}
} else {
buffer_right[cur_right_cnt++] = start + i;
}
}
}
return cur_left_cnt;
}
void Bagging(int iter) override {
bag_data_cnt_ = num_data_;
// not subsample for first iterations
if (iter < static_cast<int>(1.0f / gbdt_config_->learning_rate)) { return; }
const data_size_t min_inner_size = 100;
data_size_t inner_size = (num_data_ + num_threads_ - 1) / num_threads_;
if (inner_size < min_inner_size) { inner_size = min_inner_size; }
#pragma omp parallel for schedule(static, 1)
for (int i = 0; i < num_threads_; ++i) {
left_cnts_buf_[i] = 0;
right_cnts_buf_[i] = 0;
data_size_t cur_start = i * inner_size;
if (cur_start > num_data_) { continue; }
data_size_t cur_cnt = inner_size;
if (cur_start + cur_cnt > num_data_) { cur_cnt = num_data_ - cur_start; }
Random cur_rand(gbdt_config_->bagging_seed + iter * num_threads_ + i);
data_size_t cur_left_count = BaggingHelper(cur_rand, cur_start, cur_cnt,
tmp_indices_.data() + cur_start, tmp_indice_right_.data() + cur_start);
offsets_buf_[i] = cur_start;
left_cnts_buf_[i] = cur_left_count;
right_cnts_buf_[i] = cur_cnt - cur_left_count;
}
data_size_t left_cnt = 0;
left_write_pos_buf_[0] = 0;
right_write_pos_buf_[0] = 0;
for (int i = 1; i < num_threads_; ++i) {
left_write_pos_buf_[i] = left_write_pos_buf_[i - 1] + left_cnts_buf_[i - 1];
right_write_pos_buf_[i] = right_write_pos_buf_[i - 1] + right_cnts_buf_[i - 1];
}
left_cnt = left_write_pos_buf_[num_threads_ - 1] + left_cnts_buf_[num_threads_ - 1];
#pragma omp parallel for schedule(static, 1)
for (int i = 0; i < num_threads_; ++i) {
if (left_cnts_buf_[i] > 0) {
std::memcpy(bag_data_indices_.data() + left_write_pos_buf_[i],
tmp_indices_.data() + offsets_buf_[i], left_cnts_buf_[i] * sizeof(data_size_t));
}
if (right_cnts_buf_[i] > 0) {
std::memcpy(bag_data_indices_.data() + left_cnt + right_write_pos_buf_[i],
tmp_indice_right_.data() + offsets_buf_[i], right_cnts_buf_[i] * sizeof(data_size_t));
}
}
bag_data_cnt_ = left_cnt;
// set bagging data to tree learner
if (!is_use_subset_) {
tree_learner_->SetBaggingData(bag_data_indices_.data(), bag_data_cnt_);
} else {
// get subset
#ifdef TIMETAG
auto start_time = std::chrono::steady_clock::now();
#endif
tmp_subset_->ReSize(bag_data_cnt_);
tmp_subset_->CopySubset(train_data_, bag_data_indices_.data(), bag_data_cnt_, false);
#ifdef TIMETAG
subset_time += std::chrono::steady_clock::now() - start_time;
#endif
#ifdef TIMETAG
start_time = std::chrono::steady_clock::now();
#endif
tree_learner_->ResetTrainingData(tmp_subset_.get());
#ifdef TIMETAG
re_init_tree_time += std::chrono::steady_clock::now() - start_time;
#endif
}
}
/*!
* \brief Get Type name of this boosting object
*/
const char* SubModelName() const override { return "tree"; }
private:
std::vector<data_size_t> tmp_indice_right_;
};
} // namespace LightGBM
#endif // LIGHTGBM_BOOSTING_GOSS_H_
#ifndef LIGHTGBM_BOOSTING_SCORE_UPDATER_HPP_
#define LIGHTGBM_BOOSTING_SCORE_UPDATER_HPP_
#include <LightGBM/utils/openmp_wrapper.h>
#include <LightGBM/meta.h>
#include <LightGBM/dataset.h>
#include <LightGBM/tree.h>
......
#include <omp.h>
#include <LightGBM/utils/openmp_wrapper.h>
#include <LightGBM/utils/common.h>
#include <LightGBM/utils/random.h>
......@@ -31,8 +31,12 @@ public:
boosting_.reset(Boosting::CreateBoosting(filename));
}
Booster() {
boosting_.reset(Boosting::CreateBoosting("gbdt", nullptr));
}
Booster(const Dataset* train_data,
const char* parameters) {
const char* parameters) {
auto param = ConfigBase::Str2Map(parameters);
config_.Set(param);
if (config_.num_threads > 0) {
......@@ -48,7 +52,7 @@ public:
// initialize the boosting
boosting_->Init(&config_.boosting_config, nullptr, objective_fun_.get(),
Common::ConstPtrInVectorWrapper<Metric>(train_metric_));
Common::ConstPtrInVectorWrapper<Metric>(train_metric_));
ResetTrainingData(train_data);
}
......@@ -67,7 +71,7 @@ public:
train_data_ = train_data;
// create objective function
objective_fun_.reset(ObjectiveFunction::CreateObjectiveFunction(config_.objective_type,
config_.objective_config));
config_.objective_config));
if (objective_fun_ == nullptr) {
Log::Warning("Using self-defined objective function");
}
......@@ -88,7 +92,7 @@ public:
train_metric_.shrink_to_fit();
// reset the boosting
boosting_->ResetTrainingData(&config_.boosting_config, train_data_,
objective_fun_.get(), Common::ConstPtrInVectorWrapper<Metric>(train_metric_));
objective_fun_.get(), Common::ConstPtrInVectorWrapper<Metric>(train_metric_));
}
void ResetConfig(const char* parameters) {
......@@ -112,7 +116,7 @@ public:
if (param.count("objective")) {
// create objective function
objective_fun_.reset(ObjectiveFunction::CreateObjectiveFunction(config_.objective_type,
config_.objective_config));
config_.objective_config));
if (objective_fun_ == nullptr) {
Log::Warning("Using self-defined objective function");
}
......@@ -123,7 +127,7 @@ public:
}
boosting_->ResetTrainingData(&config_.boosting_config, train_data_,
objective_fun_.get(), Common::ConstPtrInVectorWrapper<Metric>(train_metric_));
objective_fun_.get(), Common::ConstPtrInVectorWrapper<Metric>(train_metric_));
}
......@@ -138,7 +142,7 @@ public:
}
valid_metrics_.back().shrink_to_fit();
boosting_->AddValidDataset(valid_data,
Common::ConstPtrInVectorWrapper<Metric>(valid_metrics_.back()));
Common::ConstPtrInVectorWrapper<Metric>(valid_metrics_.back()));
}
bool TrainOneIter() {
......@@ -181,6 +185,14 @@ public:
boosting_->SaveModelToFile(num_iteration, filename);
}
void LoadModelFromString(const char* model_str) {
boosting_->LoadModelFromString(model_str);
}
std::string SaveModelToString(int num_iteration) {
return boosting_->SaveModelToString(num_iteration);
}
std::string DumpModel(int num_iteration) {
return boosting_->DumpModel(num_iteration);
}
......@@ -213,6 +225,15 @@ public:
return idx;
}
int GetFeatureNames(char** out_strs) const {
int idx = 0;
for (const auto& name : boosting_->FeatureNames()) {
std::strcpy(out_strs[idx], name.c_str());
++idx;
}
return idx;
}
const Boosting* GetBoosting() const { return boosting_.get(); }
private:
......@@ -245,13 +266,13 @@ RowPairFunctionFromDenseMatric(const void* data, int num_row, int num_col, int d
std::function<std::vector<std::pair<int, double>>(int idx)>
RowFunctionFromCSR(const void* indptr, int indptr_type, const int32_t* indices,
const void* data, int data_type, int64_t nindptr, int64_t nelem);
const void* data, int data_type, int64_t nindptr, int64_t nelem);
// Row iterator of on column for CSC matrix
class CSC_RowIterator {
public:
CSC_RowIterator(const void* col_ptr, int col_ptr_type, const int32_t* indices,
const void* data, int data_type, int64_t ncol_ptr, int64_t nelem, int col_idx);
const void* data, int data_type, int64_t ncol_ptr, int64_t nelem, int col_idx);
~CSC_RowIterator() {}
// return value at idx, only can access by ascent order
double Get(int idx);
......@@ -267,14 +288,14 @@ private:
// start of c_api functions
DllExport const char* LGBM_GetLastError() {
LIGHTGBM_C_EXPORT const char* LGBM_GetLastError() {
return LastErrorMsg();
}
DllExport int LGBM_DatasetCreateFromFile(const char* filename,
const char* parameters,
const DatasetHandle reference,
DatasetHandle* out) {
LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromFile(const char* filename,
const char* parameters,
const DatasetHandle reference,
DatasetHandle* out) {
API_BEGIN();
auto param = ConfigBase::Str2Map(parameters);
IOConfig io_config;
......@@ -284,19 +305,157 @@ DllExport int LGBM_DatasetCreateFromFile(const char* filename,
*out = loader.LoadFromFile(filename);
} else {
*out = loader.LoadFromFileAlignWithOtherDataset(filename,
reinterpret_cast<const Dataset*>(reference));
reinterpret_cast<const Dataset*>(reference));
}
API_END();
}
DllExport int LGBM_DatasetCreateFromMat(const void* data,
int data_type,
int32_t nrow,
int32_t ncol,
int is_row_major,
const char* parameters,
const DatasetHandle reference,
DatasetHandle* out) {
LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromSampledMat(const void* data,
int data_type,
int32_t num_sample_row,
int32_t ncol,
int32_t num_total_row,
const char* parameters,
DatasetHandle* out) {
if (num_sample_row == num_total_row) {
return LGBM_DatasetCreateFromMat(data, data_type, num_total_row, ncol, 1, parameters, nullptr, out);
} else {
API_BEGIN();
auto param = ConfigBase::Str2Map(parameters);
IOConfig io_config;
io_config.Set(param);
auto get_row_fun = RowFunctionFromDenseMatric(data, num_sample_row, ncol, data_type, 1);
std::vector<std::vector<double>> sample_values(ncol);
std::vector<std::vector<int>> sample_idx(ncol);
for (int i = 0; i < num_sample_row; ++i) {
auto row = get_row_fun(i);
for (size_t idx = 0; idx < row.size(); ++idx) {
if (std::fabs(row[idx]) > kEpsilon) {
sample_values[idx].emplace_back(row[idx]);
sample_idx[idx].emplace_back(i);
}
}
}
DatasetLoader loader(io_config, nullptr, 1, nullptr);
*out = loader.CostructFromSampleData(sample_values, sample_idx,
num_sample_row,
static_cast<data_size_t>(num_total_row));
API_END();
}
}
LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromSampledCSR(const void* indptr,
int indptr_type,
const int32_t* indices,
const void* data,
int data_type,
int64_t nindptr,
int64_t n_sample_elem,
int64_t num_col,
int64_t num_total_row,
const char* parameters,
DatasetHandle* out) {
if (nindptr - 1 == num_total_row) {
return LGBM_DatasetCreateFromCSR(indptr, indptr_type, indices, data,
data_type, nindptr, n_sample_elem, num_col, parameters, nullptr, out);
} else {
API_BEGIN();
auto param = ConfigBase::Str2Map(parameters);
IOConfig io_config;
io_config.Set(param);
auto get_row_fun = RowFunctionFromCSR(indptr, indptr_type, indices, data, data_type, nindptr, n_sample_elem);
int32_t num_sample_row = static_cast<int32_t>(nindptr - 1);
std::vector<std::vector<double>> sample_values(num_col);
std::vector<std::vector<int>> sample_idx(num_col);
for (int i = 0; i < num_sample_row; ++i) {
auto row = get_row_fun(i);
for (std::pair<int, double>& inner_data : row) {
if (static_cast<size_t>(inner_data.first) >= sample_values.size()) {
sample_values.resize(inner_data.first + 1);
sample_idx.resize(inner_data.first + 1);
}
if (std::fabs(inner_data.second) > kEpsilon) {
sample_values[inner_data.first].emplace_back(inner_data.second);
sample_idx[inner_data.first].emplace_back(i);
}
}
}
CHECK(num_col >= static_cast<int>(sample_values.size()));
DatasetLoader loader(io_config, nullptr, 1, nullptr);
*out = loader.CostructFromSampleData(sample_values, sample_idx,
num_sample_row,
static_cast<data_size_t>(num_total_row));
API_END();
}
}
LIGHTGBM_C_EXPORT int LGBM_DatasetCreateByReference(const DatasetHandle reference,
int64_t num_total_row,
DatasetHandle* out) {
API_BEGIN();
std::unique_ptr<Dataset> ret;
ret.reset(new Dataset(static_cast<data_size_t>(num_total_row)));
ret->CreateValid(reinterpret_cast<const Dataset*>(reference));
*out = ret.release();
API_END();
}
LIGHTGBM_C_EXPORT int LGBM_DatasetPushRows(DatasetHandle dataset,
const void* data,
int data_type,
int32_t nrow,
int32_t ncol,
int32_t start_row) {
API_BEGIN();
auto p_dataset = reinterpret_cast<Dataset*>(dataset);
auto get_row_fun = RowFunctionFromDenseMatric(data, nrow, ncol, data_type, 1);
#pragma omp parallel for schedule(static)
for (int i = 0; i < nrow; ++i) {
const int tid = omp_get_thread_num();
auto one_row = get_row_fun(i);
p_dataset->PushOneRow(tid, start_row + i, one_row);
}
if (start_row + nrow == p_dataset->num_data()) {
p_dataset->FinishLoad();
}
API_END();
}
LIGHTGBM_C_EXPORT int LGBM_DatasetPushRowsByCSR(DatasetHandle dataset,
const void* indptr,
int indptr_type,
const int32_t* indices,
const void* data,
int data_type,
int64_t nindptr,
int64_t nelem,
int64_t,
int64_t start_row) {
API_BEGIN();
auto p_dataset = reinterpret_cast<Dataset*>(dataset);
auto get_row_fun = RowFunctionFromCSR(indptr, indptr_type, indices, data, data_type, nindptr, nelem);
int32_t nrow = static_cast<int32_t>(nindptr - 1);
#pragma omp parallel for schedule(static)
for (int i = 0; i < nrow; ++i) {
const int tid = omp_get_thread_num();
auto one_row = get_row_fun(i);
p_dataset->PushOneRow(tid,
static_cast<data_size_t>(start_row + i), one_row);
}
if (start_row + nrow == static_cast<int64_t>(p_dataset->num_data())) {
p_dataset->FinishLoad();
}
API_END();
}
LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromMat(const void* data,
int data_type,
int32_t nrow,
int32_t ncol,
int is_row_major,
const char* parameters,
const DatasetHandle reference,
DatasetHandle* out) {
API_BEGIN();
auto param = ConfigBase::Str2Map(parameters);
IOConfig io_config;
......@@ -309,25 +468,26 @@ DllExport int LGBM_DatasetCreateFromMat(const void* data,
const int sample_cnt = static_cast<int>(nrow < io_config.bin_construct_sample_cnt ? nrow : io_config.bin_construct_sample_cnt);
auto sample_indices = rand.Sample(nrow, sample_cnt);
std::vector<std::vector<double>> sample_values(ncol);
std::vector<std::vector<int>> sample_idx(ncol);
for (size_t i = 0; i < sample_indices.size(); ++i) {
auto idx = sample_indices[i];
auto row = get_row_fun(static_cast<int>(idx));
for (size_t j = 0; j < row.size(); ++j) {
if (std::fabs(row[j]) > 1e-15) {
sample_values[j].push_back(row[j]);
if (std::fabs(row[j]) > kEpsilon) {
sample_values[j].emplace_back(row[j]);
sample_idx[j].emplace_back(static_cast<int>(i));
}
}
}
DatasetLoader loader(io_config, nullptr, 1, nullptr);
ret.reset(loader.CostructFromSampleData(sample_values, sample_cnt, nrow));
ret.reset(loader.CostructFromSampleData(sample_values, sample_idx, sample_cnt, nrow));
} else {
ret.reset(new Dataset(nrow));
ret->CopyFeatureMapperFrom(
reinterpret_cast<const Dataset*>(reference),
io_config.is_enable_sparse);
ret->CreateValid(
reinterpret_cast<const Dataset*>(reference));
}
#pragma omp parallel for schedule(guided)
#pragma omp parallel for schedule(static)
for (int i = 0; i < nrow; ++i) {
const int tid = omp_get_thread_num();
auto one_row = get_row_fun(i);
......@@ -338,17 +498,17 @@ DllExport int LGBM_DatasetCreateFromMat(const void* data,
API_END();
}
DllExport int LGBM_DatasetCreateFromCSR(const void* indptr,
int indptr_type,
const int32_t* indices,
const void* data,
int data_type,
int64_t nindptr,
int64_t nelem,
int64_t num_col,
const char* parameters,
const DatasetHandle reference,
DatasetHandle* out) {
LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromCSR(const void* indptr,
int indptr_type,
const int32_t* indices,
const void* data,
int data_type,
int64_t nindptr,
int64_t nelem,
int64_t num_col,
const char* parameters,
const DatasetHandle reference,
DatasetHandle* out) {
API_BEGIN();
auto param = ConfigBase::Str2Map(parameters);
IOConfig io_config;
......@@ -362,34 +522,31 @@ DllExport int LGBM_DatasetCreateFromCSR(const void* indptr,
const int sample_cnt = static_cast<int>(nrow < io_config.bin_construct_sample_cnt ? nrow : io_config.bin_construct_sample_cnt);
auto sample_indices = rand.Sample(nrow, sample_cnt);
std::vector<std::vector<double>> sample_values;
std::vector<std::vector<int>> sample_idx;
for (size_t i = 0; i < sample_indices.size(); ++i) {
auto idx = sample_indices[i];
auto row = get_row_fun(static_cast<int>(idx));
for (std::pair<int, double>& inner_data : row) {
if (static_cast<size_t>(inner_data.first) >= sample_values.size()) {
// if need expand feature set
size_t need_size = inner_data.first - sample_values.size() + 1;
for (size_t j = 0; j < need_size; ++j) {
sample_values.emplace_back();
}
sample_values.resize(inner_data.first + 1);
sample_idx.resize(inner_data.first + 1);
}
if (std::fabs(inner_data.second) > 1e-15) {
// edit the feature value
sample_values[inner_data.first].push_back(inner_data.second);
if (std::fabs(inner_data.second) > kEpsilon) {
sample_values[inner_data.first].emplace_back(inner_data.second);
sample_idx[inner_data.first].emplace_back(static_cast<int>(i));
}
}
}
CHECK(num_col >= static_cast<int>(sample_values.size()));
DatasetLoader loader(io_config, nullptr, 1, nullptr);
ret.reset(loader.CostructFromSampleData(sample_values, sample_cnt, nrow));
ret.reset(loader.CostructFromSampleData(sample_values, sample_idx, sample_cnt, nrow));
} else {
ret.reset(new Dataset(nrow));
ret->CopyFeatureMapperFrom(
reinterpret_cast<const Dataset*>(reference),
io_config.is_enable_sparse);
ret->CreateValid(
reinterpret_cast<const Dataset*>(reference));
}
#pragma omp parallel for schedule(guided)
#pragma omp parallel for schedule(static)
for (int i = 0; i < nindptr - 1; ++i) {
const int tid = omp_get_thread_num();
auto one_row = get_row_fun(i);
......@@ -400,17 +557,17 @@ DllExport int LGBM_DatasetCreateFromCSR(const void* indptr,
API_END();
}
DllExport int LGBM_DatasetCreateFromCSC(const void* col_ptr,
int col_ptr_type,
const int32_t* indices,
const void* data,
int data_type,
int64_t ncol_ptr,
int64_t nelem,
int64_t num_row,
const char* parameters,
const DatasetHandle reference,
DatasetHandle* out) {
LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromCSC(const void* col_ptr,
int col_ptr_type,
const int32_t* indices,
const void* data,
int data_type,
int64_t ncol_ptr,
int64_t nelem,
int64_t num_row,
const char* parameters,
const DatasetHandle reference,
DatasetHandle* out) {
API_BEGIN();
auto param = ConfigBase::Str2Map(parameters);
IOConfig io_config;
......@@ -423,30 +580,33 @@ DllExport int LGBM_DatasetCreateFromCSC(const void* col_ptr,
const int sample_cnt = static_cast<int>(nrow < io_config.bin_construct_sample_cnt ? nrow : io_config.bin_construct_sample_cnt);
auto sample_indices = rand.Sample(nrow, sample_cnt);
std::vector<std::vector<double>> sample_values(ncol_ptr - 1);
#pragma omp parallel for schedule(guided)
std::vector<std::vector<int>> sample_idx(ncol_ptr - 1);
#pragma omp parallel for schedule(static)
for (int i = 0; i < static_cast<int>(sample_values.size()); ++i) {
CSC_RowIterator col_it(col_ptr, col_ptr_type, indices, data, data_type, ncol_ptr, nelem, i);
for (int j = 0; j < sample_cnt; j++) {
auto val = col_it.Get(sample_indices[j]);
if (std::fabs(val) > kEpsilon) {
sample_values[i].push_back(val);
sample_values[i].emplace_back(val);
sample_idx[i].emplace_back(j);
}
}
}
DatasetLoader loader(io_config, nullptr, 1, nullptr);
ret.reset(loader.CostructFromSampleData(sample_values, sample_cnt, nrow));
ret.reset(loader.CostructFromSampleData(sample_values, sample_idx, sample_cnt, nrow));
} else {
ret.reset(new Dataset(nrow));
ret->CopyFeatureMapperFrom(
reinterpret_cast<const Dataset*>(reference),
io_config.is_enable_sparse);
ret->CreateValid(
reinterpret_cast<const Dataset*>(reference));
}
#pragma omp parallel for schedule(guided)
#pragma omp parallel for schedule(static)
for (int i = 0; i < ncol_ptr - 1; ++i) {
const int tid = omp_get_thread_num();
int feature_idx = ret->GetInnerFeatureIndex(i);
int feature_idx = ret->InnerFeatureIndex(i);
if (feature_idx < 0) { continue; }
int group = ret->Feature2Group(feature_idx);
int sub_feature = ret->Feture2SubFeature(feature_idx);
CSC_RowIterator col_it(col_ptr, col_ptr_type, indices, data, data_type, ncol_ptr, nelem, i);
int row_idx = 0;
while (row_idx < nrow) {
......@@ -454,7 +614,7 @@ DllExport int LGBM_DatasetCreateFromCSC(const void* col_ptr,
row_idx = pair.first;
// no more data
if (row_idx < 0) { break; }
ret->FeatureAt(feature_idx)->PushData(tid, row_idx, pair.second);
ret->PushOneData(tid, row_idx, group, sub_feature, pair.second);
}
}
ret->FinishLoad();
......@@ -462,7 +622,7 @@ DllExport int LGBM_DatasetCreateFromCSC(const void* col_ptr,
API_END();
}
DllExport int LGBM_DatasetGetSubset(
LIGHTGBM_C_EXPORT int LGBM_DatasetGetSubset(
const DatasetHandle handle,
const int32_t* used_row_indices,
int32_t num_used_row_indices,
......@@ -473,16 +633,14 @@ DllExport int LGBM_DatasetGetSubset(
IOConfig io_config;
io_config.Set(param);
auto full_dataset = reinterpret_cast<const Dataset*>(handle);
auto ret = std::unique_ptr<Dataset>(
full_dataset->Subset(used_row_indices,
num_used_row_indices,
io_config.is_enable_sparse));
ret->FinishLoad();
auto ret = std::unique_ptr<Dataset>(new Dataset(num_used_row_indices));
ret->CopyFeatureMapperFrom(full_dataset);
ret->CopySubset(full_dataset, used_row_indices, num_used_row_indices, true);
*out = ret.release();
API_END();
}
DllExport int LGBM_DatasetSetFeatureNames(
LIGHTGBM_C_EXPORT int LGBM_DatasetSetFeatureNames(
DatasetHandle handle,
const char** feature_names,
int num_feature_names) {
......@@ -496,7 +654,7 @@ DllExport int LGBM_DatasetSetFeatureNames(
API_END();
}
DllExport int LGBM_DatasetGetFeatureNames(
LIGHTGBM_C_EXPORT int LGBM_DatasetGetFeatureNames(
DatasetHandle handle,
char** feature_names,
int* num_feature_names) {
......@@ -510,25 +668,25 @@ DllExport int LGBM_DatasetGetFeatureNames(
API_END();
}
DllExport int LGBM_DatasetFree(DatasetHandle handle) {
LIGHTGBM_C_EXPORT int LGBM_DatasetFree(DatasetHandle handle) {
API_BEGIN();
delete reinterpret_cast<Dataset*>(handle);
API_END();
}
DllExport int LGBM_DatasetSaveBinary(DatasetHandle handle,
const char* filename) {
LIGHTGBM_C_EXPORT int LGBM_DatasetSaveBinary(DatasetHandle handle,
const char* filename) {
API_BEGIN();
auto dataset = reinterpret_cast<Dataset*>(handle);
dataset->SaveBinaryFile(filename);
API_END();
}
DllExport int LGBM_DatasetSetField(DatasetHandle handle,
const char* field_name,
const void* field_data,
int num_element,
int type) {
LIGHTGBM_C_EXPORT int LGBM_DatasetSetField(DatasetHandle handle,
const char* field_name,
const void* field_data,
int num_element,
int type) {
API_BEGIN();
auto dataset = reinterpret_cast<Dataset*>(handle);
bool is_success = false;
......@@ -543,11 +701,11 @@ DllExport int LGBM_DatasetSetField(DatasetHandle handle,
API_END();
}
DllExport int LGBM_DatasetGetField(DatasetHandle handle,
const char* field_name,
int* out_len,
const void** out_ptr,
int* out_type) {
LIGHTGBM_C_EXPORT int LGBM_DatasetGetField(DatasetHandle handle,
const char* field_name,
int* out_len,
const void** out_ptr,
int* out_type) {
API_BEGIN();
auto dataset = reinterpret_cast<Dataset*>(handle);
bool is_success = false;
......@@ -566,16 +724,16 @@ DllExport int LGBM_DatasetGetField(DatasetHandle handle,
API_END();
}
DllExport int LGBM_DatasetGetNumData(DatasetHandle handle,
int* out) {
LIGHTGBM_C_EXPORT int LGBM_DatasetGetNumData(DatasetHandle handle,
int* out) {
API_BEGIN();
auto dataset = reinterpret_cast<Dataset*>(handle);
*out = dataset->num_data();
API_END();
}
DllExport int LGBM_DatasetGetNumFeature(DatasetHandle handle,
int* out) {
LIGHTGBM_C_EXPORT int LGBM_DatasetGetNumFeature(DatasetHandle handle,
int* out) {
API_BEGIN();
auto dataset = reinterpret_cast<Dataset*>(handle);
*out = dataset->num_total_features();
......@@ -584,9 +742,9 @@ DllExport int LGBM_DatasetGetNumFeature(DatasetHandle handle,
// ---- start of booster
DllExport int LGBM_BoosterCreate(const DatasetHandle train_data,
const char* parameters,
BoosterHandle* out) {
LIGHTGBM_C_EXPORT int LGBM_BoosterCreate(const DatasetHandle train_data,
const char* parameters,
BoosterHandle* out) {
API_BEGIN();
const Dataset* p_train_data = reinterpret_cast<const Dataset*>(train_data);
auto ret = std::unique_ptr<Booster>(new Booster(p_train_data, parameters));
......@@ -594,7 +752,7 @@ DllExport int LGBM_BoosterCreate(const DatasetHandle train_data,
API_END();
}
DllExport int LGBM_BoosterCreateFromModelfile(
LIGHTGBM_C_EXPORT int LGBM_BoosterCreateFromModelfile(
const char* filename,
int* out_num_iterations,
BoosterHandle* out) {
......@@ -605,14 +763,26 @@ DllExport int LGBM_BoosterCreateFromModelfile(
API_END();
}
DllExport int LGBM_BoosterFree(BoosterHandle handle) {
LIGHTGBM_C_EXPORT int LGBM_BoosterLoadModelFromString(
const char* model_str,
int* out_num_iterations,
BoosterHandle* out) {
API_BEGIN();
auto ret = std::unique_ptr<Booster>(new Booster());
ret->LoadModelFromString(model_str);
*out_num_iterations = ret->GetBoosting()->GetCurrentIteration();
*out = ret.release();
API_END();
}
LIGHTGBM_C_EXPORT int LGBM_BoosterFree(BoosterHandle handle) {
API_BEGIN();
delete reinterpret_cast<Booster*>(handle);
API_END();
}
DllExport int LGBM_BoosterMerge(BoosterHandle handle,
BoosterHandle other_handle) {
LIGHTGBM_C_EXPORT int LGBM_BoosterMerge(BoosterHandle handle,
BoosterHandle other_handle) {
API_BEGIN();
Booster* ref_booster = reinterpret_cast<Booster*>(handle);
Booster* ref_other_booster = reinterpret_cast<Booster*>(other_handle);
......@@ -620,8 +790,8 @@ DllExport int LGBM_BoosterMerge(BoosterHandle handle,
API_END();
}
DllExport int LGBM_BoosterAddValidData(BoosterHandle handle,
const DatasetHandle valid_data) {
LIGHTGBM_C_EXPORT int LGBM_BoosterAddValidData(BoosterHandle handle,
const DatasetHandle valid_data) {
API_BEGIN();
Booster* ref_booster = reinterpret_cast<Booster*>(handle);
const Dataset* p_dataset = reinterpret_cast<const Dataset*>(valid_data);
......@@ -629,8 +799,8 @@ DllExport int LGBM_BoosterAddValidData(BoosterHandle handle,
API_END();
}
DllExport int LGBM_BoosterResetTrainingData(BoosterHandle handle,
const DatasetHandle train_data) {
LIGHTGBM_C_EXPORT int LGBM_BoosterResetTrainingData(BoosterHandle handle,
const DatasetHandle train_data) {
API_BEGIN();
Booster* ref_booster = reinterpret_cast<Booster*>(handle);
const Dataset* p_dataset = reinterpret_cast<const Dataset*>(train_data);
......@@ -638,21 +808,21 @@ DllExport int LGBM_BoosterResetTrainingData(BoosterHandle handle,
API_END();
}
DllExport int LGBM_BoosterResetParameter(BoosterHandle handle, const char* parameters) {
LIGHTGBM_C_EXPORT int LGBM_BoosterResetParameter(BoosterHandle handle, const char* parameters) {
API_BEGIN();
Booster* ref_booster = reinterpret_cast<Booster*>(handle);
ref_booster->ResetConfig(parameters);
API_END();
}
DllExport int LGBM_BoosterGetNumClasses(BoosterHandle handle, int* out_len) {
LIGHTGBM_C_EXPORT int LGBM_BoosterGetNumClasses(BoosterHandle handle, int* out_len) {
API_BEGIN();
Booster* ref_booster = reinterpret_cast<Booster*>(handle);
*out_len = ref_booster->GetBoosting()->NumberOfClasses();
API_END();
}
DllExport int LGBM_BoosterUpdateOneIter(BoosterHandle handle, int* is_finished) {
LIGHTGBM_C_EXPORT int LGBM_BoosterUpdateOneIter(BoosterHandle handle, int* is_finished) {
API_BEGIN();
Booster* ref_booster = reinterpret_cast<Booster*>(handle);
if (ref_booster->TrainOneIter()) {
......@@ -663,10 +833,10 @@ DllExport int LGBM_BoosterUpdateOneIter(BoosterHandle handle, int* is_finished)
API_END();
}
DllExport int LGBM_BoosterUpdateOneIterCustom(BoosterHandle handle,
const float* grad,
const float* hess,
int* is_finished) {
LIGHTGBM_C_EXPORT int LGBM_BoosterUpdateOneIterCustom(BoosterHandle handle,
const float* grad,
const float* hess,
int* is_finished) {
API_BEGIN();
Booster* ref_booster = reinterpret_cast<Booster*>(handle);
if (ref_booster->TrainOneIter(grad, hess)) {
......@@ -677,38 +847,52 @@ DllExport int LGBM_BoosterUpdateOneIterCustom(BoosterHandle handle,
API_END();
}
DllExport int LGBM_BoosterRollbackOneIter(BoosterHandle handle) {
LIGHTGBM_C_EXPORT int LGBM_BoosterRollbackOneIter(BoosterHandle handle) {
API_BEGIN();
Booster* ref_booster = reinterpret_cast<Booster*>(handle);
ref_booster->RollbackOneIter();
API_END();
}
DllExport int LGBM_BoosterGetCurrentIteration(BoosterHandle handle, int* out_iteration) {
LIGHTGBM_C_EXPORT int LGBM_BoosterGetCurrentIteration(BoosterHandle handle, int* out_iteration) {
API_BEGIN();
Booster* ref_booster = reinterpret_cast<Booster*>(handle);
*out_iteration = ref_booster->GetBoosting()->GetCurrentIteration();
API_END();
}
DllExport int LGBM_BoosterGetEvalCounts(BoosterHandle handle, int* out_len) {
LIGHTGBM_C_EXPORT int LGBM_BoosterGetEvalCounts(BoosterHandle handle, int* out_len) {
API_BEGIN();
Booster* ref_booster = reinterpret_cast<Booster*>(handle);
*out_len = ref_booster->GetEvalCounts();
API_END();
}
DllExport int LGBM_BoosterGetEvalNames(BoosterHandle handle, int* out_len, char** out_strs) {
LIGHTGBM_C_EXPORT int LGBM_BoosterGetEvalNames(BoosterHandle handle, int* out_len, char** out_strs) {
API_BEGIN();
Booster* ref_booster = reinterpret_cast<Booster*>(handle);
*out_len = ref_booster->GetEvalNames(out_strs);
API_END();
}
DllExport int LGBM_BoosterGetEval(BoosterHandle handle,
int data_idx,
int* out_len,
double* out_results) {
LIGHTGBM_C_EXPORT int LGBM_BoosterGetFeatureNames(BoosterHandle handle, int* out_len, char** out_strs) {
API_BEGIN();
Booster* ref_booster = reinterpret_cast<Booster*>(handle);
*out_len = ref_booster->GetFeatureNames(out_strs);
API_END();
}
LIGHTGBM_C_EXPORT int LGBM_BoosterGetNumFeature(BoosterHandle handle, int* out_len) {
API_BEGIN();
Booster* ref_booster = reinterpret_cast<Booster*>(handle);
*out_len = ref_booster->GetBoosting()->MaxFeatureIdx() + 1;
API_END();
}
LIGHTGBM_C_EXPORT int LGBM_BoosterGetEval(BoosterHandle handle,
int data_idx,
int* out_len,
double* out_results) {
API_BEGIN();
Booster* ref_booster = reinterpret_cast<Booster*>(handle);
auto boosting = ref_booster->GetBoosting();
......@@ -720,31 +904,31 @@ DllExport int LGBM_BoosterGetEval(BoosterHandle handle,
API_END();
}
DllExport int LGBM_BoosterGetNumPredict(BoosterHandle handle,
int data_idx,
int64_t* out_len) {
LIGHTGBM_C_EXPORT int LGBM_BoosterGetNumPredict(BoosterHandle handle,
int data_idx,
int64_t* out_len) {
API_BEGIN();
auto boosting = reinterpret_cast<Booster*>(handle)->GetBoosting();
*out_len = boosting->GetNumPredictAt(data_idx);
API_END();
}
DllExport int LGBM_BoosterGetPredict(BoosterHandle handle,
int data_idx,
int64_t* out_len,
double* out_result) {
LIGHTGBM_C_EXPORT int LGBM_BoosterGetPredict(BoosterHandle handle,
int data_idx,
int64_t* out_len,
double* out_result) {
API_BEGIN();
Booster* ref_booster = reinterpret_cast<Booster*>(handle);
ref_booster->GetPredictAt(data_idx, out_result, out_len);
API_END();
}
DllExport int LGBM_BoosterPredictForFile(BoosterHandle handle,
const char* data_filename,
int data_has_header,
int predict_type,
int num_iteration,
const char* result_filename) {
LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForFile(BoosterHandle handle,
const char* data_filename,
int data_has_header,
int predict_type,
int num_iteration,
const char* result_filename) {
API_BEGIN();
Booster* ref_booster = reinterpret_cast<Booster*>(handle);
auto predictor = ref_booster->NewPredictor(static_cast<int>(num_iteration), predict_type);
......@@ -766,37 +950,37 @@ int64_t GetNumPredOneRow(const Booster* ref_booster, int predict_type, int64_t n
return num_preb_in_one_row;
}
DllExport int LGBM_BoosterCalcNumPredict(BoosterHandle handle,
int num_row,
int predict_type,
int num_iteration,
int64_t* out_len) {
LIGHTGBM_C_EXPORT int LGBM_BoosterCalcNumPredict(BoosterHandle handle,
int num_row,
int predict_type,
int num_iteration,
int64_t* out_len) {
API_BEGIN();
Booster* ref_booster = reinterpret_cast<Booster*>(handle);
*out_len = static_cast<int64_t>(num_row * GetNumPredOneRow(ref_booster, predict_type, num_iteration));
API_END();
}
DllExport int LGBM_BoosterPredictForCSR(BoosterHandle handle,
const void* indptr,
int indptr_type,
const int32_t* indices,
const void* data,
int data_type,
int64_t nindptr,
int64_t nelem,
int64_t,
int predict_type,
int num_iteration,
int64_t* out_len,
double* out_result) {
LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForCSR(BoosterHandle handle,
const void* indptr,
int indptr_type,
const int32_t* indices,
const void* data,
int data_type,
int64_t nindptr,
int64_t nelem,
int64_t,
int predict_type,
int num_iteration,
int64_t* out_len,
double* out_result) {
API_BEGIN();
Booster* ref_booster = reinterpret_cast<Booster*>(handle);
auto predictor = ref_booster->NewPredictor(static_cast<int>(num_iteration), predict_type);
auto get_row_fun = RowFunctionFromCSR(indptr, indptr_type, indices, data, data_type, nindptr, nelem);
int64_t num_preb_in_one_row = GetNumPredOneRow(ref_booster, predict_type, num_iteration);
int nrow = static_cast<int>(nindptr - 1);
#pragma omp parallel for schedule(guided)
#pragma omp parallel for schedule(static)
for (int i = 0; i < nrow; ++i) {
auto one_row = get_row_fun(i);
auto predicton_result = predictor.GetPredictFunction()(one_row);
......@@ -808,19 +992,19 @@ DllExport int LGBM_BoosterPredictForCSR(BoosterHandle handle,
API_END();
}
DllExport int LGBM_BoosterPredictForCSC(BoosterHandle handle,
const void* col_ptr,
int col_ptr_type,
const int32_t* indices,
const void* data,
int data_type,
int64_t ncol_ptr,
int64_t nelem,
int64_t num_row,
int predict_type,
int num_iteration,
int64_t* out_len,
double* out_result) {
LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForCSC(BoosterHandle handle,
const void* col_ptr,
int col_ptr_type,
const int32_t* indices,
const void* data,
int data_type,
int64_t ncol_ptr,
int64_t nelem,
int64_t num_row,
int predict_type,
int num_iteration,
int64_t* out_len,
double* out_result) {
API_BEGIN();
Booster* ref_booster = reinterpret_cast<Booster*>(handle);
auto predictor = ref_booster->NewPredictor(static_cast<int>(num_iteration), predict_type);
......@@ -828,7 +1012,7 @@ DllExport int LGBM_BoosterPredictForCSC(BoosterHandle handle,
int ncol = static_cast<int>(ncol_ptr - 1);
Threading::For<int64_t>(0, num_row,
[&predictor, &out_result, num_preb_in_one_row, ncol, col_ptr, col_ptr_type, indices, data, data_type, ncol_ptr, nelem]
[&predictor, &out_result, num_preb_in_one_row, ncol, col_ptr, col_ptr_type, indices, data, data_type, ncol_ptr, nelem]
(int, data_size_t start, data_size_t end) {
std::vector<CSC_RowIterator> iterators;
for (int j = 0; j < ncol; ++j) {
......@@ -853,22 +1037,22 @@ DllExport int LGBM_BoosterPredictForCSC(BoosterHandle handle,
API_END();
}
DllExport int LGBM_BoosterPredictForMat(BoosterHandle handle,
const void* data,
int data_type,
int32_t nrow,
int32_t ncol,
int is_row_major,
int predict_type,
int num_iteration,
int64_t* out_len,
double* out_result) {
LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForMat(BoosterHandle handle,
const void* data,
int data_type,
int32_t nrow,
int32_t ncol,
int is_row_major,
int predict_type,
int num_iteration,
int64_t* out_len,
double* out_result) {
API_BEGIN();
Booster* ref_booster = reinterpret_cast<Booster*>(handle);
auto predictor = ref_booster->NewPredictor(static_cast<int>(num_iteration), predict_type);
auto get_row_fun = RowPairFunctionFromDenseMatric(data, nrow, ncol, data_type, is_row_major);
int64_t num_preb_in_one_row = GetNumPredOneRow(ref_booster, predict_type, num_iteration);
#pragma omp parallel for schedule(guided)
#pragma omp parallel for schedule(static)
for (int i = 0; i < nrow; ++i) {
auto one_row = get_row_fun(i);
auto predicton_result = predictor.GetPredictFunction()(one_row);
......@@ -880,20 +1064,35 @@ DllExport int LGBM_BoosterPredictForMat(BoosterHandle handle,
API_END();
}
DllExport int LGBM_BoosterSaveModel(BoosterHandle handle,
int num_iteration,
const char* filename) {
LIGHTGBM_C_EXPORT int LGBM_BoosterSaveModel(BoosterHandle handle,
int num_iteration,
const char* filename) {
API_BEGIN();
Booster* ref_booster = reinterpret_cast<Booster*>(handle);
ref_booster->SaveModelToFile(num_iteration, filename);
API_END();
}
DllExport int LGBM_BoosterDumpModel(BoosterHandle handle,
int num_iteration,
int buffer_len,
int* out_len,
char* out_str) {
LIGHTGBM_C_EXPORT int LGBM_BoosterSaveModelToString(BoosterHandle handle,
int num_iteration,
int buffer_len,
int* out_len,
char* out_str) {
API_BEGIN();
Booster* ref_booster = reinterpret_cast<Booster*>(handle);
std::string model = ref_booster->SaveModelToString(num_iteration);
*out_len = static_cast<int>(model.size()) + 1;
if (*out_len <= buffer_len) {
std::strcpy(out_str, model.c_str());
}
API_END();
}
LIGHTGBM_C_EXPORT int LGBM_BoosterDumpModel(BoosterHandle handle,
int num_iteration,
int buffer_len,
int* out_len,
char* out_str) {
API_BEGIN();
Booster* ref_booster = reinterpret_cast<Booster*>(handle);
std::string model = ref_booster->DumpModel(num_iteration);
......@@ -904,26 +1103,74 @@ DllExport int LGBM_BoosterDumpModel(BoosterHandle handle,
API_END();
}
DllExport int LGBM_BoosterGetLeafValue(BoosterHandle handle,
int tree_idx,
int leaf_idx,
double* out_val) {
LIGHTGBM_C_EXPORT int LGBM_BoosterGetLeafValue(BoosterHandle handle,
int tree_idx,
int leaf_idx,
double* out_val) {
API_BEGIN();
Booster* ref_booster = reinterpret_cast<Booster*>(handle);
*out_val = static_cast<double>(ref_booster->GetLeafValue(tree_idx, leaf_idx));
API_END();
}
DllExport int LGBM_BoosterSetLeafValue(BoosterHandle handle,
int tree_idx,
int leaf_idx,
double val) {
LIGHTGBM_C_EXPORT int LGBM_BoosterSetLeafValue(BoosterHandle handle,
int tree_idx,
int leaf_idx,
double val) {
API_BEGIN();
Booster* ref_booster = reinterpret_cast<Booster*>(handle);
ref_booster->SetLeafValue(tree_idx, leaf_idx, val);
API_END();
}
LIGHTGBM_C_EXPORT int LGBM_AllocateArray(int64_t len, int type, ArrayHandle* out) {
API_BEGIN();
if (type == C_API_DTYPE_FLOAT32) {
*out = new float[len];
} else if (type == C_API_DTYPE_FLOAT64) {
*out = new double[len];
} else if (type == C_API_DTYPE_INT32) {
*out = new int32_t[len];
} else if (type == C_API_DTYPE_INT64) {
*out = new int64_t[len];
}
API_END();
}
template<typename T>
void Copy(T* dst, const T* src, int64_t len) {
std::memcpy(dst, src, sizeof(T) * len);
}
LIGHTGBM_C_EXPORT int LGBM_CopyToArray(ArrayHandle arr, int type, int64_t start_idx, const void* src, int64_t len) {
API_BEGIN();
if (type == C_API_DTYPE_FLOAT32) {
Copy<float>(static_cast<float*>(arr) + start_idx, static_cast<const float*>(src), len);
} else if (type == C_API_DTYPE_FLOAT64) {
Copy<double>(static_cast<double*>(arr) + start_idx, static_cast<const double*>(src), len);
} else if (type == C_API_DTYPE_INT32) {
Copy<int32_t>(static_cast<int32_t*>(arr) + start_idx, static_cast<const int32_t*>(src), len);
} else if (type == C_API_DTYPE_INT64) {
Copy<int64_t>(static_cast<int64_t*>(arr) + start_idx, static_cast<const int64_t*>(src), len);
}
API_END();
}
LIGHTGBM_C_EXPORT int LGBM_FreeArray(ArrayHandle arr, int type) {
API_BEGIN();
if (type == C_API_DTYPE_FLOAT32) {
delete[] static_cast<float*>(arr);
} else if (type == C_API_DTYPE_FLOAT64) {
delete[] static_cast<double*>(arr);
} else if (type == C_API_DTYPE_INT32) {
delete[] static_cast<int32_t*>(arr);
} else if (type == C_API_DTYPE_INT64) {
delete[] static_cast<int64_t*>(arr);
}
API_END();
}
// ---- start of some help functions
std::function<std::vector<double>(int row_idx)>
......@@ -931,7 +1178,7 @@ RowFunctionFromDenseMatric(const void* data, int num_row, int num_col, int data_
if (data_type == C_API_DTYPE_FLOAT32) {
const float* data_ptr = reinterpret_cast<const float*>(data);
if (is_row_major) {
return [data_ptr, num_col, num_row](int row_idx) {
return [data_ptr, num_col, num_row] (int row_idx) {
std::vector<double> ret(num_col);
auto tmp_ptr = data_ptr + num_col * row_idx;
for (int i = 0; i < num_col; ++i) {
......@@ -940,7 +1187,7 @@ RowFunctionFromDenseMatric(const void* data, int num_row, int num_col, int data_
return ret;
};
} else {
return [data_ptr, num_col, num_row](int row_idx) {
return [data_ptr, num_col, num_row] (int row_idx) {
std::vector<double> ret(num_col);
for (int i = 0; i < num_col; ++i) {
ret[i] = static_cast<double>(*(data_ptr + num_row * i + row_idx));
......@@ -951,7 +1198,7 @@ RowFunctionFromDenseMatric(const void* data, int num_row, int num_col, int data_
} else if (data_type == C_API_DTYPE_FLOAT64) {
const double* data_ptr = reinterpret_cast<const double*>(data);
if (is_row_major) {
return [data_ptr, num_col, num_row](int row_idx) {
return [data_ptr, num_col, num_row] (int row_idx) {
std::vector<double> ret(num_col);
auto tmp_ptr = data_ptr + num_col * row_idx;
for (int i = 0; i < num_col; ++i) {
......@@ -960,7 +1207,7 @@ RowFunctionFromDenseMatric(const void* data, int num_row, int num_col, int data_
return ret;
};
} else {
return [data_ptr, num_col, num_row](int row_idx) {
return [data_ptr, num_col, num_row] (int row_idx) {
std::vector<double> ret(num_col);
for (int i = 0; i < num_col; ++i) {
ret[i] = static_cast<double>(*(data_ptr + num_row * i + row_idx));
......@@ -976,7 +1223,7 @@ std::function<std::vector<std::pair<int, double>>(int row_idx)>
RowPairFunctionFromDenseMatric(const void* data, int num_row, int num_col, int data_type, int is_row_major) {
auto inner_function = RowFunctionFromDenseMatric(data, num_row, num_col, data_type, is_row_major);
if (inner_function != nullptr) {
return [inner_function](int row_idx) {
return [inner_function] (int row_idx) {
auto raw_values = inner_function(row_idx);
std::vector<std::pair<int, double>> ret;
for (int i = 0; i < static_cast<int>(raw_values.size()); ++i) {
......@@ -996,7 +1243,7 @@ RowFunctionFromCSR(const void* indptr, int indptr_type, const int32_t* indices,
const float* data_ptr = reinterpret_cast<const float*>(data);
if (indptr_type == C_API_DTYPE_INT32) {
const int32_t* ptr_indptr = reinterpret_cast<const int32_t*>(indptr);
return [ptr_indptr, indices, data_ptr, nindptr, nelem](int idx) {
return [ptr_indptr, indices, data_ptr, nindptr, nelem] (int idx) {
std::vector<std::pair<int, double>> ret;
int64_t start = ptr_indptr[idx];
int64_t end = ptr_indptr[idx + 1];
......@@ -1007,7 +1254,7 @@ RowFunctionFromCSR(const void* indptr, int indptr_type, const int32_t* indices,
};
} else if (indptr_type == C_API_DTYPE_INT64) {
const int64_t* ptr_indptr = reinterpret_cast<const int64_t*>(indptr);
return [ptr_indptr, indices, data_ptr, nindptr, nelem](int idx) {
return [ptr_indptr, indices, data_ptr, nindptr, nelem] (int idx) {
std::vector<std::pair<int, double>> ret;
int64_t start = ptr_indptr[idx];
int64_t end = ptr_indptr[idx + 1];
......@@ -1021,7 +1268,7 @@ RowFunctionFromCSR(const void* indptr, int indptr_type, const int32_t* indices,
const double* data_ptr = reinterpret_cast<const double*>(data);
if (indptr_type == C_API_DTYPE_INT32) {
const int32_t* ptr_indptr = reinterpret_cast<const int32_t*>(indptr);
return [ptr_indptr, indices, data_ptr, nindptr, nelem](int idx) {
return [ptr_indptr, indices, data_ptr, nindptr, nelem] (int idx) {
std::vector<std::pair<int, double>> ret;
int64_t start = ptr_indptr[idx];
int64_t end = ptr_indptr[idx + 1];
......@@ -1032,7 +1279,7 @@ RowFunctionFromCSR(const void* indptr, int indptr_type, const int32_t* indices,
};
} else if (indptr_type == C_API_DTYPE_INT64) {
const int64_t* ptr_indptr = reinterpret_cast<const int64_t*>(indptr);
return [ptr_indptr, indices, data_ptr, nindptr, nelem](int idx) {
return [ptr_indptr, indices, data_ptr, nindptr, nelem] (int idx) {
std::vector<std::pair<int, double>> ret;
int64_t start = ptr_indptr[idx];
int64_t end = ptr_indptr[idx + 1];
......@@ -1055,7 +1302,7 @@ IterateFunctionFromCSC(const void* col_ptr, int col_ptr_type, const int32_t* ind
const int32_t* ptr_col_ptr = reinterpret_cast<const int32_t*>(col_ptr);
int64_t start = ptr_col_ptr[col_idx];
int64_t end = ptr_col_ptr[col_idx + 1];
return [ptr_col_ptr, indices, data_ptr, ncol_ptr, nelem, start, end](int bias) {
return [ptr_col_ptr, indices, data_ptr, ncol_ptr, nelem, start, end] (int bias) {
int64_t i = static_cast<int64_t>(start + bias);
if (i >= end) {
return std::make_pair(-1, 0.0);
......@@ -1068,7 +1315,7 @@ IterateFunctionFromCSC(const void* col_ptr, int col_ptr_type, const int32_t* ind
const int64_t* ptr_col_ptr = reinterpret_cast<const int64_t*>(col_ptr);
int64_t start = ptr_col_ptr[col_idx];
int64_t end = ptr_col_ptr[col_idx + 1];
return [ptr_col_ptr, indices, data_ptr, ncol_ptr, nelem, start, end](int bias) {
return [ptr_col_ptr, indices, data_ptr, ncol_ptr, nelem, start, end] (int bias) {
int64_t i = static_cast<int64_t>(start + bias);
if (i >= end) {
return std::make_pair(-1, 0.0);
......@@ -1084,7 +1331,7 @@ IterateFunctionFromCSC(const void* col_ptr, int col_ptr_type, const int32_t* ind
const int32_t* ptr_col_ptr = reinterpret_cast<const int32_t*>(col_ptr);
int64_t start = ptr_col_ptr[col_idx];
int64_t end = ptr_col_ptr[col_idx + 1];
return [ptr_col_ptr, indices, data_ptr, ncol_ptr, nelem, start, end](int bias) {
return [ptr_col_ptr, indices, data_ptr, ncol_ptr, nelem, start, end] (int bias) {
int64_t i = static_cast<int64_t>(start + bias);
if (i >= end) {
return std::make_pair(-1, 0.0);
......@@ -1097,7 +1344,7 @@ IterateFunctionFromCSC(const void* col_ptr, int col_ptr_type, const int32_t* ind
const int64_t* ptr_col_ptr = reinterpret_cast<const int64_t*>(col_ptr);
int64_t start = ptr_col_ptr[col_idx];
int64_t end = ptr_col_ptr[col_idx + 1];
return [ptr_col_ptr, indices, data_ptr, ncol_ptr, nelem, start, end](int bias) {
return [ptr_col_ptr, indices, data_ptr, ncol_ptr, nelem, start, end] (int bias) {
int64_t i = static_cast<int64_t>(start + bias);
if (i >= end) {
return std::make_pair(-1, 0.0);
......@@ -1112,7 +1359,7 @@ IterateFunctionFromCSC(const void* col_ptr, int col_ptr_type, const int32_t* ind
}
CSC_RowIterator::CSC_RowIterator(const void* col_ptr, int col_ptr_type, const int32_t* indices,
const void* data, int data_type, int64_t ncol_ptr, int64_t nelem, int col_idx) {
const void* data, int data_type, int64_t ncol_ptr, int64_t nelem, int col_idx) {
iter_fun_ = IterateFunctionFromCSC(col_ptr, col_ptr_type, indices, data, data_type, ncol_ptr, nelem, col_idx);
}
......
......@@ -2,6 +2,7 @@
#include <LightGBM/bin.h>
#include "dense_bin.hpp"
#include "dense_nbits_bin.hpp"
#include "sparse_bin.hpp"
#include "ordered_sparse_bin.hpp"
......@@ -30,7 +31,9 @@ BinMapper::BinMapper(const BinMapper& other) {
bin_2_categorical_ = other.bin_2_categorical_;
categorical_2_bin_ = other.categorical_2_bin_;
}
min_val_ = other.min_val_;
max_val_ = other.max_val_;
default_bin_ = other.default_bin_;
}
BinMapper::BinMapper(const void* memory) {
......@@ -41,37 +44,60 @@ BinMapper::~BinMapper() {
}
void BinMapper::FindBin(std::vector<double>* values, size_t total_sample_cnt, int max_bin, BinType bin_type) {
bool NeedFilter(std::vector<int>& cnt_in_bin, int total_cnt, int filter_cnt, BinType bin_type) {
if (bin_type == BinType::NumericalBin) {
int sum_left = 0;
for (size_t i = 0; i < cnt_in_bin.size() - 1; ++i) {
sum_left += cnt_in_bin[i];
if (sum_left >= filter_cnt) {
return false;
} else if (total_cnt - sum_left >= filter_cnt) {
return false;
}
}
} else {
for (size_t i = 0; i < cnt_in_bin.size() - 1; ++i) {
int sum_left = cnt_in_bin[i];
if (sum_left >= filter_cnt) {
return false;
} else if (total_cnt - sum_left >= filter_cnt) {
return false;
}
}
}
return true;
}
void BinMapper::FindBin(std::vector<double>& values, size_t total_sample_cnt,
int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type) {
bin_type_ = bin_type;
std::vector<double>& ref_values = (*values);
size_t sample_size = total_sample_cnt;
int zero_cnt = static_cast<int>(total_sample_cnt - ref_values.size());
default_bin_ = 0;
std::vector<double>& raw_values = values;
int zero_cnt = static_cast<int>(total_sample_cnt - raw_values.size());
// find distinct_values first
std::vector<double> distinct_values;
std::vector<int> counts;
std::sort(ref_values.begin(), ref_values.end());
std::sort(raw_values.begin(), raw_values.end());
// push zero in the front
if (ref_values.empty() || (ref_values[0] > 0.0f && zero_cnt > 0)) {
distinct_values.push_back(0);
if (raw_values.empty() || (raw_values[0] > 0.0f && zero_cnt > 0)) {
distinct_values.push_back(0.0f);
counts.push_back(zero_cnt);
}
if (!ref_values.empty()) {
distinct_values.push_back(ref_values[0]);
if (!raw_values.empty()) {
distinct_values.push_back(raw_values[0]);
counts.push_back(1);
}
for (size_t i = 1; i < ref_values.size(); ++i) {
if (ref_values[i] != ref_values[i - 1]) {
if (ref_values[i - 1] == 0.0f) {
counts.back() += zero_cnt;
} else if (ref_values[i - 1] < 0.0f && ref_values[i] > 0.0f) {
distinct_values.push_back(0);
for (size_t i = 1; i < raw_values.size(); ++i) {
if (raw_values[i] != raw_values[i - 1]) {
if (raw_values[i - 1] < 0.0f && raw_values[i] > 0.0f) {
distinct_values.push_back(0.0f);
counts.push_back(zero_cnt);
}
distinct_values.push_back(ref_values[i]);
distinct_values.push_back(raw_values[i]);
counts.push_back(1);
} else {
++counts.back();
......@@ -79,29 +105,44 @@ void BinMapper::FindBin(std::vector<double>* values, size_t total_sample_cnt, in
}
// push zero in the back
if (!ref_values.empty() && ref_values.back() < 0.0f && zero_cnt > 0) {
distinct_values.push_back(0);
if (!raw_values.empty() && raw_values.back() < 0.0f && zero_cnt > 0) {
distinct_values.push_back(0.0f);
counts.push_back(zero_cnt);
}
min_val_ = distinct_values.front();
max_val_ = distinct_values.back();
std::vector<int> cnt_in_bin;
int num_values = static_cast<int>(distinct_values.size());
int cnt_in_bin0 = 0;
if (bin_type_ == BinType::NumericalBin) {
if (num_values <= max_bin) {
std::sort(distinct_values.begin(), distinct_values.end());
// use distinct value is enough
num_bin_ = num_values;
bin_upper_bound_ = std::vector<double>(num_values);
bin_upper_bound_.clear();
int cur_cnt_inbin = 0;
for (int i = 0; i < num_values - 1; ++i) {
bin_upper_bound_[i] = (distinct_values[i] + distinct_values[i + 1]) / 2;
cur_cnt_inbin += counts[i];
if (cur_cnt_inbin >= min_data_in_bin) {
bin_upper_bound_.push_back((distinct_values[i] + distinct_values[i + 1]) / 2);
cnt_in_bin.push_back(cur_cnt_inbin);
cur_cnt_inbin = 0;
}
}
cnt_in_bin0 = counts[0];
bin_upper_bound_[num_values - 1] = std::numeric_limits<double>::infinity();
cur_cnt_inbin += counts.back();
cnt_in_bin.push_back(cur_cnt_inbin);
bin_upper_bound_.push_back(std::numeric_limits<double>::infinity());
num_bin_ = static_cast<int>(bin_upper_bound_.size());
} else {
if (min_data_in_bin > 0) {
max_bin = std::min(max_bin, static_cast<int>(total_sample_cnt / min_data_in_bin));
max_bin = std::max(max_bin, 1);
}
double mean_bin_size = static_cast<double>(total_sample_cnt) / max_bin;
if (zero_cnt > mean_bin_size) {
int non_zero_cnt = static_cast<int>(raw_values.size());
max_bin = std::min(max_bin, 1 + static_cast<int>(non_zero_cnt / min_data_in_bin));
}
// mean size for one bin
double mean_bin_size = sample_size / static_cast<double>(max_bin);
int rest_bin_cnt = max_bin;
int rest_sample_cnt = static_cast<int>(sample_size);
int rest_sample_cnt = static_cast<int>(total_sample_cnt);
std::vector<bool> is_big_count_value(num_values, false);
for (int i = 0; i < num_values; ++i) {
if (counts[i] >= mean_bin_size) {
......@@ -110,8 +151,7 @@ void BinMapper::FindBin(std::vector<double>* values, size_t total_sample_cnt, in
rest_sample_cnt -= counts[i];
}
}
mean_bin_size = rest_sample_cnt / static_cast<double>(rest_bin_cnt);
mean_bin_size = static_cast<double>(rest_sample_cnt) / rest_bin_cnt;
std::vector<double> upper_bounds(max_bin, std::numeric_limits<double>::infinity());
std::vector<double> lower_bounds(max_bin, std::numeric_limits<double>::infinity());
......@@ -127,9 +167,7 @@ void BinMapper::FindBin(std::vector<double>* values, size_t total_sample_cnt, in
if (is_big_count_value[i] || cur_cnt_inbin >= mean_bin_size ||
(is_big_count_value[i + 1] && cur_cnt_inbin >= std::max(1.0, mean_bin_size * 0.5f))) {
upper_bounds[bin_cnt] = distinct_values[i];
if (bin_cnt == 0) {
cnt_in_bin0 = cur_cnt_inbin;
}
cnt_in_bin.push_back(cur_cnt_inbin);
++bin_cnt;
lower_bounds[bin_cnt] = distinct_values[i + 1];
if (bin_cnt >= max_bin - 1) { break; }
......@@ -140,7 +178,8 @@ void BinMapper::FindBin(std::vector<double>* values, size_t total_sample_cnt, in
}
}
}
//
cur_cnt_inbin += counts.back();
cnt_in_bin.push_back(cur_cnt_inbin);
++bin_cnt;
// update bin upper bound
bin_upper_bound_ = std::vector<double>(bin_cnt);
......@@ -151,7 +190,7 @@ void BinMapper::FindBin(std::vector<double>* values, size_t total_sample_cnt, in
// last bin upper bound
bin_upper_bound_[bin_cnt - 1] = std::numeric_limits<double>::infinity();
}
CHECK(num_bin_ <= max_bin);
} else {
// convert to int type first
std::vector<int> distinct_values_int;
......@@ -169,20 +208,21 @@ void BinMapper::FindBin(std::vector<double>* values, size_t total_sample_cnt, in
// sort by counts
Common::SortForPair<int, int>(counts_int, distinct_values_int, 0, true);
// will ingore the categorical of small counts
num_bin_ = std::min(max_bin, static_cast<int>(counts_int.size()));
const int cut_cnt = static_cast<int>(total_sample_cnt * 0.98f);
categorical_2_bin_.clear();
bin_2_categorical_ = std::vector<int>(num_bin_);
bin_2_categorical_.clear();
num_bin_ = 0;
int used_cnt = 0;
for (int i = 0; i < num_bin_; ++i) {
bin_2_categorical_[i] = distinct_values_int[i];
categorical_2_bin_[distinct_values_int[i]] = static_cast<unsigned int>(i);
used_cnt += counts_int[i];
}
if (used_cnt / static_cast<double>(sample_size) < 0.95f) {
Log::Warning("Too many categoricals are ignored, \
please use bigger max_bin or partition this column ");
max_bin = std::min(static_cast<int>(distinct_values_int.size()), max_bin);
while (used_cnt < cut_cnt || num_bin_ < max_bin) {
bin_2_categorical_.push_back(distinct_values_int[num_bin_]);
categorical_2_bin_[distinct_values_int[num_bin_]] = static_cast<unsigned int>(num_bin_);
used_cnt += counts_int[num_bin_];
++num_bin_;
}
cnt_in_bin0 = static_cast<int>(sample_size) - used_cnt + counts_int[0];
cnt_in_bin = counts_int;
counts_int.resize(num_bin_);
counts_int.back() += static_cast<int>(total_sample_cnt - used_cnt);
}
// check trival(num_bin_ == 1) feature
......@@ -191,8 +231,16 @@ void BinMapper::FindBin(std::vector<double>* values, size_t total_sample_cnt, in
} else {
is_trival_ = false;
}
// check useless bin
if (!is_trival_ && NeedFilter(cnt_in_bin, static_cast<int>(total_sample_cnt), min_split_data, bin_type_)) {
is_trival_ = true;
}
if (!is_trival_) {
default_bin_ = ValueToBin(0);
}
// calculate sparse rate
sparse_rate_ = static_cast<double>(cnt_in_bin0) / static_cast<double>(sample_size);
sparse_rate_ = static_cast<double>(cnt_in_bin[default_bin_]) / static_cast<double>(total_sample_cnt);
}
......@@ -202,7 +250,9 @@ int BinMapper::SizeForSpecificBin(int bin) {
size += sizeof(bool);
size += sizeof(double);
size += sizeof(BinType);
size += 2 * sizeof(double);
size += bin * sizeof(double);
size += sizeof(uint32_t);
return size;
}
......@@ -215,6 +265,12 @@ void BinMapper::CopyTo(char * buffer) {
buffer += sizeof(sparse_rate_);
std::memcpy(buffer, &bin_type_, sizeof(bin_type_));
buffer += sizeof(bin_type_);
std::memcpy(&min_val_, buffer, sizeof(min_val_));
buffer += sizeof(min_val_);
std::memcpy(&max_val_, buffer, sizeof(max_val_));
buffer += sizeof(max_val_);
std::memcpy(&default_bin_, buffer, sizeof(default_bin_));
buffer += sizeof(default_bin_);
if (bin_type_ == BinType::NumericalBin) {
std::memcpy(buffer, bin_upper_bound_.data(), num_bin_ * sizeof(double));
} else {
......@@ -231,6 +287,12 @@ void BinMapper::CopyFrom(const char * buffer) {
buffer += sizeof(sparse_rate_);
std::memcpy(&bin_type_, buffer, sizeof(bin_type_));
buffer += sizeof(bin_type_);
std::memcpy(&min_val_, buffer, sizeof(min_val_));
buffer += sizeof(min_val_);
std::memcpy(&max_val_, buffer, sizeof(max_val_));
buffer += sizeof(max_val_);
std::memcpy(&default_bin_, buffer, sizeof(default_bin_));
buffer += sizeof(default_bin_);
if (bin_type_ == BinType::NumericalBin) {
bin_upper_bound_ = std::vector<double>(num_bin_);
std::memcpy(bin_upper_bound_.data(), buffer, num_bin_ * sizeof(double));
......@@ -249,6 +311,9 @@ void BinMapper::SaveBinaryToFile(FILE* file) const {
fwrite(&is_trival_, sizeof(is_trival_), 1, file);
fwrite(&sparse_rate_, sizeof(sparse_rate_), 1, file);
fwrite(&bin_type_, sizeof(bin_type_), 1, file);
fwrite(&min_val_, sizeof(min_val_), 1, file);
fwrite(&max_val_, sizeof(max_val_), 1, file);
fwrite(&default_bin_, sizeof(default_bin_), 1, file);
if (bin_type_ == BinType::NumericalBin) {
fwrite(bin_upper_bound_.data(), sizeof(double), num_bin_, file);
} else {
......@@ -258,7 +323,7 @@ void BinMapper::SaveBinaryToFile(FILE* file) const {
size_t BinMapper::SizesInByte() const {
size_t ret = sizeof(num_bin_) + sizeof(is_trival_) + sizeof(sparse_rate_)
+ sizeof(bin_type_);
+ sizeof(bin_type_) + sizeof(min_val_) + sizeof(max_val_) + sizeof(default_bin_);
if (bin_type_ == BinType::NumericalBin) {
ret += sizeof(double) * num_bin_;
} else {
......@@ -271,73 +336,47 @@ template class DenseBin<uint8_t>;
template class DenseBin<uint16_t>;
template class DenseBin<uint32_t>;
template class DenseCategoricalBin<uint8_t>;
template class DenseCategoricalBin<uint16_t>;
template class DenseCategoricalBin<uint32_t>;
template class SparseBin<uint8_t>;
template class SparseBin<uint16_t>;
template class SparseBin<uint32_t>;
template class SparseCategoricalBin<uint8_t>;
template class SparseCategoricalBin<uint16_t>;
template class SparseCategoricalBin<uint32_t>;
template class OrderedSparseBin<uint8_t>;
template class OrderedSparseBin<uint16_t>;
template class OrderedSparseBin<uint32_t>;
double BinMapper::kSparseThreshold = 0.8f;
Bin* Bin::CreateBin(data_size_t num_data, int num_bin, double sparse_rate,
bool is_enable_sparse, bool* is_sparse, int default_bin, BinType bin_type) {
bool is_enable_sparse, bool* is_sparse) {
// sparse threshold
const double kSparseThreshold = 0.8f;
if (sparse_rate >= kSparseThreshold && is_enable_sparse) {
if (sparse_rate >= BinMapper::kSparseThreshold && is_enable_sparse) {
*is_sparse = true;
return CreateSparseBin(num_data, num_bin, default_bin, bin_type);
return CreateSparseBin(num_data, num_bin);
} else {
*is_sparse = false;
return CreateDenseBin(num_data, num_bin, default_bin, bin_type);
return CreateDenseBin(num_data, num_bin);
}
}
Bin* Bin::CreateDenseBin(data_size_t num_data, int num_bin, int default_bin, BinType bin_type) {
if (bin_type == BinType::NumericalBin) {
if (num_bin <= 256) {
return new DenseBin<uint8_t>(num_data, default_bin);
} else if (num_bin <= 65536) {
return new DenseBin<uint16_t>(num_data, default_bin);
} else {
return new DenseBin<uint32_t>(num_data, default_bin);
}
Bin* Bin::CreateDenseBin(data_size_t num_data, int num_bin) {
if (num_bin <= 16) {
return new Dense4bitsBin(num_data);
} else if (num_bin <= 256) {
return new DenseBin<uint8_t>(num_data);
} else if (num_bin <= 65536) {
return new DenseBin<uint16_t>(num_data);
} else {
if (num_bin <= 256) {
return new DenseCategoricalBin<uint8_t>(num_data, default_bin);
} else if (num_bin <= 65536) {
return new DenseCategoricalBin<uint16_t>(num_data, default_bin);
} else {
return new DenseCategoricalBin<uint32_t>(num_data, default_bin);
}
return new DenseBin<uint32_t>(num_data);
}
}
Bin* Bin::CreateSparseBin(data_size_t num_data, int num_bin, int default_bin, BinType bin_type) {
if (bin_type == BinType::NumericalBin) {
if (num_bin <= 256) {
return new SparseBin<uint8_t>(num_data, default_bin);
} else if (num_bin <= 65536) {
return new SparseBin<uint16_t>(num_data, default_bin);
} else {
return new SparseBin<uint32_t>(num_data, default_bin);
}
Bin* Bin::CreateSparseBin(data_size_t num_data, int num_bin) {
if (num_bin <= 256) {
return new SparseBin<uint8_t>(num_data);
} else if (num_bin <= 65536) {
return new SparseBin<uint16_t>(num_data);
} else {
if (num_bin <= 256) {
return new SparseCategoricalBin<uint8_t>(num_data, default_bin);
} else if (num_bin <= 65536) {
return new SparseCategoricalBin<uint16_t>(num_data, default_bin);
} else {
return new SparseCategoricalBin<uint32_t>(num_data, default_bin);
}
return new SparseBin<uint32_t>(num_data);
}
}
......
......@@ -39,11 +39,11 @@ void OverallConfig::Set(const std::unordered_map<std::string, std::string>& para
// generate seeds by seed.
if (GetInt(params, "seed", &seed)) {
Random rand(seed);
int int_max = std::numeric_limits<int>::max();
io_config.data_random_seed = static_cast<int>(rand.NextInt(0, int_max));
boosting_config.bagging_seed = static_cast<int>(rand.NextInt(0, int_max));
boosting_config.drop_seed = static_cast<int>(rand.NextInt(0, int_max));
boosting_config.tree_config.feature_fraction_seed = static_cast<int>(rand.NextInt(0, int_max));
int int_max = std::numeric_limits<short>::max();
io_config.data_random_seed = static_cast<int>(rand.NextShort(0, int_max));
boosting_config.bagging_seed = static_cast<int>(rand.NextShort(0, int_max));
boosting_config.drop_seed = static_cast<int>(rand.NextShort(0, int_max));
boosting_config.tree_config.feature_fraction_seed = static_cast<int>(rand.NextShort(0, int_max));
}
GetTaskType(params);
GetBoostingType(params);
......@@ -79,6 +79,8 @@ void OverallConfig::GetBoostingType(const std::unordered_map<std::string, std::s
boosting_type = "gbdt";
} else if (value == std::string("dart")) {
boosting_type = "dart";
} else if (value == std::string("goss")) {
boosting_type = "goss";
} else {
Log::Fatal("Unknown boosting type %s", value.c_str());
}
......@@ -102,7 +104,7 @@ void OverallConfig::GetMetricType(const std::unordered_map<std::string, std::str
std::transform(value.begin(), value.end(), value.begin(), Common::tolower);
// split
std::vector<std::string> metrics = Common::Split(value.c_str(), ',');
// remove dumplicate
// remove duplicate
std::unordered_set<std::string> metric_sets;
for (auto& metric : metrics) {
std::transform(metric.begin(), metric.end(), metric.begin(), Common::tolower);
......@@ -147,11 +149,13 @@ void OverallConfig::CheckParamConflict() {
Log::Fatal("Number of classes must be 1 for non-multiclass training");
}
}
for (std::string metric_type : metric_types) {
bool metric_type_multiclass = (metric_type == std::string("multi_logloss") || metric_type == std::string("multi_error"));
if ((objective_type_multiclass && !metric_type_multiclass)
|| (!objective_type_multiclass && metric_type_multiclass)) {
Log::Fatal("Objective and metrics don't match");
if (boosting_config.is_provide_training_metric || !io_config.valid_data_filenames.empty()) {
for (std::string metric_type : metric_types) {
bool metric_type_multiclass = (metric_type == std::string("multi_logloss") || metric_type == std::string("multi_error"));
if ((objective_type_multiclass && !metric_type_multiclass)
|| (!objective_type_multiclass && metric_type_multiclass)) {
Log::Fatal("Objective and metrics don't match");
}
}
}
......@@ -177,7 +181,7 @@ void OverallConfig::CheckParamConflict() {
&& boosting_config.tree_learner_type == std::string("data")) {
Log::Warning("Histogram LRU queue was enabled (histogram_pool_size=%f). Will disable this to reduce communication costs"
, boosting_config.tree_config.histogram_pool_size);
// Change pool size to -1 (not limit) when using data parallel to reduce communication costs
// Change pool size to -1 (no limit) when using data parallel to reduce communication costs
boosting_config.tree_config.histogram_pool_size = -1;
}
......@@ -213,6 +217,11 @@ void IOConfig::Set(const std::unordered_map<std::string, std::string>& params) {
GetString(params, "group_column", &group_column);
GetString(params, "ignore_column", &ignore_column);
GetString(params, "categorical_column", &categorical_column);
GetInt(params, "min_data_in_leaf", &min_data_in_leaf);
GetInt(params, "min_dato_in_bin", &min_data_in_bin);
GetDouble(params, "max_conflict_rate", &max_conflict_rate);
GetBool(params, "enable_bundle", &enable_bundle);
GetBool(params, "adjacent_bundle", &adjacent_bundle);
}
......@@ -222,6 +231,7 @@ void ObjectiveConfig::Set(const std::unordered_map<std::string, std::string>& pa
GetDouble(params, "huber_delta", &huber_delta);
GetDouble(params, "fair_c", &fair_c);
GetDouble(params, "gaussian_eta", &gaussian_eta);
GetDouble(params, "poisson_max_delta_step", &poisson_max_delta_step);
GetInt(params, "max_position", &max_position);
CHECK(max_position > 0);
GetInt(params, "num_class", &num_class);
......@@ -293,7 +303,6 @@ void TreeConfig::Set(const std::unordered_map<std::string, std::string>& params)
GetDouble(params, "histogram_pool_size", &histogram_pool_size);
GetInt(params, "max_depth", &max_depth);
GetInt(params, "top_k", &top_k);
CHECK(max_depth > 1 || max_depth < 0);
}
......@@ -320,6 +329,8 @@ void BoostingConfig::Set(const std::unordered_map<std::string, std::string>& par
GetInt(params, "max_drop", &max_drop);
GetBool(params, "xgboost_dart_mode", &xgboost_dart_mode);
GetBool(params, "uniform_drop", &uniform_drop);
GetDouble(params, "top_rate", &top_rate);
GetDouble(params, "other_rate", &other_rate);
CHECK(drop_rate <= 1.0 && drop_rate >= 0.0);
CHECK(skip_drop <= 1.0 && skip_drop >= 0.0);
GetTreeLearnerType(params);
......
#include <LightGBM/dataset.h>
#include <LightGBM/feature_group.h>
#include <LightGBM/utils/openmp_wrapper.h>
#include <LightGBM/utils/threading.h>
#include <LightGBM/utils/array_args.h>
#include <LightGBM/feature.h>
#include <omp.h>
#include <chrono>
#include <cstdio>
#include <unordered_map>
#include <limits>
......@@ -19,55 +20,212 @@ const char* Dataset::binary_file_token = "______LightGBM_Binary_File_Token______
Dataset::Dataset() {
data_filename_ = "noname";
num_data_ = 0;
is_finish_load_ = false;
}
Dataset::Dataset(data_size_t num_data) {
data_filename_ = "noname";
num_data_ = num_data;
metadata_.Init(num_data_, -1, -1);
metadata_.Init(num_data_, NO_SPECIFIC, NO_SPECIFIC);
is_finish_load_ = false;
}
Dataset::~Dataset() {
}
std::vector<std::vector<int>> NoGroup(
const std::vector<int>& used_features) {
std::vector<std::vector<int>> features_in_group;
features_in_group.resize(used_features.size());
for (size_t i = 0; i < used_features.size(); ++i) {
features_in_group[i].emplace_back(used_features[i]);
}
return features_in_group;
}
void Dataset::Construct(
std::vector<std::unique_ptr<BinMapper>>& bin_mappers,
const std::vector<std::vector<int>>&,
size_t,
const IOConfig& io_config) {
num_total_features_ = static_cast<int>(bin_mappers.size());
// get num_features
std::vector<int> used_features;
for (int i = 0; i < static_cast<int>(bin_mappers.size()); ++i) {
if (bin_mappers[i] != nullptr && !bin_mappers[i]->is_trival()) {
used_features.emplace_back(i);
}
}
auto features_in_group = NoGroup(used_features);
num_features_ = 0;
for (const auto& fs : features_in_group) {
num_features_ += static_cast<int>(fs.size());
}
int cur_fidx = 0;
used_feature_map_ = std::vector<int>(num_total_features_, -1);
num_groups_ = static_cast<int>(features_in_group.size());
real_feature_idx_.resize(num_features_);
feature2group_.resize(num_features_);
feature2subfeature_.resize(num_features_);
for (int i = 0; i < num_groups_; ++i) {
auto cur_features = features_in_group[i];
int cur_cnt_features = static_cast<int>(cur_features.size());
// get bin_mappers
std::vector<std::unique_ptr<BinMapper>> cur_bin_mappers;
for (int j = 0; j < cur_cnt_features; ++j) {
int real_fidx = cur_features[j];
used_feature_map_[real_fidx] = cur_fidx;
real_feature_idx_[cur_fidx] = real_fidx;
feature2group_[cur_fidx] = i;
feature2subfeature_[cur_fidx] = j;
cur_bin_mappers.emplace_back(bin_mappers[real_fidx].release());
++cur_fidx;
}
feature_groups_.emplace_back(std::unique_ptr<FeatureGroup>(
new FeatureGroup(cur_cnt_features, cur_bin_mappers, num_data_, io_config.is_enable_sparse)));
}
feature_groups_.shrink_to_fit();
group_bin_boundaries_.clear();
uint64_t num_total_bin = 0;
group_bin_boundaries_.push_back(num_total_bin);
for (int i = 0; i < num_groups_; ++i) {
num_total_bin += feature_groups_[i]->num_total_bin_;
group_bin_boundaries_.push_back(num_total_bin);
}
int last_group = 0;
group_feature_start_.reserve(num_groups_);
group_feature_cnt_.reserve(num_groups_);
group_feature_start_.push_back(0);
group_feature_cnt_.push_back(1);
for (int i = 1; i < num_features_; ++i) {
const int group = feature2group_[i];
if (group == last_group) {
group_feature_cnt_.back() = group_feature_cnt_.back() + 1;
} else {
group_feature_start_.push_back(i);
group_feature_cnt_.push_back(1);
last_group = group;
}
}
}
void Dataset::FinishLoad() {
if (is_finish_load_) { return; }
#pragma omp parallel for schedule(guided)
for (int i = 0; i < num_features_; ++i) {
features_[i]->FinishLoad();
for (int i = 0; i < num_groups_; ++i) {
feature_groups_[i]->bin_data_->FinishLoad();
}
is_finish_load_ = true;
}
void Dataset::CopyFeatureMapperFrom(const Dataset* dataset, bool is_enable_sparse) {
features_.clear();
void Dataset::CopyFeatureMapperFrom(const Dataset* dataset) {
feature_groups_.clear();
num_features_ = dataset->num_features_;
num_groups_ = dataset->num_groups_;
bool is_enable_sparse = false;
for (int i = 0; i < num_groups_; ++i) {
if (dataset->feature_groups_[i]->is_sparse_) {
is_enable_sparse = true;
break;
}
}
// copy feature bin mapper data
for (const auto& feature : dataset->features_) {
features_.emplace_back(std::unique_ptr<Feature>(
new Feature(feature->feature_index(),
new BinMapper(*feature->bin_mapper()),
num_data_,
is_enable_sparse)
));
}
features_.shrink_to_fit();
for (int i = 0; i < num_groups_; ++i) {
std::vector<std::unique_ptr<BinMapper>> bin_mappers;
for (int j = 0; j < dataset->feature_groups_[i]->num_feature_; ++j) {
bin_mappers.emplace_back(new BinMapper(*(dataset->feature_groups_[i]->bin_mappers_[j])));
}
feature_groups_.emplace_back(new FeatureGroup(
dataset->feature_groups_[i]->num_feature_,
bin_mappers,
num_data_,
is_enable_sparse));
}
feature_groups_.shrink_to_fit();
used_feature_map_ = dataset->used_feature_map_;
num_features_ = static_cast<int>(features_.size());
num_total_features_ = dataset->num_total_features_;
feature_names_ = dataset->feature_names_;
label_idx_ = dataset->label_idx_;
real_feature_idx_ = dataset->real_feature_idx_;
feature2group_ = dataset->feature2group_;
feature2subfeature_ = dataset->feature2subfeature_;
group_bin_boundaries_ = dataset->group_bin_boundaries_;
group_feature_start_ = dataset->group_feature_start_;
group_feature_cnt_ = dataset->group_feature_cnt_;
}
Dataset* Dataset::Subset(const data_size_t* used_indices, data_size_t num_used_indices, bool is_enable_sparse) const {
auto ret = std::unique_ptr<Dataset>(new Dataset(num_used_indices));
ret->CopyFeatureMapperFrom(this, is_enable_sparse);
#pragma omp parallel for schedule(guided)
for (int fidx = 0; fidx < num_features_; ++fidx) {
auto iterator = features_[fidx]->bin_data()->GetIterator(0);
for (data_size_t i = 0; i < num_used_indices; ++i) {
ret->features_[fidx]->PushBin(0, i, iterator->Get(used_indices[i]));
void Dataset::CreateValid(const Dataset* dataset) {
feature_groups_.clear();
num_features_ = dataset->num_features_;
num_groups_ = num_features_;
bool is_enable_sparse = true;
feature2group_.clear();
feature2subfeature_.clear();
// copy feature bin mapper data
for (int i = 0; i < num_features_; ++i) {
std::vector<std::unique_ptr<BinMapper>> bin_mappers;
bin_mappers.emplace_back(new BinMapper(*(dataset->FeatureBinMapper(i))));
feature_groups_.emplace_back(new FeatureGroup(
1,
bin_mappers,
num_data_,
is_enable_sparse));
feature2group_.push_back(i);
feature2subfeature_.push_back(0);
}
feature_groups_.shrink_to_fit();
used_feature_map_ = dataset->used_feature_map_;
num_total_features_ = dataset->num_total_features_;
feature_names_ = dataset->feature_names_;
label_idx_ = dataset->label_idx_;
real_feature_idx_ = dataset->real_feature_idx_;
group_bin_boundaries_.clear();
uint64_t num_total_bin = 0;
group_bin_boundaries_.push_back(num_total_bin);
for (int i = 0; i < num_groups_; ++i) {
num_total_bin += feature_groups_[i]->num_total_bin_;
group_bin_boundaries_.push_back(num_total_bin);
}
int last_group = 0;
group_feature_start_.reserve(num_groups_);
group_feature_cnt_.reserve(num_groups_);
group_feature_start_.push_back(0);
group_feature_cnt_.push_back(1);
for (int i = 1; i < num_features_; ++i) {
const int group = feature2group_[i];
if (group == last_group) {
group_feature_cnt_.back() = group_feature_cnt_.back() + 1;
} else {
group_feature_start_.push_back(i);
group_feature_cnt_.push_back(1);
last_group = group;
}
}
}
void Dataset::ReSize(data_size_t num_data) {
if (num_data_ != num_data) {
num_data_ = num_data;
#pragma omp parallel for schedule(static)
for (int group = 0; group < num_groups_; ++group) {
feature_groups_[group]->bin_data_->ReSize(num_data_);
}
}
ret->metadata_.Init(metadata_, used_indices, num_used_indices);
return ret.release();
}
void Dataset::CopySubset(const Dataset* fullset, const data_size_t* used_indices, data_size_t num_used_indices, bool need_meta_data) {
CHECK(num_used_indices == num_data_);
#pragma omp parallel for schedule(static)
for (int group = 0; group < num_groups_; ++group) {
feature_groups_[group]->CopySubset(fullset->feature_groups_[group].get(), used_indices, num_used_indices);
}
if (need_meta_data) {
metadata_.Init(fullset->metadata_, used_indices, num_used_indices);
}
is_finish_load_ = true;
}
bool Dataset::SetFloatField(const char* field_name, const float* field_data, data_size_t num_element) {
......@@ -99,8 +257,6 @@ bool Dataset::SetIntField(const char* field_name, const int* field_data, data_si
name = Common::Trim(name);
if (name == std::string("query") || name == std::string("group")) {
metadata_.SetQuery(field_data, num_element);
} else if (name == std::string("query_id") || name == std::string("group_id")) {
metadata_.SetQueryId(field_data, num_element);
} else {
return false;
}
......@@ -147,8 +303,8 @@ bool Dataset::GetIntField(const char* field_name, data_size_t* out_len, const in
}
void Dataset::SaveBinaryFile(const char* bin_filename) {
if (bin_filename != nullptr
&& std::string(bin_filename) == std::string(data_filename_)) {
if (bin_filename != nullptr
&& std::string(bin_filename) == std::string(data_filename_)) {
Log::Warning("Bianry file %s already existed", bin_filename);
return;
}
......@@ -185,8 +341,9 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
size_t size_of_token = std::strlen(binary_file_token);
fwrite(binary_file_token, sizeof(char), size_of_token, file);
// get size of header
size_t size_of_header = sizeof(num_data_) + sizeof(num_features_) + sizeof(num_total_features_)
+ sizeof(size_t) + sizeof(int) * used_feature_map_.size();
size_t size_of_header = sizeof(num_data_) + sizeof(num_features_) + sizeof(num_total_features_)
+ sizeof(int) * num_total_features_ + sizeof(num_groups_)
+ 3 * sizeof(int) * num_features_ + sizeof(uint64_t) * (num_groups_ + 1) + 2 * sizeof(int) * num_groups_;
// size of feature names
for (int i = 0; i < num_total_features_; ++i) {
size_of_header += feature_names_[i].size() + sizeof(int);
......@@ -195,10 +352,15 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
// write header
fwrite(&num_data_, sizeof(num_data_), 1, file);
fwrite(&num_features_, sizeof(num_features_), 1, file);
fwrite(&num_total_features_, sizeof(num_features_), 1, file);
size_t num_used_feature_map = used_feature_map_.size();
fwrite(&num_used_feature_map, sizeof(num_used_feature_map), 1, file);
fwrite(used_feature_map_.data(), sizeof(int), num_used_feature_map, file);
fwrite(&num_total_features_, sizeof(num_total_features_), 1, file);
fwrite(used_feature_map_.data(), sizeof(int), num_total_features_, file);
fwrite(&num_groups_, sizeof(num_groups_), 1, file);
fwrite(real_feature_idx_.data(), sizeof(int), num_features_, file);
fwrite(feature2group_.data(), sizeof(int), num_features_, file);
fwrite(feature2subfeature_.data(), sizeof(int), num_features_, file);
fwrite(group_bin_boundaries_.data(), sizeof(uint64_t), num_groups_ + 1, file);
fwrite(group_feature_start_.data(), sizeof(int), num_groups_, file);
fwrite(group_feature_cnt_.data(), sizeof(int), num_groups_, file);
// write feature names
for (int i = 0; i < num_total_features_; ++i) {
......@@ -215,15 +377,95 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
metadata_.SaveBinaryToFile(file);
// write feature data
for (int i = 0; i < num_features_; ++i) {
for (int i = 0; i < num_groups_; ++i) {
// get size of feature
size_t size_of_feature = features_[i]->SizesInByte();
size_t size_of_feature = feature_groups_[i]->SizesInByte();
fwrite(&size_of_feature, sizeof(size_of_feature), 1, file);
// write feature
features_[i]->SaveBinaryToFile(file);
feature_groups_[i]->SaveBinaryToFile(file);
}
fclose(file);
}
}
void Dataset::ConstructHistograms(
const std::vector<int8_t>& is_feature_used,
const data_size_t* data_indices, data_size_t num_data,
int leaf_idx,
std::vector<std::unique_ptr<OrderedBin>>& ordered_bins,
const score_t* gradients, const score_t* hessians,
score_t* ordered_gradients, score_t* ordered_hessians,
HistogramBinEntry* hist_data) const {
if (leaf_idx < 0 || num_data <= 0 || hist_data == nullptr) {
return;
}
auto ptr_ordered_grad = gradients;
auto ptr_ordered_hess = hessians;
if (data_indices != nullptr && num_data < num_data_) {
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data; ++i) {
ordered_gradients[i] = gradients[data_indices[i]];
ordered_hessians[i] = hessians[data_indices[i]];
}
ptr_ordered_grad = ordered_gradients;
ptr_ordered_hess = ordered_hessians;
}
#pragma omp parallel for schedule(static)
for (int group = 0; group < num_groups_; ++group) {
bool is_groud_used = false;
const int f_cnt = group_feature_cnt_[group];
for (int j = 0; j < f_cnt; ++j) {
const int fidx = group_feature_start_[group] + j;
if (is_feature_used[fidx]) {
is_groud_used = true;
break;
}
}
if (!is_groud_used) { continue; }
// feature is not used
auto data_ptr = hist_data + group_bin_boundaries_[group];
const int num_bin = feature_groups_[group]->num_total_bin_;
std::memset(data_ptr + 1, 0, (num_bin - 1) * sizeof(HistogramBinEntry));
// construct histograms for smaller leaf
if (ordered_bins[group] == nullptr) {
// if not use ordered bin
feature_groups_[group]->bin_data_->ConstructHistogram(
data_indices,
num_data,
ptr_ordered_grad,
ptr_ordered_hess,
data_ptr);
} else {
// used ordered bin
ordered_bins[group]->ConstructHistogram(leaf_idx,
gradients,
hessians,
data_ptr);
}
}
}
void Dataset::FixHistogram(int feature_idx, double sum_gradient, double sum_hessian, data_size_t num_data,
HistogramBinEntry* data) const {
const int group = feature2group_[feature_idx];
const int sub_feature = feature2subfeature_[feature_idx];
const BinMapper* bin_mapper = feature_groups_[group]->bin_mappers_[sub_feature].get();
const int default_bin = bin_mapper->GetDefaultBin();
if (default_bin > 0) {
const int num_bin = bin_mapper->num_bin();
data[default_bin].sum_gradients = sum_gradient;
data[default_bin].sum_hessians = sum_hessian;
data[default_bin].cnt = num_data;
for (int i = 0; i < num_bin; ++i) {
if (i != default_bin) {
data[default_bin].sum_gradients -= data[i].sum_gradients;
data[default_bin].sum_hessians -= data[i].sum_hessians;
data[default_bin].cnt -= data[i].cnt;
}
}
}
}
} // namespace LightGBM
#include <omp.h>
#include <LightGBM/utils/openmp_wrapper.h>
#include <LightGBM/utils/log.h>
#include <LightGBM/dataset_loader.h>
#include <LightGBM/feature.h>
#include <LightGBM/network.h>
......@@ -132,8 +131,6 @@ void DatasetLoader::SetHeader(const char* filename) {
ignore_features_.emplace(group_idx_);
}
}
// load categorical features
if (io_config_.categorical_column.size() > 0) {
if (Common::StartsWith(io_config_.categorical_column, name_prefix)) {
std::string names = io_config_.categorical_column.substr(name_prefix.size());
......@@ -209,7 +206,7 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac
}
} else {
// load data from binary file
dataset.reset(LoadFromBinFile(filename, bin_filename.c_str(), rank, num_machines));
dataset.reset(LoadFromBinFile(filename, bin_filename.c_str(), rank, num_machines, &num_global_data, &used_data_indices));
}
// check meta data
dataset->metadata_.CheckOrPartition(num_global_data, used_data_indices);
......@@ -238,7 +235,7 @@ Dataset* DatasetLoader::LoadFromFileAlignWithOtherDataset(const char* filename,
dataset->num_data_ = static_cast<data_size_t>(text_data.size());
// initialize label
dataset->metadata_.Init(dataset->num_data_, weight_idx_, group_idx_);
dataset->CopyFeatureMapperFrom(train_data, io_config_.is_enable_sparse);
dataset->CreateValid(train_data);
// extract features
ExtractFeaturesFromMemory(text_data, parser.get(), dataset.get());
text_data.clear();
......@@ -249,13 +246,13 @@ Dataset* DatasetLoader::LoadFromFileAlignWithOtherDataset(const char* filename,
num_global_data = dataset->num_data_;
// initialize label
dataset->metadata_.Init(dataset->num_data_, weight_idx_, group_idx_);
dataset->CopyFeatureMapperFrom(train_data, io_config_.is_enable_sparse);
dataset->CreateValid(train_data);
// extract features
ExtractFeaturesFromFile(filename, parser.get(), used_data_indices, dataset.get());
}
} else {
// load data from binary file
dataset.reset(LoadFromBinFile(filename, bin_filename.c_str(), 0, 1));
dataset.reset(LoadFromBinFile(filename, bin_filename.c_str(), 0, 1, &num_global_data, &used_data_indices));
}
// not need to check validation data
// check meta data
......@@ -263,7 +260,7 @@ Dataset* DatasetLoader::LoadFromFileAlignWithOtherDataset(const char* filename,
return dataset.release();
}
Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* bin_filename, int rank, int num_machines) {
Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* bin_filename, int rank, int num_machines, int* num_global_data, std::vector<data_size_t>* used_data_indices) {
auto dataset = std::unique_ptr<Dataset>(new Dataset());
FILE* file;
#ifdef _MSC_VER
......@@ -318,14 +315,60 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b
mem_ptr += sizeof(dataset->num_features_);
dataset->num_total_features_ = *(reinterpret_cast<const int*>(mem_ptr));
mem_ptr += sizeof(dataset->num_total_features_);
size_t num_used_feature_map = *(reinterpret_cast<const size_t*>(mem_ptr));
mem_ptr += sizeof(num_used_feature_map);
const int* tmp_feature_map = reinterpret_cast<const int*>(mem_ptr);
dataset->used_feature_map_.clear();
for (size_t i = 0; i < num_used_feature_map; ++i) {
for (int i = 0; i < dataset->num_total_features_; ++i) {
dataset->used_feature_map_.push_back(tmp_feature_map[i]);
}
mem_ptr += sizeof(int) * num_used_feature_map;
mem_ptr += sizeof(int) * dataset->num_total_features_;
// num_groups
dataset->num_groups_ = *(reinterpret_cast<const int*>(mem_ptr));
mem_ptr += sizeof(dataset->num_groups_);
// real_feature_idx_
const int* tmp_ptr_real_feature_idx_ = reinterpret_cast<const int*>(mem_ptr);
dataset->real_feature_idx_.clear();
for (int i = 0; i < dataset->num_features_; ++i) {
dataset->real_feature_idx_.push_back(tmp_ptr_real_feature_idx_[i]);
}
mem_ptr += sizeof(int) * dataset->num_features_;
// feature2group
const int* tmp_ptr_feature2group = reinterpret_cast<const int*>(mem_ptr);
dataset->feature2group_.clear();
for (int i = 0; i < dataset->num_features_; ++i) {
dataset->feature2group_.push_back(tmp_ptr_feature2group[i]);
}
mem_ptr += sizeof(int) * dataset->num_features_;
// feature2subfeature
const int* tmp_ptr_feature2subfeature = reinterpret_cast<const int*>(mem_ptr);
dataset->feature2subfeature_.clear();
for (int i = 0; i < dataset->num_features_; ++i) {
dataset->feature2subfeature_.push_back(tmp_ptr_feature2subfeature[i]);
}
mem_ptr += sizeof(int) * dataset->num_features_;
// group_bin_boundaries
const uint64_t* tmp_ptr_group_bin_boundaries = reinterpret_cast<const uint64_t*>(mem_ptr);
dataset->group_bin_boundaries_.clear();
for (int i = 0; i < dataset->num_groups_ + 1; ++i) {
dataset->group_bin_boundaries_.push_back(tmp_ptr_group_bin_boundaries[i]);
}
mem_ptr += sizeof(uint64_t) * (dataset->num_groups_ + 1);
// group_feature_start_
const int* tmp_ptr_group_feature_start = reinterpret_cast<const int*>(mem_ptr);
dataset->group_feature_start_.clear();
for (int i = 0; i < dataset->num_groups_ ; ++i) {
dataset->group_feature_start_.push_back(tmp_ptr_group_feature_start[i]);
}
mem_ptr += sizeof(int) * (dataset->num_groups_);
// group_feature_cnt_
const int* tmp_ptr_group_feature_cnt = reinterpret_cast<const int*>(mem_ptr);
dataset->group_feature_cnt_.clear();
for (int i = 0; i < dataset->num_groups_; ++i) {
dataset->group_feature_cnt_.push_back(tmp_ptr_group_feature_cnt[i]);
}
mem_ptr += sizeof(int) * (dataset->num_groups_);
// get feature names
dataset->feature_names_.clear();
// write feature names
......@@ -364,16 +407,16 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b
// load meta data
dataset->metadata_.LoadFromMemory(buffer.data());
std::vector<data_size_t> used_data_indices;
data_size_t num_global_data = dataset->num_data_;
*num_global_data = dataset->num_data_;
used_data_indices->clear();
// sample local used data if need to partition
if (num_machines > 1 && !io_config_.is_pre_partition) {
const data_size_t* query_boundaries = dataset->metadata_.query_boundaries();
if (query_boundaries == nullptr) {
// if not contain query file, minimal sample unit is one record
for (data_size_t i = 0; i < dataset->num_data_; ++i) {
if (random_.NextInt(0, num_machines) == rank) {
used_data_indices.push_back(i);
if (random_.NextShort(0, num_machines) == rank) {
used_data_indices->push_back(i);
}
}
} else {
......@@ -388,21 +431,21 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b
if (i >= query_boundaries[qid + 1]) {
// if is new query
is_query_used = false;
if (random_.NextInt(0, num_machines) == rank) {
if (random_.NextShort(0, num_machines) == rank) {
is_query_used = true;
}
++qid;
}
if (is_query_used) {
used_data_indices.push_back(i);
used_data_indices->push_back(i);
}
}
}
dataset->num_data_ = static_cast<data_size_t>(used_data_indices.size());
dataset->num_data_ = static_cast<data_size_t>((*used_data_indices).size());
}
dataset->metadata_.PartitionLabel(used_data_indices);
dataset->metadata_.PartitionLabel(*used_data_indices);
// read feature data
for (int i = 0; i < dataset->num_features_; ++i) {
for (int i = 0; i < dataset->num_groups_; ++i) {
// read feature size
read_cnt = fread(buffer.data(), sizeof(size_t), 1, file);
if (read_cnt != 1) {
......@@ -420,64 +463,49 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b
if (read_cnt != size_of_feature) {
Log::Fatal("Binary file error: feature %d is incorrect, read count: %d", i, read_cnt);
}
dataset->features_.emplace_back(std::unique_ptr<Feature>(
new Feature(buffer.data(),
num_global_data,
used_data_indices)
dataset->feature_groups_.emplace_back(std::unique_ptr<FeatureGroup>(
new FeatureGroup(buffer.data(),
*num_global_data,
*used_data_indices)
));
}
dataset->features_.shrink_to_fit();
dataset->feature_groups_.shrink_to_fit();
fclose(file);
dataset->is_finish_load_ = true;
return dataset.release();
}
Dataset* DatasetLoader::CostructFromSampleData(std::vector<std::vector<double>>& sample_values, size_t total_sample_size, data_size_t num_data) {
Dataset* DatasetLoader::CostructFromSampleData(std::vector<std::vector<double>>& sample_values,
std::vector<std::vector<int>>& sample_indices,
size_t total_sample_size, data_size_t num_data) {
std::vector<std::unique_ptr<BinMapper>> bin_mappers(sample_values.size());
#pragma omp parallel for schedule(guided)
for (int i = 0; i < static_cast<int>(sample_values.size()); ++i) {
bin_mappers[i].reset(new BinMapper());
BinType bin_type = BinType::NumericalBin;
if (categorical_features_.count(i)) {
bin_type = BinType::CategoricalBin;
}
bin_mappers[i]->FindBin(&sample_values[i], total_sample_size, io_config_.max_bin, bin_type);
}
auto dataset = std::unique_ptr<Dataset>(new Dataset());
dataset->features_.clear();
dataset->num_data_ = num_data;
// -1 means doesn't use this feature
dataset->used_feature_map_ = std::vector<int>(bin_mappers.size(), -1);
dataset->num_total_features_ = static_cast<int>(bin_mappers.size());
for (size_t i = 0; i < bin_mappers.size(); ++i) {
if (!bin_mappers[i]->is_trival()) {
// map real feature index to used feature index
dataset->used_feature_map_[i] = static_cast<int>(dataset->features_.size());
// push new feature
dataset->features_.emplace_back(std::unique_ptr<Feature>(
new Feature(static_cast<int>(i),
bin_mappers[i].release(),
dataset->num_data_,
io_config_.is_enable_sparse)
));
} else {
// if feature is trival(only 1 bin), free spaces
Log::Warning("Ignoring Column_%d , only has one value", i);
}
}
dataset->features_.shrink_to_fit();
// fill feature_names_ if not header
if (feature_names_.empty()) {
for (int i = 0; i < dataset->num_total_features_; ++i) {
for (int i = 0; i < static_cast<int>(sample_values.size()); ++i) {
std::stringstream str_buf;
str_buf << "Column_" << i;
feature_names_.push_back(str_buf.str());
}
}
const data_size_t filter_cnt = static_cast<data_size_t>(static_cast<double>(0.95 * io_config_.min_data_in_leaf) / num_data * sample_values.size());
#pragma omp parallel for schedule(guided)
for (int i = 0; i < static_cast<int>(sample_values.size()); ++i) {
if (ignore_features_.count(i) > 0) {
bin_mappers[i] = nullptr;
continue;
}
BinType bin_type = BinType::NumericalBin;
if (categorical_features_.count(i)) {
bin_type = BinType::CategoricalBin;
}
bin_mappers[i].reset(new BinMapper());
bin_mappers[i]->FindBin(sample_values[i], total_sample_size,
io_config_.max_bin, io_config_.min_data_in_bin, filter_cnt, bin_type);
}
auto dataset = std::unique_ptr<Dataset>(new Dataset(num_data));
dataset->feature_names_ = feature_names_;
dataset->num_features_ = static_cast<int>(dataset->features_.size());
dataset->metadata_.Init(dataset->num_data_, NO_SPECIFIC, NO_SPECIFIC);
dataset->Construct(bin_mappers, sample_indices, total_sample_size, io_config_);
return dataset.release();
}
......@@ -488,13 +516,34 @@ void DatasetLoader::CheckDataset(const Dataset* dataset) {
if (dataset->num_data_ <= 0) {
Log::Fatal("Data file %s is empty", dataset->data_filename_);
}
if (dataset->features_.empty()) {
if (dataset->feature_groups_.empty()) {
Log::Fatal("No usable features in data file %s", dataset->data_filename_);
}
if (dataset->feature_names_.size() != static_cast<size_t>(dataset->num_total_features_)) {
Log::Fatal("Size of feature name error, should be %d, got %d", dataset->num_total_features_,
static_cast<int>(dataset->feature_names_.size()));
}
bool is_feature_order_by_group = true;
int last_group = -1;
int last_sub_feature = -1;
// if features are ordered, not need to use hist_buf
for (int i = 0; i < dataset->num_features_; ++i) {
int group = dataset->feature2group_[i];
int sub_feature = dataset->feature2subfeature_[i];
if (group < last_group) {
is_feature_order_by_group = false;
} else if (group == last_group) {
if (sub_feature <= last_sub_feature) {
is_feature_order_by_group = false;
break;
}
}
last_group = group;
last_sub_feature = sub_feature;
}
if (!is_feature_order_by_group) {
Log::Fatal("feature in dataset should order by group");
}
}
std::vector<std::string> DatasetLoader::LoadTextDataToMemory(const char* filename, const Metadata& metadata,
......@@ -512,7 +561,7 @@ std::vector<std::string> DatasetLoader::LoadTextDataToMemory(const char* filenam
if (query_boundaries == nullptr) {
// if not contain query data, minimal sample unit is one record
*num_global_data = text_reader.ReadAndFilterLines([this, rank, num_machines](data_size_t) {
if (random_.NextInt(0, num_machines) == rank) {
if (random_.NextShort(0, num_machines) == rank) {
return true;
} else {
return false;
......@@ -532,7 +581,7 @@ std::vector<std::string> DatasetLoader::LoadTextDataToMemory(const char* filenam
if (line_idx >= query_boundaries[qid + 1]) {
// if is new query
is_query_used = false;
if (random_.NextInt(0, num_machines) == rank) {
if (random_.NextShort(0, num_machines) == rank) {
is_query_used = true;
}
++qid;
......@@ -571,7 +620,7 @@ std::vector<std::string> DatasetLoader::SampleTextDataFromFile(const char* filen
// if not contain query file, minimal sample unit is one record
*num_global_data = text_reader.SampleAndFilterFromFile([this, rank, num_machines]
(data_size_t) {
if (random_.NextInt(0, num_machines) == rank) {
if (random_.NextShort(0, num_machines) == rank) {
return true;
} else {
return false;
......@@ -592,7 +641,7 @@ std::vector<std::string> DatasetLoader::SampleTextDataFromFile(const char* filen
if (line_idx >= query_boundaries[qid + 1]) {
// if is new query
is_query_used = false;
if (random_.NextInt(0, num_machines) == rank) {
if (random_.NextShort(0, num_machines) == rank) {
is_query_used = true;
}
++qid;
......@@ -605,30 +654,28 @@ std::vector<std::string> DatasetLoader::SampleTextDataFromFile(const char* filen
}
void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines, const std::vector<std::string>& sample_data, const Parser* parser, Dataset* dataset) {
// sample_values[i][j], means the value of j-th sample on i-th feature
std::vector<std::vector<double>> sample_values;
// temp buffer for one line features and label
std::vector<std::vector<int>> sample_indices;
std::vector<std::pair<int, double>> oneline_features;
double label;
for (size_t i = 0; i < sample_data.size(); ++i) {
for (int i = 0; i < static_cast<int>(sample_data.size()); ++i) {
oneline_features.clear();
// parse features
parser->ParseOneLine(sample_data[i].c_str(), &oneline_features, &label);
for (std::pair<int, double>& inner_data : oneline_features) {
if (static_cast<size_t>(inner_data.first) >= sample_values.size()) {
// if need expand feature set
size_t need_size = inner_data.first - sample_values.size() + 1;
for (size_t j = 0; j < need_size; ++j) {
sample_values.emplace_back();
}
sample_values.resize(inner_data.first + 1);
sample_indices.resize(inner_data.first + 1);
}
if (std::fabs(inner_data.second) > 1e-15) {
sample_values[inner_data.first].push_back(inner_data.second);
if (std::fabs(inner_data.second) > kEpsilon) {
sample_values[inner_data.first].emplace_back(inner_data.second);
sample_indices[inner_data.first].emplace_back(i);
}
}
}
dataset->features_.clear();
dataset->feature_groups_.clear();
if (feature_names_.empty()) {
// -1 means doesn't use this feature
......@@ -653,48 +700,32 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
}
}
dataset->feature_names_ = feature_names_;
std::vector<std::unique_ptr<BinMapper>> bin_mappers(sample_values.size());
const data_size_t filter_cnt = static_cast<data_size_t>(static_cast<double>(0.95 * io_config_.min_data_in_leaf) / dataset->num_data_ * sample_values.size());
// start find bins
if (num_machines == 1) {
std::vector<std::unique_ptr<BinMapper>> bin_mappers(sample_values.size());
// if only one machine, find bin locally
#pragma omp parallel for schedule(guided)
for (int i = 0; i < static_cast<int>(sample_values.size()); ++i) {
if (ignore_features_.count(i) > 0) {
bin_mappers[i].reset(nullptr);
bin_mappers[i] = nullptr;
continue;
}
bin_mappers[i].reset(new BinMapper());
BinType bin_type = BinType::NumericalBin;
if (categorical_features_.count(i)) {
bin_type = BinType::CategoricalBin;
}
bin_mappers[i]->FindBin(&sample_values[i], sample_data.size(), io_config_.max_bin, bin_type);
}
for (size_t i = 0; i < sample_values.size(); ++i) {
if (bin_mappers[i] == nullptr) {
Log::Warning("Ignoring feature %s", feature_names_[i].c_str());
} else if (!bin_mappers[i]->is_trival()) {
// map real feature index to used feature index
dataset->used_feature_map_[i] = static_cast<int>(dataset->features_.size());
// push new feature
dataset->features_.emplace_back(std::unique_ptr<Feature>(
new Feature(static_cast<int>(i),
bin_mappers[i].release(),
dataset->num_data_,
io_config_.is_enable_sparse)
));
} else {
// if feature is trival(only 1 bin), free spaces
Log::Warning("Ignoring feature %s, only has one value", feature_names_[i].c_str());
}
bin_mappers[i].reset(new BinMapper());
bin_mappers[i]->FindBin(sample_values[i], sample_data.size(),
io_config_.max_bin, io_config_.min_data_in_bin, filter_cnt, bin_type);
}
} else {
// if have multi-machines, need find bin distributed
// if have multi-machines, need to find bin distributed
// different machines will find bin for different features
// start and len will store the process feature indices for different machines
// machine i will find bins for features in [ strat[i], start[i] + len[i] )
// machine i will find bins for features in [ start[i], start[i] + len[i] )
std::vector<int> start(num_machines);
std::vector<int> len(num_machines);
int total_num_feature = static_cast<int>(sample_values.size());
......@@ -707,8 +738,50 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
start[i + 1] = start[i] + len[i];
}
len[num_machines - 1] = total_num_feature - start[num_machines - 1];
// get size of bin mapper with max_bin_ size
int type_size = BinMapper::SizeForSpecificBin(io_config_.max_bin);
#pragma omp parallel for schedule(guided)
for (int i = 0; i < len[rank]; ++i) {
if (ignore_features_.count(start[rank] + i) > 0) {
continue;
}
BinType bin_type = BinType::NumericalBin;
if (categorical_features_.count(start[rank] + i)) {
bin_type = BinType::CategoricalBin;
}
bin_mappers[i].reset(new BinMapper());
bin_mappers[i]->FindBin(sample_values[start[rank] + i], sample_data.size(),
io_config_.max_bin, io_config_.min_data_in_bin, filter_cnt, bin_type);
}
// get max_bin
int local_max_bin = 0;
for (int i = 0; i < len[rank]; ++i) {
if (ignore_features_.count(start[rank] + i) > 0) {
continue;
}
local_max_bin = std::max(local_max_bin, bin_mappers[i]->num_bin());
}
int max_bin = local_max_bin;
// sync global max_bin
Network::Allreduce(reinterpret_cast<char*>(&local_max_bin),
sizeof(local_max_bin), sizeof(local_max_bin),
reinterpret_cast<char*>(&max_bin),
[] (const char* src, char* dst, int len) {
int used_size = 0;
const int type_size = sizeof(int);
const int *p1;
int *p2;
while (used_size < len) {
p1 = reinterpret_cast<const int *>(src);
p2 = reinterpret_cast<int *>(dst);
if (*p1 > *p2) {
std::memcpy(dst, src, type_size);
}
src += type_size;
dst += type_size;
used_size += type_size;
}
});
// get size of bin mapper with max_bin size
int type_size = BinMapper::SizeForSpecificBin(max_bin);
// since sizes of different feature may not be same, we expand all bin mapper to type_size
int buffer_size = type_size * total_num_feature;
auto input_buffer = std::vector<char>(buffer_size);
......@@ -717,13 +790,12 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
// find local feature bins and copy to buffer
#pragma omp parallel for schedule(guided)
for (int i = 0; i < len[rank]; ++i) {
BinMapper bin_mapper;
BinType bin_type = BinType::NumericalBin;
if (categorical_features_.count(start[rank] + i)) {
bin_type = BinType::CategoricalBin;
if (ignore_features_.count(start[rank] + i) > 0) {
continue;
}
bin_mapper.FindBin(&sample_values[start[rank] + i], sample_data.size(), io_config_.max_bin, bin_type);
bin_mapper.CopyTo(input_buffer.data() + i * type_size);
bin_mappers[i]->CopyTo(input_buffer.data() + i * type_size);
// free
bin_mappers[i].reset(nullptr);
}
// convert to binary size
for (int i = 0; i < num_machines; ++i) {
......@@ -735,26 +807,15 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
// restore features bins from buffer
for (int i = 0; i < total_num_feature; ++i) {
if (ignore_features_.count(i) > 0) {
Log::Warning("Ignoring feature %s", feature_names_[i].c_str());
bin_mappers[i] = nullptr;
continue;
}
auto bin_mapper = std::unique_ptr<BinMapper>(new BinMapper());
bin_mapper->CopyFrom(output_buffer.data() + i * type_size);
if (!bin_mapper->is_trival()) {
dataset->used_feature_map_[i] = static_cast<int>(dataset->features_.size());
dataset->features_.emplace_back(std::unique_ptr<Feature>(
new Feature(static_cast<int>(i),
bin_mapper.release(),
dataset->num_data_,
io_config_.is_enable_sparse)
));
} else {
Log::Warning("Ignoring feature %s, only has one value", feature_names_[i].c_str());
}
bin_mappers[i].reset(new BinMapper());
bin_mappers[i]->CopyFrom(output_buffer.data() + i * type_size);
}
}
dataset->features_.shrink_to_fit();
dataset->num_features_ = static_cast<int>(dataset->features_.size());
sample_values.clear();
dataset->Construct(bin_mappers, sample_indices, sample_data.size(), io_config_);
}
/*! \brief Extract local features from memory */
......@@ -763,7 +824,7 @@ void DatasetLoader::ExtractFeaturesFromMemory(std::vector<std::string>& text_dat
double tmp_label = 0.0f;
if (predict_fun_ == nullptr) {
// if doesn't need to prediction with initial model
#pragma omp parallel for schedule(guided) private(oneline_features) firstprivate(tmp_label)
#pragma omp parallel for schedule(static) private(oneline_features) firstprivate(tmp_label)
for (data_size_t i = 0; i < dataset->num_data_; ++i) {
const int tid = omp_get_thread_num();
oneline_features.clear();
......@@ -781,7 +842,9 @@ void DatasetLoader::ExtractFeaturesFromMemory(std::vector<std::string>& text_dat
int feature_idx = dataset->used_feature_map_[inner_data.first];
if (feature_idx >= 0) {
// if is used feature
dataset->features_[feature_idx]->PushData(tid, i, inner_data.second);
int group = dataset->feature2group_[feature_idx];
int sub_feature = dataset->feature2subfeature_[feature_idx];
dataset->feature_groups_[group]->PushData(tid, sub_feature, i, inner_data.second);
} else {
if (inner_data.first == weight_idx_) {
dataset->metadata_.SetWeightAt(i, static_cast<float>(inner_data.second));
......@@ -794,7 +857,7 @@ void DatasetLoader::ExtractFeaturesFromMemory(std::vector<std::string>& text_dat
} else {
// if need to prediction with initial model
std::vector<double> init_score(dataset->num_data_ * num_class_);
#pragma omp parallel for schedule(guided) private(oneline_features) firstprivate(tmp_label)
#pragma omp parallel for schedule(static) private(oneline_features) firstprivate(tmp_label)
for (data_size_t i = 0; i < dataset->num_data_; ++i) {
const int tid = omp_get_thread_num();
oneline_features.clear();
......@@ -817,7 +880,9 @@ void DatasetLoader::ExtractFeaturesFromMemory(std::vector<std::string>& text_dat
int feature_idx = dataset->used_feature_map_[inner_data.first];
if (feature_idx >= 0) {
// if is used feature
dataset->features_[feature_idx]->PushData(tid, i, inner_data.second);
int group = dataset->feature2group_[feature_idx];
int sub_feature = dataset->feature2subfeature_[feature_idx];
dataset->feature_groups_[group]->PushData(tid, sub_feature, i, inner_data.second);
} else {
if (inner_data.first == weight_idx_) {
dataset->metadata_.SetWeightAt(i, static_cast<float>(inner_data.second));
......@@ -867,7 +932,9 @@ void DatasetLoader::ExtractFeaturesFromFile(const char* filename, const Parser*
int feature_idx = dataset->used_feature_map_[inner_data.first];
if (feature_idx >= 0) {
// if is used feature
dataset->features_[feature_idx]->PushData(tid, start_idx + i, inner_data.second);
int group = dataset->feature2group_[feature_idx];
int sub_feature = dataset->feature2subfeature_[feature_idx];
dataset->feature_groups_[group]->PushData(tid, sub_feature, start_idx + i, inner_data.second);
} else {
if (inner_data.first == weight_idx_) {
dataset->metadata_.SetWeightAt(start_idx + i, static_cast<float>(inner_data.second));
......
......@@ -9,21 +9,41 @@
namespace LightGBM {
template <typename VAL_T>
class DenseBin;
template <typename VAL_T>
class DenseBinIterator : public BinIterator {
public:
explicit DenseBinIterator(const DenseBin<VAL_T>* bin_data, uint32_t min_bin, uint32_t max_bin, uint32_t default_bin)
: bin_data_(bin_data), min_bin_(static_cast<VAL_T>(min_bin)),
max_bin_(static_cast<VAL_T>(max_bin)),
default_bin_(static_cast<uint8_t>(default_bin)) {
if (default_bin_ == 0) {
bias_ = 1;
} else {
bias_ = 0;
}
}
inline uint32_t Get(data_size_t idx) override;
inline void Reset(data_size_t) override { }
private:
const DenseBin<VAL_T>* bin_data_;
VAL_T min_bin_;
VAL_T max_bin_;
VAL_T default_bin_;
uint8_t bias_;
};
/*!
* \brief Used to store bins for dense feature
* Use template to reduce memory cost
*/
template <typename VAL_T>
class DenseBin: public Bin {
class DenseBin : public Bin {
public:
DenseBin(data_size_t num_data, int default_bin)
: num_data_(num_data) {
data_.resize(num_data_);
VAL_T default_bin_T = static_cast<VAL_T>(default_bin);
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) {
data_[i] = default_bin_T;
}
friend DenseBinIterator<VAL_T>;
DenseBin(data_size_t num_data)
: num_data_(num_data), data_(num_data_, static_cast<VAL_T>(0)) {
}
~DenseBin() {
......@@ -33,24 +53,27 @@ public:
data_[idx] = static_cast<VAL_T>(value);
}
inline uint32_t Get(data_size_t idx) const {
return static_cast<uint32_t>(data_[idx]);
void ReSize(data_size_t num_data) override {
if (num_data_ != num_data) {
num_data_ = num_data;
data_.resize(num_data_);
}
}
BinIterator* GetIterator(data_size_t start_idx) const override;
BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin) const override;
void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data,
const score_t* ordered_gradients, const score_t* ordered_hessians,
HistogramBinEntry* out) const override {
// use 4-way unrolling, will be faster
if (data_indices != nullptr) { // if use part of data
data_size_t rest = num_data % 4;
const data_size_t rest = num_data & 0x3;
data_size_t i = 0;
for (; i < num_data - rest; i += 4) {
VAL_T bin0 = data_[data_indices[i]];
VAL_T bin1 = data_[data_indices[i + 1]];
VAL_T bin2 = data_[data_indices[i + 2]];
VAL_T bin3 = data_[data_indices[i + 3]];
const VAL_T bin0 = data_[data_indices[i]];
const VAL_T bin1 = data_[data_indices[i + 1]];
const VAL_T bin2 = data_[data_indices[i + 2]];
const VAL_T bin3 = data_[data_indices[i + 3]];
out[bin0].sum_gradients += ordered_gradients[i];
out[bin1].sum_gradients += ordered_gradients[i + 1];
......@@ -68,19 +91,19 @@ public:
++out[bin3].cnt;
}
for (; i < num_data; ++i) {
VAL_T bin = data_[data_indices[i]];
const VAL_T bin = data_[data_indices[i]];
out[bin].sum_gradients += ordered_gradients[i];
out[bin].sum_hessians += ordered_hessians[i];
++out[bin].cnt;
}
} else { // use full data
data_size_t rest = num_data % 4;
const data_size_t rest = num_data & 0x3;
data_size_t i = 0;
for (; i < num_data - rest; i += 4) {
VAL_T bin0 = data_[i];
VAL_T bin1 = data_[i + 1];
VAL_T bin2 = data_[i + 2];
VAL_T bin3 = data_[i + 3];
const VAL_T bin0 = data_[i];
const VAL_T bin1 = data_[i + 1];
const VAL_T bin2 = data_[i + 2];
const VAL_T bin3 = data_[i + 3];
out[bin0].sum_gradients += ordered_gradients[i];
out[bin1].sum_gradients += ordered_gradients[i + 1];
......@@ -98,7 +121,7 @@ public:
++out[bin3].cnt;
}
for (; i < num_data; ++i) {
VAL_T bin = data_[i];
const VAL_T bin = data_[i];
out[bin].sum_gradients += ordered_gradients[i];
out[bin].sum_hessians += ordered_hessians[i];
++out[bin].cnt;
......@@ -106,16 +129,52 @@ public:
}
}
virtual data_size_t Split(unsigned int threshold, data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices) const override {
virtual data_size_t Split(
uint32_t min_bin, uint32_t max_bin, uint32_t default_bin,
uint32_t threshold, data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices, BinType bin_type) const override {
if (num_data <= 0) { return 0; }
VAL_T th = static_cast<VAL_T>(threshold + min_bin);
VAL_T minb = static_cast<VAL_T>(min_bin);
VAL_T maxb = static_cast<VAL_T>(max_bin);
if (default_bin == 0) {
th -= 1;
}
data_size_t lte_count = 0;
data_size_t gt_count = 0;
for (data_size_t i = 0; i < num_data; ++i) {
data_size_t idx = data_indices[i];
if (data_[idx] > threshold) {
gt_indices[gt_count++] = idx;
} else {
lte_indices[lte_count++] = idx;
data_size_t* default_indices = gt_indices;
data_size_t* default_count = &gt_count;
if (bin_type == BinType::NumericalBin) {
if (default_bin <= threshold) {
default_indices = lte_indices;
default_count = &lte_count;
}
for (data_size_t i = 0; i < num_data; ++i) {
const data_size_t idx = data_indices[i];
VAL_T bin = data_[idx];
if (bin > maxb || bin < minb) {
default_indices[(*default_count)++] = idx;
} else if (bin > th) {
gt_indices[gt_count++] = idx;
} else {
lte_indices[lte_count++] = idx;
}
}
} else {
if (default_bin == threshold) {
default_indices = lte_indices;
default_count = &lte_count;
}
for (data_size_t i = 0; i < num_data; ++i) {
const data_size_t idx = data_indices[i];
VAL_T bin = data_[idx];
if (bin > maxb || bin < minb) {
default_indices[(*default_count)++] = idx;
} else if (bin != th) {
gt_indices[gt_count++] = idx;
} else {
lte_indices[lte_count++] = idx;
}
}
}
return lte_count;
......@@ -140,6 +199,13 @@ public:
}
}
void CopySubset(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) override {
auto other_bin = reinterpret_cast<const DenseBin<VAL_T>*>(full_bin);
for (int i = 0; i < num_used_indices; ++i) {
data_[i] = other_bin->data_[used_indices[i]];
}
}
void SaveBinaryToFile(FILE* file) const override {
fwrite(data_.data(), sizeof(VAL_T), num_data_, file);
}
......@@ -154,45 +220,19 @@ protected:
};
template <typename VAL_T>
class DenseBinIterator: public BinIterator {
public:
explicit DenseBinIterator(const DenseBin<VAL_T>* bin_data)
: bin_data_(bin_data) {
}
uint32_t Get(data_size_t idx) override {
return bin_data_->Get(idx);
uint32_t DenseBinIterator<VAL_T>::Get(data_size_t idx) {
auto ret = bin_data_->data_[idx];
if (ret >= min_bin_ && ret <= max_bin_) {
return ret - min_bin_ + bias_;
} else {
return default_bin_;
}
private:
const DenseBin<VAL_T>* bin_data_;
};
template <typename VAL_T>
BinIterator* DenseBin<VAL_T>::GetIterator(data_size_t) const {
return new DenseBinIterator<VAL_T>(this);
}
template <typename VAL_T>
class DenseCategoricalBin: public DenseBin<VAL_T> {
public:
DenseCategoricalBin(data_size_t num_data, int default_bin)
: DenseBin<VAL_T>(num_data, default_bin) {
}
virtual data_size_t Split(unsigned int threshold, data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices) const override {
data_size_t lte_count = 0;
data_size_t gt_count = 0;
for (data_size_t i = 0; i < num_data; ++i) {
data_size_t idx = data_indices[i];
if (DenseBin<VAL_T>::data_[idx] != threshold) {
gt_indices[gt_count++] = idx;
} else {
lte_indices[lte_count++] = idx;
}
}
return lte_count;
}
};
BinIterator* DenseBin<VAL_T>::GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin) const {
return new DenseBinIterator<VAL_T>(this, min_bin, max_bin, default_bin);
}
} // namespace LightGBM
#endif // LightGBM_IO_DENSE_BIN_HPP_
#ifndef LIGHTGBM_IO_DENSE_NBITS_BIN_HPP_
#define LIGHTGBM_IO_DENSE_NBITS_BIN_HPP_
#include <LightGBM/bin.h>
#include <vector>
#include <cstring>
#include <cstdint>
namespace LightGBM {
class Dense4bitsBin;
class Dense4bitsBinIterator: public BinIterator {
public:
explicit Dense4bitsBinIterator(const Dense4bitsBin* bin_data, uint32_t min_bin, uint32_t max_bin, uint32_t default_bin)
: bin_data_(bin_data), min_bin_(static_cast<uint8_t>(min_bin)),
max_bin_(static_cast<uint8_t>(max_bin)),
default_bin_(static_cast<uint8_t>(default_bin)) {
if (default_bin_ == 0) {
bias_ = 1;
} else {
bias_ = 0;
}
}
inline uint32_t Get(data_size_t idx) override;
inline void Reset(data_size_t) override { }
private:
const Dense4bitsBin* bin_data_;
uint8_t min_bin_;
uint8_t max_bin_;
uint8_t default_bin_;
uint8_t bias_;
};
class Dense4bitsBin: public Bin {
public:
friend Dense4bitsBinIterator;
Dense4bitsBin(data_size_t num_data)
: num_data_(num_data) {
int len = (num_data_ + 1) / 2;
data_ = std::vector<uint8_t>(len, static_cast<uint8_t>(0));
}
~Dense4bitsBin() {
}
void Push(int, data_size_t idx, uint32_t value) override {
if (buf_.empty()) {
#pragma omp critical
{
if (buf_.empty()) {
int len = (num_data_ + 1) / 2;
buf_ = std::vector<uint8_t>(len, static_cast<uint8_t>(0));
}
}
}
const int i1 = idx >> 1;
const int i2 = (idx & 1) << 2;
const uint8_t val = static_cast<uint8_t>(value) << i2;
if (i2 == 0) {
data_[i1] = val;
} else {
buf_[i1] = val;
}
}
void ReSize(data_size_t num_data) override {
if (num_data_ != num_data) {
num_data_ = num_data;
int len = (num_data_ + 1) / 2;
data_.resize(len);
}
}
BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin) const override;
void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data,
const score_t* ordered_gradients, const score_t* ordered_hessians,
HistogramBinEntry* out) const override {
if (data_indices != nullptr) { // if use part of data
const data_size_t rest = num_data & 0x3;
data_size_t i = 0;
for (; i < num_data - rest; i += 4) {
data_size_t idx = data_indices[i];
const auto bin0 = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
idx = data_indices[i + 1];
const auto bin1 = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
idx = data_indices[i + 2];
const auto bin2 = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
idx = data_indices[i + 3];
const auto bin3 = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
out[bin0].sum_gradients += ordered_gradients[i];
out[bin1].sum_gradients += ordered_gradients[i + 1];
out[bin2].sum_gradients += ordered_gradients[i + 2];
out[bin3].sum_gradients += ordered_gradients[i + 3];
out[bin0].sum_hessians += ordered_hessians[i];
out[bin1].sum_hessians += ordered_hessians[i + 1];
out[bin2].sum_hessians += ordered_hessians[i + 2];
out[bin3].sum_hessians += ordered_hessians[i + 3];
++out[bin0].cnt;
++out[bin1].cnt;
++out[bin2].cnt;
++out[bin3].cnt;
}
for (; i < num_data; ++i) {
const data_size_t idx = data_indices[i];
const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
out[bin].sum_gradients += ordered_gradients[i];
out[bin].sum_hessians += ordered_hessians[i];
++out[bin].cnt;
}
} else { // use full data
const data_size_t rest = num_data & 0x3;
data_size_t i = 0;
for (; i < num_data - rest; i += 4) {
int j = i >> 1;
const auto bin0 = (data_[j]) & 0xf;
const auto bin1 = (data_[j] >> 4) & 0xf;
++j;
const auto bin2 = (data_[j]) & 0xf;
const auto bin3 = (data_[j] >> 4) & 0xf;
out[bin0].sum_gradients += ordered_gradients[i];
out[bin1].sum_gradients += ordered_gradients[i + 1];
out[bin2].sum_gradients += ordered_gradients[i + 2];
out[bin3].sum_gradients += ordered_gradients[i + 3];
out[bin0].sum_hessians += ordered_hessians[i];
out[bin1].sum_hessians += ordered_hessians[i + 1];
out[bin2].sum_hessians += ordered_hessians[i + 2];
out[bin3].sum_hessians += ordered_hessians[i + 3];
++out[bin0].cnt;
++out[bin1].cnt;
++out[bin2].cnt;
++out[bin3].cnt;
}
for (; i < num_data; ++i) {
const auto bin = (data_[i >> 1] >> ((i & 1) << 2)) & 0xf;
out[bin].sum_gradients += ordered_gradients[i];
out[bin].sum_hessians += ordered_hessians[i];
++out[bin].cnt;
}
}
}
virtual data_size_t Split(
uint32_t min_bin, uint32_t max_bin, uint32_t default_bin,
uint32_t threshold, data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices, BinType bin_type) const override {
if (num_data <= 0) { return 0; }
uint8_t th = static_cast<uint8_t>(threshold + min_bin);
uint8_t minb = static_cast<uint8_t>(min_bin);
uint8_t maxb = static_cast<uint8_t>(max_bin);
if (default_bin == 0) {
th -= 1;
}
data_size_t lte_count = 0;
data_size_t gt_count = 0;
data_size_t* default_indices = gt_indices;
data_size_t* default_count = &gt_count;
if (bin_type == BinType::NumericalBin) {
if (default_bin <= threshold) {
default_indices = lte_indices;
default_count = &lte_count;
}
for (data_size_t i = 0; i < num_data; ++i) {
const data_size_t idx = data_indices[i];
const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
if (bin > maxb || bin < minb) {
default_indices[(*default_count)++] = idx;
} else if (bin > th) {
gt_indices[gt_count++] = idx;
} else {
lte_indices[lte_count++] = idx;
}
}
} else {
if (default_bin == threshold) {
default_indices = lte_indices;
default_count = &lte_count;
}
for (data_size_t i = 0; i < num_data; ++i) {
const data_size_t idx = data_indices[i];
const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
if (bin > maxb || bin < minb) {
default_indices[(*default_count)++] = idx;
} else if (bin != th) {
gt_indices[gt_count++] = idx;
} else {
lte_indices[lte_count++] = idx;
}
}
}
return lte_count;
}
data_size_t num_data() const override { return num_data_; }
/*! \brief not ordered bin for dense feature */
OrderedBin* CreateOrderedBin() const override { return nullptr; }
void FinishLoad() override {
if (buf_.empty()) { return; }
int len = (num_data_ + 1) / 2;
for (int i = 0; i < len; ++i) {
data_[i] |= buf_[i];
}
buf_.clear();
}
void LoadFromMemory(const void* memory, const std::vector<data_size_t>& local_used_indices) override {
const uint8_t* mem_data = reinterpret_cast<const uint8_t*>(memory);
if (!local_used_indices.empty()) {
const data_size_t rest = num_data_ & 1;
for (int i = 0; i < num_data_ - rest; i += 2) {
// get old bins
data_size_t idx = local_used_indices[i];
const auto bin1 = static_cast<uint8_t>((mem_data[idx >> 1] >> ((idx & 1) << 2)) & 0xf);
idx = local_used_indices[i + 1];
const auto bin2 = static_cast<uint8_t>((mem_data[idx >> 1] >> ((idx & 1) << 2)) & 0xf);
// add
const int i1 = i >> 1;
data_[i1] = (bin1 | (bin2 << 4));
}
if (rest) {
data_size_t idx = local_used_indices[num_data_ - 1];
data_[num_data_ / 2 + 1] = (mem_data[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
}
} else {
for (size_t i = 0; i < data_.size(); ++i) {
data_[i] = mem_data[i];
}
}
}
void CopySubset(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) override {
auto other_bin = reinterpret_cast<const Dense4bitsBin*>(full_bin);
const data_size_t rest = num_used_indices & 1;
for (int i = 0; i < num_used_indices - rest; i += 2) {
data_size_t idx = used_indices[i];
const auto bin1 = static_cast<uint8_t>((other_bin->data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf);
idx = used_indices[i + 1];
const auto bin2 = static_cast<uint8_t>((other_bin->data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf);
const int i1 = i >> 1;
data_[i1] = (bin1 | (bin2 << 4));
}
if (rest) {
data_size_t idx = used_indices[num_used_indices - 1];
data_[num_used_indices / 2 + 1] = (other_bin->data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
}
}
void SaveBinaryToFile(FILE* file) const override {
fwrite(data_.data(), sizeof(uint8_t), data_.size(), file);
}
size_t SizesInByte() const override {
return sizeof(uint8_t) * data_.size();
}
protected:
data_size_t num_data_;
std::vector<uint8_t> data_;
std::vector<uint8_t> buf_;
};
uint32_t Dense4bitsBinIterator::Get(data_size_t idx) {
const auto bin = (bin_data_->data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
if (bin >= min_bin_ && bin <= max_bin_) {
return bin - min_bin_ + bias_;
} else {
return default_bin_;
}
}
BinIterator* Dense4bitsBin::GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin) const {
return new Dense4bitsBinIterator(this, min_bin, max_bin, default_bin);
}
} // namespace LightGBM
#endif // LIGHTGBM_IO_DENSE_NBITS_BIN_HPP_
......@@ -12,6 +12,9 @@ Metadata::Metadata() {
num_init_score_ = 0;
num_data_ = 0;
num_queries_ = 0;
weight_load_from_file_ = false;
query_load_from_file_ = false;
init_score_load_from_file_ = false;
}
void Metadata::Init(const char * data_filename) {
......@@ -40,6 +43,7 @@ void Metadata::Init(data_size_t num_data, int weight_idx, int query_idx) {
for (data_size_t i = 0; i < num_weights_; ++i) {
weights_[i] = 0.0f;
}
weight_load_from_file_ = false;
}
if (query_idx >= 0) {
if (!query_boundaries_.empty()) {
......@@ -52,6 +56,7 @@ void Metadata::Init(data_size_t num_data, int weight_idx, int query_idx) {
for (data_size_t i = 0; i < num_data_; ++i) {
queries_[i] = 0;
}
query_load_from_file_ = false;
}
}
......@@ -185,87 +190,92 @@ void Metadata::CheckOrPartition(data_size_t num_all_data, const std::vector<data
Log::Fatal("Initial score size doesn't match data size");
}
} else {
if (!queries_.empty()) {
Log::Fatal("Cannot used query_id for parallel training");
}
data_size_t num_used_data = static_cast<data_size_t>(used_data_indices.size());
// check weights
if (weights_.size() > 0 && num_weights_ != num_all_data) {
weights_.clear();
num_weights_ = 0;
Log::Fatal("Weights size doesn't match data size");
}
// check query boundries
if (!query_boundaries_.empty() && query_boundaries_[num_queries_] != num_all_data) {
query_boundaries_.clear();
num_queries_ = 0;
Log::Fatal("Query size doesn't match data size");
}
// contain initial score file
if (!init_score_.empty() && (num_init_score_ % num_all_data) != 0) {
init_score_.clear();
num_init_score_ = 0;
Log::Fatal("Initial score size doesn't match data size");
}
// get local weights
if (!weights_.empty()) {
auto old_weights = weights_;
num_weights_ = num_data_;
weights_ = std::vector<float>(num_data_);
if (weight_load_from_file_) {
if (weights_.size() > 0 && num_weights_ != num_all_data) {
weights_.clear();
num_weights_ = 0;
Log::Fatal("Weights size doesn't match data size");
}
// get local weights
if (!weights_.empty()) {
auto old_weights = weights_;
num_weights_ = num_data_;
weights_ = std::vector<float>(num_data_);
#pragma omp parallel for schedule(static)
for (int i = 0; i < static_cast<int>(used_data_indices.size()); ++i) {
weights_[i] = old_weights[used_data_indices[i]];
for (int i = 0; i < static_cast<int>(used_data_indices.size()); ++i) {
weights_[i] = old_weights[used_data_indices[i]];
}
old_weights.clear();
}
old_weights.clear();
}
// get local query boundaries
if (!query_boundaries_.empty()) {
std::vector<data_size_t> used_query;
data_size_t data_idx = 0;
for (data_size_t qid = 0; qid < num_queries_ && data_idx < num_used_data; ++qid) {
data_size_t start = query_boundaries_[qid];
data_size_t end = query_boundaries_[qid + 1];
data_size_t len = end - start;
if (used_data_indices[data_idx] > start) {
continue;
} else if (used_data_indices[data_idx] == start) {
if (num_used_data >= data_idx + len && used_data_indices[data_idx + len - 1] == end - 1) {
used_query.push_back(qid);
data_idx += len;
if (query_load_from_file_) {
// check query boundries
if (!query_boundaries_.empty() && query_boundaries_[num_queries_] != num_all_data) {
query_boundaries_.clear();
num_queries_ = 0;
Log::Fatal("Query size doesn't match data size");
}
// get local query boundaries
if (!query_boundaries_.empty()) {
std::vector<data_size_t> used_query;
data_size_t data_idx = 0;
for (data_size_t qid = 0; qid < num_queries_ && data_idx < num_used_data; ++qid) {
data_size_t start = query_boundaries_[qid];
data_size_t end = query_boundaries_[qid + 1];
data_size_t len = end - start;
if (used_data_indices[data_idx] > start) {
continue;
} else if (used_data_indices[data_idx] == start) {
if (num_used_data >= data_idx + len && used_data_indices[data_idx + len - 1] == end - 1) {
used_query.push_back(qid);
data_idx += len;
} else {
Log::Fatal("Data partition error, data didn't match queries");
}
} else {
Log::Fatal("Data partition error, data didn't match queries");
}
} else {
Log::Fatal("Data partition error, data didn't match queries");
}
auto old_query_boundaries = query_boundaries_;
query_boundaries_ = std::vector<data_size_t>(used_query.size() + 1);
num_queries_ = static_cast<data_size_t>(used_query.size());
query_boundaries_[0] = 0;
for (data_size_t i = 0; i < num_queries_; ++i) {
data_size_t qid = used_query[i];
data_size_t len = old_query_boundaries[qid + 1] - old_query_boundaries[qid];
query_boundaries_[i + 1] = query_boundaries_[i] + len;
}
old_query_boundaries.clear();
}
auto old_query_boundaries = query_boundaries_;
query_boundaries_ = std::vector<data_size_t>(used_query.size() + 1);
num_queries_ = static_cast<data_size_t>(used_query.size());
query_boundaries_[0] = 0;
for (data_size_t i = 0; i < num_queries_; ++i) {
data_size_t qid = used_query[i];
data_size_t len = old_query_boundaries[qid + 1] - old_query_boundaries[qid];
query_boundaries_[i + 1] = query_boundaries_[i] + len;
}
old_query_boundaries.clear();
}
if (init_score_load_from_file_) {
// contain initial score file
if (!init_score_.empty() && (num_init_score_ % num_all_data) != 0) {
init_score_.clear();
num_init_score_ = 0;
Log::Fatal("Initial score size doesn't match data size");
}
// get local initial scores
if (!init_score_.empty()) {
auto old_scores = init_score_;
int num_class = static_cast<int>(num_init_score_ / num_all_data);
num_init_score_ = static_cast<int64_t>(num_data_) * num_class;
init_score_ = std::vector<double>(num_init_score_);
// get local initial scores
if (!init_score_.empty()) {
auto old_scores = init_score_;
int num_class = static_cast<int>(num_init_score_ / num_all_data);
num_init_score_ = static_cast<int64_t>(num_data_) * num_class;
init_score_ = std::vector<double>(num_init_score_);
#pragma omp parallel for schedule(static)
for (int k = 0; k < num_class; ++k){
for (size_t i = 0; i < used_data_indices.size(); ++i) {
init_score_[k * num_data_ + i] = old_scores[k * num_all_data + used_data_indices[i]];
for (int k = 0; k < num_class; ++k) {
for (size_t i = 0; i < used_data_indices.size(); ++i) {
init_score_[k * num_data_ + i] = old_scores[k * num_all_data + used_data_indices[i]];
}
}
old_scores.clear();
}
old_scores.clear();
}
// re-load query weight
LoadQueryWeights();
}
......@@ -289,6 +299,7 @@ void Metadata::SetInitScore(const double* init_score, data_size_t len) {
for (int64_t i = 0; i < num_init_score_; ++i) {
init_score_[i] = init_score[i];
}
init_score_load_from_file_ = false;
}
void Metadata::SetLabel(const float* label, data_size_t len) {
......@@ -326,6 +337,7 @@ void Metadata::SetWeights(const float* weights, data_size_t len) {
weights_[i] = weights[i];
}
LoadQueryWeights();
weight_load_from_file_ = false;
}
void Metadata::SetQuery(const data_size_t* query, data_size_t len) {
......@@ -352,48 +364,7 @@ void Metadata::SetQuery(const data_size_t* query, data_size_t len) {
query_boundaries_[i + 1] = query_boundaries_[i] + query[i];
}
LoadQueryWeights();
}
void Metadata::SetQueryId(const data_size_t* query_id, data_size_t len) {
std::lock_guard<std::mutex> lock(mutex_);
// save to nullptr
if (query_id == nullptr || len == 0) {
query_boundaries_.clear();
queries_.clear();
num_queries_ = 0;
return;
}
if (num_data_ != len) {
Log::Fatal("len of query id is not same with #data");
}
if (!queries_.empty()) { queries_.clear(); }
queries_ = std::vector<data_size_t>(num_data_);
for (data_size_t i = 0; i < num_weights_; ++i) {
queries_[i] = query_id[i];
}
// need convert query_id to boundaries
std::vector<data_size_t> tmp_buffer;
data_size_t last_qid = -1;
data_size_t cur_cnt = 0;
for (data_size_t i = 0; i < num_data_; ++i) {
if (last_qid != queries_[i]) {
if (cur_cnt > 0) {
tmp_buffer.push_back(cur_cnt);
}
cur_cnt = 0;
last_qid = queries_[i];
}
++cur_cnt;
}
tmp_buffer.push_back(cur_cnt);
query_boundaries_ = std::vector<data_size_t>(tmp_buffer.size() + 1);
num_queries_ = static_cast<data_size_t>(tmp_buffer.size());
query_boundaries_[0] = 0;
for (size_t i = 0; i < tmp_buffer.size(); ++i) {
query_boundaries_[i + 1] = query_boundaries_[i] + tmp_buffer[i];
}
queries_.clear();
LoadQueryWeights();
query_load_from_file_ = false;
}
void Metadata::LoadWeights() {
......@@ -415,6 +386,7 @@ void Metadata::LoadWeights() {
Common::Atof(reader.Lines()[i].c_str(), &tmp_weight);
weights_[i] = static_cast<float>(tmp_weight);
}
weight_load_from_file_ = true;
}
void Metadata::LoadInitialScore() {
......@@ -457,6 +429,7 @@ void Metadata::LoadInitialScore() {
}
}
}
init_score_load_from_file_ = true;
}
void Metadata::LoadQueryBoundaries() {
......@@ -478,6 +451,7 @@ void Metadata::LoadQueryBoundaries() {
Common::Atoi(reader.Lines()[i].c_str(), &tmp_cnt);
query_boundaries_[i + 1] = query_boundaries_[i] + static_cast<data_size_t>(tmp_cnt);
}
query_load_from_file_ = true;
}
void Metadata::LoadQueryWeights() {
......@@ -516,12 +490,14 @@ void Metadata::LoadFromMemory(const void* memory) {
weights_ = std::vector<float>(num_weights_);
std::memcpy(weights_.data(), mem_ptr, sizeof(float)*num_weights_);
mem_ptr += sizeof(float)*num_weights_;
weight_load_from_file_ = true;
}
if (num_queries_ > 0) {
if (!query_boundaries_.empty()) { query_boundaries_.clear(); }
query_boundaries_ = std::vector<data_size_t>(num_queries_ + 1);
std::memcpy(query_boundaries_.data(), mem_ptr, sizeof(data_size_t)*(num_queries_ + 1));
mem_ptr += sizeof(data_size_t)*(num_queries_ + 1);
query_load_from_file_ = true;
}
LoadQueryWeights();
}
......
......@@ -29,17 +29,19 @@ public:
struct SparsePair {
data_size_t ridx; // data(row) index
VAL_T bin; // bin for this data
SparsePair(data_size_t r, VAL_T b) : ridx(r), bin(b) {}
SparsePair() : ridx(0), bin(0) {}
};
OrderedSparseBin(const SparseBin<VAL_T>* bin_data)
:bin_data_(bin_data) {
data_size_t cur_pos = 0;
data_size_t i_delta = -1;
int non_zero_cnt = 0;
while (bin_data_->NextNonzero(&i_delta, &cur_pos)) {
ordered_pair_.emplace_back(cur_pos, static_cast<VAL_T>(0));
++non_zero_cnt;
}
ordered_pair_.shrink_to_fit();
ordered_pair_.resize(non_zero_cnt);
leaf_cnt_.push_back(non_zero_cnt);
}
~OrderedSparseBin() {
......@@ -81,17 +83,55 @@ public:
// get current leaf boundary
const data_size_t start = leaf_start_[leaf];
const data_size_t end = start + leaf_cnt_[leaf];
const int rest = (end - start) % 4;
data_size_t i = start;
// use data on current leaf to construct histogram
for (data_size_t i = start; i < end; ++i) {
const VAL_T bin = ordered_pair_[i].bin;
const data_size_t idx = ordered_pair_[i].ridx;
out[bin].sum_gradients += gradient[idx];
out[bin].sum_hessians += hessian[idx];
++out[bin].cnt;
for (; i < end - rest; i += 4) {
const VAL_T bin0 = ordered_pair_[i].bin;
const VAL_T bin1 = ordered_pair_[i + 1].bin;
const VAL_T bin2 = ordered_pair_[i + 2].bin;
const VAL_T bin3 = ordered_pair_[i + 3].bin;
const auto g0 = gradient[ordered_pair_[i].ridx];
const auto h0 = hessian[ordered_pair_[i].ridx];
const auto g1 = gradient[ordered_pair_[i + 1].ridx];
const auto h1 = hessian[ordered_pair_[i + 1].ridx];
const auto g2 = gradient[ordered_pair_[i + 2].ridx];
const auto h2 = hessian[ordered_pair_[i + 2].ridx];
const auto g3 = gradient[ordered_pair_[i + 3].ridx];
const auto h3 = hessian[ordered_pair_[i + 3].ridx];
out[bin0].sum_gradients += g0;
out[bin1].sum_gradients += g1;
out[bin2].sum_gradients += g2;
out[bin3].sum_gradients += g3;
out[bin0].sum_hessians += h0;
out[bin1].sum_hessians += h1;
out[bin2].sum_hessians += h2;
out[bin3].sum_hessians += h3;
++out[bin0].cnt;
++out[bin1].cnt;
++out[bin2].cnt;
++out[bin3].cnt;
}
for (; i < end; ++i) {
const VAL_T bin0 = ordered_pair_[i].bin;
const auto g0 = gradient[ordered_pair_[i].ridx];
const auto h0 = hessian[ordered_pair_[i].ridx];
out[bin0].sum_gradients += g0;
out[bin0].sum_hessians += h0;
++out[bin0].cnt;
}
}
void Split(int leaf, int right_leaf, const char* left_indices) override {
void Split(int leaf, int right_leaf, const char* is_in_leaf, char mark) override {
// get current leaf boundary
const data_size_t l_start = leaf_start_[leaf];
const data_size_t l_end = l_start + leaf_cnt_[leaf];
......@@ -99,7 +139,7 @@ public:
data_size_t new_left_end = l_start;
for (data_size_t i = l_start; i < l_end; ++i) {
if (left_indices[ordered_pair_[i].ridx]) {
if (is_in_leaf[ordered_pair_[i].ridx] == mark) {
std::swap(ordered_pair_[new_left_end], ordered_pair_[i]);
++new_left_end;
}
......@@ -109,7 +149,9 @@ public:
leaf_cnt_[leaf] = new_left_end - l_start;
leaf_cnt_[right_leaf] = l_end - new_left_end;
}
data_size_t NonZeroCount(int leaf) const override {
return static_cast<data_size_t>(leaf_cnt_[leaf]);
}
/*! \brief Disable copy */
OrderedSparseBin<VAL_T>& operator=(const OrderedSparseBin<VAL_T>&) = delete;
/*! \brief Disable copy */
......
......@@ -5,40 +5,59 @@
#include <LightGBM/bin.h>
#include <omp.h>
#include <LightGBM/utils/openmp_wrapper.h>
#include <cstring>
#include <cstdint>
#include <limits>
#include <vector>
namespace LightGBM {
template <typename VAL_T>
class SparseBin;
template <typename VAL_T> class SparseBin;
const size_t kNumFastIndex = 64;
const uint8_t kMaxDelta = 255;
template <typename VAL_T>
class SparseBinIterator: public BinIterator {
public:
SparseBinIterator(const SparseBin<VAL_T>* bin_data,
uint32_t min_bin, uint32_t max_bin, uint32_t default_bin)
: bin_data_(bin_data), min_bin_(static_cast<VAL_T>(min_bin)),
max_bin_(static_cast<VAL_T>(max_bin)),
default_bin_(static_cast<VAL_T>(default_bin)) {
if (default_bin_ == 0) {
bias_ = 1;
} else {
bias_ = 0;
}
Reset(0);
}
SparseBinIterator(const SparseBin<VAL_T>* bin_data, data_size_t start_idx)
: bin_data_(bin_data) {
Reset(start_idx);
}
inline VAL_T InnerGet(data_size_t idx);
inline VAL_T RawGet(data_size_t idx);
inline uint32_t Get(data_size_t idx) override {
return InnerGet(idx);
inline uint32_t Get( data_size_t idx) override {
VAL_T ret = RawGet(idx);
if (ret >= min_bin_ && ret <= max_bin_) {
return ret - min_bin_ + bias_;
} else {
return default_bin_;
}
}
inline void Reset(data_size_t idx);
inline void Reset(data_size_t idx) override;
private:
const SparseBin<VAL_T>* bin_data_;
data_size_t cur_pos_;
data_size_t i_delta_;
VAL_T min_bin_;
VAL_T max_bin_;
VAL_T default_bin_;
uint8_t bias_;
};
template <typename VAL_T>
......@@ -50,32 +69,33 @@ public:
friend class SparseBinIterator<VAL_T>;
friend class OrderedSparseBin<VAL_T>;
SparseBin(data_size_t num_data, int default_bin)
SparseBin(data_size_t num_data)
: num_data_(num_data) {
default_bin_ = static_cast<VAL_T>(default_bin);
if (default_bin_ != 0) {
Log::Info("Warning: sparse feature with negative values, treating negative values as zero");
}
int num_threads = 1;
#pragma omp parallel
#pragma omp master
{
num_threads_ = omp_get_num_threads();
}
for (int i = 0; i < num_threads_; ++i) {
push_buffers_.emplace_back();
num_threads = omp_get_num_threads();
}
push_buffers_.resize(num_threads);
}
~SparseBin() {
}
void ReSize(data_size_t num_data) override {
num_data_ = num_data;
}
void Push(int tid, data_size_t idx, uint32_t value) override {
// not store zero data
if (value <= default_bin_) { return; }
push_buffers_[tid].emplace_back(idx, static_cast<VAL_T>(value));
auto cur_bin = static_cast<VAL_T>(value);
if (cur_bin != 0) {
push_buffers_[tid].emplace_back(idx, cur_bin);
}
}
BinIterator* GetIterator(data_size_t start_idx) const override;
BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin) const override;
void ConstructHistogram(const data_size_t*, data_size_t, const score_t*,
const score_t*, HistogramBinEntry*) const override {
......@@ -86,34 +106,70 @@ public:
inline bool NextNonzero(data_size_t* i_delta,
data_size_t* cur_pos) const {
++(*i_delta);
*cur_pos += deltas_[*i_delta];
data_size_t factor = 1;
data_size_t shift = 0;
data_size_t delta = deltas_[*i_delta];
while (*i_delta < num_vals_ && vals_[*i_delta] == 0) {
++(*i_delta);
factor *= kMaxDelta;
*cur_pos += deltas_[*i_delta] * factor;
shift += 8;
delta |= static_cast<data_size_t>(deltas_[*i_delta]) << shift;
}
if (*i_delta >= 0 && *i_delta < num_vals_) {
*cur_pos += delta;
if (*i_delta < num_vals_) {
return true;
} else {
*cur_pos = num_data_;
return false;
}
}
virtual data_size_t Split(unsigned int threshold, data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices) const override {
virtual data_size_t Split(
uint32_t min_bin, uint32_t max_bin, uint32_t default_bin,
uint32_t threshold, data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices, BinType bin_type) const override {
// not need to split
if (num_data <= 0) { return 0; }
VAL_T th = static_cast<VAL_T>(threshold + min_bin);
VAL_T minb = static_cast<VAL_T>(min_bin);
VAL_T maxb = static_cast<VAL_T>(max_bin);
if (default_bin == 0) {
th -= 1;
}
SparseBinIterator<VAL_T> iterator(this, data_indices[0]);
data_size_t lte_count = 0;
data_size_t gt_count = 0;
for (data_size_t i = 0; i < num_data; ++i) {
const data_size_t idx = data_indices[i];
VAL_T bin = iterator.InnerGet(idx);
if (bin > threshold) {
gt_indices[gt_count++] = idx;
} else {
lte_indices[lte_count++] = idx;
data_size_t* default_indices = gt_indices;
data_size_t* default_count = &gt_count;
if (bin_type == BinType::NumericalBin) {
if (default_bin <= threshold) {
default_indices = lte_indices;
default_count = &lte_count;
}
for (data_size_t i = 0; i < num_data; ++i) {
const data_size_t idx = data_indices[i];
VAL_T bin = iterator.RawGet(idx);
if (bin > maxb || bin < minb) {
default_indices[(*default_count)++] = idx;
} else if (bin > th) {
gt_indices[gt_count++] = idx;
} else {
lte_indices[lte_count++] = idx;
}
}
} else {
if (default_bin == threshold) {
default_indices = lte_indices;
default_count = &lte_count;
}
for (data_size_t i = 0; i < num_data; ++i) {
const data_size_t idx = data_indices[i];
VAL_T bin = iterator.RawGet(idx);
if (bin > maxb || bin < minb) {
default_indices[(*default_count)++] = idx;
} else if (bin != th) {
gt_indices[gt_count++] = idx;
} else {
lte_indices[lte_count++] = idx;
}
}
}
return lte_count;
......@@ -125,44 +181,40 @@ public:
void FinishLoad() override {
// get total non zero size
size_t non_zero_size = 0;
size_t pair_cnt = 0;
for (size_t i = 0; i < push_buffers_.size(); ++i) {
non_zero_size += push_buffers_[i].size();
pair_cnt += push_buffers_[i].size();
}
// merge
non_zero_pair_.reserve(non_zero_size);
for (size_t i = 0; i < push_buffers_.size(); ++i) {
non_zero_pair_.insert(non_zero_pair_.end(), push_buffers_[i].begin(), push_buffers_[i].end());
std::vector<std::pair<data_size_t, VAL_T>>& idx_val_pairs = push_buffers_[0];
idx_val_pairs.reserve(pair_cnt);
for (size_t i = 1; i < push_buffers_.size(); ++i) {
idx_val_pairs.insert(idx_val_pairs.end(), push_buffers_[i].begin(), push_buffers_[i].end());
push_buffers_[i].clear();
push_buffers_[i].shrink_to_fit();
}
push_buffers_.clear();
push_buffers_.shrink_to_fit();
// sort by data index
std::sort(non_zero_pair_.begin(), non_zero_pair_.end(),
std::sort(idx_val_pairs.begin(), idx_val_pairs.end(),
[](const std::pair<data_size_t, VAL_T>& a, const std::pair<data_size_t, VAL_T>& b) {
return a.first < b.first;
});
// load detla array
LoadFromPair(non_zero_pair_);
// free memory
non_zero_pair_.clear();
non_zero_pair_.shrink_to_fit();
// load delta array
LoadFromPair(idx_val_pairs);
}
void LoadFromPair(const std::vector<std::pair<data_size_t, VAL_T>>& non_zero_pair) {
void LoadFromPair(const std::vector<std::pair<data_size_t, VAL_T>>& idx_val_pairs) {
deltas_.clear();
vals_.clear();
// transform to delta array
data_size_t last_idx = 0;
for (size_t i = 0; i < non_zero_pair.size(); ++i) {
const data_size_t cur_idx = non_zero_pair[i].first;
const VAL_T bin = non_zero_pair[i].second;
for (size_t i = 0; i < idx_val_pairs.size(); ++i) {
const data_size_t cur_idx = idx_val_pairs[i].first;
const VAL_T bin = idx_val_pairs[i].second;
data_size_t cur_delta = cur_idx - last_idx;
while (cur_delta > kMaxDelta) {
deltas_.push_back(cur_delta % kMaxDelta);
while (cur_delta >= 256) {
deltas_.push_back(cur_delta & 0xff);
vals_.push_back(0);
cur_delta /= kMaxDelta;
cur_delta >>= 8;
}
deltas_.push_back(static_cast<uint8_t>(cur_delta));
vals_.push_back(bin);
......@@ -259,28 +311,57 @@ public:
}
LoadFromPair(tmp_pair);
}
}
void CopySubset(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) override {
auto other_bin = reinterpret_cast<const SparseBin<VAL_T>*>(full_bin);
SparseBinIterator<VAL_T> iterator(other_bin, used_indices[0]);
deltas_.clear();
vals_.clear();
// transform to delta array
data_size_t last_idx = 0;
for (data_size_t i = 0; i < num_used_indices; ++i) {
VAL_T bin = iterator.RawGet(used_indices[i]);
if (bin > 0) {
data_size_t cur_delta = i - last_idx;
while (cur_delta >= 256) {
deltas_.push_back(cur_delta & 0xff);
vals_.push_back(0);
cur_delta >>= 8;
}
deltas_.push_back(static_cast<uint8_t>(cur_delta));
vals_.push_back(bin);
last_idx = i;
}
}
// avoid out of range
deltas_.push_back(0);
num_vals_ = static_cast<data_size_t>(vals_.size());
// reduce memory cost
deltas_.shrink_to_fit();
vals_.shrink_to_fit();
// generate fast index
GetFastIndex();
}
protected:
data_size_t num_data_;
std::vector<std::pair<data_size_t, VAL_T>> non_zero_pair_;
std::vector<uint8_t> deltas_;
std::vector<VAL_T> vals_;
data_size_t num_vals_;
int num_threads_;
std::vector<std::vector<std::pair<data_size_t, VAL_T>>> push_buffers_;
std::vector<std::pair<data_size_t, data_size_t>> fast_index_;
data_size_t fast_index_shift_;
VAL_T default_bin_;
};
template <typename VAL_T>
inline VAL_T SparseBinIterator<VAL_T>::InnerGet(data_size_t idx) {
while (cur_pos_ < idx && i_delta_ < bin_data_->num_vals_) {
inline VAL_T SparseBinIterator<VAL_T>::RawGet(data_size_t idx) {
while (cur_pos_ < idx) {
bin_data_->NextNonzero(&i_delta_, &cur_pos_);
}
if (cur_pos_ == idx && i_delta_ < bin_data_->num_vals_ && i_delta_ >= 0) {
if (cur_pos_ == idx) {
return bin_data_->vals_[i_delta_];
} else {
return 0;
......@@ -295,38 +376,9 @@ inline void SparseBinIterator<VAL_T>::Reset(data_size_t start_idx) {
}
template <typename VAL_T>
BinIterator* SparseBin<VAL_T>::GetIterator(data_size_t start_idx) const {
return new SparseBinIterator<VAL_T>(this, start_idx);
BinIterator* SparseBin<VAL_T>::GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin) const {
return new SparseBinIterator<VAL_T>(this, min_bin, max_bin, default_bin);
}
template <typename VAL_T>
class SparseCategoricalBin: public SparseBin<VAL_T> {
public:
SparseCategoricalBin(data_size_t num_data, int default_bin)
: SparseBin<VAL_T>(num_data, default_bin) {
}
virtual data_size_t Split(unsigned int threshold, data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices) const override {
// not need to split
if (num_data <= 0) { return 0; }
SparseBinIterator<VAL_T> iterator(this, data_indices[0]);
data_size_t lte_count = 0;
data_size_t gt_count = 0;
for (data_size_t i = 0; i < num_data; ++i) {
const data_size_t idx = data_indices[i];
VAL_T bin = iterator.InnerGet(idx);
if (bin != threshold) {
gt_indices[gt_count++] = idx;
} else {
lte_indices[lte_count++] = idx;
}
}
return lte_count;
}
};
} // namespace LightGBM
#endif // LightGBM_IO_SPARSE_BIN_HPP_
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment