Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tianlh
LightGBM-DCU
Commits
eade219e
Commit
eade219e
authored
Mar 18, 2017
by
Qiwei Ye
Browse files
merge conflict
parents
f23e6083
060bd316
Changes
129
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2657 additions
and
898 deletions
+2657
-898
python-package/lightgbm/plotting.py
python-package/lightgbm/plotting.py
+356
-0
python-package/lightgbm/sklearn.py
python-package/lightgbm/sklearn.py
+26
-11
src/application/application.cpp
src/application/application.cpp
+4
-4
src/application/predictor.hpp
src/application/predictor.hpp
+4
-5
src/boosting/boosting.cpp
src/boosting/boosting.cpp
+12
-2
src/boosting/dart.hpp
src/boosting/dart.hpp
+8
-3
src/boosting/gbdt.cpp
src/boosting/gbdt.cpp
+232
-90
src/boosting/gbdt.h
src/boosting/gbdt.h
+21
-6
src/boosting/goss.hpp
src/boosting/goss.hpp
+204
-0
src/boosting/score_updater.hpp
src/boosting/score_updater.hpp
+2
-0
src/c_api.cpp
src/c_api.cpp
+460
-213
src/io/bin.cpp
src/io/bin.cpp
+133
-94
src/io/config.cpp
src/io/config.cpp
+24
-13
src/io/dataset.cpp
src/io/dataset.cpp
+284
-42
src/io/dataset_loader.cpp
src/io/dataset_loader.cpp
+202
-135
src/io/dense_bin.hpp
src/io/dense_bin.hpp
+107
-67
src/io/dense_nbits_bin.hpp
src/io/dense_nbits_bin.hpp
+295
-0
src/io/metadata.cpp
src/io/metadata.cpp
+83
-107
src/io/ordered_sparse_bin.hpp
src/io/ordered_sparse_bin.hpp
+54
-12
src/io/sparse_bin.hpp
src/io/sparse_bin.hpp
+146
-94
No files found.
python-package/lightgbm/plotting.py
0 → 100644
View file @
eade219e
# coding: utf-8
# pylint: disable = C0103
"""Plotting Library."""
from
__future__
import
absolute_import
import
warnings
from
copy
import
deepcopy
from
io
import
BytesIO
import
numpy
as
np
from
.basic
import
Booster
from
.sklearn
import
LGBMModel
def
check_not_tuple_of_2_elements
(
obj
,
obj_name
=
'obj'
):
"""check object is not tuple or does not have 2 elements"""
if
not
isinstance
(
obj
,
tuple
)
or
len
(
obj
)
!=
2
:
raise
TypeError
(
'%s must be a tuple of 2 elements.'
%
obj_name
)
def
plot_importance
(
booster
,
ax
=
None
,
height
=
0.2
,
xlim
=
None
,
ylim
=
None
,
title
=
'Feature importance'
,
xlabel
=
'Feature importance'
,
ylabel
=
'Features'
,
importance_type
=
'split'
,
max_num_features
=
None
,
ignore_zero
=
True
,
figsize
=
None
,
grid
=
True
,
**
kwargs
):
"""Plot model feature importances.
Parameters
----------
booster : Booster or LGBMModel
Booster or LGBMModel instance
ax : matplotlib Axes
Target axes instance. If None, new figure and axes will be created.
height : float
Bar height, passed to ax.barh()
xlim : tuple of 2 elements
Tuple passed to axes.xlim()
ylim : tuple of 2 elements
Tuple passed to axes.ylim()
title : str
Axes title. Pass None to disable.
xlabel : str
X axis title label. Pass None to disable.
ylabel : str
Y axis title label. Pass None to disable.
importance_type : str
How the importance is calculated: "split" or "gain"
"split" is the number of times a feature is used in a model
"gain" is the total gain of splits which use the feature
max_num_features : int
Max number of top features displayed on plot.
If None or smaller than 1, all features will be displayed.
ignore_zero : bool
Ignore features with zero importance
figsize : tuple of 2 elements
Figure size
grid : bool
Whether add grid for axes
**kwargs :
Other keywords passed to ax.barh()
Returns
-------
ax : matplotlib Axes
"""
try
:
import
matplotlib.pyplot
as
plt
except
ImportError
:
raise
ImportError
(
'You must install matplotlib to plot importance.'
)
if
isinstance
(
booster
,
LGBMModel
):
booster
=
booster
.
booster_
elif
not
isinstance
(
booster
,
Booster
):
raise
TypeError
(
'booster must be Booster or LGBMModel.'
)
importance
=
booster
.
feature_importance
(
importance_type
=
importance_type
)
feature_name
=
booster
.
feature_name
()
if
not
len
(
importance
):
raise
ValueError
(
'Booster feature_importances are empty.'
)
tuples
=
sorted
(
zip
(
feature_name
,
importance
),
key
=
lambda
x
:
x
[
1
])
if
ignore_zero
:
tuples
=
[
x
for
x
in
tuples
if
x
[
1
]
>
0
]
if
max_num_features
is
not
None
and
max_num_features
>
0
:
tuples
=
tuples
[
-
max_num_features
:]
labels
,
values
=
zip
(
*
tuples
)
if
ax
is
None
:
if
figsize
is
not
None
:
check_not_tuple_of_2_elements
(
figsize
,
'figsize'
)
_
,
ax
=
plt
.
subplots
(
1
,
1
,
figsize
=
figsize
)
ylocs
=
np
.
arange
(
len
(
values
))
ax
.
barh
(
ylocs
,
values
,
align
=
'center'
,
height
=
height
,
**
kwargs
)
for
x
,
y
in
zip
(
values
,
ylocs
):
ax
.
text
(
x
+
1
,
y
,
x
,
va
=
'center'
)
ax
.
set_yticks
(
ylocs
)
ax
.
set_yticklabels
(
labels
)
if
xlim
is
not
None
:
check_not_tuple_of_2_elements
(
xlim
,
'xlim'
)
else
:
xlim
=
(
0
,
max
(
values
)
*
1.1
)
ax
.
set_xlim
(
xlim
)
if
ylim
is
not
None
:
check_not_tuple_of_2_elements
(
ylim
,
'ylim'
)
else
:
ylim
=
(
-
1
,
len
(
values
))
ax
.
set_ylim
(
ylim
)
if
title
is
not
None
:
ax
.
set_title
(
title
)
if
xlabel
is
not
None
:
ax
.
set_xlabel
(
xlabel
)
if
ylabel
is
not
None
:
ax
.
set_ylabel
(
ylabel
)
ax
.
grid
(
grid
)
return
ax
def
plot_metric
(
booster
,
metric
=
None
,
dataset_names
=
None
,
ax
=
None
,
xlim
=
None
,
ylim
=
None
,
title
=
'Metric during training'
,
xlabel
=
'Iterations'
,
ylabel
=
'auto'
,
figsize
=
None
,
grid
=
True
):
"""Plot one metric during training.
Parameters
----------
booster : dict or LGBMModel
Evals_result recorded by lightgbm.train() or LGBMModel instance
metric : str or None
The metric name to plot.
Only one metric supported because different metrics have various scales.
Pass None to pick `first` one (according to dict hashcode).
dataset_names : None or list of str
List of the dataset names to plot.
Pass None to plot all datasets.
ax : matplotlib Axes
Target axes instance. If None, new figure and axes will be created.
xlim : tuple of 2 elements
Tuple passed to axes.xlim()
ylim : tuple of 2 elements
Tuple passed to axes.ylim()
title : str
Axes title. Pass None to disable.
xlabel : str
X axis title label. Pass None to disable.
ylabel : str
Y axis title label. Pass None to disable. Pass 'auto' to use `metric`.
figsize : tuple of 2 elements
Figure size
grid : bool
Whether add grid for axes
Returns
-------
ax : matplotlib Axes
"""
try
:
import
matplotlib.pyplot
as
plt
except
ImportError
:
raise
ImportError
(
'You must install matplotlib to plot metric.'
)
if
isinstance
(
booster
,
LGBMModel
):
eval_results
=
deepcopy
(
booster
.
evals_result_
)
elif
isinstance
(
booster
,
dict
):
eval_results
=
deepcopy
(
booster
)
else
:
raise
TypeError
(
'booster must be dict or LGBMModel.'
)
num_data
=
len
(
eval_results
)
if
not
num_data
:
raise
ValueError
(
'eval results cannot be empty.'
)
if
ax
is
None
:
if
figsize
is
not
None
:
check_not_tuple_of_2_elements
(
figsize
,
'figsize'
)
_
,
ax
=
plt
.
subplots
(
1
,
1
,
figsize
=
figsize
)
if
dataset_names
is
None
:
dataset_names
=
iter
(
eval_results
.
keys
())
elif
not
isinstance
(
dataset_names
,
(
list
,
tuple
,
set
))
or
not
dataset_names
:
raise
ValueError
(
'dataset_names should be iterable and cannot be empty'
)
else
:
dataset_names
=
iter
(
dataset_names
)
name
=
next
(
dataset_names
)
# take one as sample
metrics_for_one
=
eval_results
[
name
]
num_metric
=
len
(
metrics_for_one
)
if
metric
is
None
:
if
num_metric
>
1
:
msg
=
"""more than one metric available, picking one to plot."""
warnings
.
warn
(
msg
,
stacklevel
=
2
)
metric
,
results
=
metrics_for_one
.
popitem
()
else
:
if
metric
not
in
metrics_for_one
:
raise
KeyError
(
'No given metric in eval results.'
)
results
=
metrics_for_one
[
metric
]
num_iteration
,
max_result
,
min_result
=
len
(
results
),
max
(
results
),
min
(
results
)
x_
=
range
(
num_iteration
)
ax
.
plot
(
x_
,
results
,
label
=
name
)
for
name
in
dataset_names
:
metrics_for_one
=
eval_results
[
name
]
results
=
metrics_for_one
[
metric
]
max_result
,
min_result
=
max
(
max
(
results
),
max_result
),
min
(
min
(
results
),
min_result
)
ax
.
plot
(
x_
,
results
,
label
=
name
)
ax
.
legend
(
loc
=
'best'
)
if
xlim
is
not
None
:
check_not_tuple_of_2_elements
(
xlim
,
'xlim'
)
else
:
xlim
=
(
0
,
num_iteration
)
ax
.
set_xlim
(
xlim
)
if
ylim
is
not
None
:
check_not_tuple_of_2_elements
(
ylim
,
'ylim'
)
else
:
range_result
=
max_result
-
min_result
ylim
=
(
min_result
-
range_result
*
0.2
,
max_result
+
range_result
*
0.2
)
ax
.
set_ylim
(
ylim
)
if
ylabel
==
'auto'
:
ylabel
=
metric
if
title
is
not
None
:
ax
.
set_title
(
title
)
if
xlabel
is
not
None
:
ax
.
set_xlabel
(
xlabel
)
if
ylabel
is
not
None
:
ax
.
set_ylabel
(
ylabel
)
ax
.
grid
(
grid
)
return
ax
def
_to_graphviz
(
graph
,
tree_info
,
show_info
,
feature_names
):
"""Convert specified tree to graphviz instance."""
def
add
(
root
,
parent
=
None
,
decision
=
None
):
"""recursively add node or edge"""
if
'split_index'
in
root
:
# non-leaf
name
=
'split'
+
str
(
root
[
'split_index'
])
if
feature_names
is
not
None
:
label
=
'split_feature_name:'
+
str
(
feature_names
[
root
[
'split_feature'
]])
else
:
label
=
'split_feature_index:'
+
str
(
root
[
'split_feature'
])
label
+=
'
\n
threshold:'
+
str
(
root
[
'threshold'
])
for
info
in
show_info
:
if
info
in
{
'split_gain'
,
'internal_value'
,
'internal_count'
}:
label
+=
'
\n
'
+
info
+
':'
+
str
(
root
[
info
])
graph
.
node
(
name
,
label
=
label
)
if
root
[
'decision_type'
]
==
'no_greater'
:
l_dec
,
r_dec
=
'<='
,
'>'
elif
root
[
'decision_type'
]
==
'is'
:
l_dec
,
r_dec
=
'is'
,
"isn't"
else
:
raise
ValueError
(
'Invalid decision type in tree model.'
)
add
(
root
[
'left_child'
],
name
,
l_dec
)
add
(
root
[
'right_child'
],
name
,
r_dec
)
else
:
# leaf
name
=
'left'
+
str
(
root
[
'leaf_index'
])
label
=
'leaf_value:'
+
str
(
root
[
'leaf_value'
])
if
'leaf_count'
in
show_info
:
label
+=
'
\n
leaf_count:'
+
str
(
root
[
'leaf_count'
])
graph
.
node
(
name
,
label
=
label
)
if
parent
is
not
None
:
graph
.
edge
(
parent
,
name
,
decision
)
add
(
tree_info
[
'tree_structure'
])
return
graph
def
plot_tree
(
booster
,
ax
=
None
,
tree_index
=
0
,
figsize
=
None
,
graph_attr
=
None
,
node_attr
=
None
,
edge_attr
=
None
,
show_info
=
None
):
"""Plot specified tree.
Parameters
----------
booster : Booster, LGBMModel
Booster or LGBMModel instance.
ax : matplotlib Axes
Target axes instance. If None, new figure and axes will be created.
tree_index : int, default 0
Specify tree index of target tree.
figsize : tuple of 2 elements
Figure size.
graph_attr : dict
Mapping of (attribute, value) pairs for the graph.
node_attr : dict
Mapping of (attribute, value) pairs set for all nodes.
edge_attr : dict
Mapping of (attribute, value) pairs set for all edges.
show_info : list
Information shows on nodes.
options: 'split_gain', 'internal_value', 'internal_count' or 'leaf_count'.
Returns
-------
ax : matplotlib Axes
"""
try
:
import
matplotlib.pyplot
as
plt
import
matplotlib.image
as
image
except
ImportError
:
raise
ImportError
(
'You must install matplotlib to plot tree.'
)
try
:
from
graphviz
import
Digraph
except
ImportError
:
raise
ImportError
(
'You must install graphviz to plot tree.'
)
if
ax
is
None
:
if
figsize
is
not
None
:
check_not_tuple_of_2_elements
(
figsize
,
'figsize'
)
_
,
ax
=
plt
.
subplots
(
1
,
1
,
figsize
=
figsize
)
if
isinstance
(
booster
,
LGBMModel
):
booster
=
booster
.
booster_
elif
not
isinstance
(
booster
,
Booster
):
raise
TypeError
(
'booster must be Booster or LGBMModel.'
)
model
=
booster
.
dump_model
()
tree_infos
=
model
[
'tree_info'
]
if
'feature_names'
in
model
:
feature_names
=
model
[
'feature_names'
]
else
:
feature_names
=
None
if
tree_index
<
len
(
tree_infos
):
tree_info
=
tree_infos
[
tree_index
]
else
:
raise
IndexError
(
'tree_index is out of range.'
)
graph
=
Digraph
(
graph_attr
=
graph_attr
,
node_attr
=
node_attr
,
edge_attr
=
edge_attr
)
if
show_info
is
None
:
show_info
=
[]
ret
=
_to_graphviz
(
graph
,
tree_info
,
show_info
,
feature_names
)
s
=
BytesIO
()
s
.
write
(
ret
.
pipe
(
format
=
'png'
))
s
.
seek
(
0
)
img
=
image
.
imread
(
s
)
ax
.
imshow
(
img
)
ax
.
axis
(
'off'
)
return
ax
python-package/lightgbm/sklearn.py
View file @
eade219e
...
...
@@ -130,6 +130,7 @@ class LGBMModel(LGBMModelBase):
reg_alpha
=
0
,
reg_lambda
=
0
,
scale_pos_weight
=
1
,
is_unbalance
=
False
,
seed
=
0
,
nthread
=-
1
,
silent
=
True
,
sigmoid
=
1.0
,
huber_delta
=
1.0
,
gaussian_eta
=
1.0
,
fair_c
=
1.0
,
poisson_max_delta_step
=
0.7
,
max_position
=
20
,
label_gain
=
None
,
drop_rate
=
0.1
,
skip_drop
=
0.5
,
max_drop
=
50
,
uniform_drop
=
False
,
xgboost_dart_mode
=
False
):
...
...
@@ -192,6 +193,8 @@ class LGBMModel(LGBMModelBase):
It is used to control the width of Gaussian function to approximate hessian.
fair_c : float
Only used in regression. Parameter for Fair loss function.
poisson_max_delta_step : float
parameter used to safeguard optimization in Poisson regression.
max_position : int
Only used in lambdarank, will optimize NDCG at this position.
label_gain : list of float
...
...
@@ -259,6 +262,7 @@ class LGBMModel(LGBMModelBase):
self
.
huber_delta
=
huber_delta
self
.
gaussian_eta
=
gaussian_eta
self
.
fair_c
=
fair_c
self
.
poisson_max_delta_step
=
poisson_max_delta_step
self
.
max_position
=
max_position
self
.
label_gain
=
label_gain
self
.
drop_rate
=
drop_rate
...
...
@@ -280,7 +284,7 @@ class LGBMModel(LGBMModelBase):
eval_init_score
=
None
,
eval_group
=
None
,
eval_metric
=
None
,
early_stopping_rounds
=
None
,
verbose
=
True
,
feature_name
=
None
,
categorical_feature
=
None
,
feature_name
=
'auto'
,
categorical_feature
=
'auto'
,
callbacks
=
None
):
"""
Fit the gradient boosting model
...
...
@@ -311,12 +315,14 @@ class LGBMModel(LGBMModelBase):
early_stopping_rounds : int
verbose : bool
If `verbose` and an evaluation set is used, writes the evaluation
feature_name : list of str
feature_name : list of str
, or 'auto'
Feature names
categorical_feature : list of str or int
If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
callbacks : list of callback functions
List of callback functions that are applied at each iteration.
See Callbacks in Python-API.md for more information.
...
...
@@ -461,7 +467,7 @@ class LGBMModel(LGBMModelBase):
return
self
.
evals_result
@
property
def
feature_importance_
(
self
):
def
feature_importance
s
_
(
self
):
"""Get normailized feature importances."""
importace_array
=
self
.
booster_
.
feature_importance
().
astype
(
np
.
float32
)
return
importace_array
/
importace_array
.
sum
()
...
...
@@ -470,9 +476,9 @@ class LGBMModel(LGBMModelBase):
def
booster
(
self
):
return
self
.
booster_
@
LGBMDeprecated
(
'Use attribute feature_importance_ instead.'
)
@
LGBMDeprecated
(
'Use attribute feature_importance
s
_ instead.'
)
def
feature_importance
(
self
):
return
self
.
feature_importance_
return
self
.
feature_importance
s
_
class
LGBMRegressor
(
LGBMModel
,
LGBMRegressorBase
):
...
...
@@ -485,6 +491,7 @@ class LGBMRegressor(LGBMModel, LGBMRegressorBase):
reg_alpha
=
0
,
reg_lambda
=
0
,
seed
=
0
,
nthread
=-
1
,
silent
=
True
,
huber_delta
=
1.0
,
gaussian_eta
=
1.0
,
fair_c
=
1.0
,
poisson_max_delta_step
=
0.7
,
drop_rate
=
0.1
,
skip_drop
=
0.5
,
max_drop
=
50
,
uniform_drop
=
False
,
xgboost_dart_mode
=
False
):
super
(
LGBMRegressor
,
self
).
__init__
(
boosting_type
=
boosting_type
,
num_leaves
=
num_leaves
,
...
...
@@ -497,6 +504,7 @@ class LGBMRegressor(LGBMModel, LGBMRegressorBase):
reg_alpha
=
reg_alpha
,
reg_lambda
=
reg_lambda
,
seed
=
seed
,
nthread
=
nthread
,
silent
=
silent
,
huber_delta
=
huber_delta
,
gaussian_eta
=
gaussian_eta
,
fair_c
=
fair_c
,
poisson_max_delta_step
=
poisson_max_delta_step
,
drop_rate
=
drop_rate
,
skip_drop
=
skip_drop
,
max_drop
=
max_drop
,
uniform_drop
=
uniform_drop
,
xgboost_dart_mode
=
xgboost_dart_mode
)
...
...
@@ -506,7 +514,7 @@ class LGBMRegressor(LGBMModel, LGBMRegressorBase):
eval_init_score
=
None
,
eval_metric
=
"l2"
,
early_stopping_rounds
=
None
,
verbose
=
True
,
feature_name
=
None
,
categorical_feature
=
None
,
callbacks
=
None
):
feature_name
=
'auto'
,
categorical_feature
=
'auto'
,
callbacks
=
None
):
super
(
LGBMRegressor
,
self
).
fit
(
X
,
y
,
sample_weight
=
sample_weight
,
init_score
=
init_score
,
eval_set
=
eval_set
,
...
...
@@ -550,9 +558,9 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
sample_weight
=
None
,
init_score
=
None
,
eval_set
=
None
,
eval_sample_weight
=
None
,
eval_init_score
=
None
,
eval_metric
=
"
binary_
logloss"
,
eval_metric
=
"logloss"
,
early_stopping_rounds
=
None
,
verbose
=
True
,
feature_name
=
None
,
categorical_feature
=
None
,
feature_name
=
'auto'
,
categorical_feature
=
'auto'
,
callbacks
=
None
):
self
.
_le
=
LGBMLabelEncoder
().
fit
(
y
)
y
=
self
.
_le
.
transform
(
y
)
...
...
@@ -562,8 +570,15 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
if
self
.
n_classes
>
2
:
# Switch to using a multiclass objective in the underlying LGBM instance
self
.
objective
=
"multiclass"
if
eval_
s
et
is
not
None
and
eval_metric
==
"
binary_logloss
"
:
if
eval_
m
et
ric
==
'logloss'
or
eval_metric
==
'
binary_logloss
'
:
eval_metric
=
"multi_logloss"
elif
eval_metric
==
'error'
or
eval_metric
==
'binary_error'
:
eval_metric
=
"multi_error"
else
:
if
eval_metric
==
'logloss'
or
eval_metric
==
'multi_logloss'
:
eval_metric
=
'binary_logloss'
elif
eval_metric
==
'error'
or
eval_metric
==
'multi_error'
:
eval_metric
=
'binary_error'
if
eval_set
is
not
None
:
eval_set
=
[(
x
[
0
],
self
.
_le
.
transform
(
x
[
1
]))
for
x
in
eval_set
]
...
...
@@ -653,7 +668,7 @@ class LGBMRanker(LGBMModel):
eval_init_score
=
None
,
eval_group
=
None
,
eval_metric
=
'ndcg'
,
eval_at
=
1
,
early_stopping_rounds
=
None
,
verbose
=
True
,
feature_name
=
None
,
categorical_feature
=
None
,
feature_name
=
'auto'
,
categorical_feature
=
'auto'
,
callbacks
=
None
):
"""
Most arguments like common methods except following:
...
...
src/application/application.cpp
View file @
eade219e
...
...
@@ -12,7 +12,7 @@
#include "predictor.hpp"
#include <
omp
.h>
#include <
LightGBM/utils/openmp_wrapper
.h>
#include <cstdio>
#include <ctime>
...
...
@@ -226,10 +226,10 @@ void Application::Train() {
int
total_iter
=
config_
.
boosting_config
.
num_iterations
;
bool
is_finished
=
false
;
bool
need_eval
=
true
;
auto
start_time
=
std
::
chrono
::
high_resolution
_clock
::
now
();
auto
start_time
=
std
::
chrono
::
steady
_clock
::
now
();
for
(
int
iter
=
0
;
iter
<
total_iter
&&
!
is_finished
;
++
iter
)
{
is_finished
=
boosting_
->
TrainOneIter
(
nullptr
,
nullptr
,
need_eval
);
auto
end_time
=
std
::
chrono
::
high_resolution
_clock
::
now
();
auto
end_time
=
std
::
chrono
::
steady
_clock
::
now
();
// output used time per iteration
Log
::
Info
(
"%f seconds elapsed, finished iteration %d"
,
std
::
chrono
::
duration
<
double
,
std
::
milli
>
(
end_time
-
start_time
)
*
1e-3
,
iter
+
1
);
...
...
src/application/predictor.hpp
View file @
eade219e
...
...
@@ -6,7 +6,7 @@
#include <LightGBM/utils/text_reader.h>
#include <LightGBM/dataset.h>
#include <
omp
.h>
#include <
LightGBM/utils/openmp_wrapper
.h>
#include <cstring>
#include <cstdio>
...
...
@@ -19,7 +19,7 @@
namespace
LightGBM
{
/*!
* \brief Used to predict
ion
data with input model
* \brief Used to predict data with input model
*/
class
Predictor
{
public:
...
...
@@ -27,7 +27,7 @@ public:
* \brief Constructor
* \param boosting Input boosting model
* \param is_raw_score True if need to predict result with raw score
* \param predict_leaf_index True if output leaf index instead of prediction score
* \param
is_
predict_leaf_index True if output leaf index instead of prediction score
*/
Predictor
(
const
Boosting
*
boosting
,
bool
is_raw_score
,
bool
is_predict_leaf_index
)
{
boosting_
=
boosting
;
...
...
@@ -69,14 +69,13 @@ public:
~
Predictor
()
{
}
inline
const
PredictFunction
&
GetPredictFunction
()
{
inline
const
PredictFunction
&
GetPredictFunction
()
const
{
return
predict_fun_
;
}
/*!
* \brief predicting on data, then saving result to disk
* \param data_filename Filename of data
* \param has_label True if this data contains label
* \param result_filename Filename of output result
*/
void
Predict
(
const
char
*
data_filename
,
const
char
*
result_filename
,
bool
has_header
)
{
...
...
src/boosting/boosting.cpp
View file @
eade219e
#include <LightGBM/boosting.h>
#include "gbdt.h"
#include "dart.hpp"
#include "goss.hpp"
namespace
LightGBM
{
...
...
@@ -10,7 +11,7 @@ std::string GetBoostingTypeFromModelFile(const char* filename) {
return
type
;
}
void
Boosting
::
LoadFileToBoosting
(
Boosting
*
boosting
,
const
char
*
filename
)
{
bool
Boosting
::
LoadFileToBoosting
(
Boosting
*
boosting
,
const
char
*
filename
)
{
if
(
boosting
!=
nullptr
)
{
TextReader
<
size_t
>
model_reader
(
filename
,
true
);
model_reader
.
ReadAllLines
();
...
...
@@ -18,8 +19,11 @@ void Boosting::LoadFileToBoosting(Boosting* boosting, const char* filename) {
for
(
auto
&
line
:
model_reader
.
Lines
())
{
str_buf
<<
line
<<
'\n'
;
}
boosting
->
LoadModelFromString
(
str_buf
.
str
());
if
(
!
boosting
->
LoadModelFromString
(
str_buf
.
str
()))
return
false
;
}
return
true
;
}
Boosting
*
Boosting
::
CreateBoosting
(
const
std
::
string
&
type
,
const
char
*
filename
)
{
...
...
@@ -28,6 +32,8 @@ Boosting* Boosting::CreateBoosting(const std::string& type, const char* filename
return
new
GBDT
();
}
else
if
(
type
==
std
::
string
(
"dart"
))
{
return
new
DART
();
}
else
if
(
type
==
std
::
string
(
"goss"
))
{
return
new
GOSS
();
}
else
{
return
nullptr
;
}
...
...
@@ -39,6 +45,10 @@ Boosting* Boosting::CreateBoosting(const std::string& type, const char* filename
ret
.
reset
(
new
GBDT
());
}
else
if
(
type
==
std
::
string
(
"dart"
))
{
ret
.
reset
(
new
DART
());
}
else
if
(
type
==
std
::
string
(
"goss"
))
{
ret
.
reset
(
new
GOSS
());
}
else
{
Log
::
Fatal
(
"unknow boosting type %s"
,
type
.
c_str
());
}
LoadFileToBoosting
(
ret
.
get
(),
filename
);
}
else
{
...
...
src/boosting/dart.hpp
View file @
eade219e
...
...
@@ -38,6 +38,11 @@ public:
random_for_drop_
=
Random
(
gbdt_config_
->
drop_seed
);
sum_weight_
=
0.0
f
;
}
void
ResetTrainingData
(
const
BoostingConfig
*
config
,
const
Dataset
*
train_data
,
const
ObjectiveFunction
*
object_function
,
const
std
::
vector
<
const
Metric
*>&
training_metrics
)
override
{
GBDT
::
ResetTrainingData
(
config
,
train_data
,
object_function
,
training_metrics
);
}
/*!
* \brief one training iteration
*/
...
...
@@ -78,7 +83,7 @@ private:
*/
void
DroppingTrees
()
{
drop_index_
.
clear
();
bool
is_skip
=
random_for_drop_
.
Next
Double
()
<
gbdt_config_
->
skip_drop
;
bool
is_skip
=
random_for_drop_
.
Next
Float
()
<
gbdt_config_
->
skip_drop
;
// select dropping tree indexes based on drop_rate and tree weights
if
(
!
is_skip
)
{
double
drop_rate
=
gbdt_config_
->
drop_rate
;
...
...
@@ -88,7 +93,7 @@ private:
drop_rate
=
std
::
min
(
drop_rate
,
gbdt_config_
->
max_drop
*
inv_average_weight
/
sum_weight_
);
}
for
(
int
i
=
0
;
i
<
iter_
;
++
i
)
{
if
(
random_for_drop_
.
Next
Double
()
<
drop_rate
*
tree_weight_
[
i
]
*
inv_average_weight
)
{
if
(
random_for_drop_
.
Next
Float
()
<
drop_rate
*
tree_weight_
[
i
]
*
inv_average_weight
)
{
drop_index_
.
push_back
(
i
);
}
}
...
...
@@ -97,7 +102,7 @@ private:
drop_rate
=
std
::
min
(
drop_rate
,
gbdt_config_
->
max_drop
/
static_cast
<
double
>
(
iter_
));
}
for
(
int
i
=
0
;
i
<
iter_
;
++
i
)
{
if
(
random_for_drop_
.
Next
Double
()
<
drop_rate
)
{
if
(
random_for_drop_
.
Next
Float
()
<
drop_rate
)
{
drop_index_
.
push_back
(
i
);
}
}
...
...
src/boosting/gbdt.cpp
View file @
eade219e
#include "gbdt.h"
#include <
omp
.h>
#include <
LightGBM/utils/openmp_wrapper
.h>
#include <LightGBM/utils/common.h>
#include <LightGBM/feature.h>
#include <LightGBM/objective_function.h>
#include <LightGBM/metric.h>
...
...
@@ -18,6 +17,17 @@
namespace
LightGBM
{
#ifdef TIMETAG
std
::
chrono
::
duration
<
double
,
std
::
milli
>
boosting_time
;
std
::
chrono
::
duration
<
double
,
std
::
milli
>
train_score_time
;
std
::
chrono
::
duration
<
double
,
std
::
milli
>
out_of_bag_score_time
;
std
::
chrono
::
duration
<
double
,
std
::
milli
>
valid_score_time
;
std
::
chrono
::
duration
<
double
,
std
::
milli
>
metric_time
;
std
::
chrono
::
duration
<
double
,
std
::
milli
>
bagging_time
;
std
::
chrono
::
duration
<
double
,
std
::
milli
>
sub_gradient_time
;
std
::
chrono
::
duration
<
double
,
std
::
milli
>
tree_time
;
#endif // TIMETAG
GBDT
::
GBDT
()
:
iter_
(
0
),
train_data_
(
nullptr
),
...
...
@@ -25,7 +35,7 @@ GBDT::GBDT()
early_stopping_round_
(
0
),
max_feature_idx_
(
0
),
num_class_
(
1
),
sigmoid_
(
1.0
f
),
sigmoid_
(
-
1.0
f
),
num_iteration_for_pred_
(
0
),
shrinkage_rate_
(
0.1
f
),
num_init_iteration_
(
0
)
{
...
...
@@ -37,7 +47,16 @@ GBDT::GBDT()
}
GBDT
::~
GBDT
()
{
#ifdef TIMETAG
Log
::
Info
(
"GBDT::boosting costs %f"
,
boosting_time
*
1e-3
);
Log
::
Info
(
"GBDT::train_score costs %f"
,
train_score_time
*
1e-3
);
Log
::
Info
(
"GBDT::out_of_bag_score costs %f"
,
out_of_bag_score_time
*
1e-3
);
Log
::
Info
(
"GBDT::valid_score costs %f"
,
valid_score_time
*
1e-3
);
Log
::
Info
(
"GBDT::metric costs %f"
,
metric_time
*
1e-3
);
Log
::
Info
(
"GBDT::bagging costs %f"
,
bagging_time
*
1e-3
);
Log
::
Info
(
"GBDT::sub_gradient costs %f"
,
sub_gradient_time
*
1e-3
);
Log
::
Info
(
"GBDT::tree costs %f"
,
tree_time
*
1e-3
);
#endif
}
void
GBDT
::
Init
(
const
BoostingConfig
*
config
,
const
Dataset
*
train_data
,
const
ObjectiveFunction
*
object_function
,
...
...
@@ -46,9 +65,6 @@ void GBDT::Init(const BoostingConfig* config, const Dataset* train_data, const O
num_iteration_for_pred_
=
0
;
max_feature_idx_
=
0
;
num_class_
=
config
->
num_class
;
for
(
int
i
=
0
;
i
<
num_threads_
;
++
i
)
{
random_
.
emplace_back
(
config
->
bagging_seed
+
i
);
}
train_data_
=
nullptr
;
gbdt_config_
=
nullptr
;
tree_learner_
=
nullptr
;
...
...
@@ -107,6 +123,10 @@ void GBDT::ResetTrainingData(const BoostingConfig* config, const Dataset* train_
max_feature_idx_
=
train_data
->
num_total_features
()
-
1
;
// get label index
label_idx_
=
train_data
->
label_idx
();
// get feature names
feature_names_
=
train_data
->
feature_names
();
feature_infos_
=
train_data
->
feature_infos
();
}
if
((
train_data_
!=
train_data
&&
train_data
!=
nullptr
)
...
...
@@ -122,16 +142,26 @@ void GBDT::ResetTrainingData(const BoostingConfig* config, const Dataset* train_
right_cnts_buf_
.
resize
(
num_threads_
);
left_write_pos_buf_
.
resize
(
num_threads_
);
right_write_pos_buf_
.
resize
(
num_threads_
);
double
average_bag_rate
=
new_config
->
bagging_fraction
/
new_config
->
bagging_freq
;
is_use_subset_
=
false
;
if
(
average_bag_rate
<=
0.5
)
{
tmp_subset_
.
reset
(
new
Dataset
(
bag_data_cnt_
));
tmp_subset_
->
CopyFeatureMapperFrom
(
train_data
);
is_use_subset_
=
true
;
Log
::
Debug
(
"use subset for bagging"
);
}
}
else
{
bag_data_cnt_
=
num_data_
;
bag_data_indices_
.
clear
();
tmp_indices_
.
clear
();
is_use_subset_
=
false
;
}
}
train_data_
=
train_data
;
if
(
train_data_
!=
nullptr
)
{
// reset config for tree learner
tree_learner_
->
ResetConfig
(
&
new_config
->
tree_config
);
is_class_end_
=
std
::
vector
<
bool
>
(
num_class_
,
false
);
}
gbdt_config_
.
reset
(
new_config
.
release
());
}
...
...
@@ -168,34 +198,39 @@ void GBDT::AddValidDataset(const Dataset* valid_data,
valid_metrics_
.
back
().
shrink_to_fit
();
}
data_size_t
GBDT
::
BaggingHelper
(
data_size_t
start
,
data_size_t
cnt
,
data_size_t
*
buffer
){
const
int
tid
=
omp_get_thread_num
();
data_size_t
GBDT
::
BaggingHelper
(
Random
&
cur_rand
,
data_size_t
start
,
data_size_t
cnt
,
data_size_t
*
buffer
){
if
(
cnt
<=
0
)
{
return
0
;
}
data_size_t
bag_data_cnt
=
static_cast
<
data_size_t
>
(
gbdt_config_
->
bagging_fraction
*
cnt
);
data_size_t
cur_left_cnt
=
0
;
data_size_t
cur_right_cnt
=
0
;
auto
right_buffer
=
buffer
+
bag_data_cnt
;
// random bagging, minimal unit is one record
for
(
data_size_t
i
=
0
;
i
<
cnt
;
++
i
)
{
double
prob
=
(
bag_data_cnt
-
cur_left_cnt
)
/
static_cast
<
double
>
(
cnt
-
i
);
if
(
random_
[
tid
].
NextDouble
()
<
prob
)
{
float
prob
=
(
bag_data_cnt
-
cur_left_cnt
)
/
static_cast
<
float
>
(
cnt
-
i
);
if
(
cur_rand
.
NextFloat
()
<
prob
)
{
buffer
[
cur_left_cnt
++
]
=
start
+
i
;
}
else
{
buffer
[
bag_data_cnt
+
cur_right_cnt
++
]
=
start
+
i
;
right_
buffer
[
cur_right_cnt
++
]
=
start
+
i
;
}
}
CHECK
(
cur_left_cnt
==
bag_data_cnt
);
return
cur_left_cnt
;
}
void
GBDT
::
Bagging
(
int
iter
)
{
// if need bagging
if
(
bag_data_cnt_
<
num_data_
&&
iter
%
gbdt_config_
->
bagging_freq
==
0
)
{
const
data_size_t
min_inner_size
=
1000
0
;
const
data_size_t
min_inner_size
=
1000
;
data_size_t
inner_size
=
(
num_data_
+
num_threads_
-
1
)
/
num_threads_
;
if
(
inner_size
<
min_inner_size
)
{
inner_size
=
min_inner_size
;
}
#pragma omp parallel for schedule(static,
1)
#pragma omp parallel for schedule(static,1)
for
(
int
i
=
0
;
i
<
num_threads_
;
++
i
)
{
left_cnts_buf_
[
i
]
=
0
;
right_cnts_buf_
[
i
]
=
0
;
...
...
@@ -203,7 +238,8 @@ void GBDT::Bagging(int iter) {
if
(
cur_start
>
num_data_
)
{
continue
;
}
data_size_t
cur_cnt
=
inner_size
;
if
(
cur_start
+
cur_cnt
>
num_data_
)
{
cur_cnt
=
num_data_
-
cur_start
;
}
data_size_t
cur_left_count
=
BaggingHelper
(
cur_start
,
cur_cnt
,
tmp_indices_
.
data
()
+
cur_start
);
Random
cur_rand
(
gbdt_config_
->
bagging_seed
+
iter
*
num_threads_
+
i
);
data_size_t
cur_left_count
=
BaggingHelper
(
cur_rand
,
cur_start
,
cur_cnt
,
tmp_indices_
.
data
()
+
cur_start
);
offsets_buf_
[
i
]
=
cur_start
;
left_cnts_buf_
[
i
]
=
cur_left_count
;
right_cnts_buf_
[
i
]
=
cur_cnt
-
cur_left_count
;
...
...
@@ -228,47 +264,114 @@ void GBDT::Bagging(int iter) {
tmp_indices_
.
data
()
+
offsets_buf_
[
i
]
+
left_cnts_buf_
[
i
],
right_cnts_buf_
[
i
]
*
sizeof
(
data_size_t
));
}
}
bag_data_cnt_
=
left_cnt
;
CHECK
(
bag_data_indices_
[
bag_data_cnt_
-
1
]
>
bag_data_indices_
[
bag_data_cnt_
]);
Log
::
Debug
(
"Re-bagging, using %d data to train"
,
bag_data_cnt_
);
// set bagging data to tree learner
if
(
!
is_use_subset_
)
{
tree_learner_
->
SetBaggingData
(
bag_data_indices_
.
data
(),
bag_data_cnt_
);
}
else
{
// get subset
tmp_subset_
->
ReSize
(
bag_data_cnt_
);
tmp_subset_
->
CopySubset
(
train_data_
,
bag_data_indices_
.
data
(),
bag_data_cnt_
,
false
);
tree_learner_
->
ResetTrainingData
(
tmp_subset_
.
get
());
}
}
}
void
GBDT
::
UpdateScoreOutOfBag
(
const
Tree
*
tree
,
const
int
curr_class
)
{
#ifdef TIMETAG
auto
start_time
=
std
::
chrono
::
steady_clock
::
now
();
#endif
// we need to predict out-of-bag socres of data for boosting
if
(
num_data_
-
bag_data_cnt_
>
0
)
{
if
(
num_data_
-
bag_data_cnt_
>
0
&&
!
is_use_subset_
)
{
train_score_updater_
->
AddScore
(
tree
,
bag_data_indices_
.
data
()
+
bag_data_cnt_
,
num_data_
-
bag_data_cnt_
,
curr_class
);
}
#ifdef TIMETAG
out_of_bag_score_time
+=
std
::
chrono
::
steady_clock
::
now
()
-
start_time
;
#endif
}
bool
GBDT
::
TrainOneIter
(
const
score_t
*
gradient
,
const
score_t
*
hessian
,
bool
is_eval
)
{
// boosting first
if
(
gradient
==
nullptr
||
hessian
==
nullptr
)
{
#ifdef TIMETAG
auto
start_time
=
std
::
chrono
::
steady_clock
::
now
();
#endif
Boosting
();
gradient
=
gradients_
.
data
();
hessian
=
hessians_
.
data
();
#ifdef TIMETAG
boosting_time
+=
std
::
chrono
::
steady_clock
::
now
()
-
start_time
;
#endif
}
#ifdef TIMETAG
auto
start_time
=
std
::
chrono
::
steady_clock
::
now
();
#endif
// bagging logic
Bagging
(
iter_
);
#ifdef TIMETAG
bagging_time
+=
std
::
chrono
::
steady_clock
::
now
()
-
start_time
;
#endif
if
(
is_use_subset_
&&
bag_data_cnt_
<
num_data_
)
{
#ifdef TIMETAG
start_time
=
std
::
chrono
::
steady_clock
::
now
();
#endif
if
(
gradients_
.
empty
())
{
size_t
total_size
=
static_cast
<
size_t
>
(
num_data_
)
*
num_class_
;
gradients_
.
resize
(
total_size
);
hessians_
.
resize
(
total_size
);
}
// get sub gradients
for
(
int
curr_class
=
0
;
curr_class
<
num_class_
;
++
curr_class
)
{
auto
bias
=
curr_class
*
num_data_
;
// cannot multi-threding
for
(
int
i
=
0
;
i
<
bag_data_cnt_
;
++
i
)
{
gradients_
[
bias
+
i
]
=
gradient
[
bias
+
bag_data_indices_
[
i
]];
hessians_
[
bias
+
i
]
=
hessian
[
bias
+
bag_data_indices_
[
i
]];
}
}
gradient
=
gradients_
.
data
();
hessian
=
hessians_
.
data
();
#ifdef TIMETAG
sub_gradient_time
+=
std
::
chrono
::
steady_clock
::
now
()
-
start_time
;
#endif
}
bool
should_continue
=
false
;
for
(
int
curr_class
=
0
;
curr_class
<
num_class_
;
++
curr_class
)
{
#ifdef TIMETAG
start_time
=
std
::
chrono
::
steady_clock
::
now
();
#endif
std
::
unique_ptr
<
Tree
>
new_tree
(
new
Tree
(
2
));
if
(
!
is_class_end_
[
curr_class
])
{
// train a new tree
std
::
unique_ptr
<
Tree
>
new_tree
(
tree_learner_
->
Train
(
gradient
+
curr_class
*
num_data_
,
hessian
+
curr_class
*
num_data_
));
// if cannot learn a new tree, then stop
if
(
new_tree
->
num_leaves
()
<=
1
)
{
Log
::
Info
(
"Stopped training because there are no more leafs that meet the split requirements."
);
return
true
;
new_tree
.
reset
(
tree_learner_
->
Train
(
gradient
+
curr_class
*
num_data_
,
hessian
+
curr_class
*
num_data_
));
}
#ifdef TIMETAG
tree_time
+=
std
::
chrono
::
steady_clock
::
now
()
-
start_time
;
#endif
if
(
new_tree
->
num_leaves
()
>
1
)
{
should_continue
=
true
;
// shrinkage by learning rate
new_tree
->
Shrinkage
(
shrinkage_rate_
);
// update score
UpdateScore
(
new_tree
.
get
(),
curr_class
);
UpdateScoreOutOfBag
(
new_tree
.
get
(),
curr_class
);
}
else
{
is_class_end_
[
curr_class
]
=
true
;
}
// add model
models_
.
push_back
(
std
::
move
(
new_tree
));
}
if
(
!
should_continue
)
{
Log
::
Warning
(
"Stopped training because there are no more leaves that meet the split requirements."
);
for
(
int
curr_class
=
0
;
curr_class
<
num_class_
;
++
curr_class
)
{
models_
.
pop_back
();
}
return
true
;
}
++
iter_
;
if
(
is_eval
)
{
return
EvalAndCheckEarlyStopping
();
...
...
@@ -294,13 +397,20 @@ void GBDT::RollbackOneIter() {
for
(
int
curr_class
=
0
;
curr_class
<
num_class_
;
++
curr_class
)
{
models_
.
pop_back
();
}
is_class_end_
=
std
::
vector
<
bool
>
(
num_class_
,
false
);
--
iter_
;
}
bool
GBDT
::
EvalAndCheckEarlyStopping
()
{
bool
is_met_early_stopping
=
false
;
#ifdef TIMETAG
auto
start_time
=
std
::
chrono
::
steady_clock
::
now
();
#endif
// print message for metric
auto
best_msg
=
OutputMetric
(
iter_
);
#ifdef TIMETAG
metric_time
+=
std
::
chrono
::
steady_clock
::
now
()
-
start_time
;
#endif
is_met_early_stopping
=
!
best_msg
.
empty
();
if
(
is_met_early_stopping
)
{
Log
::
Info
(
"Early stopping at iteration %d, the best iteration round is %d"
,
...
...
@@ -315,12 +425,28 @@ bool GBDT::EvalAndCheckEarlyStopping() {
}
void
GBDT
::
UpdateScore
(
const
Tree
*
tree
,
const
int
curr_class
)
{
#ifdef TIMETAG
auto
start_time
=
std
::
chrono
::
steady_clock
::
now
();
#endif
// update training score
if
(
!
is_use_subset_
)
{
train_score_updater_
->
AddScore
(
tree_learner_
.
get
(),
curr_class
);
}
else
{
train_score_updater_
->
AddScore
(
tree
,
curr_class
);
}
#ifdef TIMETAG
train_score_time
+=
std
::
chrono
::
steady_clock
::
now
()
-
start_time
;
#endif
#ifdef TIMETAG
start_time
=
std
::
chrono
::
steady_clock
::
now
();
#endif
// update validation score
for
(
auto
&
score_updater
:
valid_score_updater_
)
{
score_updater
->
AddScore
(
tree
,
curr_class
);
}
#ifdef TIMETAG
valid_score_time
+=
std
::
chrono
::
steady_clock
::
now
()
-
start_time
;
#endif
}
std
::
string
GBDT
::
OutputMetric
(
int
iter
)
{
...
...
@@ -441,7 +567,7 @@ void GBDT::GetPredictAt(int data_idx, double* out_result, int64_t* out_len) {
}
else
if
(
sigmoid_
>
0.0
f
){
#pragma omp parallel for schedule(static)
for
(
data_size_t
i
=
0
;
i
<
num_data
;
++
i
)
{
out_result
[
i
]
=
static_cast
<
double
>
(
1.0
f
/
(
1.0
f
+
std
::
exp
(
-
2.0
f
*
sigmoid_
*
raw_scores
[
i
])));
out_result
[
i
]
=
static_cast
<
double
>
(
1.0
f
/
(
1.0
f
+
std
::
exp
(
-
sigmoid_
*
raw_scores
[
i
])));
}
}
else
{
#pragma omp parallel for schedule(static)
...
...
@@ -472,14 +598,8 @@ std::string GBDT::DumpModel(int num_iteration) const {
str_buf
<<
"
\"
max_feature_idx
\"
:"
<<
max_feature_idx_
<<
","
<<
std
::
endl
;
str_buf
<<
"
\"
sigmoid
\"
:"
<<
sigmoid_
<<
","
<<
std
::
endl
;
// output feature names
auto
feature_names
=
std
::
ref
(
feature_names_
);
if
(
train_data_
!=
nullptr
)
{
feature_names
=
std
::
ref
(
train_data_
->
feature_names
());
}
str_buf
<<
"
\"
feature_names
\"
:[
\"
"
<<
Common
::
Join
(
feature_names
.
get
()
,
"
\"
,
\"
"
)
<<
"
\"
],"
<<
Common
::
Join
(
feature_names
_
,
"
\"
,
\"
"
)
<<
"
\"
],"
<<
std
::
endl
;
str_buf
<<
"
\"
tree_info
\"
:["
;
...
...
@@ -503,51 +623,61 @@ std::string GBDT::DumpModel(int num_iteration) const {
return
str_buf
.
str
();
}
void
GBDT
::
SaveModelToFile
(
int
num_iteration
,
const
char
*
filename
)
const
{
/*! \brief File to write models */
std
::
ofstream
output_file
;
output_file
.
open
(
filename
);
std
::
string
GBDT
::
SaveModelToString
(
int
num_iterations
)
const
{
std
::
stringstream
ss
;
// output model type
output_file
<<
SubModelName
()
<<
std
::
endl
;
ss
<<
SubModelName
()
<<
std
::
endl
;
// output number of class
output_file
<<
"num_class="
<<
num_class_
<<
std
::
endl
;
ss
<<
"num_class="
<<
num_class_
<<
std
::
endl
;
// output label index
output_file
<<
"label_index="
<<
label_idx_
<<
std
::
endl
;
ss
<<
"label_index="
<<
label_idx_
<<
std
::
endl
;
// output max_feature_idx
output_file
<<
"max_feature_idx="
<<
max_feature_idx_
<<
std
::
endl
;
ss
<<
"max_feature_idx="
<<
max_feature_idx_
<<
std
::
endl
;
// output objective name
if
(
object_function_
!=
nullptr
)
{
output_file
<<
"objective="
<<
object_function_
->
GetName
()
<<
std
::
endl
;
ss
<<
"objective="
<<
object_function_
->
GetName
()
<<
std
::
endl
;
}
// output sigmoid parameter
output_file
<<
"sigmoid="
<<
sigmoid_
<<
std
::
endl
;
// output feature names
auto
feature_names
=
std
::
ref
(
feature_names_
);
if
(
train_data_
!=
nullptr
)
{
feature_names
=
std
::
ref
(
train_data_
->
feature_names
());
}
output_file
<<
"feature_names="
<<
Common
::
Join
(
feature_names
.
get
(),
" "
)
<<
std
::
endl
;
ss
<<
"sigmoid="
<<
sigmoid_
<<
std
::
endl
;
ss
<<
"feature_names="
<<
Common
::
Join
(
feature_names_
,
" "
)
<<
std
::
endl
;
output_file
<<
std
::
endl
;
ss
<<
"feature_infos="
<<
Common
::
Join
(
feature_infos_
,
" "
)
<<
std
::
endl
;
ss
<<
std
::
endl
;
int
num_used_model
=
static_cast
<
int
>
(
models_
.
size
());
if
(
num_iteration
>
0
)
{
num_used_model
=
std
::
min
(
num_iteration
*
num_class_
,
num_used_model
);
if
(
num_iteration
s
>
0
)
{
num_used_model
=
std
::
min
(
num_iteration
s
*
num_class_
,
num_used_model
);
}
// output tree models
for
(
int
i
=
0
;
i
<
num_used_model
;
++
i
)
{
output_file
<<
"Tree="
<<
i
<<
std
::
endl
;
output_file
<<
models_
[
i
]
->
ToString
()
<<
std
::
endl
;
ss
<<
"Tree="
<<
i
<<
std
::
endl
;
ss
<<
models_
[
i
]
->
ToString
()
<<
std
::
endl
;
}
std
::
vector
<
std
::
pair
<
size_t
,
std
::
string
>>
pairs
=
FeatureImportance
();
output_file
<<
std
::
endl
<<
"feature importances:"
<<
std
::
endl
;
ss
<<
std
::
endl
<<
"feature importances:"
<<
std
::
endl
;
for
(
size_t
i
=
0
;
i
<
pairs
.
size
();
++
i
)
{
output_file
<<
pairs
[
i
].
second
<<
"="
<<
std
::
to_string
(
pairs
[
i
].
first
)
<<
std
::
endl
;
ss
<<
pairs
[
i
].
second
<<
"="
<<
std
::
to_string
(
pairs
[
i
].
first
)
<<
std
::
endl
;
}
return
ss
.
str
();
}
bool
GBDT
::
SaveModelToFile
(
int
num_iteration
,
const
char
*
filename
)
const
{
/*! \brief File to write models */
std
::
ofstream
output_file
;
output_file
.
open
(
filename
);
output_file
<<
SaveModelToString
(
num_iteration
);
output_file
.
close
();
return
(
bool
)
output_file
;
}
void
GBDT
::
LoadModelFromString
(
const
std
::
string
&
model_str
)
{
bool
GBDT
::
LoadModelFromString
(
const
std
::
string
&
model_str
)
{
// use serialized string to restore this object
models_
.
clear
();
std
::
vector
<
std
::
string
>
lines
=
Common
::
Split
(
model_str
.
c_str
(),
'\n'
);
...
...
@@ -558,7 +688,7 @@ void GBDT::LoadModelFromString(const std::string& model_str) {
Common
::
Atoi
(
Common
::
Split
(
line
.
c_str
(),
'='
)[
1
].
c_str
(),
&
num_class_
);
}
else
{
Log
::
Fatal
(
"Model file doesn't specify the number of classes"
);
return
;
return
false
;
}
// get index of label
line
=
Common
::
FindFromLines
(
lines
,
"label_index="
);
...
...
@@ -566,7 +696,7 @@ void GBDT::LoadModelFromString(const std::string& model_str) {
Common
::
Atoi
(
Common
::
Split
(
line
.
c_str
(),
'='
)[
1
].
c_str
(),
&
label_idx_
);
}
else
{
Log
::
Fatal
(
"Model file doesn't specify the label index"
);
return
;
return
false
;
}
// get max_feature_idx first
line
=
Common
::
FindFromLines
(
lines
,
"max_feature_idx="
);
...
...
@@ -574,7 +704,7 @@ void GBDT::LoadModelFromString(const std::string& model_str) {
Common
::
Atoi
(
Common
::
Split
(
line
.
c_str
(),
'='
)[
1
].
c_str
(),
&
max_feature_idx_
);
}
else
{
Log
::
Fatal
(
"Model file doesn't specify max_feature_idx"
);
return
;
return
false
;
}
// get sigmoid parameter
line
=
Common
::
FindFromLines
(
lines
,
"sigmoid="
);
...
...
@@ -589,11 +719,24 @@ void GBDT::LoadModelFromString(const std::string& model_str) {
feature_names_
=
Common
::
Split
(
line
.
substr
(
std
::
strlen
(
"feature_names="
)).
c_str
(),
" "
);
if
(
feature_names_
.
size
()
!=
static_cast
<
size_t
>
(
max_feature_idx_
+
1
))
{
Log
::
Fatal
(
"Wrong size of feature_names"
);
return
;
return
false
;
}
}
else
{
}
else
{
Log
::
Fatal
(
"Model file doesn't contain feature names"
);
return
;
return
false
;
}
line
=
Common
::
FindFromLines
(
lines
,
"feature_infos="
);
if
(
line
.
size
()
>
0
)
{
feature_infos_
=
Common
::
Split
(
line
.
substr
(
std
::
strlen
(
"feature_infos="
)).
c_str
(),
" "
);
if
(
feature_infos_
.
size
()
!=
static_cast
<
size_t
>
(
max_feature_idx_
+
1
))
{
Log
::
Fatal
(
"Wrong size of feature_infos"
);
return
false
;
}
}
else
{
Log
::
Fatal
(
"Model file doesn't contain feature infos"
);
return
false
;
}
// get tree models
...
...
@@ -616,24 +759,23 @@ void GBDT::LoadModelFromString(const std::string& model_str) {
num_iteration_for_pred_
=
static_cast
<
int
>
(
models_
.
size
())
/
num_class_
;
num_init_iteration_
=
num_iteration_for_pred_
;
iter_
=
0
;
return
true
;
}
std
::
vector
<
std
::
pair
<
size_t
,
std
::
string
>>
GBDT
::
FeatureImportance
()
const
{
auto
feature_names
=
std
::
ref
(
feature_names_
);
if
(
train_data_
!=
nullptr
)
{
feature_names
=
std
::
ref
(
train_data_
->
feature_names
());
}
std
::
vector
<
size_t
>
feature_importances
(
max_feature_idx_
+
1
,
0
);
for
(
size_t
iter
=
0
;
iter
<
models_
.
size
();
++
iter
)
{
for
(
int
split_idx
=
0
;
split_idx
<
models_
[
iter
]
->
num_leaves
()
-
1
;
++
split_idx
)
{
++
feature_importances
[
models_
[
iter
]
->
split_feature
_real
(
split_idx
)];
++
feature_importances
[
models_
[
iter
]
->
split_feature
(
split_idx
)];
}
}
// store the importance first
std
::
vector
<
std
::
pair
<
size_t
,
std
::
string
>>
pairs
;
for
(
size_t
i
=
0
;
i
<
feature_importances
.
size
();
++
i
)
{
if
(
feature_importances
[
i
]
>
0
)
{
pairs
.
emplace_back
(
feature_importances
[
i
],
feature_names
.
get
().
at
(
i
)
);
pairs
.
emplace_back
(
feature_importances
[
i
],
feature_names
_
[
i
]
);
}
}
// sort the importance
...
...
@@ -664,7 +806,7 @@ std::vector<double> GBDT::Predict(const double* value) const {
}
// if need sigmoid transform
if
(
sigmoid_
>
0
&&
num_class_
==
1
)
{
ret
[
0
]
=
1.0
f
/
(
1.0
f
+
std
::
exp
(
-
2.0
f
*
sigmoid_
*
ret
[
0
]));
ret
[
0
]
=
1.0
f
/
(
1.0
f
+
std
::
exp
(
-
sigmoid_
*
ret
[
0
]));
}
else
if
(
num_class_
>
1
)
{
Common
::
Softmax
(
&
ret
);
}
...
...
src/boosting/gbdt.h
View file @
eade219e
...
...
@@ -119,7 +119,7 @@ public:
* \brief Get prediction result at data_idx data
* \param data_idx 0: training data, 1: 1st validation data
* \param result used to store prediction result, should allocate memory before call this function
* \param out_len leng
h
t of returned score
* \param out_len lengt
h
of returned score
*/
void
GetPredictAt
(
int
data_idx
,
double
*
out_result
,
int64_t
*
out_len
)
override
;
...
...
@@ -156,12 +156,19 @@ public:
* \param is_finish Is training finished or not
* \param filename Filename that want to save to
*/
virtual
void
SaveModelToFile
(
int
num_iterations
,
const
char
*
filename
)
const
override
;
virtual
bool
SaveModelToFile
(
int
num_iterations
,
const
char
*
filename
)
const
override
;
/*!
* \brief Save model to string
* \param num_used_model Number of model that want to save, -1 means save all
* \return Non-empty string if succeeded
*/
virtual
std
::
string
SaveModelToString
(
int
num_iterations
)
const
override
;
/*!
* \brief Restore from a serialized string
*/
void
LoadModelFromString
(
const
std
::
string
&
model_str
)
override
;
bool
LoadModelFromString
(
const
std
::
string
&
model_str
)
override
;
/*!
* \brief Get max feature index of this model
...
...
@@ -169,6 +176,12 @@ public:
*/
inline
int
MaxFeatureIdx
()
const
override
{
return
max_feature_idx_
;
}
/*!
* \brief Get feature names of this model
* \return Feature names of this model
*/
inline
std
::
vector
<
std
::
string
>
FeatureNames
()
const
override
{
return
feature_names_
;
}
/*!
* \brief Get index of label column
* \return index of label column
...
...
@@ -228,7 +241,7 @@ protected:
* \param buffer output buffer
* \return count of left size
*/
virtual
data_size_t
BaggingHelper
(
data_size_t
start
,
data_size_t
cnt
,
data_size_t
*
buffer
);
data_size_t
BaggingHelper
(
Random
&
cur_rand
,
data_size_t
start
,
data_size_t
cnt
,
data_size_t
*
buffer
);
/*!
* \brief updating score for out-of-bag data.
* Data should be update since we may re-bagging data on training
...
...
@@ -301,8 +314,6 @@ protected:
data_size_t
num_data_
;
/*! \brief Number of classes */
int
num_class_
;
/*! \brief Random generator, used for bagging */
std
::
vector
<
Random
>
random_
;
/*!
* \brief Sigmoid parameter, used for prediction.
* if > 0 means output score will transform by sigmoid function
...
...
@@ -318,6 +329,7 @@ protected:
int
num_init_iteration_
;
/*! \brief Feature names */
std
::
vector
<
std
::
string
>
feature_names_
;
std
::
vector
<
std
::
string
>
feature_infos_
;
/*! \brief number of threads */
int
num_threads_
;
/*! \brief Buffer for multi-threading bagging */
...
...
@@ -330,6 +342,9 @@ protected:
std
::
vector
<
data_size_t
>
left_write_pos_buf_
;
/*! \brief Buffer for multi-threading bagging */
std
::
vector
<
data_size_t
>
right_write_pos_buf_
;
std
::
unique_ptr
<
Dataset
>
tmp_subset_
;
bool
is_use_subset_
;
std
::
vector
<
bool
>
is_class_end_
;
};
}
// namespace LightGBM
...
...
src/boosting/goss.hpp
0 → 100644
View file @
eade219e
#ifndef LIGHTGBM_BOOSTING_GOSS_H_
#define LIGHTGBM_BOOSTING_GOSS_H_
#include <LightGBM/utils/array_args.h>
#include <LightGBM/utils/log.h>
#include <LightGBM/utils/openmp_wrapper.h>
#include <LightGBM/boosting.h>
#include "score_updater.hpp"
#include "gbdt.h"
#include <cstdio>
#include <vector>
#include <string>
#include <fstream>
#include <chrono>
namespace
LightGBM
{
#ifdef TIMETAG
std
::
chrono
::
duration
<
double
,
std
::
milli
>
subset_time
;
std
::
chrono
::
duration
<
double
,
std
::
milli
>
re_init_tree_time
;
#endif
class
GOSS
:
public
GBDT
{
public:
/*!
* \brief Constructor
*/
GOSS
()
:
GBDT
()
{
}
~
GOSS
()
{
#ifdef TIMETAG
Log
::
Info
(
"GOSS::subset costs %f"
,
subset_time
*
1e-3
);
Log
::
Info
(
"GOSS::re_init_tree costs %f"
,
re_init_tree_time
*
1e-3
);
#endif
}
void
Init
(
const
BoostingConfig
*
config
,
const
Dataset
*
train_data
,
const
ObjectiveFunction
*
object_function
,
const
std
::
vector
<
const
Metric
*>&
training_metrics
)
override
{
GBDT
::
Init
(
config
,
train_data
,
object_function
,
training_metrics
);
CHECK
(
gbdt_config_
->
top_rate
+
gbdt_config_
->
other_rate
<=
1.0
f
);
CHECK
(
gbdt_config_
->
top_rate
>
0.0
f
&&
gbdt_config_
->
other_rate
>
0.0
f
);
if
(
gbdt_config_
->
bagging_freq
>
0
&&
gbdt_config_
->
bagging_fraction
!=
1.0
f
)
{
Log
::
Fatal
(
"cannot use bagging in GOSS"
);
}
Log
::
Info
(
"using GOSS"
);
}
void
ResetTrainingData
(
const
BoostingConfig
*
config
,
const
Dataset
*
train_data
,
const
ObjectiveFunction
*
object_function
,
const
std
::
vector
<
const
Metric
*>&
training_metrics
)
override
{
if
(
config
->
bagging_freq
>
0
&&
config
->
bagging_fraction
!=
1.0
f
)
{
Log
::
Fatal
(
"cannot use bagging in GOSS"
);
}
GBDT
::
ResetTrainingData
(
config
,
train_data
,
object_function
,
training_metrics
);
if
(
train_data_
==
nullptr
)
{
return
;
}
bag_data_indices_
.
resize
(
num_data_
);
tmp_indices_
.
resize
(
num_data_
);
tmp_indice_right_
.
resize
(
num_data_
);
offsets_buf_
.
resize
(
num_threads_
);
left_cnts_buf_
.
resize
(
num_threads_
);
right_cnts_buf_
.
resize
(
num_threads_
);
left_write_pos_buf_
.
resize
(
num_threads_
);
right_write_pos_buf_
.
resize
(
num_threads_
);
is_use_subset_
=
false
;
if
(
config
->
top_rate
+
config
->
other_rate
<=
0.5
)
{
auto
bag_data_cnt
=
static_cast
<
data_size_t
>
((
config
->
top_rate
+
config
->
other_rate
)
*
num_data_
);
tmp_subset_
.
reset
(
new
Dataset
(
bag_data_cnt
));
tmp_subset_
->
CopyFeatureMapperFrom
(
train_data_
);
is_use_subset_
=
true
;
}
// flag to not bagging first
bag_data_cnt_
=
num_data_
;
}
data_size_t
BaggingHelper
(
Random
&
cur_rand
,
data_size_t
start
,
data_size_t
cnt
,
data_size_t
*
buffer
,
data_size_t
*
buffer_right
)
{
std
::
vector
<
score_t
>
tmp_gradients
(
cnt
,
0.0
f
);
for
(
data_size_t
i
=
0
;
i
<
cnt
;
++
i
)
{
for
(
int
curr_class
=
0
;
curr_class
<
num_class_
;
++
curr_class
)
{
int
idx
=
curr_class
*
num_data_
+
start
+
i
;
tmp_gradients
[
i
]
+=
std
::
fabs
(
gradients_
[
idx
]
*
hessians_
[
idx
]);
}
}
data_size_t
top_k
=
static_cast
<
data_size_t
>
(
cnt
*
gbdt_config_
->
top_rate
);
data_size_t
other_k
=
static_cast
<
data_size_t
>
(
cnt
*
gbdt_config_
->
other_rate
);
top_k
=
std
::
max
(
1
,
top_k
);
ArrayArgs
<
score_t
>::
ArgMaxAtK
(
&
tmp_gradients
,
0
,
static_cast
<
int
>
(
tmp_gradients
.
size
()),
top_k
);
score_t
threshold
=
tmp_gradients
[
top_k
-
1
];
score_t
multiply
=
static_cast
<
score_t
>
(
cnt
-
top_k
)
/
other_k
;
data_size_t
cur_left_cnt
=
0
;
data_size_t
cur_right_cnt
=
0
;
data_size_t
big_weight_cnt
=
0
;
for
(
data_size_t
i
=
0
;
i
<
cnt
;
++
i
)
{
score_t
grad
=
0.0
f
;
for
(
int
curr_class
=
0
;
curr_class
<
num_class_
;
++
curr_class
)
{
int
idx
=
curr_class
*
num_data_
+
start
+
i
;
grad
+=
std
::
fabs
(
gradients_
[
idx
]
*
hessians_
[
idx
]);
}
if
(
grad
>=
threshold
)
{
buffer
[
cur_left_cnt
++
]
=
start
+
i
;
++
big_weight_cnt
;
}
else
{
data_size_t
sampled
=
cur_left_cnt
-
big_weight_cnt
;
data_size_t
rest_need
=
other_k
-
sampled
;
data_size_t
rest_all
=
(
cnt
-
i
)
-
(
top_k
-
big_weight_cnt
);
double
prob
=
(
rest_need
)
/
static_cast
<
double
>
(
rest_all
);
if
(
cur_rand
.
NextFloat
()
<
prob
)
{
buffer
[
cur_left_cnt
++
]
=
start
+
i
;
for
(
int
curr_class
=
0
;
curr_class
<
num_class_
;
++
curr_class
)
{
int
idx
=
curr_class
*
num_data_
+
start
+
i
;
gradients_
[
idx
]
*=
multiply
;
hessians_
[
idx
]
*=
multiply
;
}
}
else
{
buffer_right
[
cur_right_cnt
++
]
=
start
+
i
;
}
}
}
return
cur_left_cnt
;
}
void
Bagging
(
int
iter
)
override
{
bag_data_cnt_
=
num_data_
;
// not subsample for first iterations
if
(
iter
<
static_cast
<
int
>
(
1.0
f
/
gbdt_config_
->
learning_rate
))
{
return
;
}
const
data_size_t
min_inner_size
=
100
;
data_size_t
inner_size
=
(
num_data_
+
num_threads_
-
1
)
/
num_threads_
;
if
(
inner_size
<
min_inner_size
)
{
inner_size
=
min_inner_size
;
}
#pragma omp parallel for schedule(static, 1)
for
(
int
i
=
0
;
i
<
num_threads_
;
++
i
)
{
left_cnts_buf_
[
i
]
=
0
;
right_cnts_buf_
[
i
]
=
0
;
data_size_t
cur_start
=
i
*
inner_size
;
if
(
cur_start
>
num_data_
)
{
continue
;
}
data_size_t
cur_cnt
=
inner_size
;
if
(
cur_start
+
cur_cnt
>
num_data_
)
{
cur_cnt
=
num_data_
-
cur_start
;
}
Random
cur_rand
(
gbdt_config_
->
bagging_seed
+
iter
*
num_threads_
+
i
);
data_size_t
cur_left_count
=
BaggingHelper
(
cur_rand
,
cur_start
,
cur_cnt
,
tmp_indices_
.
data
()
+
cur_start
,
tmp_indice_right_
.
data
()
+
cur_start
);
offsets_buf_
[
i
]
=
cur_start
;
left_cnts_buf_
[
i
]
=
cur_left_count
;
right_cnts_buf_
[
i
]
=
cur_cnt
-
cur_left_count
;
}
data_size_t
left_cnt
=
0
;
left_write_pos_buf_
[
0
]
=
0
;
right_write_pos_buf_
[
0
]
=
0
;
for
(
int
i
=
1
;
i
<
num_threads_
;
++
i
)
{
left_write_pos_buf_
[
i
]
=
left_write_pos_buf_
[
i
-
1
]
+
left_cnts_buf_
[
i
-
1
];
right_write_pos_buf_
[
i
]
=
right_write_pos_buf_
[
i
-
1
]
+
right_cnts_buf_
[
i
-
1
];
}
left_cnt
=
left_write_pos_buf_
[
num_threads_
-
1
]
+
left_cnts_buf_
[
num_threads_
-
1
];
#pragma omp parallel for schedule(static, 1)
for
(
int
i
=
0
;
i
<
num_threads_
;
++
i
)
{
if
(
left_cnts_buf_
[
i
]
>
0
)
{
std
::
memcpy
(
bag_data_indices_
.
data
()
+
left_write_pos_buf_
[
i
],
tmp_indices_
.
data
()
+
offsets_buf_
[
i
],
left_cnts_buf_
[
i
]
*
sizeof
(
data_size_t
));
}
if
(
right_cnts_buf_
[
i
]
>
0
)
{
std
::
memcpy
(
bag_data_indices_
.
data
()
+
left_cnt
+
right_write_pos_buf_
[
i
],
tmp_indice_right_
.
data
()
+
offsets_buf_
[
i
],
right_cnts_buf_
[
i
]
*
sizeof
(
data_size_t
));
}
}
bag_data_cnt_
=
left_cnt
;
// set bagging data to tree learner
if
(
!
is_use_subset_
)
{
tree_learner_
->
SetBaggingData
(
bag_data_indices_
.
data
(),
bag_data_cnt_
);
}
else
{
// get subset
#ifdef TIMETAG
auto
start_time
=
std
::
chrono
::
steady_clock
::
now
();
#endif
tmp_subset_
->
ReSize
(
bag_data_cnt_
);
tmp_subset_
->
CopySubset
(
train_data_
,
bag_data_indices_
.
data
(),
bag_data_cnt_
,
false
);
#ifdef TIMETAG
subset_time
+=
std
::
chrono
::
steady_clock
::
now
()
-
start_time
;
#endif
#ifdef TIMETAG
start_time
=
std
::
chrono
::
steady_clock
::
now
();
#endif
tree_learner_
->
ResetTrainingData
(
tmp_subset_
.
get
());
#ifdef TIMETAG
re_init_tree_time
+=
std
::
chrono
::
steady_clock
::
now
()
-
start_time
;
#endif
}
}
/*!
* \brief Get Type name of this boosting object
*/
const
char
*
SubModelName
()
const
override
{
return
"tree"
;
}
private:
std
::
vector
<
data_size_t
>
tmp_indice_right_
;
};
}
// namespace LightGBM
#endif // LIGHTGBM_BOOSTING_GOSS_H_
src/boosting/score_updater.hpp
View file @
eade219e
#ifndef LIGHTGBM_BOOSTING_SCORE_UPDATER_HPP_
#define LIGHTGBM_BOOSTING_SCORE_UPDATER_HPP_
#include <LightGBM/utils/openmp_wrapper.h>
#include <LightGBM/meta.h>
#include <LightGBM/dataset.h>
#include <LightGBM/tree.h>
...
...
src/c_api.cpp
View file @
eade219e
#include <
omp
.h>
#include <
LightGBM/utils/openmp_wrapper
.h>
#include <LightGBM/utils/common.h>
#include <LightGBM/utils/random.h>
...
...
@@ -31,6 +31,10 @@ public:
boosting_
.
reset
(
Boosting
::
CreateBoosting
(
filename
));
}
Booster
()
{
boosting_
.
reset
(
Boosting
::
CreateBoosting
(
"gbdt"
,
nullptr
));
}
Booster
(
const
Dataset
*
train_data
,
const
char
*
parameters
)
{
auto
param
=
ConfigBase
::
Str2Map
(
parameters
);
...
...
@@ -181,6 +185,14 @@ public:
boosting_
->
SaveModelToFile
(
num_iteration
,
filename
);
}
void
LoadModelFromString
(
const
char
*
model_str
)
{
boosting_
->
LoadModelFromString
(
model_str
);
}
std
::
string
SaveModelToString
(
int
num_iteration
)
{
return
boosting_
->
SaveModelToString
(
num_iteration
);
}
std
::
string
DumpModel
(
int
num_iteration
)
{
return
boosting_
->
DumpModel
(
num_iteration
);
}
...
...
@@ -213,6 +225,15 @@ public:
return
idx
;
}
int
GetFeatureNames
(
char
**
out_strs
)
const
{
int
idx
=
0
;
for
(
const
auto
&
name
:
boosting_
->
FeatureNames
())
{
std
::
strcpy
(
out_strs
[
idx
],
name
.
c_str
());
++
idx
;
}
return
idx
;
}
const
Boosting
*
GetBoosting
()
const
{
return
boosting_
.
get
();
}
private:
...
...
@@ -267,11 +288,11 @@ private:
// start of c_api functions
DllExport
const
char
*
LGBM_GetLastError
()
{
LIGHTGBM_C_EXPORT
const
char
*
LGBM_GetLastError
()
{
return
LastErrorMsg
();
}
DllExport
int
LGBM_DatasetCreateFromFile
(
const
char
*
filename
,
LIGHTGBM_C_EXPORT
int
LGBM_DatasetCreateFromFile
(
const
char
*
filename
,
const
char
*
parameters
,
const
DatasetHandle
reference
,
DatasetHandle
*
out
)
{
...
...
@@ -289,7 +310,145 @@ DllExport int LGBM_DatasetCreateFromFile(const char* filename,
API_END
();
}
DllExport
int
LGBM_DatasetCreateFromMat
(
const
void
*
data
,
LIGHTGBM_C_EXPORT
int
LGBM_DatasetCreateFromSampledMat
(
const
void
*
data
,
int
data_type
,
int32_t
num_sample_row
,
int32_t
ncol
,
int32_t
num_total_row
,
const
char
*
parameters
,
DatasetHandle
*
out
)
{
if
(
num_sample_row
==
num_total_row
)
{
return
LGBM_DatasetCreateFromMat
(
data
,
data_type
,
num_total_row
,
ncol
,
1
,
parameters
,
nullptr
,
out
);
}
else
{
API_BEGIN
();
auto
param
=
ConfigBase
::
Str2Map
(
parameters
);
IOConfig
io_config
;
io_config
.
Set
(
param
);
auto
get_row_fun
=
RowFunctionFromDenseMatric
(
data
,
num_sample_row
,
ncol
,
data_type
,
1
);
std
::
vector
<
std
::
vector
<
double
>>
sample_values
(
ncol
);
std
::
vector
<
std
::
vector
<
int
>>
sample_idx
(
ncol
);
for
(
int
i
=
0
;
i
<
num_sample_row
;
++
i
)
{
auto
row
=
get_row_fun
(
i
);
for
(
size_t
idx
=
0
;
idx
<
row
.
size
();
++
idx
)
{
if
(
std
::
fabs
(
row
[
idx
])
>
kEpsilon
)
{
sample_values
[
idx
].
emplace_back
(
row
[
idx
]);
sample_idx
[
idx
].
emplace_back
(
i
);
}
}
}
DatasetLoader
loader
(
io_config
,
nullptr
,
1
,
nullptr
);
*
out
=
loader
.
CostructFromSampleData
(
sample_values
,
sample_idx
,
num_sample_row
,
static_cast
<
data_size_t
>
(
num_total_row
));
API_END
();
}
}
LIGHTGBM_C_EXPORT
int
LGBM_DatasetCreateFromSampledCSR
(
const
void
*
indptr
,
int
indptr_type
,
const
int32_t
*
indices
,
const
void
*
data
,
int
data_type
,
int64_t
nindptr
,
int64_t
n_sample_elem
,
int64_t
num_col
,
int64_t
num_total_row
,
const
char
*
parameters
,
DatasetHandle
*
out
)
{
if
(
nindptr
-
1
==
num_total_row
)
{
return
LGBM_DatasetCreateFromCSR
(
indptr
,
indptr_type
,
indices
,
data
,
data_type
,
nindptr
,
n_sample_elem
,
num_col
,
parameters
,
nullptr
,
out
);
}
else
{
API_BEGIN
();
auto
param
=
ConfigBase
::
Str2Map
(
parameters
);
IOConfig
io_config
;
io_config
.
Set
(
param
);
auto
get_row_fun
=
RowFunctionFromCSR
(
indptr
,
indptr_type
,
indices
,
data
,
data_type
,
nindptr
,
n_sample_elem
);
int32_t
num_sample_row
=
static_cast
<
int32_t
>
(
nindptr
-
1
);
std
::
vector
<
std
::
vector
<
double
>>
sample_values
(
num_col
);
std
::
vector
<
std
::
vector
<
int
>>
sample_idx
(
num_col
);
for
(
int
i
=
0
;
i
<
num_sample_row
;
++
i
)
{
auto
row
=
get_row_fun
(
i
);
for
(
std
::
pair
<
int
,
double
>&
inner_data
:
row
)
{
if
(
static_cast
<
size_t
>
(
inner_data
.
first
)
>=
sample_values
.
size
())
{
sample_values
.
resize
(
inner_data
.
first
+
1
);
sample_idx
.
resize
(
inner_data
.
first
+
1
);
}
if
(
std
::
fabs
(
inner_data
.
second
)
>
kEpsilon
)
{
sample_values
[
inner_data
.
first
].
emplace_back
(
inner_data
.
second
);
sample_idx
[
inner_data
.
first
].
emplace_back
(
i
);
}
}
}
CHECK
(
num_col
>=
static_cast
<
int
>
(
sample_values
.
size
()));
DatasetLoader
loader
(
io_config
,
nullptr
,
1
,
nullptr
);
*
out
=
loader
.
CostructFromSampleData
(
sample_values
,
sample_idx
,
num_sample_row
,
static_cast
<
data_size_t
>
(
num_total_row
));
API_END
();
}
}
LIGHTGBM_C_EXPORT
int
LGBM_DatasetCreateByReference
(
const
DatasetHandle
reference
,
int64_t
num_total_row
,
DatasetHandle
*
out
)
{
API_BEGIN
();
std
::
unique_ptr
<
Dataset
>
ret
;
ret
.
reset
(
new
Dataset
(
static_cast
<
data_size_t
>
(
num_total_row
)));
ret
->
CreateValid
(
reinterpret_cast
<
const
Dataset
*>
(
reference
));
*
out
=
ret
.
release
();
API_END
();
}
LIGHTGBM_C_EXPORT
int
LGBM_DatasetPushRows
(
DatasetHandle
dataset
,
const
void
*
data
,
int
data_type
,
int32_t
nrow
,
int32_t
ncol
,
int32_t
start_row
)
{
API_BEGIN
();
auto
p_dataset
=
reinterpret_cast
<
Dataset
*>
(
dataset
);
auto
get_row_fun
=
RowFunctionFromDenseMatric
(
data
,
nrow
,
ncol
,
data_type
,
1
);
#pragma omp parallel for schedule(static)
for
(
int
i
=
0
;
i
<
nrow
;
++
i
)
{
const
int
tid
=
omp_get_thread_num
();
auto
one_row
=
get_row_fun
(
i
);
p_dataset
->
PushOneRow
(
tid
,
start_row
+
i
,
one_row
);
}
if
(
start_row
+
nrow
==
p_dataset
->
num_data
())
{
p_dataset
->
FinishLoad
();
}
API_END
();
}
LIGHTGBM_C_EXPORT
int
LGBM_DatasetPushRowsByCSR
(
DatasetHandle
dataset
,
const
void
*
indptr
,
int
indptr_type
,
const
int32_t
*
indices
,
const
void
*
data
,
int
data_type
,
int64_t
nindptr
,
int64_t
nelem
,
int64_t
,
int64_t
start_row
)
{
API_BEGIN
();
auto
p_dataset
=
reinterpret_cast
<
Dataset
*>
(
dataset
);
auto
get_row_fun
=
RowFunctionFromCSR
(
indptr
,
indptr_type
,
indices
,
data
,
data_type
,
nindptr
,
nelem
);
int32_t
nrow
=
static_cast
<
int32_t
>
(
nindptr
-
1
);
#pragma omp parallel for schedule(static)
for
(
int
i
=
0
;
i
<
nrow
;
++
i
)
{
const
int
tid
=
omp_get_thread_num
();
auto
one_row
=
get_row_fun
(
i
);
p_dataset
->
PushOneRow
(
tid
,
static_cast
<
data_size_t
>
(
start_row
+
i
),
one_row
);
}
if
(
start_row
+
nrow
==
static_cast
<
int64_t
>
(
p_dataset
->
num_data
()))
{
p_dataset
->
FinishLoad
();
}
API_END
();
}
LIGHTGBM_C_EXPORT
int
LGBM_DatasetCreateFromMat
(
const
void
*
data
,
int
data_type
,
int32_t
nrow
,
int32_t
ncol
,
...
...
@@ -309,25 +468,26 @@ DllExport int LGBM_DatasetCreateFromMat(const void* data,
const
int
sample_cnt
=
static_cast
<
int
>
(
nrow
<
io_config
.
bin_construct_sample_cnt
?
nrow
:
io_config
.
bin_construct_sample_cnt
);
auto
sample_indices
=
rand
.
Sample
(
nrow
,
sample_cnt
);
std
::
vector
<
std
::
vector
<
double
>>
sample_values
(
ncol
);
std
::
vector
<
std
::
vector
<
int
>>
sample_idx
(
ncol
);
for
(
size_t
i
=
0
;
i
<
sample_indices
.
size
();
++
i
)
{
auto
idx
=
sample_indices
[
i
];
auto
row
=
get_row_fun
(
static_cast
<
int
>
(
idx
));
for
(
size_t
j
=
0
;
j
<
row
.
size
();
++
j
)
{
if
(
std
::
fabs
(
row
[
j
])
>
1e-15
)
{
sample_values
[
j
].
push_back
(
row
[
j
]);
if
(
std
::
fabs
(
row
[
j
])
>
kEpsilon
)
{
sample_values
[
j
].
emplace_back
(
row
[
j
]);
sample_idx
[
j
].
emplace_back
(
static_cast
<
int
>
(
i
));
}
}
}
DatasetLoader
loader
(
io_config
,
nullptr
,
1
,
nullptr
);
ret
.
reset
(
loader
.
CostructFromSampleData
(
sample_values
,
sample_cnt
,
nrow
));
ret
.
reset
(
loader
.
CostructFromSampleData
(
sample_values
,
sample_idx
,
sample_cnt
,
nrow
));
}
else
{
ret
.
reset
(
new
Dataset
(
nrow
));
ret
->
CopyFeatureMapperFrom
(
reinterpret_cast
<
const
Dataset
*>
(
reference
),
io_config
.
is_enable_sparse
);
ret
->
CreateValid
(
reinterpret_cast
<
const
Dataset
*>
(
reference
));
}
#pragma omp parallel for schedule(
guided
)
#pragma omp parallel for schedule(
static
)
for
(
int
i
=
0
;
i
<
nrow
;
++
i
)
{
const
int
tid
=
omp_get_thread_num
();
auto
one_row
=
get_row_fun
(
i
);
...
...
@@ -338,7 +498,7 @@ DllExport int LGBM_DatasetCreateFromMat(const void* data,
API_END
();
}
DllExport
int
LGBM_DatasetCreateFromCSR
(
const
void
*
indptr
,
LIGHTGBM_C_EXPORT
int
LGBM_DatasetCreateFromCSR
(
const
void
*
indptr
,
int
indptr_type
,
const
int32_t
*
indices
,
const
void
*
data
,
...
...
@@ -362,34 +522,31 @@ DllExport int LGBM_DatasetCreateFromCSR(const void* indptr,
const
int
sample_cnt
=
static_cast
<
int
>
(
nrow
<
io_config
.
bin_construct_sample_cnt
?
nrow
:
io_config
.
bin_construct_sample_cnt
);
auto
sample_indices
=
rand
.
Sample
(
nrow
,
sample_cnt
);
std
::
vector
<
std
::
vector
<
double
>>
sample_values
;
std
::
vector
<
std
::
vector
<
int
>>
sample_idx
;
for
(
size_t
i
=
0
;
i
<
sample_indices
.
size
();
++
i
)
{
auto
idx
=
sample_indices
[
i
];
auto
row
=
get_row_fun
(
static_cast
<
int
>
(
idx
));
for
(
std
::
pair
<
int
,
double
>&
inner_data
:
row
)
{
if
(
static_cast
<
size_t
>
(
inner_data
.
first
)
>=
sample_values
.
size
())
{
// if need expand feature set
size_t
need_size
=
inner_data
.
first
-
sample_values
.
size
()
+
1
;
for
(
size_t
j
=
0
;
j
<
need_size
;
++
j
)
{
sample_values
.
emplace_back
();
}
sample_values
.
resize
(
inner_data
.
first
+
1
);
sample_idx
.
resize
(
inner_data
.
first
+
1
);
}
if
(
std
::
fabs
(
inner_data
.
second
)
>
1e-15
)
{
// edit the feature value
sample_
values
[
inner_data
.
first
].
push_back
(
inner_data
.
second
);
if
(
std
::
fabs
(
inner_data
.
second
)
>
kEpsilon
)
{
sample_values
[
inner_data
.
first
].
emplace_back
(
inner_data
.
second
);
sample_
idx
[
inner_data
.
first
].
emplace_back
(
static_cast
<
int
>
(
i
)
);
}
}
}
CHECK
(
num_col
>=
static_cast
<
int
>
(
sample_values
.
size
()));
DatasetLoader
loader
(
io_config
,
nullptr
,
1
,
nullptr
);
ret
.
reset
(
loader
.
CostructFromSampleData
(
sample_values
,
sample_cnt
,
nrow
));
ret
.
reset
(
loader
.
CostructFromSampleData
(
sample_values
,
sample_idx
,
sample_cnt
,
nrow
));
}
else
{
ret
.
reset
(
new
Dataset
(
nrow
));
ret
->
CopyFeatureMapperFrom
(
reinterpret_cast
<
const
Dataset
*>
(
reference
),
io_config
.
is_enable_sparse
);
ret
->
CreateValid
(
reinterpret_cast
<
const
Dataset
*>
(
reference
));
}
#pragma omp parallel for schedule(
guided
)
#pragma omp parallel for schedule(
static
)
for
(
int
i
=
0
;
i
<
nindptr
-
1
;
++
i
)
{
const
int
tid
=
omp_get_thread_num
();
auto
one_row
=
get_row_fun
(
i
);
...
...
@@ -400,7 +557,7 @@ DllExport int LGBM_DatasetCreateFromCSR(const void* indptr,
API_END
();
}
DllExport
int
LGBM_DatasetCreateFromCSC
(
const
void
*
col_ptr
,
LIGHTGBM_C_EXPORT
int
LGBM_DatasetCreateFromCSC
(
const
void
*
col_ptr
,
int
col_ptr_type
,
const
int32_t
*
indices
,
const
void
*
data
,
...
...
@@ -423,30 +580,33 @@ DllExport int LGBM_DatasetCreateFromCSC(const void* col_ptr,
const
int
sample_cnt
=
static_cast
<
int
>
(
nrow
<
io_config
.
bin_construct_sample_cnt
?
nrow
:
io_config
.
bin_construct_sample_cnt
);
auto
sample_indices
=
rand
.
Sample
(
nrow
,
sample_cnt
);
std
::
vector
<
std
::
vector
<
double
>>
sample_values
(
ncol_ptr
-
1
);
#pragma omp parallel for schedule(guided)
std
::
vector
<
std
::
vector
<
int
>>
sample_idx
(
ncol_ptr
-
1
);
#pragma omp parallel for schedule(static)
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
sample_values
.
size
());
++
i
)
{
CSC_RowIterator
col_it
(
col_ptr
,
col_ptr_type
,
indices
,
data
,
data_type
,
ncol_ptr
,
nelem
,
i
);
for
(
int
j
=
0
;
j
<
sample_cnt
;
j
++
)
{
auto
val
=
col_it
.
Get
(
sample_indices
[
j
]);
if
(
std
::
fabs
(
val
)
>
kEpsilon
)
{
sample_values
[
i
].
push_back
(
val
);
sample_values
[
i
].
emplace_back
(
val
);
sample_idx
[
i
].
emplace_back
(
j
);
}
}
}
DatasetLoader
loader
(
io_config
,
nullptr
,
1
,
nullptr
);
ret
.
reset
(
loader
.
CostructFromSampleData
(
sample_values
,
sample_cnt
,
nrow
));
ret
.
reset
(
loader
.
CostructFromSampleData
(
sample_values
,
sample_idx
,
sample_cnt
,
nrow
));
}
else
{
ret
.
reset
(
new
Dataset
(
nrow
));
ret
->
CopyFeatureMapperFrom
(
reinterpret_cast
<
const
Dataset
*>
(
reference
),
io_config
.
is_enable_sparse
);
ret
->
CreateValid
(
reinterpret_cast
<
const
Dataset
*>
(
reference
));
}
#pragma omp parallel for schedule(
guided
)
#pragma omp parallel for schedule(
static
)
for
(
int
i
=
0
;
i
<
ncol_ptr
-
1
;
++
i
)
{
const
int
tid
=
omp_get_thread_num
();
int
feature_idx
=
ret
->
Get
InnerFeatureIndex
(
i
);
int
feature_idx
=
ret
->
InnerFeatureIndex
(
i
);
if
(
feature_idx
<
0
)
{
continue
;
}
int
group
=
ret
->
Feature2Group
(
feature_idx
);
int
sub_feature
=
ret
->
Feture2SubFeature
(
feature_idx
);
CSC_RowIterator
col_it
(
col_ptr
,
col_ptr_type
,
indices
,
data
,
data_type
,
ncol_ptr
,
nelem
,
i
);
int
row_idx
=
0
;
while
(
row_idx
<
nrow
)
{
...
...
@@ -454,7 +614,7 @@ DllExport int LGBM_DatasetCreateFromCSC(const void* col_ptr,
row_idx
=
pair
.
first
;
// no more data
if
(
row_idx
<
0
)
{
break
;
}
ret
->
FeatureAt
(
feature_idx
)
->
PushData
(
tid
,
row_idx
,
pair
.
second
);
ret
->
Push
One
Data
(
tid
,
row_idx
,
group
,
sub_feature
,
pair
.
second
);
}
}
ret
->
FinishLoad
();
...
...
@@ -462,7 +622,7 @@ DllExport int LGBM_DatasetCreateFromCSC(const void* col_ptr,
API_END
();
}
DllExport
int
LGBM_DatasetGetSubset
(
LIGHTGBM_C_EXPORT
int
LGBM_DatasetGetSubset
(
const
DatasetHandle
handle
,
const
int32_t
*
used_row_indices
,
int32_t
num_used_row_indices
,
...
...
@@ -473,16 +633,14 @@ DllExport int LGBM_DatasetGetSubset(
IOConfig
io_config
;
io_config
.
Set
(
param
);
auto
full_dataset
=
reinterpret_cast
<
const
Dataset
*>
(
handle
);
auto
ret
=
std
::
unique_ptr
<
Dataset
>
(
full_dataset
->
Subset
(
used_row_indices
,
num_used_row_indices
,
io_config
.
is_enable_sparse
));
ret
->
FinishLoad
();
auto
ret
=
std
::
unique_ptr
<
Dataset
>
(
new
Dataset
(
num_used_row_indices
));
ret
->
CopyFeatureMapperFrom
(
full_dataset
);
ret
->
CopySubset
(
full_dataset
,
used_row_indices
,
num_used_row_indices
,
true
);
*
out
=
ret
.
release
();
API_END
();
}
DllExport
int
LGBM_DatasetSetFeatureNames
(
LIGHTGBM_C_EXPORT
int
LGBM_DatasetSetFeatureNames
(
DatasetHandle
handle
,
const
char
**
feature_names
,
int
num_feature_names
)
{
...
...
@@ -496,7 +654,7 @@ DllExport int LGBM_DatasetSetFeatureNames(
API_END
();
}
DllExport
int
LGBM_DatasetGetFeatureNames
(
LIGHTGBM_C_EXPORT
int
LGBM_DatasetGetFeatureNames
(
DatasetHandle
handle
,
char
**
feature_names
,
int
*
num_feature_names
)
{
...
...
@@ -510,13 +668,13 @@ DllExport int LGBM_DatasetGetFeatureNames(
API_END
();
}
DllExport
int
LGBM_DatasetFree
(
DatasetHandle
handle
)
{
LIGHTGBM_C_EXPORT
int
LGBM_DatasetFree
(
DatasetHandle
handle
)
{
API_BEGIN
();
delete
reinterpret_cast
<
Dataset
*>
(
handle
);
API_END
();
}
DllExport
int
LGBM_DatasetSaveBinary
(
DatasetHandle
handle
,
LIGHTGBM_C_EXPORT
int
LGBM_DatasetSaveBinary
(
DatasetHandle
handle
,
const
char
*
filename
)
{
API_BEGIN
();
auto
dataset
=
reinterpret_cast
<
Dataset
*>
(
handle
);
...
...
@@ -524,7 +682,7 @@ DllExport int LGBM_DatasetSaveBinary(DatasetHandle handle,
API_END
();
}
DllExport
int
LGBM_DatasetSetField
(
DatasetHandle
handle
,
LIGHTGBM_C_EXPORT
int
LGBM_DatasetSetField
(
DatasetHandle
handle
,
const
char
*
field_name
,
const
void
*
field_data
,
int
num_element
,
...
...
@@ -543,7 +701,7 @@ DllExport int LGBM_DatasetSetField(DatasetHandle handle,
API_END
();
}
DllExport
int
LGBM_DatasetGetField
(
DatasetHandle
handle
,
LIGHTGBM_C_EXPORT
int
LGBM_DatasetGetField
(
DatasetHandle
handle
,
const
char
*
field_name
,
int
*
out_len
,
const
void
**
out_ptr
,
...
...
@@ -566,7 +724,7 @@ DllExport int LGBM_DatasetGetField(DatasetHandle handle,
API_END
();
}
DllExport
int
LGBM_DatasetGetNumData
(
DatasetHandle
handle
,
LIGHTGBM_C_EXPORT
int
LGBM_DatasetGetNumData
(
DatasetHandle
handle
,
int
*
out
)
{
API_BEGIN
();
auto
dataset
=
reinterpret_cast
<
Dataset
*>
(
handle
);
...
...
@@ -574,7 +732,7 @@ DllExport int LGBM_DatasetGetNumData(DatasetHandle handle,
API_END
();
}
DllExport
int
LGBM_DatasetGetNumFeature
(
DatasetHandle
handle
,
LIGHTGBM_C_EXPORT
int
LGBM_DatasetGetNumFeature
(
DatasetHandle
handle
,
int
*
out
)
{
API_BEGIN
();
auto
dataset
=
reinterpret_cast
<
Dataset
*>
(
handle
);
...
...
@@ -584,7 +742,7 @@ DllExport int LGBM_DatasetGetNumFeature(DatasetHandle handle,
// ---- start of booster
DllExport
int
LGBM_BoosterCreate
(
const
DatasetHandle
train_data
,
LIGHTGBM_C_EXPORT
int
LGBM_BoosterCreate
(
const
DatasetHandle
train_data
,
const
char
*
parameters
,
BoosterHandle
*
out
)
{
API_BEGIN
();
...
...
@@ -594,7 +752,7 @@ DllExport int LGBM_BoosterCreate(const DatasetHandle train_data,
API_END
();
}
DllExport
int
LGBM_BoosterCreateFromModelfile
(
LIGHTGBM_C_EXPORT
int
LGBM_BoosterCreateFromModelfile
(
const
char
*
filename
,
int
*
out_num_iterations
,
BoosterHandle
*
out
)
{
...
...
@@ -605,13 +763,25 @@ DllExport int LGBM_BoosterCreateFromModelfile(
API_END
();
}
DllExport
int
LGBM_BoosterFree
(
BoosterHandle
handle
)
{
LIGHTGBM_C_EXPORT
int
LGBM_BoosterLoadModelFromString
(
const
char
*
model_str
,
int
*
out_num_iterations
,
BoosterHandle
*
out
)
{
API_BEGIN
();
auto
ret
=
std
::
unique_ptr
<
Booster
>
(
new
Booster
());
ret
->
LoadModelFromString
(
model_str
);
*
out_num_iterations
=
ret
->
GetBoosting
()
->
GetCurrentIteration
();
*
out
=
ret
.
release
();
API_END
();
}
LIGHTGBM_C_EXPORT
int
LGBM_BoosterFree
(
BoosterHandle
handle
)
{
API_BEGIN
();
delete
reinterpret_cast
<
Booster
*>
(
handle
);
API_END
();
}
DllExport
int
LGBM_BoosterMerge
(
BoosterHandle
handle
,
LIGHTGBM_C_EXPORT
int
LGBM_BoosterMerge
(
BoosterHandle
handle
,
BoosterHandle
other_handle
)
{
API_BEGIN
();
Booster
*
ref_booster
=
reinterpret_cast
<
Booster
*>
(
handle
);
...
...
@@ -620,7 +790,7 @@ DllExport int LGBM_BoosterMerge(BoosterHandle handle,
API_END
();
}
DllExport
int
LGBM_BoosterAddValidData
(
BoosterHandle
handle
,
LIGHTGBM_C_EXPORT
int
LGBM_BoosterAddValidData
(
BoosterHandle
handle
,
const
DatasetHandle
valid_data
)
{
API_BEGIN
();
Booster
*
ref_booster
=
reinterpret_cast
<
Booster
*>
(
handle
);
...
...
@@ -629,7 +799,7 @@ DllExport int LGBM_BoosterAddValidData(BoosterHandle handle,
API_END
();
}
DllExport
int
LGBM_BoosterResetTrainingData
(
BoosterHandle
handle
,
LIGHTGBM_C_EXPORT
int
LGBM_BoosterResetTrainingData
(
BoosterHandle
handle
,
const
DatasetHandle
train_data
)
{
API_BEGIN
();
Booster
*
ref_booster
=
reinterpret_cast
<
Booster
*>
(
handle
);
...
...
@@ -638,21 +808,21 @@ DllExport int LGBM_BoosterResetTrainingData(BoosterHandle handle,
API_END
();
}
DllExport
int
LGBM_BoosterResetParameter
(
BoosterHandle
handle
,
const
char
*
parameters
)
{
LIGHTGBM_C_EXPORT
int
LGBM_BoosterResetParameter
(
BoosterHandle
handle
,
const
char
*
parameters
)
{
API_BEGIN
();
Booster
*
ref_booster
=
reinterpret_cast
<
Booster
*>
(
handle
);
ref_booster
->
ResetConfig
(
parameters
);
API_END
();
}
DllExport
int
LGBM_BoosterGetNumClasses
(
BoosterHandle
handle
,
int
*
out_len
)
{
LIGHTGBM_C_EXPORT
int
LGBM_BoosterGetNumClasses
(
BoosterHandle
handle
,
int
*
out_len
)
{
API_BEGIN
();
Booster
*
ref_booster
=
reinterpret_cast
<
Booster
*>
(
handle
);
*
out_len
=
ref_booster
->
GetBoosting
()
->
NumberOfClasses
();
API_END
();
}
DllExport
int
LGBM_BoosterUpdateOneIter
(
BoosterHandle
handle
,
int
*
is_finished
)
{
LIGHTGBM_C_EXPORT
int
LGBM_BoosterUpdateOneIter
(
BoosterHandle
handle
,
int
*
is_finished
)
{
API_BEGIN
();
Booster
*
ref_booster
=
reinterpret_cast
<
Booster
*>
(
handle
);
if
(
ref_booster
->
TrainOneIter
())
{
...
...
@@ -663,7 +833,7 @@ DllExport int LGBM_BoosterUpdateOneIter(BoosterHandle handle, int* is_finished)
API_END
();
}
DllExport
int
LGBM_BoosterUpdateOneIterCustom
(
BoosterHandle
handle
,
LIGHTGBM_C_EXPORT
int
LGBM_BoosterUpdateOneIterCustom
(
BoosterHandle
handle
,
const
float
*
grad
,
const
float
*
hess
,
int
*
is_finished
)
{
...
...
@@ -677,35 +847,49 @@ DllExport int LGBM_BoosterUpdateOneIterCustom(BoosterHandle handle,
API_END
();
}
DllExport
int
LGBM_BoosterRollbackOneIter
(
BoosterHandle
handle
)
{
LIGHTGBM_C_EXPORT
int
LGBM_BoosterRollbackOneIter
(
BoosterHandle
handle
)
{
API_BEGIN
();
Booster
*
ref_booster
=
reinterpret_cast
<
Booster
*>
(
handle
);
ref_booster
->
RollbackOneIter
();
API_END
();
}
DllExport
int
LGBM_BoosterGetCurrentIteration
(
BoosterHandle
handle
,
int
*
out_iteration
)
{
LIGHTGBM_C_EXPORT
int
LGBM_BoosterGetCurrentIteration
(
BoosterHandle
handle
,
int
*
out_iteration
)
{
API_BEGIN
();
Booster
*
ref_booster
=
reinterpret_cast
<
Booster
*>
(
handle
);
*
out_iteration
=
ref_booster
->
GetBoosting
()
->
GetCurrentIteration
();
API_END
();
}
DllExport
int
LGBM_BoosterGetEvalCounts
(
BoosterHandle
handle
,
int
*
out_len
)
{
LIGHTGBM_C_EXPORT
int
LGBM_BoosterGetEvalCounts
(
BoosterHandle
handle
,
int
*
out_len
)
{
API_BEGIN
();
Booster
*
ref_booster
=
reinterpret_cast
<
Booster
*>
(
handle
);
*
out_len
=
ref_booster
->
GetEvalCounts
();
API_END
();
}
DllExport
int
LGBM_BoosterGetEvalNames
(
BoosterHandle
handle
,
int
*
out_len
,
char
**
out_strs
)
{
LIGHTGBM_C_EXPORT
int
LGBM_BoosterGetEvalNames
(
BoosterHandle
handle
,
int
*
out_len
,
char
**
out_strs
)
{
API_BEGIN
();
Booster
*
ref_booster
=
reinterpret_cast
<
Booster
*>
(
handle
);
*
out_len
=
ref_booster
->
GetEvalNames
(
out_strs
);
API_END
();
}
DllExport
int
LGBM_BoosterGetEval
(
BoosterHandle
handle
,
LIGHTGBM_C_EXPORT
int
LGBM_BoosterGetFeatureNames
(
BoosterHandle
handle
,
int
*
out_len
,
char
**
out_strs
)
{
API_BEGIN
();
Booster
*
ref_booster
=
reinterpret_cast
<
Booster
*>
(
handle
);
*
out_len
=
ref_booster
->
GetFeatureNames
(
out_strs
);
API_END
();
}
LIGHTGBM_C_EXPORT
int
LGBM_BoosterGetNumFeature
(
BoosterHandle
handle
,
int
*
out_len
)
{
API_BEGIN
();
Booster
*
ref_booster
=
reinterpret_cast
<
Booster
*>
(
handle
);
*
out_len
=
ref_booster
->
GetBoosting
()
->
MaxFeatureIdx
()
+
1
;
API_END
();
}
LIGHTGBM_C_EXPORT
int
LGBM_BoosterGetEval
(
BoosterHandle
handle
,
int
data_idx
,
int
*
out_len
,
double
*
out_results
)
{
...
...
@@ -720,7 +904,7 @@ DllExport int LGBM_BoosterGetEval(BoosterHandle handle,
API_END
();
}
DllExport
int
LGBM_BoosterGetNumPredict
(
BoosterHandle
handle
,
LIGHTGBM_C_EXPORT
int
LGBM_BoosterGetNumPredict
(
BoosterHandle
handle
,
int
data_idx
,
int64_t
*
out_len
)
{
API_BEGIN
();
...
...
@@ -729,7 +913,7 @@ DllExport int LGBM_BoosterGetNumPredict(BoosterHandle handle,
API_END
();
}
DllExport
int
LGBM_BoosterGetPredict
(
BoosterHandle
handle
,
LIGHTGBM_C_EXPORT
int
LGBM_BoosterGetPredict
(
BoosterHandle
handle
,
int
data_idx
,
int64_t
*
out_len
,
double
*
out_result
)
{
...
...
@@ -739,7 +923,7 @@ DllExport int LGBM_BoosterGetPredict(BoosterHandle handle,
API_END
();
}
DllExport
int
LGBM_BoosterPredictForFile
(
BoosterHandle
handle
,
LIGHTGBM_C_EXPORT
int
LGBM_BoosterPredictForFile
(
BoosterHandle
handle
,
const
char
*
data_filename
,
int
data_has_header
,
int
predict_type
,
...
...
@@ -766,7 +950,7 @@ int64_t GetNumPredOneRow(const Booster* ref_booster, int predict_type, int64_t n
return
num_preb_in_one_row
;
}
DllExport
int
LGBM_BoosterCalcNumPredict
(
BoosterHandle
handle
,
LIGHTGBM_C_EXPORT
int
LGBM_BoosterCalcNumPredict
(
BoosterHandle
handle
,
int
num_row
,
int
predict_type
,
int
num_iteration
,
...
...
@@ -777,7 +961,7 @@ DllExport int LGBM_BoosterCalcNumPredict(BoosterHandle handle,
API_END
();
}
DllExport
int
LGBM_BoosterPredictForCSR
(
BoosterHandle
handle
,
LIGHTGBM_C_EXPORT
int
LGBM_BoosterPredictForCSR
(
BoosterHandle
handle
,
const
void
*
indptr
,
int
indptr_type
,
const
int32_t
*
indices
,
...
...
@@ -796,7 +980,7 @@ DllExport int LGBM_BoosterPredictForCSR(BoosterHandle handle,
auto
get_row_fun
=
RowFunctionFromCSR
(
indptr
,
indptr_type
,
indices
,
data
,
data_type
,
nindptr
,
nelem
);
int64_t
num_preb_in_one_row
=
GetNumPredOneRow
(
ref_booster
,
predict_type
,
num_iteration
);
int
nrow
=
static_cast
<
int
>
(
nindptr
-
1
);
#pragma omp parallel for schedule(
guided
)
#pragma omp parallel for schedule(
static
)
for
(
int
i
=
0
;
i
<
nrow
;
++
i
)
{
auto
one_row
=
get_row_fun
(
i
);
auto
predicton_result
=
predictor
.
GetPredictFunction
()(
one_row
);
...
...
@@ -808,7 +992,7 @@ DllExport int LGBM_BoosterPredictForCSR(BoosterHandle handle,
API_END
();
}
DllExport
int
LGBM_BoosterPredictForCSC
(
BoosterHandle
handle
,
LIGHTGBM_C_EXPORT
int
LGBM_BoosterPredictForCSC
(
BoosterHandle
handle
,
const
void
*
col_ptr
,
int
col_ptr_type
,
const
int32_t
*
indices
,
...
...
@@ -853,7 +1037,7 @@ DllExport int LGBM_BoosterPredictForCSC(BoosterHandle handle,
API_END
();
}
DllExport
int
LGBM_BoosterPredictForMat
(
BoosterHandle
handle
,
LIGHTGBM_C_EXPORT
int
LGBM_BoosterPredictForMat
(
BoosterHandle
handle
,
const
void
*
data
,
int
data_type
,
int32_t
nrow
,
...
...
@@ -868,7 +1052,7 @@ DllExport int LGBM_BoosterPredictForMat(BoosterHandle handle,
auto
predictor
=
ref_booster
->
NewPredictor
(
static_cast
<
int
>
(
num_iteration
),
predict_type
);
auto
get_row_fun
=
RowPairFunctionFromDenseMatric
(
data
,
nrow
,
ncol
,
data_type
,
is_row_major
);
int64_t
num_preb_in_one_row
=
GetNumPredOneRow
(
ref_booster
,
predict_type
,
num_iteration
);
#pragma omp parallel for schedule(
guided
)
#pragma omp parallel for schedule(
static
)
for
(
int
i
=
0
;
i
<
nrow
;
++
i
)
{
auto
one_row
=
get_row_fun
(
i
);
auto
predicton_result
=
predictor
.
GetPredictFunction
()(
one_row
);
...
...
@@ -880,7 +1064,7 @@ DllExport int LGBM_BoosterPredictForMat(BoosterHandle handle,
API_END
();
}
DllExport
int
LGBM_BoosterSaveModel
(
BoosterHandle
handle
,
LIGHTGBM_C_EXPORT
int
LGBM_BoosterSaveModel
(
BoosterHandle
handle
,
int
num_iteration
,
const
char
*
filename
)
{
API_BEGIN
();
...
...
@@ -889,7 +1073,22 @@ DllExport int LGBM_BoosterSaveModel(BoosterHandle handle,
API_END
();
}
DllExport
int
LGBM_BoosterDumpModel
(
BoosterHandle
handle
,
LIGHTGBM_C_EXPORT
int
LGBM_BoosterSaveModelToString
(
BoosterHandle
handle
,
int
num_iteration
,
int
buffer_len
,
int
*
out_len
,
char
*
out_str
)
{
API_BEGIN
();
Booster
*
ref_booster
=
reinterpret_cast
<
Booster
*>
(
handle
);
std
::
string
model
=
ref_booster
->
SaveModelToString
(
num_iteration
);
*
out_len
=
static_cast
<
int
>
(
model
.
size
())
+
1
;
if
(
*
out_len
<=
buffer_len
)
{
std
::
strcpy
(
out_str
,
model
.
c_str
());
}
API_END
();
}
LIGHTGBM_C_EXPORT
int
LGBM_BoosterDumpModel
(
BoosterHandle
handle
,
int
num_iteration
,
int
buffer_len
,
int
*
out_len
,
...
...
@@ -904,7 +1103,7 @@ DllExport int LGBM_BoosterDumpModel(BoosterHandle handle,
API_END
();
}
DllExport
int
LGBM_BoosterGetLeafValue
(
BoosterHandle
handle
,
LIGHTGBM_C_EXPORT
int
LGBM_BoosterGetLeafValue
(
BoosterHandle
handle
,
int
tree_idx
,
int
leaf_idx
,
double
*
out_val
)
{
...
...
@@ -914,7 +1113,7 @@ DllExport int LGBM_BoosterGetLeafValue(BoosterHandle handle,
API_END
();
}
DllExport
int
LGBM_BoosterSetLeafValue
(
BoosterHandle
handle
,
LIGHTGBM_C_EXPORT
int
LGBM_BoosterSetLeafValue
(
BoosterHandle
handle
,
int
tree_idx
,
int
leaf_idx
,
double
val
)
{
...
...
@@ -924,6 +1123,54 @@ DllExport int LGBM_BoosterSetLeafValue(BoosterHandle handle,
API_END
();
}
LIGHTGBM_C_EXPORT
int
LGBM_AllocateArray
(
int64_t
len
,
int
type
,
ArrayHandle
*
out
)
{
API_BEGIN
();
if
(
type
==
C_API_DTYPE_FLOAT32
)
{
*
out
=
new
float
[
len
];
}
else
if
(
type
==
C_API_DTYPE_FLOAT64
)
{
*
out
=
new
double
[
len
];
}
else
if
(
type
==
C_API_DTYPE_INT32
)
{
*
out
=
new
int32_t
[
len
];
}
else
if
(
type
==
C_API_DTYPE_INT64
)
{
*
out
=
new
int64_t
[
len
];
}
API_END
();
}
template
<
typename
T
>
void
Copy
(
T
*
dst
,
const
T
*
src
,
int64_t
len
)
{
std
::
memcpy
(
dst
,
src
,
sizeof
(
T
)
*
len
);
}
LIGHTGBM_C_EXPORT
int
LGBM_CopyToArray
(
ArrayHandle
arr
,
int
type
,
int64_t
start_idx
,
const
void
*
src
,
int64_t
len
)
{
API_BEGIN
();
if
(
type
==
C_API_DTYPE_FLOAT32
)
{
Copy
<
float
>
(
static_cast
<
float
*>
(
arr
)
+
start_idx
,
static_cast
<
const
float
*>
(
src
),
len
);
}
else
if
(
type
==
C_API_DTYPE_FLOAT64
)
{
Copy
<
double
>
(
static_cast
<
double
*>
(
arr
)
+
start_idx
,
static_cast
<
const
double
*>
(
src
),
len
);
}
else
if
(
type
==
C_API_DTYPE_INT32
)
{
Copy
<
int32_t
>
(
static_cast
<
int32_t
*>
(
arr
)
+
start_idx
,
static_cast
<
const
int32_t
*>
(
src
),
len
);
}
else
if
(
type
==
C_API_DTYPE_INT64
)
{
Copy
<
int64_t
>
(
static_cast
<
int64_t
*>
(
arr
)
+
start_idx
,
static_cast
<
const
int64_t
*>
(
src
),
len
);
}
API_END
();
}
LIGHTGBM_C_EXPORT
int
LGBM_FreeArray
(
ArrayHandle
arr
,
int
type
)
{
API_BEGIN
();
if
(
type
==
C_API_DTYPE_FLOAT32
)
{
delete
[]
static_cast
<
float
*>
(
arr
);
}
else
if
(
type
==
C_API_DTYPE_FLOAT64
)
{
delete
[]
static_cast
<
double
*>
(
arr
);
}
else
if
(
type
==
C_API_DTYPE_INT32
)
{
delete
[]
static_cast
<
int32_t
*>
(
arr
);
}
else
if
(
type
==
C_API_DTYPE_INT64
)
{
delete
[]
static_cast
<
int64_t
*>
(
arr
);
}
API_END
();
}
// ---- start of some help functions
std
::
function
<
std
::
vector
<
double
>
(
int
row_idx
)
>
...
...
@@ -931,7 +1178,7 @@ RowFunctionFromDenseMatric(const void* data, int num_row, int num_col, int data_
if
(
data_type
==
C_API_DTYPE_FLOAT32
)
{
const
float
*
data_ptr
=
reinterpret_cast
<
const
float
*>
(
data
);
if
(
is_row_major
)
{
return
[
data_ptr
,
num_col
,
num_row
](
int
row_idx
)
{
return
[
data_ptr
,
num_col
,
num_row
]
(
int
row_idx
)
{
std
::
vector
<
double
>
ret
(
num_col
);
auto
tmp_ptr
=
data_ptr
+
num_col
*
row_idx
;
for
(
int
i
=
0
;
i
<
num_col
;
++
i
)
{
...
...
@@ -940,7 +1187,7 @@ RowFunctionFromDenseMatric(const void* data, int num_row, int num_col, int data_
return
ret
;
};
}
else
{
return
[
data_ptr
,
num_col
,
num_row
](
int
row_idx
)
{
return
[
data_ptr
,
num_col
,
num_row
]
(
int
row_idx
)
{
std
::
vector
<
double
>
ret
(
num_col
);
for
(
int
i
=
0
;
i
<
num_col
;
++
i
)
{
ret
[
i
]
=
static_cast
<
double
>
(
*
(
data_ptr
+
num_row
*
i
+
row_idx
));
...
...
@@ -951,7 +1198,7 @@ RowFunctionFromDenseMatric(const void* data, int num_row, int num_col, int data_
}
else
if
(
data_type
==
C_API_DTYPE_FLOAT64
)
{
const
double
*
data_ptr
=
reinterpret_cast
<
const
double
*>
(
data
);
if
(
is_row_major
)
{
return
[
data_ptr
,
num_col
,
num_row
](
int
row_idx
)
{
return
[
data_ptr
,
num_col
,
num_row
]
(
int
row_idx
)
{
std
::
vector
<
double
>
ret
(
num_col
);
auto
tmp_ptr
=
data_ptr
+
num_col
*
row_idx
;
for
(
int
i
=
0
;
i
<
num_col
;
++
i
)
{
...
...
@@ -960,7 +1207,7 @@ RowFunctionFromDenseMatric(const void* data, int num_row, int num_col, int data_
return
ret
;
};
}
else
{
return
[
data_ptr
,
num_col
,
num_row
](
int
row_idx
)
{
return
[
data_ptr
,
num_col
,
num_row
]
(
int
row_idx
)
{
std
::
vector
<
double
>
ret
(
num_col
);
for
(
int
i
=
0
;
i
<
num_col
;
++
i
)
{
ret
[
i
]
=
static_cast
<
double
>
(
*
(
data_ptr
+
num_row
*
i
+
row_idx
));
...
...
@@ -976,7 +1223,7 @@ std::function<std::vector<std::pair<int, double>>(int row_idx)>
RowPairFunctionFromDenseMatric
(
const
void
*
data
,
int
num_row
,
int
num_col
,
int
data_type
,
int
is_row_major
)
{
auto
inner_function
=
RowFunctionFromDenseMatric
(
data
,
num_row
,
num_col
,
data_type
,
is_row_major
);
if
(
inner_function
!=
nullptr
)
{
return
[
inner_function
](
int
row_idx
)
{
return
[
inner_function
]
(
int
row_idx
)
{
auto
raw_values
=
inner_function
(
row_idx
);
std
::
vector
<
std
::
pair
<
int
,
double
>>
ret
;
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
raw_values
.
size
());
++
i
)
{
...
...
@@ -996,7 +1243,7 @@ RowFunctionFromCSR(const void* indptr, int indptr_type, const int32_t* indices,
const
float
*
data_ptr
=
reinterpret_cast
<
const
float
*>
(
data
);
if
(
indptr_type
==
C_API_DTYPE_INT32
)
{
const
int32_t
*
ptr_indptr
=
reinterpret_cast
<
const
int32_t
*>
(
indptr
);
return
[
ptr_indptr
,
indices
,
data_ptr
,
nindptr
,
nelem
](
int
idx
)
{
return
[
ptr_indptr
,
indices
,
data_ptr
,
nindptr
,
nelem
]
(
int
idx
)
{
std
::
vector
<
std
::
pair
<
int
,
double
>>
ret
;
int64_t
start
=
ptr_indptr
[
idx
];
int64_t
end
=
ptr_indptr
[
idx
+
1
];
...
...
@@ -1007,7 +1254,7 @@ RowFunctionFromCSR(const void* indptr, int indptr_type, const int32_t* indices,
};
}
else
if
(
indptr_type
==
C_API_DTYPE_INT64
)
{
const
int64_t
*
ptr_indptr
=
reinterpret_cast
<
const
int64_t
*>
(
indptr
);
return
[
ptr_indptr
,
indices
,
data_ptr
,
nindptr
,
nelem
](
int
idx
)
{
return
[
ptr_indptr
,
indices
,
data_ptr
,
nindptr
,
nelem
]
(
int
idx
)
{
std
::
vector
<
std
::
pair
<
int
,
double
>>
ret
;
int64_t
start
=
ptr_indptr
[
idx
];
int64_t
end
=
ptr_indptr
[
idx
+
1
];
...
...
@@ -1021,7 +1268,7 @@ RowFunctionFromCSR(const void* indptr, int indptr_type, const int32_t* indices,
const
double
*
data_ptr
=
reinterpret_cast
<
const
double
*>
(
data
);
if
(
indptr_type
==
C_API_DTYPE_INT32
)
{
const
int32_t
*
ptr_indptr
=
reinterpret_cast
<
const
int32_t
*>
(
indptr
);
return
[
ptr_indptr
,
indices
,
data_ptr
,
nindptr
,
nelem
](
int
idx
)
{
return
[
ptr_indptr
,
indices
,
data_ptr
,
nindptr
,
nelem
]
(
int
idx
)
{
std
::
vector
<
std
::
pair
<
int
,
double
>>
ret
;
int64_t
start
=
ptr_indptr
[
idx
];
int64_t
end
=
ptr_indptr
[
idx
+
1
];
...
...
@@ -1032,7 +1279,7 @@ RowFunctionFromCSR(const void* indptr, int indptr_type, const int32_t* indices,
};
}
else
if
(
indptr_type
==
C_API_DTYPE_INT64
)
{
const
int64_t
*
ptr_indptr
=
reinterpret_cast
<
const
int64_t
*>
(
indptr
);
return
[
ptr_indptr
,
indices
,
data_ptr
,
nindptr
,
nelem
](
int
idx
)
{
return
[
ptr_indptr
,
indices
,
data_ptr
,
nindptr
,
nelem
]
(
int
idx
)
{
std
::
vector
<
std
::
pair
<
int
,
double
>>
ret
;
int64_t
start
=
ptr_indptr
[
idx
];
int64_t
end
=
ptr_indptr
[
idx
+
1
];
...
...
@@ -1055,7 +1302,7 @@ IterateFunctionFromCSC(const void* col_ptr, int col_ptr_type, const int32_t* ind
const
int32_t
*
ptr_col_ptr
=
reinterpret_cast
<
const
int32_t
*>
(
col_ptr
);
int64_t
start
=
ptr_col_ptr
[
col_idx
];
int64_t
end
=
ptr_col_ptr
[
col_idx
+
1
];
return
[
ptr_col_ptr
,
indices
,
data_ptr
,
ncol_ptr
,
nelem
,
start
,
end
](
int
bias
)
{
return
[
ptr_col_ptr
,
indices
,
data_ptr
,
ncol_ptr
,
nelem
,
start
,
end
]
(
int
bias
)
{
int64_t
i
=
static_cast
<
int64_t
>
(
start
+
bias
);
if
(
i
>=
end
)
{
return
std
::
make_pair
(
-
1
,
0.0
);
...
...
@@ -1068,7 +1315,7 @@ IterateFunctionFromCSC(const void* col_ptr, int col_ptr_type, const int32_t* ind
const
int64_t
*
ptr_col_ptr
=
reinterpret_cast
<
const
int64_t
*>
(
col_ptr
);
int64_t
start
=
ptr_col_ptr
[
col_idx
];
int64_t
end
=
ptr_col_ptr
[
col_idx
+
1
];
return
[
ptr_col_ptr
,
indices
,
data_ptr
,
ncol_ptr
,
nelem
,
start
,
end
](
int
bias
)
{
return
[
ptr_col_ptr
,
indices
,
data_ptr
,
ncol_ptr
,
nelem
,
start
,
end
]
(
int
bias
)
{
int64_t
i
=
static_cast
<
int64_t
>
(
start
+
bias
);
if
(
i
>=
end
)
{
return
std
::
make_pair
(
-
1
,
0.0
);
...
...
@@ -1084,7 +1331,7 @@ IterateFunctionFromCSC(const void* col_ptr, int col_ptr_type, const int32_t* ind
const
int32_t
*
ptr_col_ptr
=
reinterpret_cast
<
const
int32_t
*>
(
col_ptr
);
int64_t
start
=
ptr_col_ptr
[
col_idx
];
int64_t
end
=
ptr_col_ptr
[
col_idx
+
1
];
return
[
ptr_col_ptr
,
indices
,
data_ptr
,
ncol_ptr
,
nelem
,
start
,
end
](
int
bias
)
{
return
[
ptr_col_ptr
,
indices
,
data_ptr
,
ncol_ptr
,
nelem
,
start
,
end
]
(
int
bias
)
{
int64_t
i
=
static_cast
<
int64_t
>
(
start
+
bias
);
if
(
i
>=
end
)
{
return
std
::
make_pair
(
-
1
,
0.0
);
...
...
@@ -1097,7 +1344,7 @@ IterateFunctionFromCSC(const void* col_ptr, int col_ptr_type, const int32_t* ind
const
int64_t
*
ptr_col_ptr
=
reinterpret_cast
<
const
int64_t
*>
(
col_ptr
);
int64_t
start
=
ptr_col_ptr
[
col_idx
];
int64_t
end
=
ptr_col_ptr
[
col_idx
+
1
];
return
[
ptr_col_ptr
,
indices
,
data_ptr
,
ncol_ptr
,
nelem
,
start
,
end
](
int
bias
)
{
return
[
ptr_col_ptr
,
indices
,
data_ptr
,
ncol_ptr
,
nelem
,
start
,
end
]
(
int
bias
)
{
int64_t
i
=
static_cast
<
int64_t
>
(
start
+
bias
);
if
(
i
>=
end
)
{
return
std
::
make_pair
(
-
1
,
0.0
);
...
...
src/io/bin.cpp
View file @
eade219e
...
...
@@ -2,6 +2,7 @@
#include <LightGBM/bin.h>
#include "dense_bin.hpp"
#include "dense_nbits_bin.hpp"
#include "sparse_bin.hpp"
#include "ordered_sparse_bin.hpp"
...
...
@@ -30,7 +31,9 @@ BinMapper::BinMapper(const BinMapper& other) {
bin_2_categorical_
=
other
.
bin_2_categorical_
;
categorical_2_bin_
=
other
.
categorical_2_bin_
;
}
min_val_
=
other
.
min_val_
;
max_val_
=
other
.
max_val_
;
default_bin_
=
other
.
default_bin_
;
}
BinMapper
::
BinMapper
(
const
void
*
memory
)
{
...
...
@@ -41,37 +44,60 @@ BinMapper::~BinMapper() {
}
void
BinMapper
::
FindBin
(
std
::
vector
<
double
>*
values
,
size_t
total_sample_cnt
,
int
max_bin
,
BinType
bin_type
)
{
bool
NeedFilter
(
std
::
vector
<
int
>&
cnt_in_bin
,
int
total_cnt
,
int
filter_cnt
,
BinType
bin_type
)
{
if
(
bin_type
==
BinType
::
NumericalBin
)
{
int
sum_left
=
0
;
for
(
size_t
i
=
0
;
i
<
cnt_in_bin
.
size
()
-
1
;
++
i
)
{
sum_left
+=
cnt_in_bin
[
i
];
if
(
sum_left
>=
filter_cnt
)
{
return
false
;
}
else
if
(
total_cnt
-
sum_left
>=
filter_cnt
)
{
return
false
;
}
}
}
else
{
for
(
size_t
i
=
0
;
i
<
cnt_in_bin
.
size
()
-
1
;
++
i
)
{
int
sum_left
=
cnt_in_bin
[
i
];
if
(
sum_left
>=
filter_cnt
)
{
return
false
;
}
else
if
(
total_cnt
-
sum_left
>=
filter_cnt
)
{
return
false
;
}
}
}
return
true
;
}
void
BinMapper
::
FindBin
(
std
::
vector
<
double
>&
values
,
size_t
total_sample_cnt
,
int
max_bin
,
int
min_data_in_bin
,
int
min_split_data
,
BinType
bin_type
)
{
bin_type_
=
bin_type
;
std
::
vector
<
double
>&
ref_values
=
(
*
values
)
;
s
ize_t
sample_size
=
total_sample_cnt
;
int
zero_cnt
=
static_cast
<
int
>
(
total_sample_cnt
-
r
ef
_values
.
size
());
default_bin_
=
0
;
s
td
::
vector
<
double
>&
raw_values
=
values
;
int
zero_cnt
=
static_cast
<
int
>
(
total_sample_cnt
-
r
aw
_values
.
size
());
// find distinct_values first
std
::
vector
<
double
>
distinct_values
;
std
::
vector
<
int
>
counts
;
std
::
sort
(
r
ef
_values
.
begin
(),
r
ef
_values
.
end
());
std
::
sort
(
r
aw
_values
.
begin
(),
r
aw
_values
.
end
());
// push zero in the front
if
(
r
ef
_values
.
empty
()
||
(
r
ef
_values
[
0
]
>
0.0
f
&&
zero_cnt
>
0
))
{
distinct_values
.
push_back
(
0
);
if
(
r
aw
_values
.
empty
()
||
(
r
aw
_values
[
0
]
>
0.0
f
&&
zero_cnt
>
0
))
{
distinct_values
.
push_back
(
0
.0
f
);
counts
.
push_back
(
zero_cnt
);
}
if
(
!
r
ef
_values
.
empty
())
{
distinct_values
.
push_back
(
r
ef
_values
[
0
]);
if
(
!
r
aw
_values
.
empty
())
{
distinct_values
.
push_back
(
r
aw
_values
[
0
]);
counts
.
push_back
(
1
);
}
for
(
size_t
i
=
1
;
i
<
ref_values
.
size
();
++
i
)
{
if
(
ref_values
[
i
]
!=
ref_values
[
i
-
1
])
{
if
(
ref_values
[
i
-
1
]
==
0.0
f
)
{
counts
.
back
()
+=
zero_cnt
;
}
else
if
(
ref_values
[
i
-
1
]
<
0.0
f
&&
ref_values
[
i
]
>
0.0
f
)
{
distinct_values
.
push_back
(
0
);
for
(
size_t
i
=
1
;
i
<
raw_values
.
size
();
++
i
)
{
if
(
raw_values
[
i
]
!=
raw_values
[
i
-
1
])
{
if
(
raw_values
[
i
-
1
]
<
0.0
f
&&
raw_values
[
i
]
>
0.0
f
)
{
distinct_values
.
push_back
(
0.0
f
);
counts
.
push_back
(
zero_cnt
);
}
distinct_values
.
push_back
(
r
ef
_values
[
i
]);
distinct_values
.
push_back
(
r
aw
_values
[
i
]);
counts
.
push_back
(
1
);
}
else
{
++
counts
.
back
();
...
...
@@ -79,29 +105,44 @@ void BinMapper::FindBin(std::vector<double>* values, size_t total_sample_cnt, in
}
// push zero in the back
if
(
!
r
ef
_values
.
empty
()
&&
r
ef
_values
.
back
()
<
0.0
f
&&
zero_cnt
>
0
)
{
distinct_values
.
push_back
(
0
);
if
(
!
r
aw
_values
.
empty
()
&&
r
aw
_values
.
back
()
<
0.0
f
&&
zero_cnt
>
0
)
{
distinct_values
.
push_back
(
0
.0
f
);
counts
.
push_back
(
zero_cnt
);
}
min_val_
=
distinct_values
.
front
();
max_val_
=
distinct_values
.
back
();
std
::
vector
<
int
>
cnt_in_bin
;
int
num_values
=
static_cast
<
int
>
(
distinct_values
.
size
());
int
cnt_in_bin0
=
0
;
if
(
bin_type_
==
BinType
::
NumericalBin
)
{
if
(
num_values
<=
max_bin
)
{
std
::
sort
(
distinct_values
.
begin
(),
distinct_values
.
end
());
// use distinct value is enough
num_bin_
=
num_values
;
b
in
_upper_bound_
=
std
::
vector
<
double
>
(
num_values
)
;
bin_upper_bound_
.
clear
()
;
in
t
cur_cnt_inbin
=
0
;
for
(
int
i
=
0
;
i
<
num_values
-
1
;
++
i
)
{
bin_upper_bound_
[
i
]
=
(
distinct_values
[
i
]
+
distinct_values
[
i
+
1
])
/
2
;
cur_cnt_inbin
+=
counts
[
i
];
if
(
cur_cnt_inbin
>=
min_data_in_bin
)
{
bin_upper_bound_
.
push_back
((
distinct_values
[
i
]
+
distinct_values
[
i
+
1
])
/
2
);
cnt_in_bin
.
push_back
(
cur_cnt_inbin
);
cur_cnt_inbin
=
0
;
}
}
cnt_in_bin0
=
counts
[
0
];
bin_upper_bound_
[
num_values
-
1
]
=
std
::
numeric_limits
<
double
>::
infinity
();
cur_cnt_inbin
+=
counts
.
back
();
cnt_in_bin
.
push_back
(
cur_cnt_inbin
);
bin_upper_bound_
.
push_back
(
std
::
numeric_limits
<
double
>::
infinity
());
num_bin_
=
static_cast
<
int
>
(
bin_upper_bound_
.
size
());
}
else
{
if
(
min_data_in_bin
>
0
)
{
max_bin
=
std
::
min
(
max_bin
,
static_cast
<
int
>
(
total_sample_cnt
/
min_data_in_bin
));
max_bin
=
std
::
max
(
max_bin
,
1
);
}
double
mean_bin_size
=
static_cast
<
double
>
(
total_sample_cnt
)
/
max_bin
;
if
(
zero_cnt
>
mean_bin_size
)
{
int
non_zero_cnt
=
static_cast
<
int
>
(
raw_values
.
size
());
max_bin
=
std
::
min
(
max_bin
,
1
+
static_cast
<
int
>
(
non_zero_cnt
/
min_data_in_bin
));
}
// mean size for one bin
double
mean_bin_size
=
sample_size
/
static_cast
<
double
>
(
max_bin
);
int
rest_bin_cnt
=
max_bin
;
int
rest_sample_cnt
=
static_cast
<
int
>
(
sample_
size
);
int
rest_sample_cnt
=
static_cast
<
int
>
(
total_
sample_
cnt
);
std
::
vector
<
bool
>
is_big_count_value
(
num_values
,
false
);
for
(
int
i
=
0
;
i
<
num_values
;
++
i
)
{
if
(
counts
[
i
]
>=
mean_bin_size
)
{
...
...
@@ -110,8 +151,7 @@ void BinMapper::FindBin(std::vector<double>* values, size_t total_sample_cnt, in
rest_sample_cnt
-=
counts
[
i
];
}
}
mean_bin_size
=
rest_sample_cnt
/
static_cast
<
double
>
(
rest_bin_cnt
);
mean_bin_size
=
static_cast
<
double
>
(
rest_sample_cnt
)
/
rest_bin_cnt
;
std
::
vector
<
double
>
upper_bounds
(
max_bin
,
std
::
numeric_limits
<
double
>::
infinity
());
std
::
vector
<
double
>
lower_bounds
(
max_bin
,
std
::
numeric_limits
<
double
>::
infinity
());
...
...
@@ -127,9 +167,7 @@ void BinMapper::FindBin(std::vector<double>* values, size_t total_sample_cnt, in
if
(
is_big_count_value
[
i
]
||
cur_cnt_inbin
>=
mean_bin_size
||
(
is_big_count_value
[
i
+
1
]
&&
cur_cnt_inbin
>=
std
::
max
(
1.0
,
mean_bin_size
*
0.5
f
)))
{
upper_bounds
[
bin_cnt
]
=
distinct_values
[
i
];
if
(
bin_cnt
==
0
)
{
cnt_in_bin0
=
cur_cnt_inbin
;
}
cnt_in_bin
.
push_back
(
cur_cnt_inbin
);
++
bin_cnt
;
lower_bounds
[
bin_cnt
]
=
distinct_values
[
i
+
1
];
if
(
bin_cnt
>=
max_bin
-
1
)
{
break
;
}
...
...
@@ -140,7 +178,8 @@ void BinMapper::FindBin(std::vector<double>* values, size_t total_sample_cnt, in
}
}
}
//
cur_cnt_inbin
+=
counts
.
back
();
cnt_in_bin
.
push_back
(
cur_cnt_inbin
);
++
bin_cnt
;
// update bin upper bound
bin_upper_bound_
=
std
::
vector
<
double
>
(
bin_cnt
);
...
...
@@ -151,7 +190,7 @@ void BinMapper::FindBin(std::vector<double>* values, size_t total_sample_cnt, in
// last bin upper bound
bin_upper_bound_
[
bin_cnt
-
1
]
=
std
::
numeric_limits
<
double
>::
infinity
();
}
CHECK
(
num_bin_
<=
max_bin
);
}
else
{
// convert to int type first
std
::
vector
<
int
>
distinct_values_int
;
...
...
@@ -169,20 +208,21 @@ void BinMapper::FindBin(std::vector<double>* values, size_t total_sample_cnt, in
// sort by counts
Common
::
SortForPair
<
int
,
int
>
(
counts_int
,
distinct_values_int
,
0
,
true
);
// will ingore the categorical of small counts
num_bin_
=
std
::
min
(
max_bin
,
static_cast
<
int
>
(
counts_int
.
size
())
);
const
int
cut_cnt
=
static_cast
<
int
>
(
total_sample_cnt
*
0.98
f
);
categorical_2_bin_
.
clear
();
bin_2_categorical_
=
std
::
vector
<
int
>
(
num_bin_
);
bin_2_categorical_
.
clear
();
num_bin_
=
0
;
int
used_cnt
=
0
;
for
(
int
i
=
0
;
i
<
num_bin_
;
++
i
)
{
bin_2_categorical_
[
i
]
=
distinct_values_int
[
i
];
categorical_2_bin_
[
distinct_values_int
[
i
]]
=
static_cast
<
unsigned
int
>
(
i
);
used_cnt
+=
counts_int
[
i
];
max_bin
=
std
::
min
(
static_cast
<
int
>
(
distinct_values_int
.
size
()),
max_bin
);
while
(
used_cnt
<
cut_cnt
||
num_bin_
<
max_bin
)
{
bin_2_categorical_
.
push_back
(
distinct_values_int
[
num_bin_
]);
categorical_2_bin_
[
distinct_values_int
[
num_bin_
]]
=
static_cast
<
unsigned
int
>
(
num_bin_
);
used_cnt
+=
counts_int
[
num_bin_
];
++
num_bin_
;
}
if
(
used_cnt
/
static_cast
<
double
>
(
sample_size
)
<
0.95
f
)
{
Log
::
Warning
(
"Too many categoricals are ignored, \
please use bigger max_bin or partition this column "
);
}
cnt_in_bin0
=
static_cast
<
int
>
(
sample_size
)
-
used_cnt
+
counts_int
[
0
];
cnt_in_bin
=
counts_int
;
counts_int
.
resize
(
num_bin_
);
counts_int
.
back
()
+=
static_cast
<
int
>
(
total_sample_cnt
-
used_cnt
);
}
// check trival(num_bin_ == 1) feature
...
...
@@ -191,8 +231,16 @@ void BinMapper::FindBin(std::vector<double>* values, size_t total_sample_cnt, in
}
else
{
is_trival_
=
false
;
}
// check useless bin
if
(
!
is_trival_
&&
NeedFilter
(
cnt_in_bin
,
static_cast
<
int
>
(
total_sample_cnt
),
min_split_data
,
bin_type_
))
{
is_trival_
=
true
;
}
if
(
!
is_trival_
)
{
default_bin_
=
ValueToBin
(
0
);
}
// calculate sparse rate
sparse_rate_
=
static_cast
<
double
>
(
cnt_in_bin
0
)
/
static_cast
<
double
>
(
sample_
size
);
sparse_rate_
=
static_cast
<
double
>
(
cnt_in_bin
[
default_bin_
]
)
/
static_cast
<
double
>
(
total_
sample_
cnt
);
}
...
...
@@ -202,7 +250,9 @@ int BinMapper::SizeForSpecificBin(int bin) {
size
+=
sizeof
(
bool
);
size
+=
sizeof
(
double
);
size
+=
sizeof
(
BinType
);
size
+=
2
*
sizeof
(
double
);
size
+=
bin
*
sizeof
(
double
);
size
+=
sizeof
(
uint32_t
);
return
size
;
}
...
...
@@ -215,6 +265,12 @@ void BinMapper::CopyTo(char * buffer) {
buffer
+=
sizeof
(
sparse_rate_
);
std
::
memcpy
(
buffer
,
&
bin_type_
,
sizeof
(
bin_type_
));
buffer
+=
sizeof
(
bin_type_
);
std
::
memcpy
(
&
min_val_
,
buffer
,
sizeof
(
min_val_
));
buffer
+=
sizeof
(
min_val_
);
std
::
memcpy
(
&
max_val_
,
buffer
,
sizeof
(
max_val_
));
buffer
+=
sizeof
(
max_val_
);
std
::
memcpy
(
&
default_bin_
,
buffer
,
sizeof
(
default_bin_
));
buffer
+=
sizeof
(
default_bin_
);
if
(
bin_type_
==
BinType
::
NumericalBin
)
{
std
::
memcpy
(
buffer
,
bin_upper_bound_
.
data
(),
num_bin_
*
sizeof
(
double
));
}
else
{
...
...
@@ -231,6 +287,12 @@ void BinMapper::CopyFrom(const char * buffer) {
buffer
+=
sizeof
(
sparse_rate_
);
std
::
memcpy
(
&
bin_type_
,
buffer
,
sizeof
(
bin_type_
));
buffer
+=
sizeof
(
bin_type_
);
std
::
memcpy
(
&
min_val_
,
buffer
,
sizeof
(
min_val_
));
buffer
+=
sizeof
(
min_val_
);
std
::
memcpy
(
&
max_val_
,
buffer
,
sizeof
(
max_val_
));
buffer
+=
sizeof
(
max_val_
);
std
::
memcpy
(
&
default_bin_
,
buffer
,
sizeof
(
default_bin_
));
buffer
+=
sizeof
(
default_bin_
);
if
(
bin_type_
==
BinType
::
NumericalBin
)
{
bin_upper_bound_
=
std
::
vector
<
double
>
(
num_bin_
);
std
::
memcpy
(
bin_upper_bound_
.
data
(),
buffer
,
num_bin_
*
sizeof
(
double
));
...
...
@@ -249,6 +311,9 @@ void BinMapper::SaveBinaryToFile(FILE* file) const {
fwrite
(
&
is_trival_
,
sizeof
(
is_trival_
),
1
,
file
);
fwrite
(
&
sparse_rate_
,
sizeof
(
sparse_rate_
),
1
,
file
);
fwrite
(
&
bin_type_
,
sizeof
(
bin_type_
),
1
,
file
);
fwrite
(
&
min_val_
,
sizeof
(
min_val_
),
1
,
file
);
fwrite
(
&
max_val_
,
sizeof
(
max_val_
),
1
,
file
);
fwrite
(
&
default_bin_
,
sizeof
(
default_bin_
),
1
,
file
);
if
(
bin_type_
==
BinType
::
NumericalBin
)
{
fwrite
(
bin_upper_bound_
.
data
(),
sizeof
(
double
),
num_bin_
,
file
);
}
else
{
...
...
@@ -258,7 +323,7 @@ void BinMapper::SaveBinaryToFile(FILE* file) const {
size_t
BinMapper
::
SizesInByte
()
const
{
size_t
ret
=
sizeof
(
num_bin_
)
+
sizeof
(
is_trival_
)
+
sizeof
(
sparse_rate_
)
+
sizeof
(
bin_type_
);
+
sizeof
(
bin_type_
)
+
sizeof
(
min_val_
)
+
sizeof
(
max_val_
)
+
sizeof
(
default_bin_
)
;
if
(
bin_type_
==
BinType
::
NumericalBin
)
{
ret
+=
sizeof
(
double
)
*
num_bin_
;
}
else
{
...
...
@@ -271,73 +336,47 @@ template class DenseBin<uint8_t>;
template
class
DenseBin
<
uint16_t
>;
template
class
DenseBin
<
uint32_t
>;
template
class
DenseCategoricalBin
<
uint8_t
>;
template
class
DenseCategoricalBin
<
uint16_t
>;
template
class
DenseCategoricalBin
<
uint32_t
>;
template
class
SparseBin
<
uint8_t
>;
template
class
SparseBin
<
uint16_t
>;
template
class
SparseBin
<
uint32_t
>;
template
class
SparseCategoricalBin
<
uint8_t
>;
template
class
SparseCategoricalBin
<
uint16_t
>;
template
class
SparseCategoricalBin
<
uint32_t
>;
template
class
OrderedSparseBin
<
uint8_t
>;
template
class
OrderedSparseBin
<
uint16_t
>;
template
class
OrderedSparseBin
<
uint32_t
>;
double
BinMapper
::
kSparseThreshold
=
0.8
f
;
Bin
*
Bin
::
CreateBin
(
data_size_t
num_data
,
int
num_bin
,
double
sparse_rate
,
bool
is_enable_sparse
,
bool
*
is_sparse
,
int
default_bin
,
BinType
bin_type
)
{
bool
is_enable_sparse
,
bool
*
is_sparse
)
{
// sparse threshold
const
double
kSparseThreshold
=
0.8
f
;
if
(
sparse_rate
>=
kSparseThreshold
&&
is_enable_sparse
)
{
if
(
sparse_rate
>=
BinMapper
::
kSparseThreshold
&&
is_enable_sparse
)
{
*
is_sparse
=
true
;
return
CreateSparseBin
(
num_data
,
num_bin
,
default_bin
,
bin_type
);
return
CreateSparseBin
(
num_data
,
num_bin
);
}
else
{
*
is_sparse
=
false
;
return
CreateDenseBin
(
num_data
,
num_bin
,
default_bin
,
bin_type
);
return
CreateDenseBin
(
num_data
,
num_bin
);
}
}
Bin
*
Bin
::
CreateDenseBin
(
data_size_t
num_data
,
int
num_bin
,
int
default_bin
,
BinType
bin_type
)
{
if
(
bin_type
==
BinType
::
NumericalBin
)
{
if
(
num_bin
<=
256
)
{
return
new
DenseBin
<
uint8_t
>
(
num_data
,
default_bin
);
}
else
if
(
num_bin
<=
65536
)
{
return
new
DenseBin
<
uint16_t
>
(
num_data
,
default_bin
);
}
else
{
return
new
DenseBin
<
uint32_t
>
(
num_data
,
default_bin
);
}
}
else
{
if
(
num_bin
<=
256
)
{
return
new
DenseCategoricalBin
<
uint8_t
>
(
num_data
,
default_bin
);
Bin
*
Bin
::
CreateDenseBin
(
data_size_t
num_data
,
int
num_bin
)
{
if
(
num_bin
<=
16
)
{
return
new
Dense4bitsBin
(
num_data
);
}
else
if
(
num_bin
<=
256
)
{
return
new
DenseBin
<
uint8_t
>
(
num_data
);
}
else
if
(
num_bin
<=
65536
)
{
return
new
Dense
Categorical
Bin
<
uint16_t
>
(
num_data
,
default_bin
);
return
new
DenseBin
<
uint16_t
>
(
num_data
);
}
else
{
return
new
DenseCategoricalBin
<
uint32_t
>
(
num_data
,
default_bin
);
}
return
new
DenseBin
<
uint32_t
>
(
num_data
);
}
}
Bin
*
Bin
::
CreateSparseBin
(
data_size_t
num_data
,
int
num_bin
,
int
default_bin
,
BinType
bin_type
)
{
if
(
bin_type
==
BinType
::
NumericalBin
)
{
Bin
*
Bin
::
CreateSparseBin
(
data_size_t
num_data
,
int
num_bin
)
{
if
(
num_bin
<=
256
)
{
return
new
SparseBin
<
uint8_t
>
(
num_data
,
default_bin
);
return
new
SparseBin
<
uint8_t
>
(
num_data
);
}
else
if
(
num_bin
<=
65536
)
{
return
new
SparseBin
<
uint16_t
>
(
num_data
,
default_bin
);
return
new
SparseBin
<
uint16_t
>
(
num_data
);
}
else
{
return
new
SparseBin
<
uint32_t
>
(
num_data
,
default_bin
);
}
}
else
{
if
(
num_bin
<=
256
)
{
return
new
SparseCategoricalBin
<
uint8_t
>
(
num_data
,
default_bin
);
}
else
if
(
num_bin
<=
65536
)
{
return
new
SparseCategoricalBin
<
uint16_t
>
(
num_data
,
default_bin
);
}
else
{
return
new
SparseCategoricalBin
<
uint32_t
>
(
num_data
,
default_bin
);
}
return
new
SparseBin
<
uint32_t
>
(
num_data
);
}
}
...
...
src/io/config.cpp
View file @
eade219e
...
...
@@ -39,11 +39,11 @@ void OverallConfig::Set(const std::unordered_map<std::string, std::string>& para
// generate seeds by seed.
if
(
GetInt
(
params
,
"seed"
,
&
seed
))
{
Random
rand
(
seed
);
int
int_max
=
std
::
numeric_limits
<
in
t
>::
max
();
io_config
.
data_random_seed
=
static_cast
<
int
>
(
rand
.
Next
In
t
(
0
,
int_max
));
boosting_config
.
bagging_seed
=
static_cast
<
int
>
(
rand
.
Next
In
t
(
0
,
int_max
));
boosting_config
.
drop_seed
=
static_cast
<
int
>
(
rand
.
Next
In
t
(
0
,
int_max
));
boosting_config
.
tree_config
.
feature_fraction_seed
=
static_cast
<
int
>
(
rand
.
Next
In
t
(
0
,
int_max
));
int
int_max
=
std
::
numeric_limits
<
shor
t
>::
max
();
io_config
.
data_random_seed
=
static_cast
<
int
>
(
rand
.
Next
Shor
t
(
0
,
int_max
));
boosting_config
.
bagging_seed
=
static_cast
<
int
>
(
rand
.
Next
Shor
t
(
0
,
int_max
));
boosting_config
.
drop_seed
=
static_cast
<
int
>
(
rand
.
Next
Shor
t
(
0
,
int_max
));
boosting_config
.
tree_config
.
feature_fraction_seed
=
static_cast
<
int
>
(
rand
.
Next
Shor
t
(
0
,
int_max
));
}
GetTaskType
(
params
);
GetBoostingType
(
params
);
...
...
@@ -79,6 +79,8 @@ void OverallConfig::GetBoostingType(const std::unordered_map<std::string, std::s
boosting_type
=
"gbdt"
;
}
else
if
(
value
==
std
::
string
(
"dart"
))
{
boosting_type
=
"dart"
;
}
else
if
(
value
==
std
::
string
(
"goss"
))
{
boosting_type
=
"goss"
;
}
else
{
Log
::
Fatal
(
"Unknown boosting type %s"
,
value
.
c_str
());
}
...
...
@@ -102,7 +104,7 @@ void OverallConfig::GetMetricType(const std::unordered_map<std::string, std::str
std
::
transform
(
value
.
begin
(),
value
.
end
(),
value
.
begin
(),
Common
::
tolower
);
// split
std
::
vector
<
std
::
string
>
metrics
=
Common
::
Split
(
value
.
c_str
(),
','
);
// remove du
m
plicate
// remove duplicate
std
::
unordered_set
<
std
::
string
>
metric_sets
;
for
(
auto
&
metric
:
metrics
)
{
std
::
transform
(
metric
.
begin
(),
metric
.
end
(),
metric
.
begin
(),
Common
::
tolower
);
...
...
@@ -147,6 +149,7 @@ void OverallConfig::CheckParamConflict() {
Log
::
Fatal
(
"Number of classes must be 1 for non-multiclass training"
);
}
}
if
(
boosting_config
.
is_provide_training_metric
||
!
io_config
.
valid_data_filenames
.
empty
())
{
for
(
std
::
string
metric_type
:
metric_types
)
{
bool
metric_type_multiclass
=
(
metric_type
==
std
::
string
(
"multi_logloss"
)
||
metric_type
==
std
::
string
(
"multi_error"
));
if
((
objective_type_multiclass
&&
!
metric_type_multiclass
)
...
...
@@ -154,6 +157,7 @@ void OverallConfig::CheckParamConflict() {
Log
::
Fatal
(
"Objective and metrics don't match"
);
}
}
}
if
(
network_config
.
num_machines
>
1
)
{
is_parallel
=
true
;
...
...
@@ -177,7 +181,7 @@ void OverallConfig::CheckParamConflict() {
&&
boosting_config
.
tree_learner_type
==
std
::
string
(
"data"
))
{
Log
::
Warning
(
"Histogram LRU queue was enabled (histogram_pool_size=%f). Will disable this to reduce communication costs"
,
boosting_config
.
tree_config
.
histogram_pool_size
);
// Change pool size to -1 (no
t
limit) when using data parallel to reduce communication costs
// Change pool size to -1 (no limit) when using data parallel to reduce communication costs
boosting_config
.
tree_config
.
histogram_pool_size
=
-
1
;
}
...
...
@@ -213,6 +217,11 @@ void IOConfig::Set(const std::unordered_map<std::string, std::string>& params) {
GetString
(
params
,
"group_column"
,
&
group_column
);
GetString
(
params
,
"ignore_column"
,
&
ignore_column
);
GetString
(
params
,
"categorical_column"
,
&
categorical_column
);
GetInt
(
params
,
"min_data_in_leaf"
,
&
min_data_in_leaf
);
GetInt
(
params
,
"min_dato_in_bin"
,
&
min_data_in_bin
);
GetDouble
(
params
,
"max_conflict_rate"
,
&
max_conflict_rate
);
GetBool
(
params
,
"enable_bundle"
,
&
enable_bundle
);
GetBool
(
params
,
"adjacent_bundle"
,
&
adjacent_bundle
);
}
...
...
@@ -222,6 +231,7 @@ void ObjectiveConfig::Set(const std::unordered_map<std::string, std::string>& pa
GetDouble
(
params
,
"huber_delta"
,
&
huber_delta
);
GetDouble
(
params
,
"fair_c"
,
&
fair_c
);
GetDouble
(
params
,
"gaussian_eta"
,
&
gaussian_eta
);
GetDouble
(
params
,
"poisson_max_delta_step"
,
&
poisson_max_delta_step
);
GetInt
(
params
,
"max_position"
,
&
max_position
);
CHECK
(
max_position
>
0
);
GetInt
(
params
,
"num_class"
,
&
num_class
);
...
...
@@ -293,7 +303,6 @@ void TreeConfig::Set(const std::unordered_map<std::string, std::string>& params)
GetDouble
(
params
,
"histogram_pool_size"
,
&
histogram_pool_size
);
GetInt
(
params
,
"max_depth"
,
&
max_depth
);
GetInt
(
params
,
"top_k"
,
&
top_k
);
CHECK
(
max_depth
>
1
||
max_depth
<
0
);
}
...
...
@@ -320,6 +329,8 @@ void BoostingConfig::Set(const std::unordered_map<std::string, std::string>& par
GetInt
(
params
,
"max_drop"
,
&
max_drop
);
GetBool
(
params
,
"xgboost_dart_mode"
,
&
xgboost_dart_mode
);
GetBool
(
params
,
"uniform_drop"
,
&
uniform_drop
);
GetDouble
(
params
,
"top_rate"
,
&
top_rate
);
GetDouble
(
params
,
"other_rate"
,
&
other_rate
);
CHECK
(
drop_rate
<=
1.0
&&
drop_rate
>=
0.0
);
CHECK
(
skip_drop
<=
1.0
&&
skip_drop
>=
0.0
);
GetTreeLearnerType
(
params
);
...
...
src/io/dataset.cpp
View file @
eade219e
#include <LightGBM/dataset.h>
#include <LightGBM/feature_group.h>
#include <LightGBM/utils/openmp_wrapper.h>
#include <LightGBM/utils/threading.h>
#include <LightGBM/utils/array_args.h>
#include <LightGBM/feature.h>
#include <omp.h>
#include <chrono>
#include <cstdio>
#include <unordered_map>
#include <limits>
...
...
@@ -19,55 +20,212 @@ const char* Dataset::binary_file_token = "______LightGBM_Binary_File_Token______
Dataset
::
Dataset
()
{
data_filename_
=
"noname"
;
num_data_
=
0
;
is_finish_load_
=
false
;
}
Dataset
::
Dataset
(
data_size_t
num_data
)
{
data_filename_
=
"noname"
;
num_data_
=
num_data
;
metadata_
.
Init
(
num_data_
,
-
1
,
-
1
);
metadata_
.
Init
(
num_data_
,
NO_SPECIFIC
,
NO_SPECIFIC
);
is_finish_load_
=
false
;
}
Dataset
::~
Dataset
()
{
}
std
::
vector
<
std
::
vector
<
int
>>
NoGroup
(
const
std
::
vector
<
int
>&
used_features
)
{
std
::
vector
<
std
::
vector
<
int
>>
features_in_group
;
features_in_group
.
resize
(
used_features
.
size
());
for
(
size_t
i
=
0
;
i
<
used_features
.
size
();
++
i
)
{
features_in_group
[
i
].
emplace_back
(
used_features
[
i
]);
}
return
features_in_group
;
}
void
Dataset
::
Construct
(
std
::
vector
<
std
::
unique_ptr
<
BinMapper
>>&
bin_mappers
,
const
std
::
vector
<
std
::
vector
<
int
>>&
,
size_t
,
const
IOConfig
&
io_config
)
{
num_total_features_
=
static_cast
<
int
>
(
bin_mappers
.
size
());
// get num_features
std
::
vector
<
int
>
used_features
;
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
bin_mappers
.
size
());
++
i
)
{
if
(
bin_mappers
[
i
]
!=
nullptr
&&
!
bin_mappers
[
i
]
->
is_trival
())
{
used_features
.
emplace_back
(
i
);
}
}
auto
features_in_group
=
NoGroup
(
used_features
);
num_features_
=
0
;
for
(
const
auto
&
fs
:
features_in_group
)
{
num_features_
+=
static_cast
<
int
>
(
fs
.
size
());
}
int
cur_fidx
=
0
;
used_feature_map_
=
std
::
vector
<
int
>
(
num_total_features_
,
-
1
);
num_groups_
=
static_cast
<
int
>
(
features_in_group
.
size
());
real_feature_idx_
.
resize
(
num_features_
);
feature2group_
.
resize
(
num_features_
);
feature2subfeature_
.
resize
(
num_features_
);
for
(
int
i
=
0
;
i
<
num_groups_
;
++
i
)
{
auto
cur_features
=
features_in_group
[
i
];
int
cur_cnt_features
=
static_cast
<
int
>
(
cur_features
.
size
());
// get bin_mappers
std
::
vector
<
std
::
unique_ptr
<
BinMapper
>>
cur_bin_mappers
;
for
(
int
j
=
0
;
j
<
cur_cnt_features
;
++
j
)
{
int
real_fidx
=
cur_features
[
j
];
used_feature_map_
[
real_fidx
]
=
cur_fidx
;
real_feature_idx_
[
cur_fidx
]
=
real_fidx
;
feature2group_
[
cur_fidx
]
=
i
;
feature2subfeature_
[
cur_fidx
]
=
j
;
cur_bin_mappers
.
emplace_back
(
bin_mappers
[
real_fidx
].
release
());
++
cur_fidx
;
}
feature_groups_
.
emplace_back
(
std
::
unique_ptr
<
FeatureGroup
>
(
new
FeatureGroup
(
cur_cnt_features
,
cur_bin_mappers
,
num_data_
,
io_config
.
is_enable_sparse
)));
}
feature_groups_
.
shrink_to_fit
();
group_bin_boundaries_
.
clear
();
uint64_t
num_total_bin
=
0
;
group_bin_boundaries_
.
push_back
(
num_total_bin
);
for
(
int
i
=
0
;
i
<
num_groups_
;
++
i
)
{
num_total_bin
+=
feature_groups_
[
i
]
->
num_total_bin_
;
group_bin_boundaries_
.
push_back
(
num_total_bin
);
}
int
last_group
=
0
;
group_feature_start_
.
reserve
(
num_groups_
);
group_feature_cnt_
.
reserve
(
num_groups_
);
group_feature_start_
.
push_back
(
0
);
group_feature_cnt_
.
push_back
(
1
);
for
(
int
i
=
1
;
i
<
num_features_
;
++
i
)
{
const
int
group
=
feature2group_
[
i
];
if
(
group
==
last_group
)
{
group_feature_cnt_
.
back
()
=
group_feature_cnt_
.
back
()
+
1
;
}
else
{
group_feature_start_
.
push_back
(
i
);
group_feature_cnt_
.
push_back
(
1
);
last_group
=
group
;
}
}
}
void
Dataset
::
FinishLoad
()
{
if
(
is_finish_load_
)
{
return
;
}
#pragma omp parallel for schedule(guided)
for
(
int
i
=
0
;
i
<
num_
feature
s_
;
++
i
)
{
feature
s_
[
i
]
->
FinishLoad
();
for
(
int
i
=
0
;
i
<
num_
group
s_
;
++
i
)
{
feature
_groups_
[
i
]
->
bin_data_
->
FinishLoad
();
}
is_finish_load_
=
true
;
}
void
Dataset
::
CopyFeatureMapperFrom
(
const
Dataset
*
dataset
,
bool
is_enable_sparse
)
{
features_
.
clear
();
void
Dataset
::
CopyFeatureMapperFrom
(
const
Dataset
*
dataset
)
{
feature_groups_
.
clear
();
num_features_
=
dataset
->
num_features_
;
num_groups_
=
dataset
->
num_groups_
;
bool
is_enable_sparse
=
false
;
for
(
int
i
=
0
;
i
<
num_groups_
;
++
i
)
{
if
(
dataset
->
feature_groups_
[
i
]
->
is_sparse_
)
{
is_enable_sparse
=
true
;
break
;
}
}
// copy feature bin mapper data
for
(
const
auto
&
feature
:
dataset
->
features_
)
{
features_
.
emplace_back
(
std
::
unique_ptr
<
Feature
>
(
new
Feature
(
feature
->
feature_index
(),
new
BinMapper
(
*
feature
->
bin_mapper
()),
for
(
int
i
=
0
;
i
<
num_groups_
;
++
i
)
{
std
::
vector
<
std
::
unique_ptr
<
BinMapper
>>
bin_mappers
;
for
(
int
j
=
0
;
j
<
dataset
->
feature_groups_
[
i
]
->
num_feature_
;
++
j
)
{
bin_mappers
.
emplace_back
(
new
BinMapper
(
*
(
dataset
->
feature_groups_
[
i
]
->
bin_mappers_
[
j
])));
}
feature_groups_
.
emplace_back
(
new
FeatureGroup
(
dataset
->
feature_groups_
[
i
]
->
num_feature_
,
bin_mappers
,
num_data_
,
is_enable_sparse
)
));
is_enable_sparse
));
}
features_
.
shrink_to_fit
();
feature
_group
s_
.
shrink_to_fit
();
used_feature_map_
=
dataset
->
used_feature_map_
;
num_features_
=
static_cast
<
int
>
(
features_
.
size
());
num_total_features_
=
dataset
->
num_total_features_
;
feature_names_
=
dataset
->
feature_names_
;
label_idx_
=
dataset
->
label_idx_
;
real_feature_idx_
=
dataset
->
real_feature_idx_
;
feature2group_
=
dataset
->
feature2group_
;
feature2subfeature_
=
dataset
->
feature2subfeature_
;
group_bin_boundaries_
=
dataset
->
group_bin_boundaries_
;
group_feature_start_
=
dataset
->
group_feature_start_
;
group_feature_cnt_
=
dataset
->
group_feature_cnt_
;
}
Dataset
*
Dataset
::
Subset
(
const
data_size_t
*
used_indices
,
data_size_t
num_used_indices
,
bool
is_enable_sparse
)
const
{
auto
ret
=
std
::
unique_ptr
<
Dataset
>
(
new
Dataset
(
num_used_indices
));
ret
->
CopyFeatureMapperFrom
(
this
,
is_enable_sparse
);
#pragma omp parallel for schedule(guided)
for
(
int
fidx
=
0
;
fidx
<
num_features_
;
++
fidx
)
{
auto
iterator
=
features_
[
fidx
]
->
bin_data
()
->
GetIterator
(
0
);
for
(
data_size_t
i
=
0
;
i
<
num_used_indices
;
++
i
)
{
ret
->
features_
[
fidx
]
->
PushBin
(
0
,
i
,
iterator
->
Get
(
used_indices
[
i
]));
void
Dataset
::
CreateValid
(
const
Dataset
*
dataset
)
{
feature_groups_
.
clear
();
num_features_
=
dataset
->
num_features_
;
num_groups_
=
num_features_
;
bool
is_enable_sparse
=
true
;
feature2group_
.
clear
();
feature2subfeature_
.
clear
();
// copy feature bin mapper data
for
(
int
i
=
0
;
i
<
num_features_
;
++
i
)
{
std
::
vector
<
std
::
unique_ptr
<
BinMapper
>>
bin_mappers
;
bin_mappers
.
emplace_back
(
new
BinMapper
(
*
(
dataset
->
FeatureBinMapper
(
i
))));
feature_groups_
.
emplace_back
(
new
FeatureGroup
(
1
,
bin_mappers
,
num_data_
,
is_enable_sparse
));
feature2group_
.
push_back
(
i
);
feature2subfeature_
.
push_back
(
0
);
}
feature_groups_
.
shrink_to_fit
();
used_feature_map_
=
dataset
->
used_feature_map_
;
num_total_features_
=
dataset
->
num_total_features_
;
feature_names_
=
dataset
->
feature_names_
;
label_idx_
=
dataset
->
label_idx_
;
real_feature_idx_
=
dataset
->
real_feature_idx_
;
group_bin_boundaries_
.
clear
();
uint64_t
num_total_bin
=
0
;
group_bin_boundaries_
.
push_back
(
num_total_bin
);
for
(
int
i
=
0
;
i
<
num_groups_
;
++
i
)
{
num_total_bin
+=
feature_groups_
[
i
]
->
num_total_bin_
;
group_bin_boundaries_
.
push_back
(
num_total_bin
);
}
int
last_group
=
0
;
group_feature_start_
.
reserve
(
num_groups_
);
group_feature_cnt_
.
reserve
(
num_groups_
);
group_feature_start_
.
push_back
(
0
);
group_feature_cnt_
.
push_back
(
1
);
for
(
int
i
=
1
;
i
<
num_features_
;
++
i
)
{
const
int
group
=
feature2group_
[
i
];
if
(
group
==
last_group
)
{
group_feature_cnt_
.
back
()
=
group_feature_cnt_
.
back
()
+
1
;
}
else
{
group_feature_start_
.
push_back
(
i
);
group_feature_cnt_
.
push_back
(
1
);
last_group
=
group
;
}
}
}
void
Dataset
::
ReSize
(
data_size_t
num_data
)
{
if
(
num_data_
!=
num_data
)
{
num_data_
=
num_data
;
#pragma omp parallel for schedule(static)
for
(
int
group
=
0
;
group
<
num_groups_
;
++
group
)
{
feature_groups_
[
group
]
->
bin_data_
->
ReSize
(
num_data_
);
}
}
ret
->
metadata_
.
Init
(
metadata_
,
used_indices
,
num_used_indices
);
return
ret
.
release
();
}
void
Dataset
::
CopySubset
(
const
Dataset
*
fullset
,
const
data_size_t
*
used_indices
,
data_size_t
num_used_indices
,
bool
need_meta_data
)
{
CHECK
(
num_used_indices
==
num_data_
);
#pragma omp parallel for schedule(static)
for
(
int
group
=
0
;
group
<
num_groups_
;
++
group
)
{
feature_groups_
[
group
]
->
CopySubset
(
fullset
->
feature_groups_
[
group
].
get
(),
used_indices
,
num_used_indices
);
}
if
(
need_meta_data
)
{
metadata_
.
Init
(
fullset
->
metadata_
,
used_indices
,
num_used_indices
);
}
is_finish_load_
=
true
;
}
bool
Dataset
::
SetFloatField
(
const
char
*
field_name
,
const
float
*
field_data
,
data_size_t
num_element
)
{
...
...
@@ -99,8 +257,6 @@ bool Dataset::SetIntField(const char* field_name, const int* field_data, data_si
name
=
Common
::
Trim
(
name
);
if
(
name
==
std
::
string
(
"query"
)
||
name
==
std
::
string
(
"group"
))
{
metadata_
.
SetQuery
(
field_data
,
num_element
);
}
else
if
(
name
==
std
::
string
(
"query_id"
)
||
name
==
std
::
string
(
"group_id"
))
{
metadata_
.
SetQueryId
(
field_data
,
num_element
);
}
else
{
return
false
;
}
...
...
@@ -186,7 +342,8 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
fwrite
(
binary_file_token
,
sizeof
(
char
),
size_of_token
,
file
);
// get size of header
size_t
size_of_header
=
sizeof
(
num_data_
)
+
sizeof
(
num_features_
)
+
sizeof
(
num_total_features_
)
+
sizeof
(
size_t
)
+
sizeof
(
int
)
*
used_feature_map_
.
size
();
+
sizeof
(
int
)
*
num_total_features_
+
sizeof
(
num_groups_
)
+
3
*
sizeof
(
int
)
*
num_features_
+
sizeof
(
uint64_t
)
*
(
num_groups_
+
1
)
+
2
*
sizeof
(
int
)
*
num_groups_
;
// size of feature names
for
(
int
i
=
0
;
i
<
num_total_features_
;
++
i
)
{
size_of_header
+=
feature_names_
[
i
].
size
()
+
sizeof
(
int
);
...
...
@@ -195,10 +352,15 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
// write header
fwrite
(
&
num_data_
,
sizeof
(
num_data_
),
1
,
file
);
fwrite
(
&
num_features_
,
sizeof
(
num_features_
),
1
,
file
);
fwrite
(
&
num_total_features_
,
sizeof
(
num_features_
),
1
,
file
);
size_t
num_used_feature_map
=
used_feature_map_
.
size
();
fwrite
(
&
num_used_feature_map
,
sizeof
(
num_used_feature_map
),
1
,
file
);
fwrite
(
used_feature_map_
.
data
(),
sizeof
(
int
),
num_used_feature_map
,
file
);
fwrite
(
&
num_total_features_
,
sizeof
(
num_total_features_
),
1
,
file
);
fwrite
(
used_feature_map_
.
data
(),
sizeof
(
int
),
num_total_features_
,
file
);
fwrite
(
&
num_groups_
,
sizeof
(
num_groups_
),
1
,
file
);
fwrite
(
real_feature_idx_
.
data
(),
sizeof
(
int
),
num_features_
,
file
);
fwrite
(
feature2group_
.
data
(),
sizeof
(
int
),
num_features_
,
file
);
fwrite
(
feature2subfeature_
.
data
(),
sizeof
(
int
),
num_features_
,
file
);
fwrite
(
group_bin_boundaries_
.
data
(),
sizeof
(
uint64_t
),
num_groups_
+
1
,
file
);
fwrite
(
group_feature_start_
.
data
(),
sizeof
(
int
),
num_groups_
,
file
);
fwrite
(
group_feature_cnt_
.
data
(),
sizeof
(
int
),
num_groups_
,
file
);
// write feature names
for
(
int
i
=
0
;
i
<
num_total_features_
;
++
i
)
{
...
...
@@ -215,15 +377,95 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
metadata_
.
SaveBinaryToFile
(
file
);
// write feature data
for
(
int
i
=
0
;
i
<
num_
feature
s_
;
++
i
)
{
for
(
int
i
=
0
;
i
<
num_
group
s_
;
++
i
)
{
// get size of feature
size_t
size_of_feature
=
features_
[
i
]
->
SizesInByte
();
size_t
size_of_feature
=
feature
_group
s_
[
i
]
->
SizesInByte
();
fwrite
(
&
size_of_feature
,
sizeof
(
size_of_feature
),
1
,
file
);
// write feature
features_
[
i
]
->
SaveBinaryToFile
(
file
);
feature
_group
s_
[
i
]
->
SaveBinaryToFile
(
file
);
}
fclose
(
file
);
}
}
void
Dataset
::
ConstructHistograms
(
const
std
::
vector
<
int8_t
>&
is_feature_used
,
const
data_size_t
*
data_indices
,
data_size_t
num_data
,
int
leaf_idx
,
std
::
vector
<
std
::
unique_ptr
<
OrderedBin
>>&
ordered_bins
,
const
score_t
*
gradients
,
const
score_t
*
hessians
,
score_t
*
ordered_gradients
,
score_t
*
ordered_hessians
,
HistogramBinEntry
*
hist_data
)
const
{
if
(
leaf_idx
<
0
||
num_data
<=
0
||
hist_data
==
nullptr
)
{
return
;
}
auto
ptr_ordered_grad
=
gradients
;
auto
ptr_ordered_hess
=
hessians
;
if
(
data_indices
!=
nullptr
&&
num_data
<
num_data_
)
{
#pragma omp parallel for schedule(static)
for
(
data_size_t
i
=
0
;
i
<
num_data
;
++
i
)
{
ordered_gradients
[
i
]
=
gradients
[
data_indices
[
i
]];
ordered_hessians
[
i
]
=
hessians
[
data_indices
[
i
]];
}
ptr_ordered_grad
=
ordered_gradients
;
ptr_ordered_hess
=
ordered_hessians
;
}
#pragma omp parallel for schedule(static)
for
(
int
group
=
0
;
group
<
num_groups_
;
++
group
)
{
bool
is_groud_used
=
false
;
const
int
f_cnt
=
group_feature_cnt_
[
group
];
for
(
int
j
=
0
;
j
<
f_cnt
;
++
j
)
{
const
int
fidx
=
group_feature_start_
[
group
]
+
j
;
if
(
is_feature_used
[
fidx
])
{
is_groud_used
=
true
;
break
;
}
}
if
(
!
is_groud_used
)
{
continue
;
}
// feature is not used
auto
data_ptr
=
hist_data
+
group_bin_boundaries_
[
group
];
const
int
num_bin
=
feature_groups_
[
group
]
->
num_total_bin_
;
std
::
memset
(
data_ptr
+
1
,
0
,
(
num_bin
-
1
)
*
sizeof
(
HistogramBinEntry
));
// construct histograms for smaller leaf
if
(
ordered_bins
[
group
]
==
nullptr
)
{
// if not use ordered bin
feature_groups_
[
group
]
->
bin_data_
->
ConstructHistogram
(
data_indices
,
num_data
,
ptr_ordered_grad
,
ptr_ordered_hess
,
data_ptr
);
}
else
{
// used ordered bin
ordered_bins
[
group
]
->
ConstructHistogram
(
leaf_idx
,
gradients
,
hessians
,
data_ptr
);
}
}
}
void
Dataset
::
FixHistogram
(
int
feature_idx
,
double
sum_gradient
,
double
sum_hessian
,
data_size_t
num_data
,
HistogramBinEntry
*
data
)
const
{
const
int
group
=
feature2group_
[
feature_idx
];
const
int
sub_feature
=
feature2subfeature_
[
feature_idx
];
const
BinMapper
*
bin_mapper
=
feature_groups_
[
group
]
->
bin_mappers_
[
sub_feature
].
get
();
const
int
default_bin
=
bin_mapper
->
GetDefaultBin
();
if
(
default_bin
>
0
)
{
const
int
num_bin
=
bin_mapper
->
num_bin
();
data
[
default_bin
].
sum_gradients
=
sum_gradient
;
data
[
default_bin
].
sum_hessians
=
sum_hessian
;
data
[
default_bin
].
cnt
=
num_data
;
for
(
int
i
=
0
;
i
<
num_bin
;
++
i
)
{
if
(
i
!=
default_bin
)
{
data
[
default_bin
].
sum_gradients
-=
data
[
i
].
sum_gradients
;
data
[
default_bin
].
sum_hessians
-=
data
[
i
].
sum_hessians
;
data
[
default_bin
].
cnt
-=
data
[
i
].
cnt
;
}
}
}
}
}
// namespace LightGBM
src/io/dataset_loader.cpp
View file @
eade219e
#include <
omp
.h>
#include <
LightGBM/utils/openmp_wrapper
.h>
#include <LightGBM/utils/log.h>
#include <LightGBM/dataset_loader.h>
#include <LightGBM/feature.h>
#include <LightGBM/network.h>
...
...
@@ -132,8 +131,6 @@ void DatasetLoader::SetHeader(const char* filename) {
ignore_features_
.
emplace
(
group_idx_
);
}
}
// load categorical features
if
(
io_config_
.
categorical_column
.
size
()
>
0
)
{
if
(
Common
::
StartsWith
(
io_config_
.
categorical_column
,
name_prefix
))
{
std
::
string
names
=
io_config_
.
categorical_column
.
substr
(
name_prefix
.
size
());
...
...
@@ -209,7 +206,7 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac
}
}
else
{
// load data from binary file
dataset
.
reset
(
LoadFromBinFile
(
filename
,
bin_filename
.
c_str
(),
rank
,
num_machines
));
dataset
.
reset
(
LoadFromBinFile
(
filename
,
bin_filename
.
c_str
(),
rank
,
num_machines
,
&
num_global_data
,
&
used_data_indices
));
}
// check meta data
dataset
->
metadata_
.
CheckOrPartition
(
num_global_data
,
used_data_indices
);
...
...
@@ -238,7 +235,7 @@ Dataset* DatasetLoader::LoadFromFileAlignWithOtherDataset(const char* filename,
dataset
->
num_data_
=
static_cast
<
data_size_t
>
(
text_data
.
size
());
// initialize label
dataset
->
metadata_
.
Init
(
dataset
->
num_data_
,
weight_idx_
,
group_idx_
);
dataset
->
C
opyFeatureMapperFrom
(
train_data
,
io_config_
.
is_enable_sparse
);
dataset
->
C
reateValid
(
train_data
);
// extract features
ExtractFeaturesFromMemory
(
text_data
,
parser
.
get
(),
dataset
.
get
());
text_data
.
clear
();
...
...
@@ -249,13 +246,13 @@ Dataset* DatasetLoader::LoadFromFileAlignWithOtherDataset(const char* filename,
num_global_data
=
dataset
->
num_data_
;
// initialize label
dataset
->
metadata_
.
Init
(
dataset
->
num_data_
,
weight_idx_
,
group_idx_
);
dataset
->
C
opyFeatureMapperFrom
(
train_data
,
io_config_
.
is_enable_sparse
);
dataset
->
C
reateValid
(
train_data
);
// extract features
ExtractFeaturesFromFile
(
filename
,
parser
.
get
(),
used_data_indices
,
dataset
.
get
());
}
}
else
{
// load data from binary file
dataset
.
reset
(
LoadFromBinFile
(
filename
,
bin_filename
.
c_str
(),
0
,
1
));
dataset
.
reset
(
LoadFromBinFile
(
filename
,
bin_filename
.
c_str
(),
0
,
1
,
&
num_global_data
,
&
used_data_indices
));
}
// not need to check validation data
// check meta data
...
...
@@ -263,7 +260,7 @@ Dataset* DatasetLoader::LoadFromFileAlignWithOtherDataset(const char* filename,
return
dataset
.
release
();
}
Dataset
*
DatasetLoader
::
LoadFromBinFile
(
const
char
*
data_filename
,
const
char
*
bin_filename
,
int
rank
,
int
num_machines
)
{
Dataset
*
DatasetLoader
::
LoadFromBinFile
(
const
char
*
data_filename
,
const
char
*
bin_filename
,
int
rank
,
int
num_machines
,
int
*
num_global_data
,
std
::
vector
<
data_size_t
>*
used_data_indices
)
{
auto
dataset
=
std
::
unique_ptr
<
Dataset
>
(
new
Dataset
());
FILE
*
file
;
#ifdef _MSC_VER
...
...
@@ -318,14 +315,60 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b
mem_ptr
+=
sizeof
(
dataset
->
num_features_
);
dataset
->
num_total_features_
=
*
(
reinterpret_cast
<
const
int
*>
(
mem_ptr
));
mem_ptr
+=
sizeof
(
dataset
->
num_total_features_
);
size_t
num_used_feature_map
=
*
(
reinterpret_cast
<
const
size_t
*>
(
mem_ptr
));
mem_ptr
+=
sizeof
(
num_used_feature_map
);
const
int
*
tmp_feature_map
=
reinterpret_cast
<
const
int
*>
(
mem_ptr
);
dataset
->
used_feature_map_
.
clear
();
for
(
size_
t
i
=
0
;
i
<
num_used
_feature_
map
;
++
i
)
{
for
(
in
t
i
=
0
;
i
<
dataset
->
num_total
_feature
s
_
;
++
i
)
{
dataset
->
used_feature_map_
.
push_back
(
tmp_feature_map
[
i
]);
}
mem_ptr
+=
sizeof
(
int
)
*
num_used_feature_map
;
mem_ptr
+=
sizeof
(
int
)
*
dataset
->
num_total_features_
;
// num_groups
dataset
->
num_groups_
=
*
(
reinterpret_cast
<
const
int
*>
(
mem_ptr
));
mem_ptr
+=
sizeof
(
dataset
->
num_groups_
);
// real_feature_idx_
const
int
*
tmp_ptr_real_feature_idx_
=
reinterpret_cast
<
const
int
*>
(
mem_ptr
);
dataset
->
real_feature_idx_
.
clear
();
for
(
int
i
=
0
;
i
<
dataset
->
num_features_
;
++
i
)
{
dataset
->
real_feature_idx_
.
push_back
(
tmp_ptr_real_feature_idx_
[
i
]);
}
mem_ptr
+=
sizeof
(
int
)
*
dataset
->
num_features_
;
// feature2group
const
int
*
tmp_ptr_feature2group
=
reinterpret_cast
<
const
int
*>
(
mem_ptr
);
dataset
->
feature2group_
.
clear
();
for
(
int
i
=
0
;
i
<
dataset
->
num_features_
;
++
i
)
{
dataset
->
feature2group_
.
push_back
(
tmp_ptr_feature2group
[
i
]);
}
mem_ptr
+=
sizeof
(
int
)
*
dataset
->
num_features_
;
// feature2subfeature
const
int
*
tmp_ptr_feature2subfeature
=
reinterpret_cast
<
const
int
*>
(
mem_ptr
);
dataset
->
feature2subfeature_
.
clear
();
for
(
int
i
=
0
;
i
<
dataset
->
num_features_
;
++
i
)
{
dataset
->
feature2subfeature_
.
push_back
(
tmp_ptr_feature2subfeature
[
i
]);
}
mem_ptr
+=
sizeof
(
int
)
*
dataset
->
num_features_
;
// group_bin_boundaries
const
uint64_t
*
tmp_ptr_group_bin_boundaries
=
reinterpret_cast
<
const
uint64_t
*>
(
mem_ptr
);
dataset
->
group_bin_boundaries_
.
clear
();
for
(
int
i
=
0
;
i
<
dataset
->
num_groups_
+
1
;
++
i
)
{
dataset
->
group_bin_boundaries_
.
push_back
(
tmp_ptr_group_bin_boundaries
[
i
]);
}
mem_ptr
+=
sizeof
(
uint64_t
)
*
(
dataset
->
num_groups_
+
1
);
// group_feature_start_
const
int
*
tmp_ptr_group_feature_start
=
reinterpret_cast
<
const
int
*>
(
mem_ptr
);
dataset
->
group_feature_start_
.
clear
();
for
(
int
i
=
0
;
i
<
dataset
->
num_groups_
;
++
i
)
{
dataset
->
group_feature_start_
.
push_back
(
tmp_ptr_group_feature_start
[
i
]);
}
mem_ptr
+=
sizeof
(
int
)
*
(
dataset
->
num_groups_
);
// group_feature_cnt_
const
int
*
tmp_ptr_group_feature_cnt
=
reinterpret_cast
<
const
int
*>
(
mem_ptr
);
dataset
->
group_feature_cnt_
.
clear
();
for
(
int
i
=
0
;
i
<
dataset
->
num_groups_
;
++
i
)
{
dataset
->
group_feature_cnt_
.
push_back
(
tmp_ptr_group_feature_cnt
[
i
]);
}
mem_ptr
+=
sizeof
(
int
)
*
(
dataset
->
num_groups_
);
// get feature names
dataset
->
feature_names_
.
clear
();
// write feature names
...
...
@@ -364,16 +407,16 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b
// load meta data
dataset
->
metadata_
.
LoadFromMemory
(
buffer
.
data
());
std
::
vector
<
data_size_t
>
used_data_indices
;
data_size_t
num_global_data
=
dataset
->
num_data_
;
*
num_global_data
=
dataset
->
num_data_
;
used_data_indices
->
clear
()
;
// sample local used data if need to partition
if
(
num_machines
>
1
&&
!
io_config_
.
is_pre_partition
)
{
const
data_size_t
*
query_boundaries
=
dataset
->
metadata_
.
query_boundaries
();
if
(
query_boundaries
==
nullptr
)
{
// if not contain query file, minimal sample unit is one record
for
(
data_size_t
i
=
0
;
i
<
dataset
->
num_data_
;
++
i
)
{
if
(
random_
.
Next
In
t
(
0
,
num_machines
)
==
rank
)
{
used_data_indices
.
push_back
(
i
);
if
(
random_
.
Next
Shor
t
(
0
,
num_machines
)
==
rank
)
{
used_data_indices
->
push_back
(
i
);
}
}
}
else
{
...
...
@@ -388,21 +431,21 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b
if
(
i
>=
query_boundaries
[
qid
+
1
])
{
// if is new query
is_query_used
=
false
;
if
(
random_
.
Next
In
t
(
0
,
num_machines
)
==
rank
)
{
if
(
random_
.
Next
Shor
t
(
0
,
num_machines
)
==
rank
)
{
is_query_used
=
true
;
}
++
qid
;
}
if
(
is_query_used
)
{
used_data_indices
.
push_back
(
i
);
used_data_indices
->
push_back
(
i
);
}
}
}
dataset
->
num_data_
=
static_cast
<
data_size_t
>
(
used_data_indices
.
size
());
dataset
->
num_data_
=
static_cast
<
data_size_t
>
(
(
*
used_data_indices
)
.
size
());
}
dataset
->
metadata_
.
PartitionLabel
(
used_data_indices
);
dataset
->
metadata_
.
PartitionLabel
(
*
used_data_indices
);
// read feature data
for
(
int
i
=
0
;
i
<
dataset
->
num_
feature
s_
;
++
i
)
{
for
(
int
i
=
0
;
i
<
dataset
->
num_
group
s_
;
++
i
)
{
// read feature size
read_cnt
=
fread
(
buffer
.
data
(),
sizeof
(
size_t
),
1
,
file
);
if
(
read_cnt
!=
1
)
{
...
...
@@ -420,64 +463,49 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b
if
(
read_cnt
!=
size_of_feature
)
{
Log
::
Fatal
(
"Binary file error: feature %d is incorrect, read count: %d"
,
i
,
read_cnt
);
}
dataset
->
features_
.
emplace_back
(
std
::
unique_ptr
<
Feature
>
(
new
Feature
(
buffer
.
data
(),
num_global_data
,
used_data_indices
)
dataset
->
feature
_group
s_
.
emplace_back
(
std
::
unique_ptr
<
Feature
Group
>
(
new
Feature
Group
(
buffer
.
data
(),
*
num_global_data
,
*
used_data_indices
)
));
}
dataset
->
features_
.
shrink_to_fit
();
dataset
->
feature
_group
s_
.
shrink_to_fit
();
fclose
(
file
);
dataset
->
is_finish_load_
=
true
;
return
dataset
.
release
();
}
Dataset
*
DatasetLoader
::
CostructFromSampleData
(
std
::
vector
<
std
::
vector
<
double
>>&
sample_values
,
size_t
total_sample_size
,
data_size_t
num_data
)
{
Dataset
*
DatasetLoader
::
CostructFromSampleData
(
std
::
vector
<
std
::
vector
<
double
>>&
sample_values
,
std
::
vector
<
std
::
vector
<
int
>>&
sample_indices
,
size_t
total_sample_size
,
data_size_t
num_data
)
{
std
::
vector
<
std
::
unique_ptr
<
BinMapper
>>
bin_mappers
(
sample_values
.
size
());
#pragma omp parallel for schedule(guided)
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
sample_values
.
size
());
++
i
)
{
bin_mappers
[
i
].
reset
(
new
BinMapper
());
BinType
bin_type
=
BinType
::
NumericalBin
;
if
(
categorical_features_
.
count
(
i
))
{
bin_type
=
BinType
::
CategoricalBin
;
}
bin_mappers
[
i
]
->
FindBin
(
&
sample_values
[
i
],
total_sample_size
,
io_config_
.
max_bin
,
bin_type
);
}
auto
dataset
=
std
::
unique_ptr
<
Dataset
>
(
new
Dataset
());
dataset
->
features_
.
clear
();
dataset
->
num_data_
=
num_data
;
// -1 means doesn't use this feature
dataset
->
used_feature_map_
=
std
::
vector
<
int
>
(
bin_mappers
.
size
(),
-
1
);
dataset
->
num_total_features_
=
static_cast
<
int
>
(
bin_mappers
.
size
());
for
(
size_t
i
=
0
;
i
<
bin_mappers
.
size
();
++
i
)
{
if
(
!
bin_mappers
[
i
]
->
is_trival
())
{
// map real feature index to used feature index
dataset
->
used_feature_map_
[
i
]
=
static_cast
<
int
>
(
dataset
->
features_
.
size
());
// push new feature
dataset
->
features_
.
emplace_back
(
std
::
unique_ptr
<
Feature
>
(
new
Feature
(
static_cast
<
int
>
(
i
),
bin_mappers
[
i
].
release
(),
dataset
->
num_data_
,
io_config_
.
is_enable_sparse
)
));
}
else
{
// if feature is trival(only 1 bin), free spaces
Log
::
Warning
(
"Ignoring Column_%d , only has one value"
,
i
);
}
}
dataset
->
features_
.
shrink_to_fit
();
// fill feature_names_ if not header
if
(
feature_names_
.
empty
())
{
for
(
int
i
=
0
;
i
<
dataset
->
num_total_features_
;
++
i
)
{
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
sample_values
.
size
())
;
++
i
)
{
std
::
stringstream
str_buf
;
str_buf
<<
"Column_"
<<
i
;
feature_names_
.
push_back
(
str_buf
.
str
());
}
}
const
data_size_t
filter_cnt
=
static_cast
<
data_size_t
>
(
static_cast
<
double
>
(
0.95
*
io_config_
.
min_data_in_leaf
)
/
num_data
*
sample_values
.
size
());
#pragma omp parallel for schedule(guided)
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
sample_values
.
size
());
++
i
)
{
if
(
ignore_features_
.
count
(
i
)
>
0
)
{
bin_mappers
[
i
]
=
nullptr
;
continue
;
}
BinType
bin_type
=
BinType
::
NumericalBin
;
if
(
categorical_features_
.
count
(
i
))
{
bin_type
=
BinType
::
CategoricalBin
;
}
bin_mappers
[
i
].
reset
(
new
BinMapper
());
bin_mappers
[
i
]
->
FindBin
(
sample_values
[
i
],
total_sample_size
,
io_config_
.
max_bin
,
io_config_
.
min_data_in_bin
,
filter_cnt
,
bin_type
);
}
auto
dataset
=
std
::
unique_ptr
<
Dataset
>
(
new
Dataset
(
num_data
));
dataset
->
feature_names_
=
feature_names_
;
dataset
->
num_features_
=
static_cast
<
int
>
(
dataset
->
features_
.
size
());
dataset
->
metadata_
.
Init
(
dataset
->
num_data_
,
NO_SPECIFIC
,
NO_SPECIFIC
);
dataset
->
Construct
(
bin_mappers
,
sample_indices
,
total_sample_size
,
io_config_
);
return
dataset
.
release
();
}
...
...
@@ -488,13 +516,34 @@ void DatasetLoader::CheckDataset(const Dataset* dataset) {
if
(
dataset
->
num_data_
<=
0
)
{
Log
::
Fatal
(
"Data file %s is empty"
,
dataset
->
data_filename_
);
}
if
(
dataset
->
features_
.
empty
())
{
if
(
dataset
->
feature
_group
s_
.
empty
())
{
Log
::
Fatal
(
"No usable features in data file %s"
,
dataset
->
data_filename_
);
}
if
(
dataset
->
feature_names_
.
size
()
!=
static_cast
<
size_t
>
(
dataset
->
num_total_features_
))
{
Log
::
Fatal
(
"Size of feature name error, should be %d, got %d"
,
dataset
->
num_total_features_
,
static_cast
<
int
>
(
dataset
->
feature_names_
.
size
()));
}
bool
is_feature_order_by_group
=
true
;
int
last_group
=
-
1
;
int
last_sub_feature
=
-
1
;
// if features are ordered, not need to use hist_buf
for
(
int
i
=
0
;
i
<
dataset
->
num_features_
;
++
i
)
{
int
group
=
dataset
->
feature2group_
[
i
];
int
sub_feature
=
dataset
->
feature2subfeature_
[
i
];
if
(
group
<
last_group
)
{
is_feature_order_by_group
=
false
;
}
else
if
(
group
==
last_group
)
{
if
(
sub_feature
<=
last_sub_feature
)
{
is_feature_order_by_group
=
false
;
break
;
}
}
last_group
=
group
;
last_sub_feature
=
sub_feature
;
}
if
(
!
is_feature_order_by_group
)
{
Log
::
Fatal
(
"feature in dataset should order by group"
);
}
}
std
::
vector
<
std
::
string
>
DatasetLoader
::
LoadTextDataToMemory
(
const
char
*
filename
,
const
Metadata
&
metadata
,
...
...
@@ -512,7 +561,7 @@ std::vector<std::string> DatasetLoader::LoadTextDataToMemory(const char* filenam
if
(
query_boundaries
==
nullptr
)
{
// if not contain query data, minimal sample unit is one record
*
num_global_data
=
text_reader
.
ReadAndFilterLines
([
this
,
rank
,
num_machines
](
data_size_t
)
{
if
(
random_
.
Next
In
t
(
0
,
num_machines
)
==
rank
)
{
if
(
random_
.
Next
Shor
t
(
0
,
num_machines
)
==
rank
)
{
return
true
;
}
else
{
return
false
;
...
...
@@ -532,7 +581,7 @@ std::vector<std::string> DatasetLoader::LoadTextDataToMemory(const char* filenam
if
(
line_idx
>=
query_boundaries
[
qid
+
1
])
{
// if is new query
is_query_used
=
false
;
if
(
random_
.
Next
In
t
(
0
,
num_machines
)
==
rank
)
{
if
(
random_
.
Next
Shor
t
(
0
,
num_machines
)
==
rank
)
{
is_query_used
=
true
;
}
++
qid
;
...
...
@@ -571,7 +620,7 @@ std::vector<std::string> DatasetLoader::SampleTextDataFromFile(const char* filen
// if not contain query file, minimal sample unit is one record
*
num_global_data
=
text_reader
.
SampleAndFilterFromFile
([
this
,
rank
,
num_machines
]
(
data_size_t
)
{
if
(
random_
.
Next
In
t
(
0
,
num_machines
)
==
rank
)
{
if
(
random_
.
Next
Shor
t
(
0
,
num_machines
)
==
rank
)
{
return
true
;
}
else
{
return
false
;
...
...
@@ -592,7 +641,7 @@ std::vector<std::string> DatasetLoader::SampleTextDataFromFile(const char* filen
if
(
line_idx
>=
query_boundaries
[
qid
+
1
])
{
// if is new query
is_query_used
=
false
;
if
(
random_
.
Next
In
t
(
0
,
num_machines
)
==
rank
)
{
if
(
random_
.
Next
Shor
t
(
0
,
num_machines
)
==
rank
)
{
is_query_used
=
true
;
}
++
qid
;
...
...
@@ -605,30 +654,28 @@ std::vector<std::string> DatasetLoader::SampleTextDataFromFile(const char* filen
}
void
DatasetLoader
::
ConstructBinMappersFromTextData
(
int
rank
,
int
num_machines
,
const
std
::
vector
<
std
::
string
>&
sample_data
,
const
Parser
*
parser
,
Dataset
*
dataset
)
{
// sample_values[i][j], means the value of j-th sample on i-th feature
std
::
vector
<
std
::
vector
<
double
>>
sample_values
;
// temp buffer for one line features and label
std
::
vector
<
std
::
vector
<
int
>>
sample_indices
;
std
::
vector
<
std
::
pair
<
int
,
double
>>
oneline_features
;
double
label
;
for
(
size_
t
i
=
0
;
i
<
sample_data
.
size
();
++
i
)
{
for
(
in
t
i
=
0
;
i
<
static_cast
<
int
>
(
sample_data
.
size
()
)
;
++
i
)
{
oneline_features
.
clear
();
// parse features
parser
->
ParseOneLine
(
sample_data
[
i
].
c_str
(),
&
oneline_features
,
&
label
);
for
(
std
::
pair
<
int
,
double
>&
inner_data
:
oneline_features
)
{
if
(
static_cast
<
size_t
>
(
inner_data
.
first
)
>=
sample_values
.
size
())
{
// if need expand feature set
size_t
need_size
=
inner_data
.
first
-
sample_values
.
size
()
+
1
;
for
(
size_t
j
=
0
;
j
<
need_size
;
++
j
)
{
sample_values
.
emplace_back
();
}
sample_values
.
resize
(
inner_data
.
first
+
1
);
sample_indices
.
resize
(
inner_data
.
first
+
1
);
}
if
(
std
::
fabs
(
inner_data
.
second
)
>
1e-15
)
{
sample_values
[
inner_data
.
first
].
push_back
(
inner_data
.
second
);
if
(
std
::
fabs
(
inner_data
.
second
)
>
kEpsilon
)
{
sample_values
[
inner_data
.
first
].
emplace_back
(
inner_data
.
second
);
sample_indices
[
inner_data
.
first
].
emplace_back
(
i
);
}
}
}
dataset
->
features_
.
clear
();
dataset
->
feature
_group
s_
.
clear
();
if
(
feature_names_
.
empty
())
{
// -1 means doesn't use this feature
...
...
@@ -653,48 +700,32 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
}
}
dataset
->
feature_names_
=
feature_names_
;
std
::
vector
<
std
::
unique_ptr
<
BinMapper
>>
bin_mappers
(
sample_values
.
size
());
const
data_size_t
filter_cnt
=
static_cast
<
data_size_t
>
(
static_cast
<
double
>
(
0.95
*
io_config_
.
min_data_in_leaf
)
/
dataset
->
num_data_
*
sample_values
.
size
());
// start find bins
if
(
num_machines
==
1
)
{
std
::
vector
<
std
::
unique_ptr
<
BinMapper
>>
bin_mappers
(
sample_values
.
size
());
// if only one machine, find bin locally
#pragma omp parallel for schedule(guided)
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
sample_values
.
size
());
++
i
)
{
if
(
ignore_features_
.
count
(
i
)
>
0
)
{
bin_mappers
[
i
]
.
reset
(
nullptr
)
;
bin_mappers
[
i
]
=
nullptr
;
continue
;
}
bin_mappers
[
i
].
reset
(
new
BinMapper
());
BinType
bin_type
=
BinType
::
NumericalBin
;
if
(
categorical_features_
.
count
(
i
))
{
bin_type
=
BinType
::
CategoricalBin
;
}
bin_mappers
[
i
]
->
FindBin
(
&
sample_values
[
i
],
sample_data
.
size
(),
io_config_
.
max_bin
,
bin_type
);
}
for
(
size_t
i
=
0
;
i
<
sample_values
.
size
();
++
i
)
{
if
(
bin_mappers
[
i
]
==
nullptr
)
{
Log
::
Warning
(
"Ignoring feature %s"
,
feature_names_
[
i
].
c_str
());
}
else
if
(
!
bin_mappers
[
i
]
->
is_trival
())
{
// map real feature index to used feature index
dataset
->
used_feature_map_
[
i
]
=
static_cast
<
int
>
(
dataset
->
features_
.
size
());
// push new feature
dataset
->
features_
.
emplace_back
(
std
::
unique_ptr
<
Feature
>
(
new
Feature
(
static_cast
<
int
>
(
i
),
bin_mappers
[
i
].
release
(),
dataset
->
num_data_
,
io_config_
.
is_enable_sparse
)
));
}
else
{
// if feature is trival(only 1 bin), free spaces
Log
::
Warning
(
"Ignoring feature %s, only has one value"
,
feature_names_
[
i
].
c_str
());
}
bin_mappers
[
i
].
reset
(
new
BinMapper
());
bin_mappers
[
i
]
->
FindBin
(
sample_values
[
i
],
sample_data
.
size
(),
io_config_
.
max_bin
,
io_config_
.
min_data_in_bin
,
filter_cnt
,
bin_type
);
}
}
else
{
// if have multi-machines, need find bin distributed
// if have multi-machines, need
to
find bin distributed
// different machines will find bin for different features
// start and len will store the process feature indices for different machines
// machine i will find bins for features in [ st
r
at[i], start[i] + len[i] )
// machine i will find bins for features in [ sta
r
t[i], start[i] + len[i] )
std
::
vector
<
int
>
start
(
num_machines
);
std
::
vector
<
int
>
len
(
num_machines
);
int
total_num_feature
=
static_cast
<
int
>
(
sample_values
.
size
());
...
...
@@ -707,8 +738,50 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
start
[
i
+
1
]
=
start
[
i
]
+
len
[
i
];
}
len
[
num_machines
-
1
]
=
total_num_feature
-
start
[
num_machines
-
1
];
// get size of bin mapper with max_bin_ size
int
type_size
=
BinMapper
::
SizeForSpecificBin
(
io_config_
.
max_bin
);
#pragma omp parallel for schedule(guided)
for
(
int
i
=
0
;
i
<
len
[
rank
];
++
i
)
{
if
(
ignore_features_
.
count
(
start
[
rank
]
+
i
)
>
0
)
{
continue
;
}
BinType
bin_type
=
BinType
::
NumericalBin
;
if
(
categorical_features_
.
count
(
start
[
rank
]
+
i
))
{
bin_type
=
BinType
::
CategoricalBin
;
}
bin_mappers
[
i
].
reset
(
new
BinMapper
());
bin_mappers
[
i
]
->
FindBin
(
sample_values
[
start
[
rank
]
+
i
],
sample_data
.
size
(),
io_config_
.
max_bin
,
io_config_
.
min_data_in_bin
,
filter_cnt
,
bin_type
);
}
// get max_bin
int
local_max_bin
=
0
;
for
(
int
i
=
0
;
i
<
len
[
rank
];
++
i
)
{
if
(
ignore_features_
.
count
(
start
[
rank
]
+
i
)
>
0
)
{
continue
;
}
local_max_bin
=
std
::
max
(
local_max_bin
,
bin_mappers
[
i
]
->
num_bin
());
}
int
max_bin
=
local_max_bin
;
// sync global max_bin
Network
::
Allreduce
(
reinterpret_cast
<
char
*>
(
&
local_max_bin
),
sizeof
(
local_max_bin
),
sizeof
(
local_max_bin
),
reinterpret_cast
<
char
*>
(
&
max_bin
),
[]
(
const
char
*
src
,
char
*
dst
,
int
len
)
{
int
used_size
=
0
;
const
int
type_size
=
sizeof
(
int
);
const
int
*
p1
;
int
*
p2
;
while
(
used_size
<
len
)
{
p1
=
reinterpret_cast
<
const
int
*>
(
src
);
p2
=
reinterpret_cast
<
int
*>
(
dst
);
if
(
*
p1
>
*
p2
)
{
std
::
memcpy
(
dst
,
src
,
type_size
);
}
src
+=
type_size
;
dst
+=
type_size
;
used_size
+=
type_size
;
}
});
// get size of bin mapper with max_bin size
int
type_size
=
BinMapper
::
SizeForSpecificBin
(
max_bin
);
// since sizes of different feature may not be same, we expand all bin mapper to type_size
int
buffer_size
=
type_size
*
total_num_feature
;
auto
input_buffer
=
std
::
vector
<
char
>
(
buffer_size
);
...
...
@@ -717,13 +790,12 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
// find local feature bins and copy to buffer
#pragma omp parallel for schedule(guided)
for
(
int
i
=
0
;
i
<
len
[
rank
];
++
i
)
{
BinMapper
bin_mapper
;
BinType
bin_type
=
BinType
::
NumericalBin
;
if
(
categorical_features_
.
count
(
start
[
rank
]
+
i
))
{
bin_type
=
BinType
::
CategoricalBin
;
if
(
ignore_features_
.
count
(
start
[
rank
]
+
i
)
>
0
)
{
continue
;
}
bin_mapper
.
FindBin
(
&
sample_values
[
start
[
rank
]
+
i
],
sample_data
.
size
(),
io_config_
.
max_bin
,
bin_type
);
bin_mapper
.
CopyTo
(
input_buffer
.
data
()
+
i
*
type_size
);
bin_mappers
[
i
]
->
CopyTo
(
input_buffer
.
data
()
+
i
*
type_size
);
// free
bin_mappers
[
i
].
reset
(
nullptr
);
}
// convert to binary size
for
(
int
i
=
0
;
i
<
num_machines
;
++
i
)
{
...
...
@@ -735,26 +807,15 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
// restore features bins from buffer
for
(
int
i
=
0
;
i
<
total_num_feature
;
++
i
)
{
if
(
ignore_features_
.
count
(
i
)
>
0
)
{
Log
::
Warning
(
"Ignoring feature %s"
,
feature_names_
[
i
].
c_str
())
;
bin_mappers
[
i
]
=
nullptr
;
continue
;
}
auto
bin_mapper
=
std
::
unique_ptr
<
BinMapper
>
(
new
BinMapper
());
bin_mapper
->
CopyFrom
(
output_buffer
.
data
()
+
i
*
type_size
);
if
(
!
bin_mapper
->
is_trival
())
{
dataset
->
used_feature_map_
[
i
]
=
static_cast
<
int
>
(
dataset
->
features_
.
size
());
dataset
->
features_
.
emplace_back
(
std
::
unique_ptr
<
Feature
>
(
new
Feature
(
static_cast
<
int
>
(
i
),
bin_mapper
.
release
(),
dataset
->
num_data_
,
io_config_
.
is_enable_sparse
)
));
}
else
{
Log
::
Warning
(
"Ignoring feature %s, only has one value"
,
feature_names_
[
i
].
c_str
());
}
bin_mappers
[
i
].
reset
(
new
BinMapper
());
bin_mappers
[
i
]
->
CopyFrom
(
output_buffer
.
data
()
+
i
*
type_size
);
}
}
dataset
->
features_
.
shrink_to_fit
();
dataset
->
num_features_
=
static_cast
<
int
>
(
dataset
->
features_
.
size
()
);
sample_values
.
clear
();
dataset
->
Construct
(
bin_mappers
,
sample_indices
,
sample_data
.
size
(),
io_config_
);
}
/*! \brief Extract local features from memory */
...
...
@@ -763,7 +824,7 @@ void DatasetLoader::ExtractFeaturesFromMemory(std::vector<std::string>& text_dat
double
tmp_label
=
0.0
f
;
if
(
predict_fun_
==
nullptr
)
{
// if doesn't need to prediction with initial model
#pragma omp parallel for schedule(
guided
) private(oneline_features) firstprivate(tmp_label)
#pragma omp parallel for schedule(
static
) private(oneline_features) firstprivate(tmp_label)
for
(
data_size_t
i
=
0
;
i
<
dataset
->
num_data_
;
++
i
)
{
const
int
tid
=
omp_get_thread_num
();
oneline_features
.
clear
();
...
...
@@ -781,7 +842,9 @@ void DatasetLoader::ExtractFeaturesFromMemory(std::vector<std::string>& text_dat
int
feature_idx
=
dataset
->
used_feature_map_
[
inner_data
.
first
];
if
(
feature_idx
>=
0
)
{
// if is used feature
dataset
->
features_
[
feature_idx
]
->
PushData
(
tid
,
i
,
inner_data
.
second
);
int
group
=
dataset
->
feature2group_
[
feature_idx
];
int
sub_feature
=
dataset
->
feature2subfeature_
[
feature_idx
];
dataset
->
feature_groups_
[
group
]
->
PushData
(
tid
,
sub_feature
,
i
,
inner_data
.
second
);
}
else
{
if
(
inner_data
.
first
==
weight_idx_
)
{
dataset
->
metadata_
.
SetWeightAt
(
i
,
static_cast
<
float
>
(
inner_data
.
second
));
...
...
@@ -794,7 +857,7 @@ void DatasetLoader::ExtractFeaturesFromMemory(std::vector<std::string>& text_dat
}
else
{
// if need to prediction with initial model
std
::
vector
<
double
>
init_score
(
dataset
->
num_data_
*
num_class_
);
#pragma omp parallel for schedule(
guided
) private(oneline_features) firstprivate(tmp_label)
#pragma omp parallel for schedule(
static
) private(oneline_features) firstprivate(tmp_label)
for
(
data_size_t
i
=
0
;
i
<
dataset
->
num_data_
;
++
i
)
{
const
int
tid
=
omp_get_thread_num
();
oneline_features
.
clear
();
...
...
@@ -817,7 +880,9 @@ void DatasetLoader::ExtractFeaturesFromMemory(std::vector<std::string>& text_dat
int
feature_idx
=
dataset
->
used_feature_map_
[
inner_data
.
first
];
if
(
feature_idx
>=
0
)
{
// if is used feature
dataset
->
features_
[
feature_idx
]
->
PushData
(
tid
,
i
,
inner_data
.
second
);
int
group
=
dataset
->
feature2group_
[
feature_idx
];
int
sub_feature
=
dataset
->
feature2subfeature_
[
feature_idx
];
dataset
->
feature_groups_
[
group
]
->
PushData
(
tid
,
sub_feature
,
i
,
inner_data
.
second
);
}
else
{
if
(
inner_data
.
first
==
weight_idx_
)
{
dataset
->
metadata_
.
SetWeightAt
(
i
,
static_cast
<
float
>
(
inner_data
.
second
));
...
...
@@ -867,7 +932,9 @@ void DatasetLoader::ExtractFeaturesFromFile(const char* filename, const Parser*
int
feature_idx
=
dataset
->
used_feature_map_
[
inner_data
.
first
];
if
(
feature_idx
>=
0
)
{
// if is used feature
dataset
->
features_
[
feature_idx
]
->
PushData
(
tid
,
start_idx
+
i
,
inner_data
.
second
);
int
group
=
dataset
->
feature2group_
[
feature_idx
];
int
sub_feature
=
dataset
->
feature2subfeature_
[
feature_idx
];
dataset
->
feature_groups_
[
group
]
->
PushData
(
tid
,
sub_feature
,
start_idx
+
i
,
inner_data
.
second
);
}
else
{
if
(
inner_data
.
first
==
weight_idx_
)
{
dataset
->
metadata_
.
SetWeightAt
(
start_idx
+
i
,
static_cast
<
float
>
(
inner_data
.
second
));
...
...
src/io/dense_bin.hpp
View file @
eade219e
...
...
@@ -9,21 +9,41 @@
namespace
LightGBM
{
template
<
typename
VAL_T
>
class
DenseBin
;
template
<
typename
VAL_T
>
class
DenseBinIterator
:
public
BinIterator
{
public:
explicit
DenseBinIterator
(
const
DenseBin
<
VAL_T
>*
bin_data
,
uint32_t
min_bin
,
uint32_t
max_bin
,
uint32_t
default_bin
)
:
bin_data_
(
bin_data
),
min_bin_
(
static_cast
<
VAL_T
>
(
min_bin
)),
max_bin_
(
static_cast
<
VAL_T
>
(
max_bin
)),
default_bin_
(
static_cast
<
uint8_t
>
(
default_bin
))
{
if
(
default_bin_
==
0
)
{
bias_
=
1
;
}
else
{
bias_
=
0
;
}
}
inline
uint32_t
Get
(
data_size_t
idx
)
override
;
inline
void
Reset
(
data_size_t
)
override
{
}
private:
const
DenseBin
<
VAL_T
>*
bin_data_
;
VAL_T
min_bin_
;
VAL_T
max_bin_
;
VAL_T
default_bin_
;
uint8_t
bias_
;
};
/*!
* \brief Used to store bins for dense feature
* Use template to reduce memory cost
*/
template
<
typename
VAL_T
>
class
DenseBin
:
public
Bin
{
class
DenseBin
:
public
Bin
{
public:
DenseBin
(
data_size_t
num_data
,
int
default_bin
)
:
num_data_
(
num_data
)
{
data_
.
resize
(
num_data_
);
VAL_T
default_bin_T
=
static_cast
<
VAL_T
>
(
default_bin
);
#pragma omp parallel for schedule(static)
for
(
data_size_t
i
=
0
;
i
<
num_data_
;
++
i
)
{
data_
[
i
]
=
default_bin_T
;
}
friend
DenseBinIterator
<
VAL_T
>
;
DenseBin
(
data_size_t
num_data
)
:
num_data_
(
num_data
),
data_
(
num_data_
,
static_cast
<
VAL_T
>
(
0
))
{
}
~
DenseBin
()
{
...
...
@@ -33,24 +53,27 @@ public:
data_
[
idx
]
=
static_cast
<
VAL_T
>
(
value
);
}
inline
uint32_t
Get
(
data_size_t
idx
)
const
{
return
static_cast
<
uint32_t
>
(
data_
[
idx
]);
void
ReSize
(
data_size_t
num_data
)
override
{
if
(
num_data_
!=
num_data
)
{
num_data_
=
num_data
;
data_
.
resize
(
num_data_
);
}
}
BinIterator
*
GetIterator
(
data_size_t
start_idx
)
const
override
;
BinIterator
*
GetIterator
(
uint32_t
min_bin
,
uint32_t
max_bin
,
uint32_t
default_bin
)
const
override
;
void
ConstructHistogram
(
const
data_size_t
*
data_indices
,
data_size_t
num_data
,
const
score_t
*
ordered_gradients
,
const
score_t
*
ordered_hessians
,
HistogramBinEntry
*
out
)
const
override
{
// use 4-way unrolling, will be faster
if
(
data_indices
!=
nullptr
)
{
// if use part of data
data_size_t
rest
=
num_data
%
4
;
const
data_size_t
rest
=
num_data
&
0x3
;
data_size_t
i
=
0
;
for
(;
i
<
num_data
-
rest
;
i
+=
4
)
{
VAL_T
bin0
=
data_
[
data_indices
[
i
]];
VAL_T
bin1
=
data_
[
data_indices
[
i
+
1
]];
VAL_T
bin2
=
data_
[
data_indices
[
i
+
2
]];
VAL_T
bin3
=
data_
[
data_indices
[
i
+
3
]];
const
VAL_T
bin0
=
data_
[
data_indices
[
i
]];
const
VAL_T
bin1
=
data_
[
data_indices
[
i
+
1
]];
const
VAL_T
bin2
=
data_
[
data_indices
[
i
+
2
]];
const
VAL_T
bin3
=
data_
[
data_indices
[
i
+
3
]];
out
[
bin0
].
sum_gradients
+=
ordered_gradients
[
i
];
out
[
bin1
].
sum_gradients
+=
ordered_gradients
[
i
+
1
];
...
...
@@ -68,19 +91,19 @@ public:
++
out
[
bin3
].
cnt
;
}
for
(;
i
<
num_data
;
++
i
)
{
VAL_T
bin
=
data_
[
data_indices
[
i
]];
const
VAL_T
bin
=
data_
[
data_indices
[
i
]];
out
[
bin
].
sum_gradients
+=
ordered_gradients
[
i
];
out
[
bin
].
sum_hessians
+=
ordered_hessians
[
i
];
++
out
[
bin
].
cnt
;
}
}
else
{
// use full data
data_size_t
rest
=
num_data
%
4
;
const
data_size_t
rest
=
num_data
&
0x3
;
data_size_t
i
=
0
;
for
(;
i
<
num_data
-
rest
;
i
+=
4
)
{
VAL_T
bin0
=
data_
[
i
];
VAL_T
bin1
=
data_
[
i
+
1
];
VAL_T
bin2
=
data_
[
i
+
2
];
VAL_T
bin3
=
data_
[
i
+
3
];
const
VAL_T
bin0
=
data_
[
i
];
const
VAL_T
bin1
=
data_
[
i
+
1
];
const
VAL_T
bin2
=
data_
[
i
+
2
];
const
VAL_T
bin3
=
data_
[
i
+
3
];
out
[
bin0
].
sum_gradients
+=
ordered_gradients
[
i
];
out
[
bin1
].
sum_gradients
+=
ordered_gradients
[
i
+
1
];
...
...
@@ -98,7 +121,7 @@ public:
++
out
[
bin3
].
cnt
;
}
for
(;
i
<
num_data
;
++
i
)
{
VAL_T
bin
=
data_
[
i
];
const
VAL_T
bin
=
data_
[
i
];
out
[
bin
].
sum_gradients
+=
ordered_gradients
[
i
];
out
[
bin
].
sum_hessians
+=
ordered_hessians
[
i
];
++
out
[
bin
].
cnt
;
...
...
@@ -106,18 +129,54 @@ public:
}
}
virtual
data_size_t
Split
(
unsigned
int
threshold
,
data_size_t
*
data_indices
,
data_size_t
num_data
,
data_size_t
*
lte_indices
,
data_size_t
*
gt_indices
)
const
override
{
virtual
data_size_t
Split
(
uint32_t
min_bin
,
uint32_t
max_bin
,
uint32_t
default_bin
,
uint32_t
threshold
,
data_size_t
*
data_indices
,
data_size_t
num_data
,
data_size_t
*
lte_indices
,
data_size_t
*
gt_indices
,
BinType
bin_type
)
const
override
{
if
(
num_data
<=
0
)
{
return
0
;
}
VAL_T
th
=
static_cast
<
VAL_T
>
(
threshold
+
min_bin
);
VAL_T
minb
=
static_cast
<
VAL_T
>
(
min_bin
);
VAL_T
maxb
=
static_cast
<
VAL_T
>
(
max_bin
);
if
(
default_bin
==
0
)
{
th
-=
1
;
}
data_size_t
lte_count
=
0
;
data_size_t
gt_count
=
0
;
data_size_t
*
default_indices
=
gt_indices
;
data_size_t
*
default_count
=
&
gt_count
;
if
(
bin_type
==
BinType
::
NumericalBin
)
{
if
(
default_bin
<=
threshold
)
{
default_indices
=
lte_indices
;
default_count
=
&
lte_count
;
}
for
(
data_size_t
i
=
0
;
i
<
num_data
;
++
i
)
{
data_size_t
idx
=
data_indices
[
i
];
if
(
data_
[
idx
]
>
threshold
)
{
const
data_size_t
idx
=
data_indices
[
i
];
VAL_T
bin
=
data_
[
idx
];
if
(
bin
>
maxb
||
bin
<
minb
)
{
default_indices
[(
*
default_count
)
++
]
=
idx
;
}
else
if
(
bin
>
th
)
{
gt_indices
[
gt_count
++
]
=
idx
;
}
else
{
lte_indices
[
lte_count
++
]
=
idx
;
}
}
}
else
{
if
(
default_bin
==
threshold
)
{
default_indices
=
lte_indices
;
default_count
=
&
lte_count
;
}
for
(
data_size_t
i
=
0
;
i
<
num_data
;
++
i
)
{
const
data_size_t
idx
=
data_indices
[
i
];
VAL_T
bin
=
data_
[
idx
];
if
(
bin
>
maxb
||
bin
<
minb
)
{
default_indices
[(
*
default_count
)
++
]
=
idx
;
}
else
if
(
bin
!=
th
)
{
gt_indices
[
gt_count
++
]
=
idx
;
}
else
{
lte_indices
[
lte_count
++
]
=
idx
;
}
}
}
return
lte_count
;
}
data_size_t
num_data
()
const
override
{
return
num_data_
;
}
...
...
@@ -140,6 +199,13 @@ public:
}
}
void
CopySubset
(
const
Bin
*
full_bin
,
const
data_size_t
*
used_indices
,
data_size_t
num_used_indices
)
override
{
auto
other_bin
=
reinterpret_cast
<
const
DenseBin
<
VAL_T
>*>
(
full_bin
);
for
(
int
i
=
0
;
i
<
num_used_indices
;
++
i
)
{
data_
[
i
]
=
other_bin
->
data_
[
used_indices
[
i
]];
}
}
void
SaveBinaryToFile
(
FILE
*
file
)
const
override
{
fwrite
(
data_
.
data
(),
sizeof
(
VAL_T
),
num_data_
,
file
);
}
...
...
@@ -154,45 +220,19 @@ protected:
};
template
<
typename
VAL_T
>
class
DenseBinIterator
:
public
BinIterator
{
public:
explicit
DenseBinIterator
(
const
DenseBin
<
VAL_T
>*
bin_data
)
:
bin_data_
(
bin_data
)
{
}
uint32_t
Get
(
data_size_t
idx
)
override
{
return
bin_data_
->
Get
(
idx
);
uint32_t
DenseBinIterator
<
VAL_T
>::
Get
(
data_size_t
idx
)
{
auto
ret
=
bin_data_
->
data_
[
idx
];
if
(
ret
>=
min_bin_
&&
ret
<=
max_bin_
)
{
return
ret
-
min_bin_
+
bias_
;
}
else
{
return
default_bin_
;
}
private:
const
DenseBin
<
VAL_T
>*
bin_data_
;
};
template
<
typename
VAL_T
>
BinIterator
*
DenseBin
<
VAL_T
>::
GetIterator
(
data_size_t
)
const
{
return
new
DenseBinIterator
<
VAL_T
>
(
this
);
}
template
<
typename
VAL_T
>
class
DenseCategoricalBin
:
public
DenseBin
<
VAL_T
>
{
public:
DenseCategoricalBin
(
data_size_t
num_data
,
int
default_bin
)
:
DenseBin
<
VAL_T
>
(
num_data
,
default_bin
)
{
}
virtual
data_size_t
Split
(
unsigned
int
threshold
,
data_size_t
*
data_indices
,
data_size_t
num_data
,
data_size_t
*
lte_indices
,
data_size_t
*
gt_indices
)
const
override
{
data_size_t
lte_count
=
0
;
data_size_t
gt_count
=
0
;
for
(
data_size_t
i
=
0
;
i
<
num_data
;
++
i
)
{
data_size_t
idx
=
data_indices
[
i
];
if
(
DenseBin
<
VAL_T
>::
data_
[
idx
]
!=
threshold
)
{
gt_indices
[
gt_count
++
]
=
idx
;
}
else
{
lte_indices
[
lte_count
++
]
=
idx
;
}
}
return
lte_count
;
}
};
BinIterator
*
DenseBin
<
VAL_T
>::
GetIterator
(
uint32_t
min_bin
,
uint32_t
max_bin
,
uint32_t
default_bin
)
const
{
return
new
DenseBinIterator
<
VAL_T
>
(
this
,
min_bin
,
max_bin
,
default_bin
);
}
}
// namespace LightGBM
#endif // LightGBM_IO_DENSE_BIN_HPP_
src/io/dense_nbits_bin.hpp
0 → 100644
View file @
eade219e
#ifndef LIGHTGBM_IO_DENSE_NBITS_BIN_HPP_
#define LIGHTGBM_IO_DENSE_NBITS_BIN_HPP_
#include <LightGBM/bin.h>
#include <vector>
#include <cstring>
#include <cstdint>
namespace
LightGBM
{
class
Dense4bitsBin
;
class
Dense4bitsBinIterator
:
public
BinIterator
{
public:
explicit
Dense4bitsBinIterator
(
const
Dense4bitsBin
*
bin_data
,
uint32_t
min_bin
,
uint32_t
max_bin
,
uint32_t
default_bin
)
:
bin_data_
(
bin_data
),
min_bin_
(
static_cast
<
uint8_t
>
(
min_bin
)),
max_bin_
(
static_cast
<
uint8_t
>
(
max_bin
)),
default_bin_
(
static_cast
<
uint8_t
>
(
default_bin
))
{
if
(
default_bin_
==
0
)
{
bias_
=
1
;
}
else
{
bias_
=
0
;
}
}
inline
uint32_t
Get
(
data_size_t
idx
)
override
;
inline
void
Reset
(
data_size_t
)
override
{
}
private:
const
Dense4bitsBin
*
bin_data_
;
uint8_t
min_bin_
;
uint8_t
max_bin_
;
uint8_t
default_bin_
;
uint8_t
bias_
;
};
class
Dense4bitsBin
:
public
Bin
{
public:
friend
Dense4bitsBinIterator
;
Dense4bitsBin
(
data_size_t
num_data
)
:
num_data_
(
num_data
)
{
int
len
=
(
num_data_
+
1
)
/
2
;
data_
=
std
::
vector
<
uint8_t
>
(
len
,
static_cast
<
uint8_t
>
(
0
));
}
~
Dense4bitsBin
()
{
}
void
Push
(
int
,
data_size_t
idx
,
uint32_t
value
)
override
{
if
(
buf_
.
empty
())
{
#pragma omp critical
{
if
(
buf_
.
empty
())
{
int
len
=
(
num_data_
+
1
)
/
2
;
buf_
=
std
::
vector
<
uint8_t
>
(
len
,
static_cast
<
uint8_t
>
(
0
));
}
}
}
const
int
i1
=
idx
>>
1
;
const
int
i2
=
(
idx
&
1
)
<<
2
;
const
uint8_t
val
=
static_cast
<
uint8_t
>
(
value
)
<<
i2
;
if
(
i2
==
0
)
{
data_
[
i1
]
=
val
;
}
else
{
buf_
[
i1
]
=
val
;
}
}
void
ReSize
(
data_size_t
num_data
)
override
{
if
(
num_data_
!=
num_data
)
{
num_data_
=
num_data
;
int
len
=
(
num_data_
+
1
)
/
2
;
data_
.
resize
(
len
);
}
}
BinIterator
*
GetIterator
(
uint32_t
min_bin
,
uint32_t
max_bin
,
uint32_t
default_bin
)
const
override
;
void
ConstructHistogram
(
const
data_size_t
*
data_indices
,
data_size_t
num_data
,
const
score_t
*
ordered_gradients
,
const
score_t
*
ordered_hessians
,
HistogramBinEntry
*
out
)
const
override
{
if
(
data_indices
!=
nullptr
)
{
// if use part of data
const
data_size_t
rest
=
num_data
&
0x3
;
data_size_t
i
=
0
;
for
(;
i
<
num_data
-
rest
;
i
+=
4
)
{
data_size_t
idx
=
data_indices
[
i
];
const
auto
bin0
=
(
data_
[
idx
>>
1
]
>>
((
idx
&
1
)
<<
2
))
&
0xf
;
idx
=
data_indices
[
i
+
1
];
const
auto
bin1
=
(
data_
[
idx
>>
1
]
>>
((
idx
&
1
)
<<
2
))
&
0xf
;
idx
=
data_indices
[
i
+
2
];
const
auto
bin2
=
(
data_
[
idx
>>
1
]
>>
((
idx
&
1
)
<<
2
))
&
0xf
;
idx
=
data_indices
[
i
+
3
];
const
auto
bin3
=
(
data_
[
idx
>>
1
]
>>
((
idx
&
1
)
<<
2
))
&
0xf
;
out
[
bin0
].
sum_gradients
+=
ordered_gradients
[
i
];
out
[
bin1
].
sum_gradients
+=
ordered_gradients
[
i
+
1
];
out
[
bin2
].
sum_gradients
+=
ordered_gradients
[
i
+
2
];
out
[
bin3
].
sum_gradients
+=
ordered_gradients
[
i
+
3
];
out
[
bin0
].
sum_hessians
+=
ordered_hessians
[
i
];
out
[
bin1
].
sum_hessians
+=
ordered_hessians
[
i
+
1
];
out
[
bin2
].
sum_hessians
+=
ordered_hessians
[
i
+
2
];
out
[
bin3
].
sum_hessians
+=
ordered_hessians
[
i
+
3
];
++
out
[
bin0
].
cnt
;
++
out
[
bin1
].
cnt
;
++
out
[
bin2
].
cnt
;
++
out
[
bin3
].
cnt
;
}
for
(;
i
<
num_data
;
++
i
)
{
const
data_size_t
idx
=
data_indices
[
i
];
const
auto
bin
=
(
data_
[
idx
>>
1
]
>>
((
idx
&
1
)
<<
2
))
&
0xf
;
out
[
bin
].
sum_gradients
+=
ordered_gradients
[
i
];
out
[
bin
].
sum_hessians
+=
ordered_hessians
[
i
];
++
out
[
bin
].
cnt
;
}
}
else
{
// use full data
const
data_size_t
rest
=
num_data
&
0x3
;
data_size_t
i
=
0
;
for
(;
i
<
num_data
-
rest
;
i
+=
4
)
{
int
j
=
i
>>
1
;
const
auto
bin0
=
(
data_
[
j
])
&
0xf
;
const
auto
bin1
=
(
data_
[
j
]
>>
4
)
&
0xf
;
++
j
;
const
auto
bin2
=
(
data_
[
j
])
&
0xf
;
const
auto
bin3
=
(
data_
[
j
]
>>
4
)
&
0xf
;
out
[
bin0
].
sum_gradients
+=
ordered_gradients
[
i
];
out
[
bin1
].
sum_gradients
+=
ordered_gradients
[
i
+
1
];
out
[
bin2
].
sum_gradients
+=
ordered_gradients
[
i
+
2
];
out
[
bin3
].
sum_gradients
+=
ordered_gradients
[
i
+
3
];
out
[
bin0
].
sum_hessians
+=
ordered_hessians
[
i
];
out
[
bin1
].
sum_hessians
+=
ordered_hessians
[
i
+
1
];
out
[
bin2
].
sum_hessians
+=
ordered_hessians
[
i
+
2
];
out
[
bin3
].
sum_hessians
+=
ordered_hessians
[
i
+
3
];
++
out
[
bin0
].
cnt
;
++
out
[
bin1
].
cnt
;
++
out
[
bin2
].
cnt
;
++
out
[
bin3
].
cnt
;
}
for
(;
i
<
num_data
;
++
i
)
{
const
auto
bin
=
(
data_
[
i
>>
1
]
>>
((
i
&
1
)
<<
2
))
&
0xf
;
out
[
bin
].
sum_gradients
+=
ordered_gradients
[
i
];
out
[
bin
].
sum_hessians
+=
ordered_hessians
[
i
];
++
out
[
bin
].
cnt
;
}
}
}
virtual
data_size_t
Split
(
uint32_t
min_bin
,
uint32_t
max_bin
,
uint32_t
default_bin
,
uint32_t
threshold
,
data_size_t
*
data_indices
,
data_size_t
num_data
,
data_size_t
*
lte_indices
,
data_size_t
*
gt_indices
,
BinType
bin_type
)
const
override
{
if
(
num_data
<=
0
)
{
return
0
;
}
uint8_t
th
=
static_cast
<
uint8_t
>
(
threshold
+
min_bin
);
uint8_t
minb
=
static_cast
<
uint8_t
>
(
min_bin
);
uint8_t
maxb
=
static_cast
<
uint8_t
>
(
max_bin
);
if
(
default_bin
==
0
)
{
th
-=
1
;
}
data_size_t
lte_count
=
0
;
data_size_t
gt_count
=
0
;
data_size_t
*
default_indices
=
gt_indices
;
data_size_t
*
default_count
=
&
gt_count
;
if
(
bin_type
==
BinType
::
NumericalBin
)
{
if
(
default_bin
<=
threshold
)
{
default_indices
=
lte_indices
;
default_count
=
&
lte_count
;
}
for
(
data_size_t
i
=
0
;
i
<
num_data
;
++
i
)
{
const
data_size_t
idx
=
data_indices
[
i
];
const
auto
bin
=
(
data_
[
idx
>>
1
]
>>
((
idx
&
1
)
<<
2
))
&
0xf
;
if
(
bin
>
maxb
||
bin
<
minb
)
{
default_indices
[(
*
default_count
)
++
]
=
idx
;
}
else
if
(
bin
>
th
)
{
gt_indices
[
gt_count
++
]
=
idx
;
}
else
{
lte_indices
[
lte_count
++
]
=
idx
;
}
}
}
else
{
if
(
default_bin
==
threshold
)
{
default_indices
=
lte_indices
;
default_count
=
&
lte_count
;
}
for
(
data_size_t
i
=
0
;
i
<
num_data
;
++
i
)
{
const
data_size_t
idx
=
data_indices
[
i
];
const
auto
bin
=
(
data_
[
idx
>>
1
]
>>
((
idx
&
1
)
<<
2
))
&
0xf
;
if
(
bin
>
maxb
||
bin
<
minb
)
{
default_indices
[(
*
default_count
)
++
]
=
idx
;
}
else
if
(
bin
!=
th
)
{
gt_indices
[
gt_count
++
]
=
idx
;
}
else
{
lte_indices
[
lte_count
++
]
=
idx
;
}
}
}
return
lte_count
;
}
data_size_t
num_data
()
const
override
{
return
num_data_
;
}
/*! \brief not ordered bin for dense feature */
OrderedBin
*
CreateOrderedBin
()
const
override
{
return
nullptr
;
}
void
FinishLoad
()
override
{
if
(
buf_
.
empty
())
{
return
;
}
int
len
=
(
num_data_
+
1
)
/
2
;
for
(
int
i
=
0
;
i
<
len
;
++
i
)
{
data_
[
i
]
|=
buf_
[
i
];
}
buf_
.
clear
();
}
void
LoadFromMemory
(
const
void
*
memory
,
const
std
::
vector
<
data_size_t
>&
local_used_indices
)
override
{
const
uint8_t
*
mem_data
=
reinterpret_cast
<
const
uint8_t
*>
(
memory
);
if
(
!
local_used_indices
.
empty
())
{
const
data_size_t
rest
=
num_data_
&
1
;
for
(
int
i
=
0
;
i
<
num_data_
-
rest
;
i
+=
2
)
{
// get old bins
data_size_t
idx
=
local_used_indices
[
i
];
const
auto
bin1
=
static_cast
<
uint8_t
>
((
mem_data
[
idx
>>
1
]
>>
((
idx
&
1
)
<<
2
))
&
0xf
);
idx
=
local_used_indices
[
i
+
1
];
const
auto
bin2
=
static_cast
<
uint8_t
>
((
mem_data
[
idx
>>
1
]
>>
((
idx
&
1
)
<<
2
))
&
0xf
);
// add
const
int
i1
=
i
>>
1
;
data_
[
i1
]
=
(
bin1
|
(
bin2
<<
4
));
}
if
(
rest
)
{
data_size_t
idx
=
local_used_indices
[
num_data_
-
1
];
data_
[
num_data_
/
2
+
1
]
=
(
mem_data
[
idx
>>
1
]
>>
((
idx
&
1
)
<<
2
))
&
0xf
;
}
}
else
{
for
(
size_t
i
=
0
;
i
<
data_
.
size
();
++
i
)
{
data_
[
i
]
=
mem_data
[
i
];
}
}
}
void
CopySubset
(
const
Bin
*
full_bin
,
const
data_size_t
*
used_indices
,
data_size_t
num_used_indices
)
override
{
auto
other_bin
=
reinterpret_cast
<
const
Dense4bitsBin
*>
(
full_bin
);
const
data_size_t
rest
=
num_used_indices
&
1
;
for
(
int
i
=
0
;
i
<
num_used_indices
-
rest
;
i
+=
2
)
{
data_size_t
idx
=
used_indices
[
i
];
const
auto
bin1
=
static_cast
<
uint8_t
>
((
other_bin
->
data_
[
idx
>>
1
]
>>
((
idx
&
1
)
<<
2
))
&
0xf
);
idx
=
used_indices
[
i
+
1
];
const
auto
bin2
=
static_cast
<
uint8_t
>
((
other_bin
->
data_
[
idx
>>
1
]
>>
((
idx
&
1
)
<<
2
))
&
0xf
);
const
int
i1
=
i
>>
1
;
data_
[
i1
]
=
(
bin1
|
(
bin2
<<
4
));
}
if
(
rest
)
{
data_size_t
idx
=
used_indices
[
num_used_indices
-
1
];
data_
[
num_used_indices
/
2
+
1
]
=
(
other_bin
->
data_
[
idx
>>
1
]
>>
((
idx
&
1
)
<<
2
))
&
0xf
;
}
}
void
SaveBinaryToFile
(
FILE
*
file
)
const
override
{
fwrite
(
data_
.
data
(),
sizeof
(
uint8_t
),
data_
.
size
(),
file
);
}
size_t
SizesInByte
()
const
override
{
return
sizeof
(
uint8_t
)
*
data_
.
size
();
}
protected:
data_size_t
num_data_
;
std
::
vector
<
uint8_t
>
data_
;
std
::
vector
<
uint8_t
>
buf_
;
};
uint32_t
Dense4bitsBinIterator
::
Get
(
data_size_t
idx
)
{
const
auto
bin
=
(
bin_data_
->
data_
[
idx
>>
1
]
>>
((
idx
&
1
)
<<
2
))
&
0xf
;
if
(
bin
>=
min_bin_
&&
bin
<=
max_bin_
)
{
return
bin
-
min_bin_
+
bias_
;
}
else
{
return
default_bin_
;
}
}
BinIterator
*
Dense4bitsBin
::
GetIterator
(
uint32_t
min_bin
,
uint32_t
max_bin
,
uint32_t
default_bin
)
const
{
return
new
Dense4bitsBinIterator
(
this
,
min_bin
,
max_bin
,
default_bin
);
}
}
// namespace LightGBM
#endif // LIGHTGBM_IO_DENSE_NBITS_BIN_HPP_
src/io/metadata.cpp
View file @
eade219e
...
...
@@ -12,6 +12,9 @@ Metadata::Metadata() {
num_init_score_
=
0
;
num_data_
=
0
;
num_queries_
=
0
;
weight_load_from_file_
=
false
;
query_load_from_file_
=
false
;
init_score_load_from_file_
=
false
;
}
void
Metadata
::
Init
(
const
char
*
data_filename
)
{
...
...
@@ -40,6 +43,7 @@ void Metadata::Init(data_size_t num_data, int weight_idx, int query_idx) {
for
(
data_size_t
i
=
0
;
i
<
num_weights_
;
++
i
)
{
weights_
[
i
]
=
0.0
f
;
}
weight_load_from_file_
=
false
;
}
if
(
query_idx
>=
0
)
{
if
(
!
query_boundaries_
.
empty
())
{
...
...
@@ -52,6 +56,7 @@ void Metadata::Init(data_size_t num_data, int weight_idx, int query_idx) {
for
(
data_size_t
i
=
0
;
i
<
num_data_
;
++
i
)
{
queries_
[
i
]
=
0
;
}
query_load_from_file_
=
false
;
}
}
...
...
@@ -185,27 +190,17 @@ void Metadata::CheckOrPartition(data_size_t num_all_data, const std::vector<data
Log
::
Fatal
(
"Initial score size doesn't match data size"
);
}
}
else
{
if
(
!
queries_
.
empty
())
{
Log
::
Fatal
(
"Cannot used query_id for parallel training"
);
}
data_size_t
num_used_data
=
static_cast
<
data_size_t
>
(
used_data_indices
.
size
());
// check weights
if
(
weight_load_from_file_
)
{
if
(
weights_
.
size
()
>
0
&&
num_weights_
!=
num_all_data
)
{
weights_
.
clear
();
num_weights_
=
0
;
Log
::
Fatal
(
"Weights size doesn't match data size"
);
}
// check query boundries
if
(
!
query_boundaries_
.
empty
()
&&
query_boundaries_
[
num_queries_
]
!=
num_all_data
)
{
query_boundaries_
.
clear
();
num_queries_
=
0
;
Log
::
Fatal
(
"Query size doesn't match data size"
);
}
// contain initial score file
if
(
!
init_score_
.
empty
()
&&
(
num_init_score_
%
num_all_data
)
!=
0
)
{
init_score_
.
clear
();
num_init_score_
=
0
;
Log
::
Fatal
(
"Initial score size doesn't match data size"
);
}
// get local weights
if
(
!
weights_
.
empty
())
{
auto
old_weights
=
weights_
;
...
...
@@ -217,7 +212,14 @@ void Metadata::CheckOrPartition(data_size_t num_all_data, const std::vector<data
}
old_weights
.
clear
();
}
}
if
(
query_load_from_file_
)
{
// check query boundries
if
(
!
query_boundaries_
.
empty
()
&&
query_boundaries_
[
num_queries_
]
!=
num_all_data
)
{
query_boundaries_
.
clear
();
num_queries_
=
0
;
Log
::
Fatal
(
"Query size doesn't match data size"
);
}
// get local query boundaries
if
(
!
query_boundaries_
.
empty
())
{
std
::
vector
<
data_size_t
>
used_query
;
...
...
@@ -250,6 +252,14 @@ void Metadata::CheckOrPartition(data_size_t num_all_data, const std::vector<data
}
old_query_boundaries
.
clear
();
}
}
if
(
init_score_load_from_file_
)
{
// contain initial score file
if
(
!
init_score_
.
empty
()
&&
(
num_init_score_
%
num_all_data
)
!=
0
)
{
init_score_
.
clear
();
num_init_score_
=
0
;
Log
::
Fatal
(
"Initial score size doesn't match data size"
);
}
// get local initial scores
if
(
!
init_score_
.
empty
())
{
...
...
@@ -258,14 +268,14 @@ void Metadata::CheckOrPartition(data_size_t num_all_data, const std::vector<data
num_init_score_
=
static_cast
<
int64_t
>
(
num_data_
)
*
num_class
;
init_score_
=
std
::
vector
<
double
>
(
num_init_score_
);
#pragma omp parallel for schedule(static)
for
(
int
k
=
0
;
k
<
num_class
;
++
k
){
for
(
int
k
=
0
;
k
<
num_class
;
++
k
)
{
for
(
size_t
i
=
0
;
i
<
used_data_indices
.
size
();
++
i
)
{
init_score_
[
k
*
num_data_
+
i
]
=
old_scores
[
k
*
num_all_data
+
used_data_indices
[
i
]];
}
}
old_scores
.
clear
();
}
}
// re-load query weight
LoadQueryWeights
();
}
...
...
@@ -289,6 +299,7 @@ void Metadata::SetInitScore(const double* init_score, data_size_t len) {
for
(
int64_t
i
=
0
;
i
<
num_init_score_
;
++
i
)
{
init_score_
[
i
]
=
init_score
[
i
];
}
init_score_load_from_file_
=
false
;
}
void
Metadata
::
SetLabel
(
const
float
*
label
,
data_size_t
len
)
{
...
...
@@ -326,6 +337,7 @@ void Metadata::SetWeights(const float* weights, data_size_t len) {
weights_
[
i
]
=
weights
[
i
];
}
LoadQueryWeights
();
weight_load_from_file_
=
false
;
}
void
Metadata
::
SetQuery
(
const
data_size_t
*
query
,
data_size_t
len
)
{
...
...
@@ -352,48 +364,7 @@ void Metadata::SetQuery(const data_size_t* query, data_size_t len) {
query_boundaries_
[
i
+
1
]
=
query_boundaries_
[
i
]
+
query
[
i
];
}
LoadQueryWeights
();
}
void
Metadata
::
SetQueryId
(
const
data_size_t
*
query_id
,
data_size_t
len
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
// save to nullptr
if
(
query_id
==
nullptr
||
len
==
0
)
{
query_boundaries_
.
clear
();
queries_
.
clear
();
num_queries_
=
0
;
return
;
}
if
(
num_data_
!=
len
)
{
Log
::
Fatal
(
"len of query id is not same with #data"
);
}
if
(
!
queries_
.
empty
())
{
queries_
.
clear
();
}
queries_
=
std
::
vector
<
data_size_t
>
(
num_data_
);
for
(
data_size_t
i
=
0
;
i
<
num_weights_
;
++
i
)
{
queries_
[
i
]
=
query_id
[
i
];
}
// need convert query_id to boundaries
std
::
vector
<
data_size_t
>
tmp_buffer
;
data_size_t
last_qid
=
-
1
;
data_size_t
cur_cnt
=
0
;
for
(
data_size_t
i
=
0
;
i
<
num_data_
;
++
i
)
{
if
(
last_qid
!=
queries_
[
i
])
{
if
(
cur_cnt
>
0
)
{
tmp_buffer
.
push_back
(
cur_cnt
);
}
cur_cnt
=
0
;
last_qid
=
queries_
[
i
];
}
++
cur_cnt
;
}
tmp_buffer
.
push_back
(
cur_cnt
);
query_boundaries_
=
std
::
vector
<
data_size_t
>
(
tmp_buffer
.
size
()
+
1
);
num_queries_
=
static_cast
<
data_size_t
>
(
tmp_buffer
.
size
());
query_boundaries_
[
0
]
=
0
;
for
(
size_t
i
=
0
;
i
<
tmp_buffer
.
size
();
++
i
)
{
query_boundaries_
[
i
+
1
]
=
query_boundaries_
[
i
]
+
tmp_buffer
[
i
];
}
queries_
.
clear
();
LoadQueryWeights
();
query_load_from_file_
=
false
;
}
void
Metadata
::
LoadWeights
()
{
...
...
@@ -415,6 +386,7 @@ void Metadata::LoadWeights() {
Common
::
Atof
(
reader
.
Lines
()[
i
].
c_str
(),
&
tmp_weight
);
weights_
[
i
]
=
static_cast
<
float
>
(
tmp_weight
);
}
weight_load_from_file_
=
true
;
}
void
Metadata
::
LoadInitialScore
()
{
...
...
@@ -457,6 +429,7 @@ void Metadata::LoadInitialScore() {
}
}
}
init_score_load_from_file_
=
true
;
}
void
Metadata
::
LoadQueryBoundaries
()
{
...
...
@@ -478,6 +451,7 @@ void Metadata::LoadQueryBoundaries() {
Common
::
Atoi
(
reader
.
Lines
()[
i
].
c_str
(),
&
tmp_cnt
);
query_boundaries_
[
i
+
1
]
=
query_boundaries_
[
i
]
+
static_cast
<
data_size_t
>
(
tmp_cnt
);
}
query_load_from_file_
=
true
;
}
void
Metadata
::
LoadQueryWeights
()
{
...
...
@@ -516,12 +490,14 @@ void Metadata::LoadFromMemory(const void* memory) {
weights_
=
std
::
vector
<
float
>
(
num_weights_
);
std
::
memcpy
(
weights_
.
data
(),
mem_ptr
,
sizeof
(
float
)
*
num_weights_
);
mem_ptr
+=
sizeof
(
float
)
*
num_weights_
;
weight_load_from_file_
=
true
;
}
if
(
num_queries_
>
0
)
{
if
(
!
query_boundaries_
.
empty
())
{
query_boundaries_
.
clear
();
}
query_boundaries_
=
std
::
vector
<
data_size_t
>
(
num_queries_
+
1
);
std
::
memcpy
(
query_boundaries_
.
data
(),
mem_ptr
,
sizeof
(
data_size_t
)
*
(
num_queries_
+
1
));
mem_ptr
+=
sizeof
(
data_size_t
)
*
(
num_queries_
+
1
);
query_load_from_file_
=
true
;
}
LoadQueryWeights
();
}
...
...
src/io/ordered_sparse_bin.hpp
View file @
eade219e
...
...
@@ -29,17 +29,19 @@ public:
struct
SparsePair
{
data_size_t
ridx
;
// data(row) index
VAL_T
bin
;
// bin for this data
SparsePair
(
data_size_t
r
,
VAL_T
b
)
:
ridx
(
r
),
bin
(
b
)
{}
SparsePair
()
:
ridx
(
0
),
bin
(
0
)
{}
};
OrderedSparseBin
(
const
SparseBin
<
VAL_T
>*
bin_data
)
:
bin_data_
(
bin_data
)
{
data_size_t
cur_pos
=
0
;
data_size_t
i_delta
=
-
1
;
int
non_zero_cnt
=
0
;
while
(
bin_data_
->
NextNonzero
(
&
i_delta
,
&
cur_pos
))
{
ordered_pair_
.
emplace_back
(
cur_pos
,
static_cast
<
VAL_T
>
(
0
))
;
++
non_zero_cnt
;
}
ordered_pair_
.
shrink_to_fit
();
ordered_pair_
.
resize
(
non_zero_cnt
);
leaf_cnt_
.
push_back
(
non_zero_cnt
);
}
~
OrderedSparseBin
()
{
...
...
@@ -81,17 +83,55 @@ public:
// get current leaf boundary
const
data_size_t
start
=
leaf_start_
[
leaf
];
const
data_size_t
end
=
start
+
leaf_cnt_
[
leaf
];
const
int
rest
=
(
end
-
start
)
%
4
;
data_size_t
i
=
start
;
// use data on current leaf to construct histogram
for
(
data_size_t
i
=
start
;
i
<
end
;
++
i
)
{
const
VAL_T
bin
=
ordered_pair_
[
i
].
bin
;
const
data_size_t
idx
=
ordered_pair_
[
i
].
ridx
;
out
[
bin
].
sum_gradients
+=
gradient
[
idx
];
out
[
bin
].
sum_hessians
+=
hessian
[
idx
];
++
out
[
bin
].
cnt
;
for
(;
i
<
end
-
rest
;
i
+=
4
)
{
const
VAL_T
bin0
=
ordered_pair_
[
i
].
bin
;
const
VAL_T
bin1
=
ordered_pair_
[
i
+
1
].
bin
;
const
VAL_T
bin2
=
ordered_pair_
[
i
+
2
].
bin
;
const
VAL_T
bin3
=
ordered_pair_
[
i
+
3
].
bin
;
const
auto
g0
=
gradient
[
ordered_pair_
[
i
].
ridx
];
const
auto
h0
=
hessian
[
ordered_pair_
[
i
].
ridx
];
const
auto
g1
=
gradient
[
ordered_pair_
[
i
+
1
].
ridx
];
const
auto
h1
=
hessian
[
ordered_pair_
[
i
+
1
].
ridx
];
const
auto
g2
=
gradient
[
ordered_pair_
[
i
+
2
].
ridx
];
const
auto
h2
=
hessian
[
ordered_pair_
[
i
+
2
].
ridx
];
const
auto
g3
=
gradient
[
ordered_pair_
[
i
+
3
].
ridx
];
const
auto
h3
=
hessian
[
ordered_pair_
[
i
+
3
].
ridx
];
out
[
bin0
].
sum_gradients
+=
g0
;
out
[
bin1
].
sum_gradients
+=
g1
;
out
[
bin2
].
sum_gradients
+=
g2
;
out
[
bin3
].
sum_gradients
+=
g3
;
out
[
bin0
].
sum_hessians
+=
h0
;
out
[
bin1
].
sum_hessians
+=
h1
;
out
[
bin2
].
sum_hessians
+=
h2
;
out
[
bin3
].
sum_hessians
+=
h3
;
++
out
[
bin0
].
cnt
;
++
out
[
bin1
].
cnt
;
++
out
[
bin2
].
cnt
;
++
out
[
bin3
].
cnt
;
}
for
(;
i
<
end
;
++
i
)
{
const
VAL_T
bin0
=
ordered_pair_
[
i
].
bin
;
const
auto
g0
=
gradient
[
ordered_pair_
[
i
].
ridx
];
const
auto
h0
=
hessian
[
ordered_pair_
[
i
].
ridx
];
out
[
bin0
].
sum_gradients
+=
g0
;
out
[
bin0
].
sum_hessians
+=
h0
;
++
out
[
bin0
].
cnt
;
}
}
void
Split
(
int
leaf
,
int
right_leaf
,
const
char
*
left_indices
)
override
{
void
Split
(
int
leaf
,
int
right_leaf
,
const
char
*
is_in_leaf
,
char
mark
)
override
{
// get current leaf boundary
const
data_size_t
l_start
=
leaf_start_
[
leaf
];
const
data_size_t
l_end
=
l_start
+
leaf_cnt_
[
leaf
];
...
...
@@ -99,7 +139,7 @@ public:
data_size_t
new_left_end
=
l_start
;
for
(
data_size_t
i
=
l_start
;
i
<
l_end
;
++
i
)
{
if
(
left_indices
[
ordered_pair_
[
i
].
ridx
])
{
if
(
is_in_leaf
[
ordered_pair_
[
i
].
ridx
]
==
mark
)
{
std
::
swap
(
ordered_pair_
[
new_left_end
],
ordered_pair_
[
i
]);
++
new_left_end
;
}
...
...
@@ -109,7 +149,9 @@ public:
leaf_cnt_
[
leaf
]
=
new_left_end
-
l_start
;
leaf_cnt_
[
right_leaf
]
=
l_end
-
new_left_end
;
}
data_size_t
NonZeroCount
(
int
leaf
)
const
override
{
return
static_cast
<
data_size_t
>
(
leaf_cnt_
[
leaf
]);
}
/*! \brief Disable copy */
OrderedSparseBin
<
VAL_T
>&
operator
=
(
const
OrderedSparseBin
<
VAL_T
>&
)
=
delete
;
/*! \brief Disable copy */
...
...
src/io/sparse_bin.hpp
View file @
eade219e
...
...
@@ -5,40 +5,59 @@
#include <LightGBM/bin.h>
#include <
omp
.h>
#include <
LightGBM/utils/openmp_wrapper
.h>
#include <cstring>
#include <cstdint>
#include <limits>
#include <vector>
namespace
LightGBM
{
template
<
typename
VAL_T
>
class
SparseBin
;
template
<
typename
VAL_T
>
class
SparseBin
;
const
size_t
kNumFastIndex
=
64
;
const
uint8_t
kMaxDelta
=
255
;
template
<
typename
VAL_T
>
class
SparseBinIterator
:
public
BinIterator
{
public:
SparseBinIterator
(
const
SparseBin
<
VAL_T
>*
bin_data
,
uint32_t
min_bin
,
uint32_t
max_bin
,
uint32_t
default_bin
)
:
bin_data_
(
bin_data
),
min_bin_
(
static_cast
<
VAL_T
>
(
min_bin
)),
max_bin_
(
static_cast
<
VAL_T
>
(
max_bin
)),
default_bin_
(
static_cast
<
VAL_T
>
(
default_bin
))
{
if
(
default_bin_
==
0
)
{
bias_
=
1
;
}
else
{
bias_
=
0
;
}
Reset
(
0
);
}
SparseBinIterator
(
const
SparseBin
<
VAL_T
>*
bin_data
,
data_size_t
start_idx
)
:
bin_data_
(
bin_data
)
{
Reset
(
start_idx
);
}
inline
VAL_T
Inner
Get
(
data_size_t
idx
);
inline
VAL_T
Raw
Get
(
data_size_t
idx
);
inline
uint32_t
Get
(
data_size_t
idx
)
override
{
return
InnerGet
(
idx
);
inline
uint32_t
Get
(
data_size_t
idx
)
override
{
VAL_T
ret
=
RawGet
(
idx
);
if
(
ret
>=
min_bin_
&&
ret
<=
max_bin_
)
{
return
ret
-
min_bin_
+
bias_
;
}
else
{
return
default_bin_
;
}
}
inline
void
Reset
(
data_size_t
idx
);
inline
void
Reset
(
data_size_t
idx
)
override
;
private:
const
SparseBin
<
VAL_T
>*
bin_data_
;
data_size_t
cur_pos_
;
data_size_t
i_delta_
;
VAL_T
min_bin_
;
VAL_T
max_bin_
;
VAL_T
default_bin_
;
uint8_t
bias_
;
};
template
<
typename
VAL_T
>
...
...
@@ -50,32 +69,33 @@ public:
friend
class
SparseBinIterator
<
VAL_T
>
;
friend
class
OrderedSparseBin
<
VAL_T
>
;
SparseBin
(
data_size_t
num_data
,
int
default_bin
)
SparseBin
(
data_size_t
num_data
)
:
num_data_
(
num_data
)
{
default_bin_
=
static_cast
<
VAL_T
>
(
default_bin
);
if
(
default_bin_
!=
0
)
{
Log
::
Info
(
"Warning: sparse feature with negative values, treating negative values as zero"
);
}
int
num_threads
=
1
;
#pragma omp parallel
#pragma omp master
{
num_threads_
=
omp_get_num_threads
();
}
for
(
int
i
=
0
;
i
<
num_threads_
;
++
i
)
{
push_buffers_
.
emplace_back
();
num_threads
=
omp_get_num_threads
();
}
push_buffers_
.
resize
(
num_threads
);
}
~
SparseBin
()
{
}
void
ReSize
(
data_size_t
num_data
)
override
{
num_data_
=
num_data
;
}
void
Push
(
int
tid
,
data_size_t
idx
,
uint32_t
value
)
override
{
// not store zero data
if
(
value
<=
default_bin_
)
{
return
;
}
push_buffers_
[
tid
].
emplace_back
(
idx
,
static_cast
<
VAL_T
>
(
value
));
auto
cur_bin
=
static_cast
<
VAL_T
>
(
value
);
if
(
cur_bin
!=
0
)
{
push_buffers_
[
tid
].
emplace_back
(
idx
,
cur_bin
);
}
}
BinIterator
*
GetIterator
(
data_size_t
start_idx
)
const
override
;
BinIterator
*
GetIterator
(
uint32_t
min_bin
,
uint32_t
max_bin
,
uint32_t
default_bin
)
const
override
;
void
ConstructHistogram
(
const
data_size_t
*
,
data_size_t
,
const
score_t
*
,
const
score_t
*
,
HistogramBinEntry
*
)
const
override
{
...
...
@@ -86,36 +106,72 @@ public:
inline
bool
NextNonzero
(
data_size_t
*
i_delta
,
data_size_t
*
cur_pos
)
const
{
++
(
*
i_delta
);
*
cur_pos
+=
deltas_
[
*
i_delta
]
;
data_size_t
factor
=
1
;
data_size_t
shift
=
0
;
data_size_t
delta
=
deltas_
[
*
i_delta
]
;
while
(
*
i_delta
<
num_vals_
&&
vals_
[
*
i_delta
]
==
0
)
{
++
(
*
i_delta
);
factor
*=
kMaxDelta
;
*
cur_pos
+=
deltas_
[
*
i_delta
]
*
factor
;
shift
+=
8
;
delta
|=
static_cast
<
data_size_t
>
(
deltas_
[
*
i_delta
]
)
<<
shift
;
}
if
(
*
i_delta
>=
0
&&
*
i_delta
<
num_vals_
)
{
*
cur_pos
+=
delta
;
if
(
*
i_delta
<
num_vals_
)
{
return
true
;
}
else
{
*
cur_pos
=
num_data_
;
return
false
;
}
}
virtual
data_size_t
Split
(
unsigned
int
threshold
,
data_size_t
*
data_indices
,
data_size_t
num_data
,
data_size_t
*
lte_indices
,
data_size_t
*
gt_indices
)
const
override
{
virtual
data_size_t
Split
(
uint32_t
min_bin
,
uint32_t
max_bin
,
uint32_t
default_bin
,
uint32_t
threshold
,
data_size_t
*
data_indices
,
data_size_t
num_data
,
data_size_t
*
lte_indices
,
data_size_t
*
gt_indices
,
BinType
bin_type
)
const
override
{
// not need to split
if
(
num_data
<=
0
)
{
return
0
;
}
VAL_T
th
=
static_cast
<
VAL_T
>
(
threshold
+
min_bin
);
VAL_T
minb
=
static_cast
<
VAL_T
>
(
min_bin
);
VAL_T
maxb
=
static_cast
<
VAL_T
>
(
max_bin
);
if
(
default_bin
==
0
)
{
th
-=
1
;
}
SparseBinIterator
<
VAL_T
>
iterator
(
this
,
data_indices
[
0
]);
data_size_t
lte_count
=
0
;
data_size_t
gt_count
=
0
;
data_size_t
*
default_indices
=
gt_indices
;
data_size_t
*
default_count
=
&
gt_count
;
if
(
bin_type
==
BinType
::
NumericalBin
)
{
if
(
default_bin
<=
threshold
)
{
default_indices
=
lte_indices
;
default_count
=
&
lte_count
;
}
for
(
data_size_t
i
=
0
;
i
<
num_data
;
++
i
)
{
const
data_size_t
idx
=
data_indices
[
i
];
VAL_T
bin
=
iterator
.
RawGet
(
idx
);
if
(
bin
>
maxb
||
bin
<
minb
)
{
default_indices
[(
*
default_count
)
++
]
=
idx
;
}
else
if
(
bin
>
th
)
{
gt_indices
[
gt_count
++
]
=
idx
;
}
else
{
lte_indices
[
lte_count
++
]
=
idx
;
}
}
}
else
{
if
(
default_bin
==
threshold
)
{
default_indices
=
lte_indices
;
default_count
=
&
lte_count
;
}
for
(
data_size_t
i
=
0
;
i
<
num_data
;
++
i
)
{
const
data_size_t
idx
=
data_indices
[
i
];
VAL_T
bin
=
iterator
.
InnerGet
(
idx
);
if
(
bin
>
threshold
)
{
VAL_T
bin
=
iterator
.
RawGet
(
idx
);
if
(
bin
>
maxb
||
bin
<
minb
)
{
default_indices
[(
*
default_count
)
++
]
=
idx
;
}
else
if
(
bin
!=
th
)
{
gt_indices
[
gt_count
++
]
=
idx
;
}
else
{
lte_indices
[
lte_count
++
]
=
idx
;
}
}
}
return
lte_count
;
}
...
...
@@ -125,44 +181,40 @@ public:
void
FinishLoad
()
override
{
// get total non zero size
size_t
non_zero_size
=
0
;
size_t
pair_cnt
=
0
;
for
(
size_t
i
=
0
;
i
<
push_buffers_
.
size
();
++
i
)
{
non_zero_size
+=
push_buffers_
[
i
].
size
();
pair_cnt
+=
push_buffers_
[
i
].
size
();
}
// merge
non_zero_pair_
.
reserve
(
non_zero_size
);
for
(
size_t
i
=
0
;
i
<
push_buffers_
.
size
();
++
i
)
{
non_zero_pair_
.
insert
(
non_zero_pair_
.
end
(),
push_buffers_
[
i
].
begin
(),
push_buffers_
[
i
].
end
());
std
::
vector
<
std
::
pair
<
data_size_t
,
VAL_T
>>&
idx_val_pairs
=
push_buffers_
[
0
];
idx_val_pairs
.
reserve
(
pair_cnt
);
for
(
size_t
i
=
1
;
i
<
push_buffers_
.
size
();
++
i
)
{
idx_val_pairs
.
insert
(
idx_val_pairs
.
end
(),
push_buffers_
[
i
].
begin
(),
push_buffers_
[
i
].
end
());
push_buffers_
[
i
].
clear
();
push_buffers_
[
i
].
shrink_to_fit
();
}
push_buffers_
.
clear
();
push_buffers_
.
shrink_to_fit
();
// sort by data index
std
::
sort
(
non_zero
_pair
_
.
begin
(),
non_zero
_pair
_
.
end
(),
std
::
sort
(
idx_val
_pair
s
.
begin
(),
idx_val
_pair
s
.
end
(),
[](
const
std
::
pair
<
data_size_t
,
VAL_T
>&
a
,
const
std
::
pair
<
data_size_t
,
VAL_T
>&
b
)
{
return
a
.
first
<
b
.
first
;
});
// load detla array
LoadFromPair
(
non_zero_pair_
);
// free memory
non_zero_pair_
.
clear
();
non_zero_pair_
.
shrink_to_fit
();
// load delta array
LoadFromPair
(
idx_val_pairs
);
}
void
LoadFromPair
(
const
std
::
vector
<
std
::
pair
<
data_size_t
,
VAL_T
>>&
non_zero
_pair
)
{
void
LoadFromPair
(
const
std
::
vector
<
std
::
pair
<
data_size_t
,
VAL_T
>>&
idx_val
_pair
s
)
{
deltas_
.
clear
();
vals_
.
clear
();
// transform to delta array
data_size_t
last_idx
=
0
;
for
(
size_t
i
=
0
;
i
<
non_zero
_pair
.
size
();
++
i
)
{
const
data_size_t
cur_idx
=
non_zero
_pair
[
i
].
first
;
const
VAL_T
bin
=
non_zero
_pair
[
i
].
second
;
for
(
size_t
i
=
0
;
i
<
idx_val
_pair
s
.
size
();
++
i
)
{
const
data_size_t
cur_idx
=
idx_val
_pair
s
[
i
].
first
;
const
VAL_T
bin
=
idx_val
_pair
s
[
i
].
second
;
data_size_t
cur_delta
=
cur_idx
-
last_idx
;
while
(
cur_delta
>
kMaxDelta
)
{
deltas_
.
push_back
(
cur_delta
%
kMaxDelta
);
while
(
cur_delta
>
=
256
)
{
deltas_
.
push_back
(
cur_delta
&
0xff
);
vals_
.
push_back
(
0
);
cur_delta
/=
kMaxDelta
;
cur_delta
>>=
8
;
}
deltas_
.
push_back
(
static_cast
<
uint8_t
>
(
cur_delta
));
vals_
.
push_back
(
bin
);
...
...
@@ -259,28 +311,57 @@ public:
}
LoadFromPair
(
tmp_pair
);
}
}
void
CopySubset
(
const
Bin
*
full_bin
,
const
data_size_t
*
used_indices
,
data_size_t
num_used_indices
)
override
{
auto
other_bin
=
reinterpret_cast
<
const
SparseBin
<
VAL_T
>*>
(
full_bin
);
SparseBinIterator
<
VAL_T
>
iterator
(
other_bin
,
used_indices
[
0
]);
deltas_
.
clear
();
vals_
.
clear
();
// transform to delta array
data_size_t
last_idx
=
0
;
for
(
data_size_t
i
=
0
;
i
<
num_used_indices
;
++
i
)
{
VAL_T
bin
=
iterator
.
RawGet
(
used_indices
[
i
]);
if
(
bin
>
0
)
{
data_size_t
cur_delta
=
i
-
last_idx
;
while
(
cur_delta
>=
256
)
{
deltas_
.
push_back
(
cur_delta
&
0xff
);
vals_
.
push_back
(
0
);
cur_delta
>>=
8
;
}
deltas_
.
push_back
(
static_cast
<
uint8_t
>
(
cur_delta
));
vals_
.
push_back
(
bin
);
last_idx
=
i
;
}
}
// avoid out of range
deltas_
.
push_back
(
0
);
num_vals_
=
static_cast
<
data_size_t
>
(
vals_
.
size
());
// reduce memory cost
deltas_
.
shrink_to_fit
();
vals_
.
shrink_to_fit
();
// generate fast index
GetFastIndex
();
}
protected:
data_size_t
num_data_
;
std
::
vector
<
std
::
pair
<
data_size_t
,
VAL_T
>>
non_zero_pair_
;
std
::
vector
<
uint8_t
>
deltas_
;
std
::
vector
<
VAL_T
>
vals_
;
data_size_t
num_vals_
;
int
num_threads_
;
std
::
vector
<
std
::
vector
<
std
::
pair
<
data_size_t
,
VAL_T
>>>
push_buffers_
;
std
::
vector
<
std
::
pair
<
data_size_t
,
data_size_t
>>
fast_index_
;
data_size_t
fast_index_shift_
;
VAL_T
default_bin_
;
};
template
<
typename
VAL_T
>
inline
VAL_T
SparseBinIterator
<
VAL_T
>::
Inner
Get
(
data_size_t
idx
)
{
while
(
cur_pos_
<
idx
&&
i_delta_
<
bin_data_
->
num_vals_
)
{
inline
VAL_T
SparseBinIterator
<
VAL_T
>::
Raw
Get
(
data_size_t
idx
)
{
while
(
cur_pos_
<
idx
)
{
bin_data_
->
NextNonzero
(
&
i_delta_
,
&
cur_pos_
);
}
if
(
cur_pos_
==
idx
&&
i_delta_
<
bin_data_
->
num_vals_
&&
i_delta_
>=
0
)
{
if
(
cur_pos_
==
idx
)
{
return
bin_data_
->
vals_
[
i_delta_
];
}
else
{
return
0
;
...
...
@@ -295,38 +376,9 @@ inline void SparseBinIterator<VAL_T>::Reset(data_size_t start_idx) {
}
template
<
typename
VAL_T
>
BinIterator
*
SparseBin
<
VAL_T
>::
GetIterator
(
data_size_t
start_idx
)
const
{
return
new
SparseBinIterator
<
VAL_T
>
(
this
,
start_idx
);
BinIterator
*
SparseBin
<
VAL_T
>::
GetIterator
(
uint32_t
min_bin
,
uint32_t
max_bin
,
uint32_t
default_bin
)
const
{
return
new
SparseBinIterator
<
VAL_T
>
(
this
,
min_bin
,
max_bin
,
default_bin
);
}
template
<
typename
VAL_T
>
class
SparseCategoricalBin
:
public
SparseBin
<
VAL_T
>
{
public:
SparseCategoricalBin
(
data_size_t
num_data
,
int
default_bin
)
:
SparseBin
<
VAL_T
>
(
num_data
,
default_bin
)
{
}
virtual
data_size_t
Split
(
unsigned
int
threshold
,
data_size_t
*
data_indices
,
data_size_t
num_data
,
data_size_t
*
lte_indices
,
data_size_t
*
gt_indices
)
const
override
{
// not need to split
if
(
num_data
<=
0
)
{
return
0
;
}
SparseBinIterator
<
VAL_T
>
iterator
(
this
,
data_indices
[
0
]);
data_size_t
lte_count
=
0
;
data_size_t
gt_count
=
0
;
for
(
data_size_t
i
=
0
;
i
<
num_data
;
++
i
)
{
const
data_size_t
idx
=
data_indices
[
i
];
VAL_T
bin
=
iterator
.
InnerGet
(
idx
);
if
(
bin
!=
threshold
)
{
gt_indices
[
gt_count
++
]
=
idx
;
}
else
{
lte_indices
[
lte_count
++
]
=
idx
;
}
}
return
lte_count
;
}
};
}
// namespace LightGBM
#endif // LightGBM_IO_SPARSE_BIN_HPP_
Prev
1
2
3
4
5
6
7
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment