Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tianlh
LightGBM-DCU
Commits
4f77bd28
Commit
4f77bd28
authored
Feb 20, 2017
by
Guolin Ke
Browse files
update to v2.
parent
13d4581b
Changes
64
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1063 additions
and
869 deletions
+1063
-869
src/io/dataset.cpp
src/io/dataset.cpp
+254
-28
src/io/dataset_loader.cpp
src/io/dataset_loader.cpp
+127
-136
src/io/dense_bin.hpp
src/io/dense_bin.hpp
+72
-58
src/io/ordered_sparse_bin.hpp
src/io/ordered_sparse_bin.hpp
+6
-3
src/io/sparse_bin.hpp
src/io/sparse_bin.hpp
+62
-62
src/io/tree.cpp
src/io/tree.cpp
+63
-45
src/metric/binary_metric.hpp
src/metric/binary_metric.hpp
+7
-7
src/objective/binary_objective.hpp
src/objective/binary_objective.hpp
+8
-5
src/objective/rank_objective.hpp
src/objective/rank_objective.hpp
+1
-0
src/objective/regression_objective.hpp
src/objective/regression_objective.hpp
+4
-4
src/treelearner/data_parallel_tree_learner.cpp
src/treelearner/data_parallel_tree_learner.cpp
+44
-41
src/treelearner/data_partition.hpp
src/treelearner/data_partition.hpp
+3
-3
src/treelearner/feature_histogram.hpp
src/treelearner/feature_histogram.hpp
+90
-196
src/treelearner/feature_parallel_tree_learner.cpp
src/treelearner/feature_parallel_tree_learner.cpp
+4
-15
src/treelearner/leaf_splits.hpp
src/treelearner/leaf_splits.hpp
+3
-24
src/treelearner/parallel_tree_learner.h
src/treelearner/parallel_tree_learner.h
+4
-0
src/treelearner/serial_tree_learner.cpp
src/treelearner/serial_tree_learner.cpp
+140
-169
src/treelearner/serial_tree_learner.h
src/treelearner/serial_tree_learner.h
+5
-40
src/treelearner/split_info.hpp
src/treelearner/split_info.hpp
+31
-0
src/treelearner/voting_parallel_tree_learner.cpp
src/treelearner/voting_parallel_tree_learner.cpp
+135
-33
No files found.
src/io/dataset.cpp
View file @
4f77bd28
#include <LightGBM/dataset.h>
#include <LightGBM/dataset.h>
#include <LightGBM/feature_group.h>
#include <LightGBM/feature.h>
#include <LightGBM/utils/openmp_wrapper.h>
#include <LightGBM/utils/openmp_wrapper.h>
#include <LightGBM/utils/threading.h>
#include <LightGBM/utils/array_args.h>
#include <cstdio>
#include <cstdio>
#include <unordered_map>
#include <unordered_map>
...
@@ -16,6 +16,8 @@ namespace LightGBM {
...
@@ -16,6 +16,8 @@ namespace LightGBM {
const
char
*
Dataset
::
binary_file_token
=
"______LightGBM_Binary_File_Token______
\n
"
;
const
char
*
Dataset
::
binary_file_token
=
"______LightGBM_Binary_File_Token______
\n
"
;
Dataset
::
Dataset
()
{
Dataset
::
Dataset
()
{
data_filename_
=
"noname"
;
data_filename_
=
"noname"
;
num_data_
=
0
;
num_data_
=
0
;
...
@@ -24,50 +26,189 @@ Dataset::Dataset() {
...
@@ -24,50 +26,189 @@ Dataset::Dataset() {
Dataset
::
Dataset
(
data_size_t
num_data
)
{
Dataset
::
Dataset
(
data_size_t
num_data
)
{
data_filename_
=
"noname"
;
data_filename_
=
"noname"
;
num_data_
=
num_data
;
num_data_
=
num_data
;
metadata_
.
Init
(
num_data_
,
-
1
,
-
1
);
metadata_
.
Init
(
num_data_
,
NO_SPECIFIC
,
NO_SPECIFIC
);
}
}
Dataset
::~
Dataset
()
{
Dataset
::~
Dataset
()
{
}
std
::
vector
<
std
::
vector
<
int
>>
NoGroup
(
const
std
::
vector
<
int
>&
used_features
)
{
std
::
vector
<
std
::
vector
<
int
>>
features_in_group
;
features_in_group
.
resize
(
used_features
.
size
());
for
(
size_t
i
=
0
;
i
<
used_features
.
size
();
++
i
)
{
features_in_group
[
i
].
emplace_back
(
used_features
[
i
]);
}
return
features_in_group
;
}
void
Dataset
::
Construct
(
std
::
vector
<
std
::
unique_ptr
<
BinMapper
>>&
bin_mappers
,
const
std
::
vector
<
std
::
vector
<
int
>>&
sample_indices
,
size_t
total_sample_cnt
,
const
IOConfig
&
io_config
)
{
num_total_features_
=
static_cast
<
int
>
(
bin_mappers
.
size
());
// get num_features
std
::
vector
<
int
>
used_features
;
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
bin_mappers
.
size
());
++
i
)
{
if
(
bin_mappers
[
i
]
!=
nullptr
&&
!
bin_mappers
[
i
]
->
is_trival
())
{
used_features
.
emplace_back
(
i
);
}
}
auto
features_in_group
=
NoGroup
(
used_features
);
num_features_
=
0
;
for
(
const
auto
&
fs
:
features_in_group
)
{
num_features_
+=
static_cast
<
int
>
(
fs
.
size
());
}
int
cur_fidx
=
0
;
used_feature_map_
=
std
::
vector
<
int
>
(
num_total_features_
,
-
1
);
num_groups_
=
static_cast
<
int
>
(
features_in_group
.
size
());
real_feature_idx_
.
resize
(
num_features_
);
feature2group_
.
resize
(
num_features_
);
feature2subfeature_
.
resize
(
num_features_
);
for
(
int
i
=
0
;
i
<
num_groups_
;
++
i
)
{
auto
cur_features
=
features_in_group
[
i
];
int
cur_cnt_features
=
static_cast
<
int
>
(
cur_features
.
size
());
// get bin_mappers
std
::
vector
<
std
::
unique_ptr
<
BinMapper
>>
cur_bin_mappers
;
for
(
int
j
=
0
;
j
<
cur_cnt_features
;
++
j
)
{
int
real_fidx
=
cur_features
[
j
];
used_feature_map_
[
real_fidx
]
=
cur_fidx
;
real_feature_idx_
[
cur_fidx
]
=
real_fidx
;
feature2group_
[
cur_fidx
]
=
i
;
feature2subfeature_
[
cur_fidx
]
=
j
;
cur_bin_mappers
.
emplace_back
(
bin_mappers
[
real_fidx
].
release
());
++
cur_fidx
;
}
feature_groups_
.
emplace_back
(
std
::
unique_ptr
<
FeatureGroup
>
(
new
FeatureGroup
(
cur_cnt_features
,
cur_bin_mappers
,
num_data_
,
io_config
.
is_enable_sparse
)));
}
feature_groups_
.
shrink_to_fit
();
group_bin_boundaries_
.
clear
();
uint64_t
num_total_bin
=
0
;
group_bin_boundaries_
.
push_back
(
num_total_bin
);
for
(
int
i
=
0
;
i
<
num_groups_
;
++
i
)
{
num_total_bin
+=
feature_groups_
[
i
]
->
num_total_bin_
;
group_bin_boundaries_
.
push_back
(
num_total_bin
);
}
int
last_group
=
0
;
group_feature_start_
.
reserve
(
num_groups_
);
group_feature_cnt_
.
reserve
(
num_groups_
);
group_feature_start_
.
push_back
(
0
);
group_feature_cnt_
.
push_back
(
1
);
for
(
int
i
=
1
;
i
<
num_features_
;
++
i
)
{
const
int
group
=
feature2group_
[
i
];
if
(
group
==
last_group
)
{
group_feature_cnt_
.
back
()
=
group_feature_cnt_
.
back
()
+
1
;
}
else
{
group_feature_start_
.
push_back
(
i
);
group_feature_cnt_
.
push_back
(
1
);
last_group
=
group
;
}
}
}
}
void
Dataset
::
FinishLoad
()
{
void
Dataset
::
FinishLoad
()
{
#pragma omp parallel for schedule(guided)
#pragma omp parallel for schedule(guided)
for
(
int
i
=
0
;
i
<
num_
feature
s_
;
++
i
)
{
for
(
int
i
=
0
;
i
<
num_
group
s_
;
++
i
)
{
feature
s_
[
i
]
->
FinishLoad
();
feature
_groups_
[
i
]
->
bin_data_
->
FinishLoad
();
}
}
}
}
void
Dataset
::
CopyFeatureMapperFrom
(
const
Dataset
*
dataset
)
{
void
Dataset
::
CopyFeatureMapperFrom
(
const
Dataset
*
dataset
)
{
features_
.
clear
();
feature
_group
s_
.
clear
();
num_features_
=
dataset
->
num_features_
;
num_features_
=
dataset
->
num_features_
;
num_groups_
=
dataset
->
num_groups_
;
bool
is_enable_sparse
=
false
;
bool
is_enable_sparse
=
false
;
for
(
int
i
=
0
;
i
<
num_
feature
s_
;
++
i
)
{
for
(
int
i
=
0
;
i
<
num_
group
s_
;
++
i
)
{
if
(
dataset
->
features_
[
i
]
->
is_sparse
()
)
{
if
(
dataset
->
feature
_group
s_
[
i
]
->
is_sparse
_
)
{
is_enable_sparse
=
true
;
is_enable_sparse
=
true
;
break
;
break
;
}
}
}
}
// copy feature bin mapper data
// copy feature bin mapper data
for
(
int
i
=
0
;
i
<
num_features_
;
++
i
){
for
(
int
i
=
0
;
i
<
num_groups_
;
++
i
)
{
features_
.
emplace_back
(
new
Feature
(
dataset
->
features_
[
i
]
->
feature_index
(),
std
::
vector
<
std
::
unique_ptr
<
BinMapper
>>
bin_mappers
;
new
BinMapper
(
*
(
dataset
->
features_
[
i
]
->
bin_mapper
())),
for
(
int
j
=
0
;
j
<
dataset
->
feature_groups_
[
i
]
->
num_feature_
;
++
j
)
{
bin_mappers
.
emplace_back
(
new
BinMapper
(
*
(
dataset
->
feature_groups_
[
i
]
->
bin_mappers_
[
j
])));
}
feature_groups_
.
emplace_back
(
new
FeatureGroup
(
dataset
->
feature_groups_
[
i
]
->
num_feature_
,
bin_mappers
,
num_data_
,
num_data_
,
is_enable_sparse
));
is_enable_sparse
));
}
}
features_
.
shrink_to_fit
();
feature
_group
s_
.
shrink_to_fit
();
used_feature_map_
=
dataset
->
used_feature_map_
;
used_feature_map_
=
dataset
->
used_feature_map_
;
num_total_features_
=
dataset
->
num_total_features_
;
num_total_features_
=
dataset
->
num_total_features_
;
feature_names_
=
dataset
->
feature_names_
;
feature_names_
=
dataset
->
feature_names_
;
label_idx_
=
dataset
->
label_idx_
;
label_idx_
=
dataset
->
label_idx_
;
real_feature_idx_
=
dataset
->
real_feature_idx_
;
feature2group_
=
dataset
->
feature2group_
;
feature2subfeature_
=
dataset
->
feature2subfeature_
;
group_bin_boundaries_
=
dataset
->
group_bin_boundaries_
;
group_feature_start_
=
dataset
->
group_feature_start_
;
group_feature_cnt_
=
dataset
->
group_feature_cnt_
;
}
void
Dataset
::
CreateValid
(
const
Dataset
*
dataset
)
{
feature_groups_
.
clear
();
num_features_
=
dataset
->
num_features_
;
num_groups_
=
num_features_
;
bool
is_enable_sparse
=
true
;
feature2group_
.
clear
();
feature2subfeature_
.
clear
();
// copy feature bin mapper data
for
(
int
i
=
0
;
i
<
num_features_
;
++
i
)
{
std
::
vector
<
std
::
unique_ptr
<
BinMapper
>>
bin_mappers
;
bin_mappers
.
emplace_back
(
new
BinMapper
(
*
(
dataset
->
FeatureBinMapper
(
i
))));
feature_groups_
.
emplace_back
(
new
FeatureGroup
(
1
,
bin_mappers
,
num_data_
,
is_enable_sparse
));
feature2group_
.
push_back
(
i
);
feature2subfeature_
.
push_back
(
0
);
}
feature_groups_
.
shrink_to_fit
();
used_feature_map_
=
dataset
->
used_feature_map_
;
num_total_features_
=
dataset
->
num_total_features_
;
feature_names_
=
dataset
->
feature_names_
;
label_idx_
=
dataset
->
label_idx_
;
real_feature_idx_
=
dataset
->
real_feature_idx_
;
group_bin_boundaries_
.
clear
();
uint64_t
num_total_bin
=
0
;
group_bin_boundaries_
.
push_back
(
num_total_bin
);
for
(
int
i
=
0
;
i
<
num_groups_
;
++
i
)
{
num_total_bin
+=
feature_groups_
[
i
]
->
num_total_bin_
;
group_bin_boundaries_
.
push_back
(
num_total_bin
);
}
int
last_group
=
0
;
group_feature_start_
.
reserve
(
num_groups_
);
group_feature_cnt_
.
reserve
(
num_groups_
);
group_feature_start_
.
push_back
(
0
);
group_feature_cnt_
.
push_back
(
1
);
for
(
int
i
=
1
;
i
<
num_features_
;
++
i
)
{
const
int
group
=
feature2group_
[
i
];
if
(
group
==
last_group
)
{
group_feature_cnt_
.
back
()
=
group_feature_cnt_
.
back
()
+
1
;
}
else
{
group_feature_start_
.
push_back
(
i
);
group_feature_cnt_
.
push_back
(
1
);
last_group
=
group
;
}
}
}
}
void
Dataset
::
ReSize
(
data_size_t
num_data
)
{
void
Dataset
::
ReSize
(
data_size_t
num_data
)
{
if
(
num_data_
!=
num_data
)
{
if
(
num_data_
!=
num_data
)
{
num_data_
=
num_data
;
num_data_
=
num_data
;
#pragma omp parallel for schedule(guided)
#pragma omp parallel for schedule(guided)
for
(
int
fidx
=
0
;
fidx
<
num_
feature
s_
;
++
fidx
)
{
for
(
int
group
=
0
;
group
<
num_
group
s_
;
++
group
)
{
feature
s_
[
fidx
]
->
ReSize
(
num_data_
);
feature
_groups_
[
group
]
->
bin_data_
->
ReSize
(
num_data_
);
}
}
}
}
}
}
...
@@ -75,8 +216,8 @@ void Dataset::ReSize(data_size_t num_data) {
...
@@ -75,8 +216,8 @@ void Dataset::ReSize(data_size_t num_data) {
void
Dataset
::
CopySubset
(
const
Dataset
*
fullset
,
const
data_size_t
*
used_indices
,
data_size_t
num_used_indices
,
bool
need_meta_data
)
{
void
Dataset
::
CopySubset
(
const
Dataset
*
fullset
,
const
data_size_t
*
used_indices
,
data_size_t
num_used_indices
,
bool
need_meta_data
)
{
CHECK
(
num_used_indices
==
num_data_
);
CHECK
(
num_used_indices
==
num_data_
);
#pragma omp parallel for schedule(guided)
#pragma omp parallel for schedule(guided)
for
(
int
fidx
=
0
;
fidx
<
num_
feature
s_
;
++
fidx
)
{
for
(
int
group
=
0
;
group
<
num_
group
s_
;
++
group
)
{
feature
s_
[
fidx
]
->
CopySubset
(
fullset
->
feature
s_
[
fidx
].
get
(),
used_indices
,
num_used_indices
);
feature
_groups_
[
group
]
->
CopySubset
(
fullset
->
feature
_groups_
[
group
].
get
(),
used_indices
,
num_used_indices
);
}
}
if
(
need_meta_data
)
{
if
(
need_meta_data
)
{
metadata_
.
Init
(
fullset
->
metadata_
,
used_indices
,
num_used_indices
);
metadata_
.
Init
(
fullset
->
metadata_
,
used_indices
,
num_used_indices
);
...
@@ -158,8 +299,8 @@ bool Dataset::GetIntField(const char* field_name, data_size_t* out_len, const in
...
@@ -158,8 +299,8 @@ bool Dataset::GetIntField(const char* field_name, data_size_t* out_len, const in
}
}
void
Dataset
::
SaveBinaryFile
(
const
char
*
bin_filename
)
{
void
Dataset
::
SaveBinaryFile
(
const
char
*
bin_filename
)
{
if
(
bin_filename
!=
nullptr
if
(
bin_filename
!=
nullptr
&&
std
::
string
(
bin_filename
)
==
std
::
string
(
data_filename_
))
{
&&
std
::
string
(
bin_filename
)
==
std
::
string
(
data_filename_
))
{
Log
::
Warning
(
"Bianry file %s already existed"
,
bin_filename
);
Log
::
Warning
(
"Bianry file %s already existed"
,
bin_filename
);
return
;
return
;
}
}
...
@@ -196,8 +337,9 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
...
@@ -196,8 +337,9 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
size_t
size_of_token
=
std
::
strlen
(
binary_file_token
);
size_t
size_of_token
=
std
::
strlen
(
binary_file_token
);
fwrite
(
binary_file_token
,
sizeof
(
char
),
size_of_token
,
file
);
fwrite
(
binary_file_token
,
sizeof
(
char
),
size_of_token
,
file
);
// get size of header
// get size of header
size_t
size_of_header
=
sizeof
(
num_data_
)
+
sizeof
(
num_features_
)
+
sizeof
(
num_total_features_
)
size_t
size_of_header
=
sizeof
(
num_data_
)
+
sizeof
(
num_features_
)
+
sizeof
(
num_total_features_
)
+
sizeof
(
size_t
)
+
sizeof
(
int
)
*
used_feature_map_
.
size
();
+
sizeof
(
int
)
*
num_total_features_
+
sizeof
(
num_groups_
)
+
3
*
sizeof
(
int
)
*
num_features_
+
sizeof
(
uint64_t
)
*
(
num_groups_
+
1
)
+
2
*
sizeof
(
int
)
*
num_groups_
;
// size of feature names
// size of feature names
for
(
int
i
=
0
;
i
<
num_total_features_
;
++
i
)
{
for
(
int
i
=
0
;
i
<
num_total_features_
;
++
i
)
{
size_of_header
+=
feature_names_
[
i
].
size
()
+
sizeof
(
int
);
size_of_header
+=
feature_names_
[
i
].
size
()
+
sizeof
(
int
);
...
@@ -206,10 +348,15 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
...
@@ -206,10 +348,15 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
// write header
// write header
fwrite
(
&
num_data_
,
sizeof
(
num_data_
),
1
,
file
);
fwrite
(
&
num_data_
,
sizeof
(
num_data_
),
1
,
file
);
fwrite
(
&
num_features_
,
sizeof
(
num_features_
),
1
,
file
);
fwrite
(
&
num_features_
,
sizeof
(
num_features_
),
1
,
file
);
fwrite
(
&
num_total_features_
,
sizeof
(
num_features_
),
1
,
file
);
fwrite
(
&
num_total_features_
,
sizeof
(
num_total_features_
),
1
,
file
);
size_t
num_used_feature_map
=
used_feature_map_
.
size
();
fwrite
(
used_feature_map_
.
data
(),
sizeof
(
int
),
num_total_features_
,
file
);
fwrite
(
&
num_used_feature_map
,
sizeof
(
num_used_feature_map
),
1
,
file
);
fwrite
(
&
num_groups_
,
sizeof
(
num_groups_
),
1
,
file
);
fwrite
(
used_feature_map_
.
data
(),
sizeof
(
int
),
num_used_feature_map
,
file
);
fwrite
(
real_feature_idx_
.
data
(),
sizeof
(
int
),
num_features_
,
file
);
fwrite
(
feature2group_
.
data
(),
sizeof
(
int
),
num_features_
,
file
);
fwrite
(
feature2subfeature_
.
data
(),
sizeof
(
int
),
num_features_
,
file
);
fwrite
(
group_bin_boundaries_
.
data
(),
sizeof
(
uint64_t
),
num_groups_
+
1
,
file
);
fwrite
(
group_feature_start_
.
data
(),
sizeof
(
int
),
num_groups_
,
file
);
fwrite
(
group_feature_cnt_
.
data
(),
sizeof
(
int
),
num_groups_
,
file
);
// write feature names
// write feature names
for
(
int
i
=
0
;
i
<
num_total_features_
;
++
i
)
{
for
(
int
i
=
0
;
i
<
num_total_features_
;
++
i
)
{
...
@@ -226,15 +373,94 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
...
@@ -226,15 +373,94 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
metadata_
.
SaveBinaryToFile
(
file
);
metadata_
.
SaveBinaryToFile
(
file
);
// write feature data
// write feature data
for
(
int
i
=
0
;
i
<
num_
feature
s_
;
++
i
)
{
for
(
int
i
=
0
;
i
<
num_
group
s_
;
++
i
)
{
// get size of feature
// get size of feature
size_t
size_of_feature
=
features_
[
i
]
->
SizesInByte
();
size_t
size_of_feature
=
feature
_group
s_
[
i
]
->
SizesInByte
();
fwrite
(
&
size_of_feature
,
sizeof
(
size_of_feature
),
1
,
file
);
fwrite
(
&
size_of_feature
,
sizeof
(
size_of_feature
),
1
,
file
);
// write feature
// write feature
features_
[
i
]
->
SaveBinaryToFile
(
file
);
feature
_group
s_
[
i
]
->
SaveBinaryToFile
(
file
);
}
}
fclose
(
file
);
fclose
(
file
);
}
}
}
}
void
Dataset
::
ConstructHistograms
(
const
std
::
vector
<
int8_t
>&
is_feature_used
,
const
data_size_t
*
data_indices
,
data_size_t
num_data
,
int
leaf_idx
,
std
::
vector
<
std
::
unique_ptr
<
OrderedBin
>>&
ordered_bins
,
const
score_t
*
gradients
,
const
score_t
*
hessians
,
score_t
*
ordered_gradients
,
score_t
*
ordered_hessians
,
HistogramBinEntry
*
hist_data
)
const
{
if
(
leaf_idx
<
0
||
num_data
<=
0
||
hist_data
==
nullptr
)
{
return
;
}
auto
ptr_ordered_grad
=
gradients
;
auto
ptr_ordered_hess
=
hessians
;
if
(
data_indices
!=
nullptr
&&
num_data
<
num_data_
)
{
#pragma omp parallel for schedule(static)
for
(
data_size_t
i
=
0
;
i
<
num_data
;
++
i
)
{
ordered_gradients
[
i
]
=
gradients
[
data_indices
[
i
]];
ordered_hessians
[
i
]
=
hessians
[
data_indices
[
i
]];
}
ptr_ordered_grad
=
ordered_gradients
;
ptr_ordered_hess
=
ordered_hessians
;
}
#pragma omp parallel for schedule(guided)
for
(
int
group
=
0
;
group
<
num_groups_
;
++
group
)
{
bool
is_groud_used
=
false
;
const
int
f_cnt
=
group_feature_cnt_
[
group
];
for
(
int
j
=
0
;
j
<
f_cnt
;
++
j
)
{
const
int
fidx
=
group_feature_start_
[
group
]
+
j
;
if
(
is_feature_used
[
fidx
])
{
is_groud_used
=
true
;
break
;
}
}
if
(
!
is_groud_used
)
{
continue
;
}
// feature is not used
auto
data_ptr
=
hist_data
+
group_bin_boundaries_
[
group
];
const
int
num_bin
=
feature_groups_
[
group
]
->
num_total_bin_
;
std
::
memset
(
data_ptr
+
1
,
0
,
(
num_bin
-
1
)
*
sizeof
(
HistogramBinEntry
));
// construct histograms for smaller leaf
if
(
ordered_bins
[
group
]
==
nullptr
)
{
// if not use ordered bin
feature_groups_
[
group
]
->
bin_data_
->
ConstructHistogram
(
data_indices
,
num_data
,
ptr_ordered_grad
,
ptr_ordered_hess
,
data_ptr
);
}
else
{
// used ordered bin
ordered_bins
[
group
]
->
ConstructHistogram
(
leaf_idx
,
gradients
,
hessians
,
data_ptr
);
}
}
}
void
Dataset
::
FixHistogram
(
int
feature_idx
,
double
sum_gradient
,
double
sum_hessian
,
data_size_t
num_data
,
HistogramBinEntry
*
data
)
const
{
const
int
group
=
feature2group_
[
feature_idx
];
const
int
sub_feature
=
feature2subfeature_
[
feature_idx
];
const
BinMapper
*
bin_mapper
=
feature_groups_
[
group
]
->
bin_mappers_
[
sub_feature
].
get
();
const
int
default_bin
=
bin_mapper
->
GetDefaultBin
();
if
(
default_bin
>
0
)
{
const
int
num_bin
=
bin_mapper
->
num_bin
();
data
[
default_bin
].
sum_gradients
=
sum_gradient
;
data
[
default_bin
].
sum_hessians
=
sum_hessian
;
data
[
default_bin
].
cnt
=
num_data
;
for
(
int
i
=
0
;
i
<
num_bin
;
++
i
)
{
if
(
i
!=
default_bin
)
{
data
[
default_bin
].
sum_gradients
-=
data
[
i
].
sum_gradients
;
data
[
default_bin
].
sum_hessians
-=
data
[
i
].
sum_hessians
;
data
[
default_bin
].
cnt
-=
data
[
i
].
cnt
;
}
}
}
}
}
// namespace LightGBM
}
// namespace LightGBM
src/io/dataset_loader.cpp
View file @
4f77bd28
This diff is collapsed.
Click to expand it.
src/io/dense_bin.hpp
View file @
4f77bd28
...
@@ -9,15 +9,41 @@
...
@@ -9,15 +9,41 @@
namespace
LightGBM
{
namespace
LightGBM
{
template
<
typename
VAL_T
>
class
DenseBin
;
template
<
typename
VAL_T
>
class
DenseBinIterator
:
public
BinIterator
{
public:
explicit
DenseBinIterator
(
const
DenseBin
<
VAL_T
>*
bin_data
,
uint32_t
min_bin
,
uint32_t
max_bin
,
uint32_t
default_bin
)
:
bin_data_
(
bin_data
),
min_bin_
(
static_cast
<
VAL_T
>
(
min_bin
)),
max_bin_
(
static_cast
<
VAL_T
>
(
max_bin
)),
default_bin_
(
static_cast
<
uint8_t
>
(
default_bin
))
{
if
(
default_bin_
==
0
)
{
bias_
=
1
;
}
else
{
bias_
=
0
;
}
}
inline
uint32_t
Get
(
data_size_t
idx
)
override
;
inline
void
Reset
(
data_size_t
)
override
{
}
private:
const
DenseBin
<
VAL_T
>*
bin_data_
;
VAL_T
min_bin_
;
VAL_T
max_bin_
;
VAL_T
default_bin_
;
uint8_t
bias_
;
};
/*!
/*!
* \brief Used to store bins for dense feature
* \brief Used to store bins for dense feature
* Use template to reduce memory cost
* Use template to reduce memory cost
*/
*/
template
<
typename
VAL_T
>
template
<
typename
VAL_T
>
class
DenseBin
:
public
Bin
{
class
DenseBin
:
public
Bin
{
public:
public:
DenseBin
(
data_size_t
num_data
,
uint32_t
default_bin
)
friend
DenseBinIterator
<
VAL_T
>
;
:
num_data_
(
num_data
),
data_
(
num_data_
,
static_cast
<
VAL_T
>
(
default_bin
))
{
DenseBin
(
data_size_t
num_data
)
:
num_data_
(
num_data
),
data_
(
num_data_
,
static_cast
<
VAL_T
>
(
0
))
{
}
}
~
DenseBin
()
{
~
DenseBin
()
{
...
@@ -34,24 +60,20 @@ public:
...
@@ -34,24 +60,20 @@ public:
}
}
}
}
inline
uint32_t
Get
(
data_size_t
idx
)
const
{
BinIterator
*
GetIterator
(
uint32_t
min_bin
,
uint32_t
max_bin
,
uint32_t
default_bin
)
const
override
;
return
static_cast
<
uint32_t
>
(
data_
[
idx
]);
}
BinIterator
*
GetIterator
(
data_size_t
start_idx
)
const
override
;
void
ConstructHistogram
(
const
data_size_t
*
data_indices
,
data_size_t
num_data
,
void
ConstructHistogram
(
const
data_size_t
*
data_indices
,
data_size_t
num_data
,
const
score_t
*
ordered_gradients
,
const
score_t
*
ordered_hessians
,
const
score_t
*
ordered_gradients
,
const
score_t
*
ordered_hessians
,
HistogramBinEntry
*
out
)
const
override
{
HistogramBinEntry
*
out
)
const
override
{
// use 4-way unrolling, will be faster
// use 4-way unrolling, will be faster
if
(
data_indices
!=
nullptr
)
{
// if use part of data
if
(
data_indices
!=
nullptr
)
{
// if use part of data
data_size_t
rest
=
num_data
%
4
;
const
data_size_t
rest
=
num_data
%
4
;
data_size_t
i
=
0
;
data_size_t
i
=
0
;
for
(;
i
<
num_data
-
rest
;
i
+=
4
)
{
for
(;
i
<
num_data
-
rest
;
i
+=
4
)
{
VAL_T
bin0
=
data_
[
data_indices
[
i
]];
const
VAL_T
bin0
=
data_
[
data_indices
[
i
]];
VAL_T
bin1
=
data_
[
data_indices
[
i
+
1
]];
const
VAL_T
bin1
=
data_
[
data_indices
[
i
+
1
]];
VAL_T
bin2
=
data_
[
data_indices
[
i
+
2
]];
const
VAL_T
bin2
=
data_
[
data_indices
[
i
+
2
]];
VAL_T
bin3
=
data_
[
data_indices
[
i
+
3
]];
const
VAL_T
bin3
=
data_
[
data_indices
[
i
+
3
]];
out
[
bin0
].
sum_gradients
+=
ordered_gradients
[
i
];
out
[
bin0
].
sum_gradients
+=
ordered_gradients
[
i
];
out
[
bin1
].
sum_gradients
+=
ordered_gradients
[
i
+
1
];
out
[
bin1
].
sum_gradients
+=
ordered_gradients
[
i
+
1
];
...
@@ -69,19 +91,19 @@ public:
...
@@ -69,19 +91,19 @@ public:
++
out
[
bin3
].
cnt
;
++
out
[
bin3
].
cnt
;
}
}
for
(;
i
<
num_data
;
++
i
)
{
for
(;
i
<
num_data
;
++
i
)
{
VAL_T
bin
=
data_
[
data_indices
[
i
]];
const
VAL_T
bin
=
data_
[
data_indices
[
i
]];
out
[
bin
].
sum_gradients
+=
ordered_gradients
[
i
];
out
[
bin
].
sum_gradients
+=
ordered_gradients
[
i
];
out
[
bin
].
sum_hessians
+=
ordered_hessians
[
i
];
out
[
bin
].
sum_hessians
+=
ordered_hessians
[
i
];
++
out
[
bin
].
cnt
;
++
out
[
bin
].
cnt
;
}
}
}
else
{
// use full data
}
else
{
// use full data
data_size_t
rest
=
num_data
%
4
;
const
data_size_t
rest
=
num_data
%
4
;
data_size_t
i
=
0
;
data_size_t
i
=
0
;
for
(;
i
<
num_data
-
rest
;
i
+=
4
)
{
for
(;
i
<
num_data
-
rest
;
i
+=
4
)
{
VAL_T
bin0
=
data_
[
i
];
const
VAL_T
bin0
=
data_
[
i
];
VAL_T
bin1
=
data_
[
i
+
1
];
const
VAL_T
bin1
=
data_
[
i
+
1
];
VAL_T
bin2
=
data_
[
i
+
2
];
const
VAL_T
bin2
=
data_
[
i
+
2
];
VAL_T
bin3
=
data_
[
i
+
3
];
const
VAL_T
bin3
=
data_
[
i
+
3
];
out
[
bin0
].
sum_gradients
+=
ordered_gradients
[
i
];
out
[
bin0
].
sum_gradients
+=
ordered_gradients
[
i
];
out
[
bin1
].
sum_gradients
+=
ordered_gradients
[
i
+
1
];
out
[
bin1
].
sum_gradients
+=
ordered_gradients
[
i
+
1
];
...
@@ -99,7 +121,7 @@ public:
...
@@ -99,7 +121,7 @@ public:
++
out
[
bin3
].
cnt
;
++
out
[
bin3
].
cnt
;
}
}
for
(;
i
<
num_data
;
++
i
)
{
for
(;
i
<
num_data
;
++
i
)
{
VAL_T
bin
=
data_
[
i
];
const
VAL_T
bin
=
data_
[
i
];
out
[
bin
].
sum_gradients
+=
ordered_gradients
[
i
];
out
[
bin
].
sum_gradients
+=
ordered_gradients
[
i
];
out
[
bin
].
sum_hessians
+=
ordered_hessians
[
i
];
out
[
bin
].
sum_hessians
+=
ordered_hessians
[
i
];
++
out
[
bin
].
cnt
;
++
out
[
bin
].
cnt
;
...
@@ -107,13 +129,31 @@ public:
...
@@ -107,13 +129,31 @@ public:
}
}
}
}
virtual
data_size_t
Split
(
unsigned
int
threshold
,
data_size_t
*
data_indices
,
data_size_t
num_data
,
virtual
data_size_t
Split
(
uint32_t
min_bin
,
uint32_t
max_bin
,
uint32_t
default_bin
,
uint32_t
threshold
,
data_size_t
*
data_indices
,
data_size_t
num_data
,
data_size_t
*
lte_indices
,
data_size_t
*
gt_indices
)
const
override
{
data_size_t
*
lte_indices
,
data_size_t
*
gt_indices
)
const
override
{
if
(
num_data
<=
0
)
{
return
0
;
}
VAL_T
th
=
static_cast
<
VAL_T
>
(
threshold
+
min_bin
);
VAL_T
minb
=
static_cast
<
VAL_T
>
(
min_bin
);
VAL_T
maxb
=
static_cast
<
VAL_T
>
(
max_bin
);
if
(
default_bin
==
0
)
{
th
-=
1
;
}
data_size_t
lte_count
=
0
;
data_size_t
lte_count
=
0
;
data_size_t
gt_count
=
0
;
data_size_t
gt_count
=
0
;
data_size_t
*
default_indices
=
gt_indices
;
data_size_t
*
default_count
=
&
gt_count
;
if
(
default_bin
<=
threshold
)
{
default_indices
=
lte_indices
;
default_count
=
&
lte_count
;
}
for
(
data_size_t
i
=
0
;
i
<
num_data
;
++
i
)
{
for
(
data_size_t
i
=
0
;
i
<
num_data
;
++
i
)
{
data_size_t
idx
=
data_indices
[
i
];
const
data_size_t
idx
=
data_indices
[
i
];
if
(
data_
[
idx
]
>
threshold
)
{
VAL_T
bin
=
data_
[
idx
];
if
(
bin
>
maxb
||
bin
<
minb
)
{
default_indices
[(
*
default_count
)
++
]
=
idx
;
}
else
if
(
bin
>
th
)
{
gt_indices
[
gt_count
++
]
=
idx
;
gt_indices
[
gt_count
++
]
=
idx
;
}
else
{
}
else
{
lte_indices
[
lte_count
++
]
=
idx
;
lte_indices
[
lte_count
++
]
=
idx
;
...
@@ -162,45 +202,19 @@ protected:
...
@@ -162,45 +202,19 @@ protected:
};
};
template
<
typename
VAL_T
>
template
<
typename
VAL_T
>
class
DenseBinIterator
:
public
BinIterator
{
uint32_t
DenseBinIterator
<
VAL_T
>::
Get
(
data_size_t
idx
)
{
public:
auto
ret
=
bin_data_
->
data_
[
idx
];
explicit
DenseBinIterator
(
const
DenseBin
<
VAL_T
>*
bin_data
)
if
(
ret
>=
min_bin_
&&
ret
<=
max_bin_
)
{
:
bin_data_
(
bin_data
)
{
return
ret
-
min_bin_
+
bias_
;
}
else
{
return
default_bin_
;
}
}
uint32_t
Get
(
data_size_t
idx
)
override
{
return
bin_data_
->
Get
(
idx
);
}
private:
const
DenseBin
<
VAL_T
>*
bin_data_
;
};
template
<
typename
VAL_T
>
BinIterator
*
DenseBin
<
VAL_T
>::
GetIterator
(
data_size_t
)
const
{
return
new
DenseBinIterator
<
VAL_T
>
(
this
);
}
}
template
<
typename
VAL_T
>
template
<
typename
VAL_T
>
class
DenseCategoricalBin
:
public
DenseBin
<
VAL_T
>
{
BinIterator
*
DenseBin
<
VAL_T
>::
GetIterator
(
uint32_t
min_bin
,
uint32_t
max_bin
,
uint32_t
default_bin
)
const
{
public:
return
new
DenseBinIterator
<
VAL_T
>
(
this
,
min_bin
,
max_bin
,
default_bin
);
DenseCategoricalBin
(
data_size_t
num_data
,
int
default_bin
)
}
:
DenseBin
<
VAL_T
>
(
num_data
,
default_bin
)
{
}
virtual
data_size_t
Split
(
unsigned
int
threshold
,
data_size_t
*
data_indices
,
data_size_t
num_data
,
data_size_t
*
lte_indices
,
data_size_t
*
gt_indices
)
const
override
{
data_size_t
lte_count
=
0
;
data_size_t
gt_count
=
0
;
for
(
data_size_t
i
=
0
;
i
<
num_data
;
++
i
)
{
data_size_t
idx
=
data_indices
[
i
];
if
(
DenseBin
<
VAL_T
>::
data_
[
idx
]
!=
threshold
)
{
gt_indices
[
gt_count
++
]
=
idx
;
}
else
{
lte_indices
[
lte_count
++
]
=
idx
;
}
}
return
lte_count
;
}
};
}
// namespace LightGBM
}
// namespace LightGBM
#endif // LightGBM_IO_DENSE_BIN_HPP_
#endif // LightGBM_IO_DENSE_BIN_HPP_
src/io/ordered_sparse_bin.hpp
View file @
4f77bd28
...
@@ -41,6 +41,7 @@ public:
...
@@ -41,6 +41,7 @@ public:
++
non_zero_cnt
;
++
non_zero_cnt
;
}
}
ordered_pair_
.
resize
(
non_zero_cnt
);
ordered_pair_
.
resize
(
non_zero_cnt
);
leaf_cnt_
.
push_back
(
non_zero_cnt
);
}
}
~
OrderedSparseBin
()
{
~
OrderedSparseBin
()
{
...
@@ -92,7 +93,7 @@ public:
...
@@ -92,7 +93,7 @@ public:
}
}
}
}
void
Split
(
int
leaf
,
int
right_leaf
,
const
char
*
left_indices
)
override
{
void
Split
(
int
leaf
,
int
right_leaf
,
const
char
*
is_in_leaf
,
char
mark
)
override
{
// get current leaf boundary
// get current leaf boundary
const
data_size_t
l_start
=
leaf_start_
[
leaf
];
const
data_size_t
l_start
=
leaf_start_
[
leaf
];
const
data_size_t
l_end
=
l_start
+
leaf_cnt_
[
leaf
];
const
data_size_t
l_end
=
l_start
+
leaf_cnt_
[
leaf
];
...
@@ -100,7 +101,7 @@ public:
...
@@ -100,7 +101,7 @@ public:
data_size_t
new_left_end
=
l_start
;
data_size_t
new_left_end
=
l_start
;
for
(
data_size_t
i
=
l_start
;
i
<
l_end
;
++
i
)
{
for
(
data_size_t
i
=
l_start
;
i
<
l_end
;
++
i
)
{
if
(
left_indices
[
ordered_pair_
[
i
].
ridx
])
{
if
(
is_in_leaf
[
ordered_pair_
[
i
].
ridx
]
==
mark
)
{
std
::
swap
(
ordered_pair_
[
new_left_end
],
ordered_pair_
[
i
]);
std
::
swap
(
ordered_pair_
[
new_left_end
],
ordered_pair_
[
i
]);
++
new_left_end
;
++
new_left_end
;
}
}
...
@@ -110,7 +111,9 @@ public:
...
@@ -110,7 +111,9 @@ public:
leaf_cnt_
[
leaf
]
=
new_left_end
-
l_start
;
leaf_cnt_
[
leaf
]
=
new_left_end
-
l_start
;
leaf_cnt_
[
right_leaf
]
=
l_end
-
new_left_end
;
leaf_cnt_
[
right_leaf
]
=
l_end
-
new_left_end
;
}
}
data_size_t
NonZeroCount
(
int
leaf
)
const
override
{
return
static_cast
<
data_size_t
>
(
leaf_cnt_
[
leaf
]);
}
/*! \brief Disable copy */
/*! \brief Disable copy */
OrderedSparseBin
<
VAL_T
>&
operator
=
(
const
OrderedSparseBin
<
VAL_T
>&
)
=
delete
;
OrderedSparseBin
<
VAL_T
>&
operator
=
(
const
OrderedSparseBin
<
VAL_T
>&
)
=
delete
;
/*! \brief Disable copy */
/*! \brief Disable copy */
...
...
src/io/sparse_bin.hpp
View file @
4f77bd28
...
@@ -23,22 +23,43 @@ const uint8_t kMaxDelta = 255;
...
@@ -23,22 +23,43 @@ const uint8_t kMaxDelta = 255;
template
<
typename
VAL_T
>
template
<
typename
VAL_T
>
class
SparseBinIterator
:
public
BinIterator
{
class
SparseBinIterator
:
public
BinIterator
{
public:
public:
SparseBinIterator
(
const
SparseBin
<
VAL_T
>*
bin_data
,
uint32_t
min_bin
,
uint32_t
max_bin
,
uint32_t
default_bin
)
:
bin_data_
(
bin_data
),
min_bin_
(
static_cast
<
VAL_T
>
(
min_bin
)),
max_bin_
(
static_cast
<
VAL_T
>
(
max_bin
)),
default_bin_
(
static_cast
<
uint8_t
>
(
default_bin
))
{
if
(
default_bin_
==
0
)
{
bias_
=
1
;
}
else
{
bias_
=
0
;
}
Reset
(
0
);
}
SparseBinIterator
(
const
SparseBin
<
VAL_T
>*
bin_data
,
data_size_t
start_idx
)
SparseBinIterator
(
const
SparseBin
<
VAL_T
>*
bin_data
,
data_size_t
start_idx
)
:
bin_data_
(
bin_data
)
{
:
bin_data_
(
bin_data
)
{
Reset
(
start_idx
);
Reset
(
start_idx
);
}
}
inline
VAL_T
Inner
Get
(
data_size_t
idx
);
inline
VAL_T
Raw
Get
(
data_size_t
idx
);
inline
uint32_t
Get
(
data_size_t
idx
)
override
{
inline
uint32_t
Get
(
data_size_t
idx
)
override
{
return
InnerGet
(
idx
);
VAL_T
ret
=
RawGet
(
idx
);
if
(
ret
>=
min_bin_
&&
ret
<=
max_bin_
)
{
return
ret
-
min_bin_
+
bias_
;
}
else
{
return
default_bin_
;
}
}
}
inline
void
Reset
(
data_size_t
idx
);
inline
void
Reset
(
data_size_t
idx
)
override
;
private:
private:
const
SparseBin
<
VAL_T
>*
bin_data_
;
const
SparseBin
<
VAL_T
>*
bin_data_
;
data_size_t
cur_pos_
;
data_size_t
cur_pos_
;
data_size_t
i_delta_
;
data_size_t
i_delta_
;
VAL_T
min_bin_
;
VAL_T
max_bin_
;
VAL_T
default_bin_
;
uint8_t
bias_
;
};
};
template
<
typename
VAL_T
>
template
<
typename
VAL_T
>
...
@@ -50,17 +71,15 @@ public:
...
@@ -50,17 +71,15 @@ public:
friend
class
SparseBinIterator
<
VAL_T
>
;
friend
class
SparseBinIterator
<
VAL_T
>
;
friend
class
OrderedSparseBin
<
VAL_T
>
;
friend
class
OrderedSparseBin
<
VAL_T
>
;
SparseBin
(
data_size_t
num_data
,
uint32_t
default_bin
)
SparseBin
(
data_size_t
num_data
)
:
num_data_
(
num_data
)
{
:
num_data_
(
num_data
)
{
default_bin_
=
static_cast
<
VAL_T
>
(
default_bin
)
;
int
num_threads
=
1
;
#pragma omp parallel
#pragma omp parallel
#pragma omp master
#pragma omp master
{
{
num_threads_
=
omp_get_num_threads
();
num_threads
=
omp_get_num_threads
();
}
for
(
int
i
=
0
;
i
<
num_threads_
;
++
i
)
{
push_buffers_
.
emplace_back
();
}
}
push_buffers_
.
resize
(
num_threads
);
}
}
~
SparseBin
()
{
~
SparseBin
()
{
...
@@ -73,12 +92,12 @@ public:
...
@@ -73,12 +92,12 @@ public:
void
Push
(
int
tid
,
data_size_t
idx
,
uint32_t
value
)
override
{
void
Push
(
int
tid
,
data_size_t
idx
,
uint32_t
value
)
override
{
auto
cur_bin
=
static_cast
<
VAL_T
>
(
value
);
auto
cur_bin
=
static_cast
<
VAL_T
>
(
value
);
if
(
cur_bin
!=
default_bin_
)
{
if
(
cur_bin
!=
0
)
{
push_buffers_
[
tid
].
emplace_back
(
idx
,
cur_bin
);
push_buffers_
[
tid
].
emplace_back
(
idx
,
cur_bin
);
}
}
}
}
BinIterator
*
GetIterator
(
data_size_t
start_idx
)
const
override
;
BinIterator
*
GetIterator
(
uint32_t
min_bin
,
uint32_t
max_bin
,
uint32_t
default_bin
)
const
override
;
void
ConstructHistogram
(
const
data_size_t
*
,
data_size_t
,
const
score_t
*
,
void
ConstructHistogram
(
const
data_size_t
*
,
data_size_t
,
const
score_t
*
,
const
score_t
*
,
HistogramBinEntry
*
)
const
override
{
const
score_t
*
,
HistogramBinEntry
*
)
const
override
{
...
@@ -88,11 +107,10 @@ public:
...
@@ -88,11 +107,10 @@ public:
inline
bool
NextNonzero
(
data_size_t
*
i_delta
,
inline
bool
NextNonzero
(
data_size_t
*
i_delta
,
data_size_t
*
cur_pos
)
const
{
data_size_t
*
cur_pos
)
const
{
const
VAL_T
non_data_flag
=
std
::
numeric_limits
<
VAL_T
>::
max
();
++
(
*
i_delta
);
++
(
*
i_delta
);
*
cur_pos
+=
deltas_
[
*
i_delta
];
*
cur_pos
+=
deltas_
[
*
i_delta
];
data_size_t
factor
=
1
;
data_size_t
factor
=
1
;
while
(
*
i_delta
<
num_vals_
&&
vals_
[
*
i_delta
]
==
non_data_flag
)
{
while
(
*
i_delta
<
num_vals_
&&
vals_
[
*
i_delta
]
==
0
)
{
++
(
*
i_delta
);
++
(
*
i_delta
);
factor
*=
kMaxDelta
;
factor
*=
kMaxDelta
;
*
cur_pos
+=
deltas_
[
*
i_delta
]
*
factor
;
*
cur_pos
+=
deltas_
[
*
i_delta
]
*
factor
;
...
@@ -104,17 +122,33 @@ public:
...
@@ -104,17 +122,33 @@ public:
}
}
}
}
virtual
data_size_t
Split
(
unsigned
int
threshold
,
data_size_t
*
data_indices
,
data_size_t
num_data
,
virtual
data_size_t
Split
(
uint32_t
min_bin
,
uint32_t
max_bin
,
uint32_t
default_bin
,
uint32_t
threshold
,
data_size_t
*
data_indices
,
data_size_t
num_data
,
data_size_t
*
lte_indices
,
data_size_t
*
gt_indices
)
const
override
{
data_size_t
*
lte_indices
,
data_size_t
*
gt_indices
)
const
override
{
// not need to split
// not need to split
if
(
num_data
<=
0
)
{
return
0
;
}
if
(
num_data
<=
0
)
{
return
0
;
}
VAL_T
th
=
static_cast
<
VAL_T
>
(
threshold
+
min_bin
);
VAL_T
minb
=
static_cast
<
VAL_T
>
(
min_bin
);
VAL_T
maxb
=
static_cast
<
VAL_T
>
(
max_bin
);
if
(
default_bin
==
0
)
{
th
-=
1
;
}
SparseBinIterator
<
VAL_T
>
iterator
(
this
,
data_indices
[
0
]);
SparseBinIterator
<
VAL_T
>
iterator
(
this
,
data_indices
[
0
]);
data_size_t
lte_count
=
0
;
data_size_t
lte_count
=
0
;
data_size_t
gt_count
=
0
;
data_size_t
gt_count
=
0
;
data_size_t
*
default_indices
=
gt_indices
;
data_size_t
*
default_count
=
&
gt_count
;
if
(
default_bin
<=
threshold
)
{
default_indices
=
lte_indices
;
default_count
=
&
lte_count
;
}
for
(
data_size_t
i
=
0
;
i
<
num_data
;
++
i
)
{
for
(
data_size_t
i
=
0
;
i
<
num_data
;
++
i
)
{
const
data_size_t
idx
=
data_indices
[
i
];
const
data_size_t
idx
=
data_indices
[
i
];
VAL_T
bin
=
iterator
.
InnerGet
(
idx
);
VAL_T
bin
=
iterator
.
RawGet
(
idx
);
if
(
bin
>
threshold
)
{
if
(
bin
>
maxb
||
bin
<
minb
)
{
default_indices
[(
*
default_count
)
++
]
=
idx
;
}
else
if
(
bin
>
th
)
{
gt_indices
[
gt_count
++
]
=
idx
;
gt_indices
[
gt_count
++
]
=
idx
;
}
else
{
}
else
{
lte_indices
[
lte_count
++
]
=
idx
;
lte_indices
[
lte_count
++
]
=
idx
;
...
@@ -133,16 +167,14 @@ public:
...
@@ -133,16 +167,14 @@ public:
for
(
size_t
i
=
0
;
i
<
push_buffers_
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
push_buffers_
.
size
();
++
i
)
{
pair_cnt
+=
push_buffers_
[
i
].
size
();
pair_cnt
+=
push_buffers_
[
i
].
size
();
}
}
std
::
vector
<
std
::
pair
<
data_size_t
,
VAL_T
>>
idx_val_pairs
;
std
::
vector
<
std
::
pair
<
data_size_t
,
VAL_T
>>&
idx_val_pairs
=
push_buffers_
[
0
];
// merge
idx_val_pairs
.
reserve
(
pair_cnt
);
idx_val_pairs
.
reserve
(
pair_cnt
);
for
(
size_t
i
=
0
;
i
<
push_buffers_
.
size
();
++
i
)
{
for
(
size_t
i
=
1
;
i
<
push_buffers_
.
size
();
++
i
)
{
idx_val_pairs
.
insert
(
idx_val_pairs
.
end
(),
push_buffers_
[
i
].
begin
(),
push_buffers_
[
i
].
end
());
idx_val_pairs
.
insert
(
idx_val_pairs
.
end
(),
push_buffers_
[
i
].
begin
(),
push_buffers_
[
i
].
end
());
push_buffers_
[
i
].
clear
();
push_buffers_
[
i
].
clear
();
push_buffers_
[
i
].
shrink_to_fit
();
push_buffers_
[
i
].
shrink_to_fit
();
}
}
push_buffers_
.
clear
();
push_buffers_
.
shrink_to_fit
();
// sort by data index
// sort by data index
std
::
sort
(
idx_val_pairs
.
begin
(),
idx_val_pairs
.
end
(),
std
::
sort
(
idx_val_pairs
.
begin
(),
idx_val_pairs
.
end
(),
[](
const
std
::
pair
<
data_size_t
,
VAL_T
>&
a
,
const
std
::
pair
<
data_size_t
,
VAL_T
>&
b
)
{
[](
const
std
::
pair
<
data_size_t
,
VAL_T
>&
a
,
const
std
::
pair
<
data_size_t
,
VAL_T
>&
b
)
{
...
@@ -155,7 +187,6 @@ public:
...
@@ -155,7 +187,6 @@ public:
void
LoadFromPair
(
const
std
::
vector
<
std
::
pair
<
data_size_t
,
VAL_T
>>&
idx_val_pairs
)
{
void
LoadFromPair
(
const
std
::
vector
<
std
::
pair
<
data_size_t
,
VAL_T
>>&
idx_val_pairs
)
{
deltas_
.
clear
();
deltas_
.
clear
();
vals_
.
clear
();
vals_
.
clear
();
const
VAL_T
non_data_flag
=
std
::
numeric_limits
<
VAL_T
>::
max
();
// transform to delta array
// transform to delta array
data_size_t
last_idx
=
0
;
data_size_t
last_idx
=
0
;
for
(
size_t
i
=
0
;
i
<
idx_val_pairs
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
idx_val_pairs
.
size
();
++
i
)
{
...
@@ -164,7 +195,7 @@ public:
...
@@ -164,7 +195,7 @@ public:
data_size_t
cur_delta
=
cur_idx
-
last_idx
;
data_size_t
cur_delta
=
cur_idx
-
last_idx
;
while
(
cur_delta
>
kMaxDelta
)
{
while
(
cur_delta
>
kMaxDelta
)
{
deltas_
.
push_back
(
cur_delta
%
kMaxDelta
);
deltas_
.
push_back
(
cur_delta
%
kMaxDelta
);
vals_
.
push_back
(
non_data_flag
);
vals_
.
push_back
(
0
);
cur_delta
/=
kMaxDelta
;
cur_delta
/=
kMaxDelta
;
}
}
deltas_
.
push_back
(
static_cast
<
uint8_t
>
(
cur_delta
));
deltas_
.
push_back
(
static_cast
<
uint8_t
>
(
cur_delta
));
...
@@ -269,8 +300,8 @@ public:
...
@@ -269,8 +300,8 @@ public:
SparseBinIterator
<
VAL_T
>
iterator
(
other_bin
,
used_indices
[
0
]);
SparseBinIterator
<
VAL_T
>
iterator
(
other_bin
,
used_indices
[
0
]);
std
::
vector
<
std
::
pair
<
data_size_t
,
VAL_T
>>
tmp_pair
;
std
::
vector
<
std
::
pair
<
data_size_t
,
VAL_T
>>
tmp_pair
;
for
(
data_size_t
i
=
0
;
i
<
num_used_indices
;
++
i
)
{
for
(
data_size_t
i
=
0
;
i
<
num_used_indices
;
++
i
)
{
VAL_T
bin
=
iterator
.
Inner
Get
(
used_indices
[
i
]);
VAL_T
bin
=
iterator
.
Raw
Get
(
used_indices
[
i
]);
if
(
bin
!=
default_bin_
)
{
if
(
bin
>
0
)
{
tmp_pair
.
emplace_back
(
i
,
bin
);
tmp_pair
.
emplace_back
(
i
,
bin
);
}
}
}
}
...
@@ -282,22 +313,20 @@ protected:
...
@@ -282,22 +313,20 @@ protected:
std
::
vector
<
uint8_t
>
deltas_
;
std
::
vector
<
uint8_t
>
deltas_
;
std
::
vector
<
VAL_T
>
vals_
;
std
::
vector
<
VAL_T
>
vals_
;
data_size_t
num_vals_
;
data_size_t
num_vals_
;
int
num_threads_
;
std
::
vector
<
std
::
vector
<
std
::
pair
<
data_size_t
,
VAL_T
>>>
push_buffers_
;
std
::
vector
<
std
::
vector
<
std
::
pair
<
data_size_t
,
VAL_T
>>>
push_buffers_
;
std
::
vector
<
std
::
pair
<
data_size_t
,
data_size_t
>>
fast_index_
;
std
::
vector
<
std
::
pair
<
data_size_t
,
data_size_t
>>
fast_index_
;
data_size_t
fast_index_shift_
;
data_size_t
fast_index_shift_
;
VAL_T
default_bin_
;
};
};
template
<
typename
VAL_T
>
template
<
typename
VAL_T
>
inline
VAL_T
SparseBinIterator
<
VAL_T
>::
Inner
Get
(
data_size_t
idx
)
{
inline
VAL_T
SparseBinIterator
<
VAL_T
>::
Raw
Get
(
data_size_t
idx
)
{
while
(
cur_pos_
<
idx
&&
i_delta_
<
bin_data_
->
num_vals_
)
{
while
(
cur_pos_
<
idx
&&
i_delta_
<
bin_data_
->
num_vals_
)
{
bin_data_
->
NextNonzero
(
&
i_delta_
,
&
cur_pos_
);
bin_data_
->
NextNonzero
(
&
i_delta_
,
&
cur_pos_
);
}
}
if
(
cur_pos_
==
idx
&&
i_delta_
<
bin_data_
->
num_vals_
&&
i_delta_
>=
0
)
{
if
(
cur_pos_
==
idx
&&
i_delta_
<
bin_data_
->
num_vals_
&&
i_delta_
>=
0
)
{
return
bin_data_
->
vals_
[
i_delta_
];
return
bin_data_
->
vals_
[
i_delta_
];
}
else
{
}
else
{
return
bin_data_
->
default_bin_
;
return
0
;
}
}
}
}
...
@@ -309,38 +338,9 @@ inline void SparseBinIterator<VAL_T>::Reset(data_size_t start_idx) {
...
@@ -309,38 +338,9 @@ inline void SparseBinIterator<VAL_T>::Reset(data_size_t start_idx) {
}
}
template
<
typename
VAL_T
>
template
<
typename
VAL_T
>
BinIterator
*
SparseBin
<
VAL_T
>::
GetIterator
(
data_size_t
start_idx
)
const
{
BinIterator
*
SparseBin
<
VAL_T
>::
GetIterator
(
uint32_t
min_bin
,
uint32_t
max_bin
,
uint32_t
default_bin
)
const
{
return
new
SparseBinIterator
<
VAL_T
>
(
this
,
start_idx
);
return
new
SparseBinIterator
<
VAL_T
>
(
this
,
min_bin
,
max_bin
,
default_bin
);
}
}
template
<
typename
VAL_T
>
class
SparseCategoricalBin
:
public
SparseBin
<
VAL_T
>
{
public:
SparseCategoricalBin
(
data_size_t
num_data
,
uint32_t
default_bin
)
:
SparseBin
<
VAL_T
>
(
num_data
,
default_bin
)
{
}
virtual
data_size_t
Split
(
unsigned
int
threshold
,
data_size_t
*
data_indices
,
data_size_t
num_data
,
data_size_t
*
lte_indices
,
data_size_t
*
gt_indices
)
const
override
{
// not need to split
if
(
num_data
<=
0
)
{
return
0
;
}
SparseBinIterator
<
VAL_T
>
iterator
(
this
,
data_indices
[
0
]);
data_size_t
lte_count
=
0
;
data_size_t
gt_count
=
0
;
for
(
data_size_t
i
=
0
;
i
<
num_data
;
++
i
)
{
const
data_size_t
idx
=
data_indices
[
i
];
VAL_T
bin
=
iterator
.
InnerGet
(
idx
);
if
(
bin
!=
threshold
)
{
gt_indices
[
gt_count
++
]
=
idx
;
}
else
{
lte_indices
[
lte_count
++
]
=
idx
;
}
}
return
lte_count
;
}
};
}
// namespace LightGBM
}
// namespace LightGBM
#endif // LightGBM_IO_SPARSE_BIN_HPP_
#endif // LightGBM_IO_SPARSE_BIN_HPP_
\ No newline at end of file
src/io/tree.cpp
View file @
4f77bd28
...
@@ -4,7 +4,6 @@
...
@@ -4,7 +4,6 @@
#include <LightGBM/utils/common.h>
#include <LightGBM/utils/common.h>
#include <LightGBM/dataset.h>
#include <LightGBM/dataset.h>
#include <LightGBM/feature.h>
#include <sstream>
#include <sstream>
#include <unordered_map>
#include <unordered_map>
...
@@ -16,22 +15,16 @@
...
@@ -16,22 +15,16 @@
namespace
LightGBM
{
namespace
LightGBM
{
std
::
vector
<
bool
(
*
)(
unsigned
int
,
unsigned
int
)
>
Tree
::
inner_decision_funs
=
{
Tree
::
NumericalDecision
<
unsigned
int
>
,
Tree
::
CategoricalDecision
<
unsigned
int
>
};
std
::
vector
<
bool
(
*
)(
double
,
double
)
>
Tree
::
decision_funs
=
{
Tree
::
NumericalDecision
<
double
>
,
Tree
::
CategoricalDecision
<
double
>
};
Tree
::
Tree
(
int
max_leaves
)
Tree
::
Tree
(
int
max_leaves
)
:
max_leaves_
(
max_leaves
)
{
:
max_leaves_
(
max_leaves
)
{
num_leaves_
=
0
;
num_leaves_
=
0
;
left_child_
=
std
::
vector
<
int
>
(
max_leaves_
-
1
);
left_child_
=
std
::
vector
<
int
>
(
max_leaves_
-
1
);
right_child_
=
std
::
vector
<
int
>
(
max_leaves_
-
1
);
right_child_
=
std
::
vector
<
int
>
(
max_leaves_
-
1
);
split_feature_inner
=
std
::
vector
<
int
>
(
max_leaves_
-
1
);
split_feature_
=
std
::
vector
<
int
>
(
max_leaves_
-
1
);
split_feature_
=
std
::
vector
<
int
>
(
max_leaves_
-
1
);
split_feature_real_
=
std
::
vector
<
int
>
(
max_leaves_
-
1
);
threshold_in_bin_
=
std
::
vector
<
uint32_t
>
(
max_leaves_
-
1
);
threshold_in_bin_
=
std
::
vector
<
unsigned
int
>
(
max_leaves_
-
1
);
threshold_
=
std
::
vector
<
double
>
(
max_leaves_
-
1
);
threshold_
=
std
::
vector
<
double
>
(
max_leaves_
-
1
);
decision_type_
=
std
::
vector
<
int8_t
>
(
max_leaves_
-
1
);
split_gain_
=
std
::
vector
<
double
>
(
max_leaves_
-
1
);
split_gain_
=
std
::
vector
<
double
>
(
max_leaves_
-
1
);
leaf_parent_
=
std
::
vector
<
int
>
(
max_leaves_
);
leaf_parent_
=
std
::
vector
<
int
>
(
max_leaves_
);
leaf_value_
=
std
::
vector
<
double
>
(
max_leaves_
);
leaf_value_
=
std
::
vector
<
double
>
(
max_leaves_
);
...
@@ -48,7 +41,7 @@ Tree::~Tree() {
...
@@ -48,7 +41,7 @@ Tree::~Tree() {
}
}
int
Tree
::
Split
(
int
leaf
,
int
feature
,
BinType
bin_type
,
unsigned
in
t
threshold_bin
,
int
real_feature
,
int
Tree
::
Split
(
int
leaf
,
int
feature
,
uint32_
t
threshold_bin
,
int
real_feature
,
double
threshold_double
,
double
left_value
,
double
threshold_double
,
double
left_value
,
double
right_value
,
data_size_t
left_cnt
,
data_size_t
right_cnt
,
double
gain
)
{
double
right_value
,
data_size_t
left_cnt
,
data_size_t
right_cnt
,
double
gain
)
{
int
new_node_idx
=
num_leaves_
-
1
;
int
new_node_idx
=
num_leaves_
-
1
;
...
@@ -63,15 +56,10 @@ int Tree::Split(int leaf, int feature, BinType bin_type, unsigned int threshold_
...
@@ -63,15 +56,10 @@ int Tree::Split(int leaf, int feature, BinType bin_type, unsigned int threshold_
}
}
}
}
// add new node
// add new node
split_feature_
[
new_node_idx
]
=
feature
;
split_feature_
inner
[
new_node_idx
]
=
feature
;
split_feature_
real_
[
new_node_idx
]
=
real_feature
;
split_feature_
[
new_node_idx
]
=
real_feature
;
threshold_in_bin_
[
new_node_idx
]
=
threshold_bin
;
threshold_in_bin_
[
new_node_idx
]
=
threshold_bin
;
threshold_
[
new_node_idx
]
=
threshold_double
;
threshold_
[
new_node_idx
]
=
threshold_double
;
if
(
bin_type
==
BinType
::
NumericalBin
)
{
decision_type_
[
new_node_idx
]
=
0
;
}
else
{
decision_type_
[
new_node_idx
]
=
1
;
}
split_gain_
[
new_node_idx
]
=
gain
;
split_gain_
[
new_node_idx
]
=
gain
;
// add two new leaves
// add two new leaves
left_child_
[
new_node_idx
]
=
~
leaf
;
left_child_
[
new_node_idx
]
=
~
leaf
;
...
@@ -95,42 +83,74 @@ int Tree::Split(int leaf, int feature, BinType bin_type, unsigned int threshold_
...
@@ -95,42 +83,74 @@ int Tree::Split(int leaf, int feature, BinType bin_type, unsigned int threshold_
}
}
void
Tree
::
AddPredictionToScore
(
const
Dataset
*
data
,
data_size_t
num_data
,
double
*
score
)
const
{
void
Tree
::
AddPredictionToScore
(
const
Dataset
*
data
,
data_size_t
num_data
,
double
*
score
)
const
{
Threading
::
For
<
data_size_t
>
(
0
,
num_data
,
[
this
,
data
,
score
](
int
,
data_size_t
start
,
data_size_t
end
)
{
if
(
data
->
num_features
()
>
num_leaves_
-
1
)
{
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>
iterators
(
data
->
num_features
());
Threading
::
For
<
data_size_t
>
(
0
,
num_data
,
for
(
int
i
=
0
;
i
<
data
->
num_features
();
++
i
)
{
[
this
,
&
data
,
score
](
int
,
data_size_t
start
,
data_size_t
end
)
{
iterators
[
i
].
reset
(
data
->
FeatureAt
(
i
)
->
bin_data
()
->
GetIterator
(
start
));
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>
iter
(
num_leaves_
-
1
);
}
for
(
int
i
=
0
;
i
<
num_leaves_
-
1
;
++
i
)
{
for
(
data_size_t
i
=
start
;
i
<
end
;
++
i
)
{
const
int
fidx
=
split_feature_inner
[
i
];
score
[
i
]
+=
static_cast
<
double
>
(
leaf_value_
[
GetLeaf
(
iterators
,
i
)]);
iter
[
i
].
reset
(
data
->
FeatureIterator
(
fidx
));
}
iter
[
i
]
->
Reset
(
start
);
});
}
for
(
data_size_t
i
=
start
;
i
<
end
;
++
i
)
{
score
[
i
]
+=
static_cast
<
double
>
(
leaf_value_
[
GetLeaf
(
iter
,
i
)]);
}
});
}
else
{
Threading
::
For
<
data_size_t
>
(
0
,
num_data
,
[
this
,
&
data
,
score
](
int
,
data_size_t
start
,
data_size_t
end
)
{
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>
iter
(
data
->
num_features
());
for
(
int
i
=
0
;
i
<
data
->
num_features
();
++
i
)
{
iter
[
i
].
reset
(
data
->
FeatureIterator
(
i
));
iter
[
i
]
->
Reset
(
start
);
}
for
(
data_size_t
i
=
start
;
i
<
end
;
++
i
)
{
score
[
i
]
+=
static_cast
<
double
>
(
leaf_value_
[
GetLeafRaw
(
iter
,
i
)]);
}
});
}
}
}
void
Tree
::
AddPredictionToScore
(
const
Dataset
*
data
,
const
data_size_t
*
used_data_indices
,
void
Tree
::
AddPredictionToScore
(
const
Dataset
*
data
,
data_size_t
num_data
,
double
*
score
)
const
{
const
data_size_t
*
used_data_indices
,
Threading
::
For
<
data_size_t
>
(
0
,
num_data
,
data_size_t
num_data
,
double
*
score
)
const
{
if
(
data
->
num_features
()
>
num_leaves_
-
1
)
{
Threading
::
For
<
data_size_t
>
(
0
,
num_data
,
[
this
,
data
,
used_data_indices
,
score
](
int
,
data_size_t
start
,
data_size_t
end
)
{
[
this
,
data
,
used_data_indices
,
score
](
int
,
data_size_t
start
,
data_size_t
end
)
{
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>
iterators
(
data
->
num_features
());
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>
iter
(
num_leaves_
-
1
);
for
(
int
i
=
0
;
i
<
data
->
num_features
();
++
i
)
{
for
(
int
i
=
0
;
i
<
num_leaves_
-
1
;
++
i
)
{
iterators
[
i
].
reset
(
data
->
FeatureAt
(
i
)
->
bin_data
()
->
GetIterator
(
used_data_indices
[
start
]));
const
int
fidx
=
split_feature_inner
[
i
];
}
iter
[
i
].
reset
(
data
->
FeatureIterator
(
fidx
));
for
(
data_size_t
i
=
start
;
i
<
end
;
++
i
)
{
iter
[
i
]
->
Reset
(
used_data_indices
[
start
]);
score
[
used_data_indices
[
i
]]
+=
static_cast
<
double
>
(
leaf_value_
[
GetLeaf
(
iterators
,
used_data_indices
[
i
])]);
}
}
for
(
data_size_t
i
=
start
;
i
<
end
;
++
i
)
{
});
score
[
used_data_indices
[
i
]]
+=
static_cast
<
double
>
(
leaf_value_
[
GetLeaf
(
iter
,
used_data_indices
[
i
])]);
}
});
}
else
{
Threading
::
For
<
data_size_t
>
(
0
,
num_data
,
[
this
,
data
,
used_data_indices
,
score
](
int
,
data_size_t
start
,
data_size_t
end
)
{
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>
iter
(
data
->
num_features
());
for
(
int
i
=
0
;
i
<
data
->
num_features
();
++
i
)
{
iter
[
i
].
reset
(
data
->
FeatureIterator
(
i
));
iter
[
i
]
->
Reset
(
used_data_indices
[
start
]);
}
for
(
data_size_t
i
=
start
;
i
<
end
;
++
i
)
{
score
[
used_data_indices
[
i
]]
+=
static_cast
<
double
>
(
leaf_value_
[
GetLeafRaw
(
iter
,
used_data_indices
[
i
])]);
}
});
}
}
}
std
::
string
Tree
::
ToString
()
{
std
::
string
Tree
::
ToString
()
{
std
::
stringstream
str_buf
;
std
::
stringstream
str_buf
;
str_buf
<<
"num_leaves="
<<
num_leaves_
<<
std
::
endl
;
str_buf
<<
"num_leaves="
<<
num_leaves_
<<
std
::
endl
;
str_buf
<<
"split_feature="
str_buf
<<
"split_feature="
<<
Common
::
ArrayToString
<
int
>
(
split_feature_
real_
,
num_leaves_
-
1
,
' '
)
<<
std
::
endl
;
<<
Common
::
ArrayToString
<
int
>
(
split_feature_
,
num_leaves_
-
1
,
' '
)
<<
std
::
endl
;
str_buf
<<
"split_gain="
str_buf
<<
"split_gain="
<<
Common
::
ArrayToString
<
double
>
(
split_gain_
,
num_leaves_
-
1
,
' '
)
<<
std
::
endl
;
<<
Common
::
ArrayToString
<
double
>
(
split_gain_
,
num_leaves_
-
1
,
' '
)
<<
std
::
endl
;
str_buf
<<
"threshold="
str_buf
<<
"threshold="
<<
Common
::
ArrayToString
<
double
>
(
threshold_
,
num_leaves_
-
1
,
' '
)
<<
std
::
endl
;
<<
Common
::
ArrayToString
<
double
>
(
threshold_
,
num_leaves_
-
1
,
' '
)
<<
std
::
endl
;
str_buf
<<
"decision_type="
<<
Common
::
ArrayToString
<
int
>
(
Common
::
ArrayCast
<
int8_t
,
int
>
(
decision_type_
),
num_leaves_
-
1
,
' '
)
<<
std
::
endl
;
str_buf
<<
"left_child="
str_buf
<<
"left_child="
<<
Common
::
ArrayToString
<
int
>
(
left_child_
,
num_leaves_
-
1
,
' '
)
<<
std
::
endl
;
<<
Common
::
ArrayToString
<
int
>
(
left_child_
,
num_leaves_
-
1
,
' '
)
<<
std
::
endl
;
str_buf
<<
"right_child="
str_buf
<<
"right_child="
...
@@ -166,10 +186,9 @@ std::string Tree::NodeToJSON(int index) {
...
@@ -166,10 +186,9 @@ std::string Tree::NodeToJSON(int index) {
// non-leaf
// non-leaf
str_buf
<<
"{"
<<
std
::
endl
;
str_buf
<<
"{"
<<
std
::
endl
;
str_buf
<<
"
\"
split_index
\"
:"
<<
index
<<
","
<<
std
::
endl
;
str_buf
<<
"
\"
split_index
\"
:"
<<
index
<<
","
<<
std
::
endl
;
str_buf
<<
"
\"
split_feature
\"
:"
<<
split_feature_
real_
[
index
]
<<
","
<<
std
::
endl
;
str_buf
<<
"
\"
split_feature
\"
:"
<<
split_feature_
[
index
]
<<
","
<<
std
::
endl
;
str_buf
<<
"
\"
split_gain
\"
:"
<<
split_gain_
[
index
]
<<
","
<<
std
::
endl
;
str_buf
<<
"
\"
split_gain
\"
:"
<<
split_gain_
[
index
]
<<
","
<<
std
::
endl
;
str_buf
<<
"
\"
threshold
\"
:"
<<
threshold_
[
index
]
<<
","
<<
std
::
endl
;
str_buf
<<
"
\"
threshold
\"
:"
<<
threshold_
[
index
]
<<
","
<<
std
::
endl
;
str_buf
<<
"
\"
decision_type
\"
:
\"
"
<<
Tree
::
GetDecisionTypeName
(
decision_type_
[
index
])
<<
"
\"
,"
<<
std
::
endl
;
str_buf
<<
"
\"
internal_value
\"
:"
<<
internal_value_
[
index
]
<<
","
<<
std
::
endl
;
str_buf
<<
"
\"
internal_value
\"
:"
<<
internal_value_
[
index
]
<<
","
<<
std
::
endl
;
str_buf
<<
"
\"
internal_count
\"
:"
<<
internal_count_
[
index
]
<<
","
<<
std
::
endl
;
str_buf
<<
"
\"
internal_count
\"
:"
<<
internal_count_
[
index
]
<<
","
<<
std
::
endl
;
str_buf
<<
"
\"
left_child
\"
:"
<<
NodeToJSON
(
left_child_
[
index
])
<<
","
<<
std
::
endl
;
str_buf
<<
"
\"
left_child
\"
:"
<<
NodeToJSON
(
left_child_
[
index
])
<<
","
<<
std
::
endl
;
...
@@ -207,7 +226,7 @@ Tree::Tree(const std::string& str) {
...
@@ -207,7 +226,7 @@ Tree::Tree(const std::string& str) {
||
key_vals
.
count
(
"left_child"
)
<=
0
||
key_vals
.
count
(
"right_child"
)
<=
0
||
key_vals
.
count
(
"left_child"
)
<=
0
||
key_vals
.
count
(
"right_child"
)
<=
0
||
key_vals
.
count
(
"leaf_parent"
)
<=
0
||
key_vals
.
count
(
"leaf_value"
)
<=
0
||
key_vals
.
count
(
"leaf_parent"
)
<=
0
||
key_vals
.
count
(
"leaf_value"
)
<=
0
||
key_vals
.
count
(
"internal_value"
)
<=
0
||
key_vals
.
count
(
"internal_count"
)
<=
0
||
key_vals
.
count
(
"internal_value"
)
<=
0
||
key_vals
.
count
(
"internal_count"
)
<=
0
||
key_vals
.
count
(
"leaf_count"
)
<=
0
||
key_vals
.
count
(
"decision_type"
)
<=
0
||
key_vals
.
count
(
"leaf_count"
)
<=
0
)
{
)
{
Log
::
Fatal
(
"Tree model string format error"
);
Log
::
Fatal
(
"Tree model string format error"
);
}
}
...
@@ -216,12 +235,11 @@ Tree::Tree(const std::string& str) {
...
@@ -216,12 +235,11 @@ Tree::Tree(const std::string& str) {
left_child_
=
Common
::
StringToArray
<
int
>
(
key_vals
[
"left_child"
],
' '
,
num_leaves_
-
1
);
left_child_
=
Common
::
StringToArray
<
int
>
(
key_vals
[
"left_child"
],
' '
,
num_leaves_
-
1
);
right_child_
=
Common
::
StringToArray
<
int
>
(
key_vals
[
"right_child"
],
' '
,
num_leaves_
-
1
);
right_child_
=
Common
::
StringToArray
<
int
>
(
key_vals
[
"right_child"
],
' '
,
num_leaves_
-
1
);
split_feature_
real_
=
Common
::
StringToArray
<
int
>
(
key_vals
[
"split_feature"
],
' '
,
num_leaves_
-
1
);
split_feature_
=
Common
::
StringToArray
<
int
>
(
key_vals
[
"split_feature"
],
' '
,
num_leaves_
-
1
);
threshold_
=
Common
::
StringToArray
<
double
>
(
key_vals
[
"threshold"
],
' '
,
num_leaves_
-
1
);
threshold_
=
Common
::
StringToArray
<
double
>
(
key_vals
[
"threshold"
],
' '
,
num_leaves_
-
1
);
split_gain_
=
Common
::
StringToArray
<
double
>
(
key_vals
[
"split_gain"
],
' '
,
num_leaves_
-
1
);
split_gain_
=
Common
::
StringToArray
<
double
>
(
key_vals
[
"split_gain"
],
' '
,
num_leaves_
-
1
);
internal_count_
=
Common
::
StringToArray
<
data_size_t
>
(
key_vals
[
"internal_count"
],
' '
,
num_leaves_
-
1
);
internal_count_
=
Common
::
StringToArray
<
data_size_t
>
(
key_vals
[
"internal_count"
],
' '
,
num_leaves_
-
1
);
internal_value_
=
Common
::
StringToArray
<
double
>
(
key_vals
[
"internal_value"
],
' '
,
num_leaves_
-
1
);
internal_value_
=
Common
::
StringToArray
<
double
>
(
key_vals
[
"internal_value"
],
' '
,
num_leaves_
-
1
);
decision_type_
=
Common
::
StringToArray
<
int8_t
>
(
key_vals
[
"decision_type"
],
' '
,
num_leaves_
-
1
);
leaf_count_
=
Common
::
StringToArray
<
data_size_t
>
(
key_vals
[
"leaf_count"
],
' '
,
num_leaves_
);
leaf_count_
=
Common
::
StringToArray
<
data_size_t
>
(
key_vals
[
"leaf_count"
],
' '
,
num_leaves_
);
leaf_parent_
=
Common
::
StringToArray
<
int
>
(
key_vals
[
"leaf_parent"
],
' '
,
num_leaves_
);
leaf_parent_
=
Common
::
StringToArray
<
int
>
(
key_vals
[
"leaf_parent"
],
' '
,
num_leaves_
);
...
...
src/metric/binary_metric.hpp
View file @
4f77bd28
...
@@ -103,7 +103,7 @@ public:
...
@@ -103,7 +103,7 @@ public:
explicit
BinaryLoglossMetric
(
const
MetricConfig
&
config
)
:
BinaryMetric
<
BinaryLoglossMetric
>
(
config
)
{}
explicit
BinaryLoglossMetric
(
const
MetricConfig
&
config
)
:
BinaryMetric
<
BinaryLoglossMetric
>
(
config
)
{}
inline
static
double
LossOnPoint
(
float
label
,
double
prob
)
{
inline
static
double
LossOnPoint
(
float
label
,
double
prob
)
{
if
(
label
=
=
0
)
{
if
(
label
<
=
0
)
{
if
(
1.0
f
-
prob
>
kEpsilon
)
{
if
(
1.0
f
-
prob
>
kEpsilon
)
{
return
-
std
::
log
(
1.0
f
-
prob
);
return
-
std
::
log
(
1.0
f
-
prob
);
}
}
...
@@ -128,9 +128,9 @@ public:
...
@@ -128,9 +128,9 @@ public:
inline
static
double
LossOnPoint
(
float
label
,
double
prob
)
{
inline
static
double
LossOnPoint
(
float
label
,
double
prob
)
{
if
(
prob
<=
0.5
f
)
{
if
(
prob
<=
0.5
f
)
{
return
label
;
return
label
>
0
;
}
else
{
}
else
{
return
1.0
f
-
label
;
return
label
<=
0
;
}
}
}
}
...
@@ -207,8 +207,8 @@ public:
...
@@ -207,8 +207,8 @@ public:
// reset
// reset
cur_neg
=
cur_pos
=
0.0
f
;
cur_neg
=
cur_pos
=
0.0
f
;
}
}
cur_neg
+=
1.0
f
-
cur_label
;
cur_neg
+=
(
cur_label
<=
0
)
;
cur_pos
+=
cur_label
;
cur_pos
+=
(
cur_label
>
0
)
;
}
}
}
else
{
// has weights
}
else
{
// has weights
for
(
data_size_t
i
=
0
;
i
<
num_data_
;
++
i
)
{
for
(
data_size_t
i
=
0
;
i
<
num_data_
;
++
i
)
{
...
@@ -224,8 +224,8 @@ public:
...
@@ -224,8 +224,8 @@ public:
// reset
// reset
cur_neg
=
cur_pos
=
0.0
f
;
cur_neg
=
cur_pos
=
0.0
f
;
}
}
cur_neg
+=
(
1.0
f
-
cur_label
)
*
cur_weight
;
cur_neg
+=
(
cur_label
<=
0
)
*
cur_weight
;
cur_pos
+=
cur_label
*
cur_weight
;
cur_pos
+=
(
cur_label
>
0
)
*
cur_weight
;
}
}
}
}
accum
+=
cur_neg
*
(
cur_pos
*
0.5
f
+
sum_pos
);
accum
+=
cur_neg
*
(
cur_pos
*
0.5
f
+
sum_pos
);
...
...
src/objective/binary_objective.hpp
View file @
4f77bd28
...
@@ -28,8 +28,9 @@ public:
...
@@ -28,8 +28,9 @@ public:
data_size_t
cnt_positive
=
0
;
data_size_t
cnt_positive
=
0
;
data_size_t
cnt_negative
=
0
;
data_size_t
cnt_negative
=
0
;
// count for positive and negative samples
// count for positive and negative samples
#pragma omp parallel for schedule(static) reduction(+:cnt_positive, cnt_negative)
for
(
data_size_t
i
=
0
;
i
<
num_data_
;
++
i
)
{
for
(
data_size_t
i
=
0
;
i
<
num_data_
;
++
i
)
{
if
(
label_
[
i
]
==
1
)
{
if
(
label_
[
i
]
>
0
)
{
++
cnt_positive
;
++
cnt_positive
;
}
else
{
}
else
{
++
cnt_negative
;
++
cnt_negative
;
...
@@ -64,8 +65,9 @@ public:
...
@@ -64,8 +65,9 @@ public:
#pragma omp parallel for schedule(static)
#pragma omp parallel for schedule(static)
for
(
data_size_t
i
=
0
;
i
<
num_data_
;
++
i
)
{
for
(
data_size_t
i
=
0
;
i
<
num_data_
;
++
i
)
{
// get label and label weights
// get label and label weights
const
int
label
=
label_val_
[
static_cast
<
int
>
(
label_
[
i
])];
const
int
is_pos
=
label_
[
i
]
>
0
;
const
double
label_weight
=
label_weights_
[
static_cast
<
int
>
(
label_
[
i
])];
const
int
label
=
label_val_
[
is_pos
];
const
double
label_weight
=
label_weights_
[
is_pos
];
// calculate gradients and hessians
// calculate gradients and hessians
const
double
response
=
-
label
*
sigmoid_
/
(
1.0
f
+
std
::
exp
(
label
*
sigmoid_
*
score
[
i
]));
const
double
response
=
-
label
*
sigmoid_
/
(
1.0
f
+
std
::
exp
(
label
*
sigmoid_
*
score
[
i
]));
const
double
abs_response
=
fabs
(
response
);
const
double
abs_response
=
fabs
(
response
);
...
@@ -76,8 +78,9 @@ public:
...
@@ -76,8 +78,9 @@ public:
#pragma omp parallel for schedule(static)
#pragma omp parallel for schedule(static)
for
(
data_size_t
i
=
0
;
i
<
num_data_
;
++
i
)
{
for
(
data_size_t
i
=
0
;
i
<
num_data_
;
++
i
)
{
// get label and label weights
// get label and label weights
const
int
label
=
label_val_
[
static_cast
<
int
>
(
label_
[
i
])];
const
int
is_pos
=
label_
[
i
]
>
0
;
const
double
label_weight
=
label_weights_
[
static_cast
<
int
>
(
label_
[
i
])];
const
int
label
=
label_val_
[
is_pos
];
const
double
label_weight
=
label_weights_
[
is_pos
];
// calculate gradients and hessians
// calculate gradients and hessians
const
double
response
=
-
label
*
sigmoid_
/
(
1.0
f
+
std
::
exp
(
label
*
sigmoid_
*
score
[
i
]));
const
double
response
=
-
label
*
sigmoid_
/
(
1.0
f
+
std
::
exp
(
label
*
sigmoid_
*
score
[
i
]));
const
double
abs_response
=
fabs
(
response
);
const
double
abs_response
=
fabs
(
response
);
...
...
src/objective/rank_objective.hpp
View file @
4f77bd28
...
@@ -52,6 +52,7 @@ public:
...
@@ -52,6 +52,7 @@ public:
num_queries_
=
metadata
.
num_queries
();
num_queries_
=
metadata
.
num_queries
();
// cache inverse max DCG, avoid computation many times
// cache inverse max DCG, avoid computation many times
inverse_max_dcgs_
.
resize
(
num_queries_
);
inverse_max_dcgs_
.
resize
(
num_queries_
);
#pragma omp parallel for schedule(guided)
for
(
data_size_t
i
=
0
;
i
<
num_queries_
;
++
i
)
{
for
(
data_size_t
i
=
0
;
i
<
num_queries_
;
++
i
)
{
inverse_max_dcgs_
[
i
]
=
DCGCalculator
::
CalMaxDCGAtK
(
optimize_pos_at_
,
inverse_max_dcgs_
[
i
]
=
DCGCalculator
::
CalMaxDCGAtK
(
optimize_pos_at_
,
label_
+
query_boundaries_
[
i
],
label_
+
query_boundaries_
[
i
],
...
...
src/objective/regression_objective.hpp
View file @
4f77bd28
...
@@ -259,14 +259,14 @@ public:
...
@@ -259,14 +259,14 @@ public:
if
(
weights_
==
nullptr
)
{
if
(
weights_
==
nullptr
)
{
#pragma omp parallel for schedule(static)
#pragma omp parallel for schedule(static)
for
(
data_size_t
i
=
0
;
i
<
num_data_
;
++
i
)
{
for
(
data_size_t
i
=
0
;
i
<
num_data_
;
++
i
)
{
gradients
[
i
]
=
score
[
i
]
-
label_
[
i
];
gradients
[
i
]
=
static_cast
<
score_t
>
(
score
[
i
]
-
label_
[
i
]
)
;
hessians
[
i
]
=
score
[
i
]
+
max_delta_step_
;
hessians
[
i
]
=
static_cast
<
score_t
>
(
score
[
i
]
+
max_delta_step_
)
;
}
}
}
else
{
}
else
{
#pragma omp parallel for schedule(static)
#pragma omp parallel for schedule(static)
for
(
data_size_t
i
=
0
;
i
<
num_data_
;
++
i
)
{
for
(
data_size_t
i
=
0
;
i
<
num_data_
;
++
i
)
{
gradients
[
i
]
=
(
score
[
i
]
-
label_
[
i
])
*
weights_
[
i
];
gradients
[
i
]
=
static_cast
<
score_t
>
(
(
score
[
i
]
-
label_
[
i
])
*
weights_
[
i
]
)
;
hessians
[
i
]
=
(
score
[
i
]
+
max_delta_step_
)
*
weights_
[
i
];
hessians
[
i
]
=
static_cast
<
score_t
>
(
(
score
[
i
]
+
max_delta_step_
)
*
weights_
[
i
]
)
;
}
}
}
}
}
}
...
...
src/treelearner/data_parallel_tree_learner.cpp
View file @
4f77bd28
...
@@ -24,7 +24,7 @@ void DataParallelTreeLearner::Init(const Dataset* train_data) {
...
@@ -24,7 +24,7 @@ void DataParallelTreeLearner::Init(const Dataset* train_data) {
// allocate buffer for communication
// allocate buffer for communication
size_t
buffer_size
=
0
;
size_t
buffer_size
=
0
;
for
(
int
i
=
0
;
i
<
num_features_
;
++
i
)
{
for
(
int
i
=
0
;
i
<
num_features_
;
++
i
)
{
buffer_size
+=
train_data_
->
Feature
At
(
i
)
->
num_b
in
()
*
sizeof
(
HistogramBinEntry
);
buffer_size
+=
train_data_
->
Feature
NumB
in
(
i
)
*
sizeof
(
HistogramBinEntry
);
}
}
input_buffer_
.
resize
(
buffer_size
);
input_buffer_
.
resize
(
buffer_size
);
...
@@ -54,7 +54,7 @@ void DataParallelTreeLearner::BeforeTrain() {
...
@@ -54,7 +54,7 @@ void DataParallelTreeLearner::BeforeTrain() {
if
(
is_feature_used_
[
i
])
{
if
(
is_feature_used_
[
i
])
{
int
cur_min_machine
=
static_cast
<
int
>
(
ArrayArgs
<
int
>::
ArgMin
(
num_bins_distributed
));
int
cur_min_machine
=
static_cast
<
int
>
(
ArrayArgs
<
int
>::
ArgMin
(
num_bins_distributed
));
feature_distribution
[
cur_min_machine
].
push_back
(
i
);
feature_distribution
[
cur_min_machine
].
push_back
(
i
);
num_bins_distributed
[
cur_min_machine
]
+=
train_data_
->
Feature
At
(
i
)
->
num_b
in
();
num_bins_distributed
[
cur_min_machine
]
+=
train_data_
->
Feature
NumB
in
(
i
);
}
}
is_feature_aggregated_
[
i
]
=
false
;
is_feature_aggregated_
[
i
]
=
false
;
}
}
...
@@ -68,7 +68,7 @@ void DataParallelTreeLearner::BeforeTrain() {
...
@@ -68,7 +68,7 @@ void DataParallelTreeLearner::BeforeTrain() {
for
(
int
i
=
0
;
i
<
num_machines_
;
++
i
)
{
for
(
int
i
=
0
;
i
<
num_machines_
;
++
i
)
{
block_len_
[
i
]
=
0
;
block_len_
[
i
]
=
0
;
for
(
auto
fid
:
feature_distribution
[
i
])
{
for
(
auto
fid
:
feature_distribution
[
i
])
{
block_len_
[
i
]
+=
train_data_
->
Feature
At
(
fid
)
->
num_bin
(
)
*
sizeof
(
HistogramBinEntry
);
block_len_
[
i
]
+=
train_data_
->
Feature
NumBin
(
fid
)
*
sizeof
(
HistogramBinEntry
);
}
}
reduce_scatter_size_
+=
block_len_
[
i
];
reduce_scatter_size_
+=
block_len_
[
i
];
}
}
...
@@ -83,7 +83,7 @@ void DataParallelTreeLearner::BeforeTrain() {
...
@@ -83,7 +83,7 @@ void DataParallelTreeLearner::BeforeTrain() {
for
(
int
i
=
0
;
i
<
num_machines_
;
++
i
)
{
for
(
int
i
=
0
;
i
<
num_machines_
;
++
i
)
{
for
(
auto
fid
:
feature_distribution
[
i
])
{
for
(
auto
fid
:
feature_distribution
[
i
])
{
buffer_write_start_pos_
[
fid
]
=
bin_size
;
buffer_write_start_pos_
[
fid
]
=
bin_size
;
bin_size
+=
train_data_
->
Feature
At
(
fid
)
->
num_bin
(
)
*
sizeof
(
HistogramBinEntry
);
bin_size
+=
train_data_
->
Feature
NumBin
(
fid
)
*
sizeof
(
HistogramBinEntry
);
}
}
}
}
...
@@ -91,7 +91,7 @@ void DataParallelTreeLearner::BeforeTrain() {
...
@@ -91,7 +91,7 @@ void DataParallelTreeLearner::BeforeTrain() {
bin_size
=
0
;
bin_size
=
0
;
for
(
auto
fid
:
feature_distribution
[
rank_
])
{
for
(
auto
fid
:
feature_distribution
[
rank_
])
{
buffer_read_start_pos_
[
fid
]
=
bin_size
;
buffer_read_start_pos_
[
fid
]
=
bin_size
;
bin_size
+=
train_data_
->
Feature
At
(
fid
)
->
num_bin
(
)
*
sizeof
(
HistogramBinEntry
);
bin_size
+=
train_data_
->
Feature
NumBin
(
fid
)
*
sizeof
(
HistogramBinEntry
);
}
}
// sync global data sumup info
// sync global data sumup info
...
@@ -125,49 +125,51 @@ void DataParallelTreeLearner::BeforeTrain() {
...
@@ -125,49 +125,51 @@ void DataParallelTreeLearner::BeforeTrain() {
}
}
void
DataParallelTreeLearner
::
FindBestThresholds
()
{
void
DataParallelTreeLearner
::
FindBestThresholds
()
{
train_data_
->
ConstructHistograms
(
is_feature_used_
,
smaller_leaf_splits_
->
data_indices
(),
smaller_leaf_splits_
->
num_data_in_leaf
(),
smaller_leaf_splits_
->
LeafIndex
(),
ordered_bins_
,
gradients_
,
hessians_
,
ordered_gradients_
.
data
(),
ordered_hessians_
.
data
(),
smaller_leaf_histogram_array_
[
0
].
RawData
()
-
1
);
// construct local histograms
// construct local histograms
#pragma omp parallel for schedule(guided)
#pragma omp parallel for schedule(guided)
for
(
int
feature_index
=
0
;
feature_index
<
num_features_
;
++
feature_index
)
{
for
(
int
feature_index
=
0
;
feature_index
<
num_features_
;
++
feature_index
)
{
if
((
!
is_feature_used_
.
empty
()
&&
is_feature_used_
[
feature_index
]
==
false
))
continue
;
if
((
!
is_feature_used_
.
empty
()
&&
is_feature_used_
[
feature_index
]
==
false
))
continue
;
// construct histograms for smaller leaf
if
(
ordered_bins_
[
feature_index
]
==
nullptr
)
{
// if not use ordered bin
train_data_
->
FeatureAt
(
feature_index
)
->
bin_data
()
->
ConstructHistogram
(
smaller_leaf_splits_
->
data_indices
(),
smaller_leaf_splits_
->
num_data_in_leaf
(),
ptr_to_ordered_gradients_smaller_leaf_
,
ptr_to_ordered_hessians_smaller_leaf_
,
smaller_leaf_histogram_array_
[
feature_index
].
GetData
());
}
else
{
// used ordered bin
ordered_bins_
[
feature_index
]
->
ConstructHistogram
(
smaller_leaf_splits_
->
LeafIndex
(),
gradients_
,
hessians_
,
smaller_leaf_histogram_array_
[
feature_index
].
GetData
());
}
// copy to buffer
// copy to buffer
std
::
memcpy
(
input_buffer_
.
data
()
+
buffer_write_start_pos_
[
feature_index
],
std
::
memcpy
(
input_buffer_
.
data
()
+
buffer_write_start_pos_
[
feature_index
],
smaller_leaf_histogram_array_
[
feature_index
].
Histogram
Data
(),
smaller_leaf_histogram_array_
[
feature_index
].
Raw
Data
(),
smaller_leaf_histogram_array_
[
feature_index
].
SizeOfHistgram
());
smaller_leaf_histogram_array_
[
feature_index
].
SizeOfHistgram
());
}
}
// Reduce scatter for histogram
// Reduce scatter for histogram
Network
::
ReduceScatter
(
input_buffer_
.
data
(),
reduce_scatter_size_
,
block_start_
.
data
(),
Network
::
ReduceScatter
(
input_buffer_
.
data
(),
reduce_scatter_size_
,
block_start_
.
data
(),
block_len_
.
data
(),
output_buffer_
.
data
(),
&
HistogramBinEntry
::
SumReducer
);
block_len_
.
data
(),
output_buffer_
.
data
(),
&
HistogramBinEntry
::
SumReducer
);
std
::
vector
<
SplitInfo
>
smaller_best
(
num_threads_
,
SplitInfo
());
std
::
vector
<
SplitInfo
>
larger_best
(
num_threads_
,
SplitInfo
());
#pragma omp parallel for schedule(guided)
#pragma omp parallel for schedule(guided)
for
(
int
feature_index
=
0
;
feature_index
<
num_features_
;
++
feature_index
)
{
for
(
int
feature_index
=
0
;
feature_index
<
num_features_
;
++
feature_index
)
{
if
(
!
is_feature_aggregated_
[
feature_index
])
continue
;
if
(
!
is_feature_aggregated_
[
feature_index
])
continue
;
const
int
tid
=
omp_get_thread_num
();
// restore global histograms from buffer
// restore global histograms from buffer
smaller_leaf_histogram_array_
[
feature_index
].
FromMemory
(
smaller_leaf_histogram_array_
[
feature_index
].
FromMemory
(
output_buffer_
.
data
()
+
buffer_read_start_pos_
[
feature_index
]);
output_buffer_
.
data
()
+
buffer_read_start_pos_
[
feature_index
]);
train_data_
->
FixHistogram
(
feature_index
,
smaller_leaf_splits_
->
sum_gradients
(),
smaller_leaf_splits_
->
sum_hessians
(),
smaller_leaf_splits_
->
num_data_in_leaf
(),
smaller_leaf_histogram_array_
[
feature_index
].
RawData
());
SplitInfo
smaller_split
;
// find best threshold for smaller child
// find best threshold for smaller child
smaller_leaf_histogram_array_
[
feature_index
].
FindBestThreshold
(
smaller_leaf_histogram_array_
[
feature_index
].
FindBestThreshold
(
smaller_leaf_splits_
->
sum_gradients
(),
smaller_leaf_splits_
->
sum_gradients
(),
smaller_leaf_splits_
->
sum_hessians
(),
smaller_leaf_splits_
->
sum_hessians
(),
GetGlobalDataCountInLeaf
(
smaller_leaf_splits_
->
LeafIndex
()),
GetGlobalDataCountInLeaf
(
smaller_leaf_splits_
->
LeafIndex
()),
&
smaller_leaf_splits_
->
BestSplitPerFeature
()[
feature_index
]);
&
smaller_split
);
if
(
smaller_split
.
gain
>
smaller_best
[
tid
].
gain
)
{
smaller_best
[
tid
]
=
smaller_split
;
}
// only root leaf
// only root leaf
if
(
larger_leaf_splits_
==
nullptr
||
larger_leaf_splits_
->
LeafIndex
()
<
0
)
continue
;
if
(
larger_leaf_splits_
==
nullptr
||
larger_leaf_splits_
->
LeafIndex
()
<
0
)
continue
;
...
@@ -175,35 +177,36 @@ void DataParallelTreeLearner::FindBestThresholds() {
...
@@ -175,35 +177,36 @@ void DataParallelTreeLearner::FindBestThresholds() {
// construct histgroms for large leaf, we init larger leaf as the parent, so we can just subtract the smaller leaf's histograms
// construct histgroms for large leaf, we init larger leaf as the parent, so we can just subtract the smaller leaf's histograms
larger_leaf_histogram_array_
[
feature_index
].
Subtract
(
larger_leaf_histogram_array_
[
feature_index
].
Subtract
(
smaller_leaf_histogram_array_
[
feature_index
]);
smaller_leaf_histogram_array_
[
feature_index
]);
SplitInfo
larger_split
;
// find best threshold for larger child
// find best threshold for larger child
larger_leaf_histogram_array_
[
feature_index
].
FindBestThreshold
(
larger_leaf_histogram_array_
[
feature_index
].
FindBestThreshold
(
larger_leaf_splits_
->
sum_gradients
(),
larger_leaf_splits_
->
sum_gradients
(),
larger_leaf_splits_
->
sum_hessians
(),
larger_leaf_splits_
->
sum_hessians
(),
GetGlobalDataCountInLeaf
(
larger_leaf_splits_
->
LeafIndex
()),
GetGlobalDataCountInLeaf
(
larger_leaf_splits_
->
LeafIndex
()),
&
larger_leaf_splits_
->
BestSplitPerFeature
()[
feature_index
]);
&
larger_split
);
if
(
larger_split
.
gain
>
larger_best
[
tid
].
gain
)
{
larger_best
[
tid
]
=
larger_split
;
}
}
}
auto
smaller_best_idx
=
ArrayArgs
<
SplitInfo
>::
ArgMax
(
smaller_best
);
int
leaf
=
smaller_leaf_splits_
->
LeafIndex
();
best_split_per_leaf_
[
leaf
]
=
smaller_best
[
smaller_best_idx
];
if
(
larger_leaf_splits_
==
nullptr
||
larger_leaf_splits_
->
LeafIndex
()
<
0
)
{
return
;
}
leaf
=
larger_leaf_splits_
->
LeafIndex
();
auto
larger_best_idx
=
ArrayArgs
<
SplitInfo
>::
ArgMax
(
larger_best
);
best_split_per_leaf_
[
leaf
]
=
larger_best
[
larger_best_idx
];
}
}
void
DataParallelTreeLearner
::
FindBestSplitsForLeaves
()
{
void
DataParallelTreeLearner
::
FindBestSplitsForLeaves
()
{
int
smaller_best_feature
=
-
1
,
larger_best_feature
=
-
1
;
SplitInfo
smaller_best
,
larger_best
;
SplitInfo
smaller_best
,
larger_best
;
std
::
vector
<
double
>
gains
;
smaller_best
=
best_split_per_leaf_
[
smaller_leaf_splits_
->
LeafIndex
()];
// find local best split for smaller leaf
for
(
size_t
i
=
0
;
i
<
smaller_leaf_splits_
->
BestSplitPerFeature
().
size
();
++
i
)
{
gains
.
push_back
(
smaller_leaf_splits_
->
BestSplitPerFeature
()[
i
].
gain
);
}
smaller_best_feature
=
static_cast
<
int
>
(
ArrayArgs
<
double
>::
ArgMax
(
gains
));
smaller_best
=
smaller_leaf_splits_
->
BestSplitPerFeature
()[
smaller_best_feature
];
// find local best split for larger leaf
// find local best split for larger leaf
if
(
larger_leaf_splits_
->
LeafIndex
()
>=
0
)
{
if
(
larger_leaf_splits_
->
LeafIndex
()
>=
0
)
{
gains
.
clear
();
larger_best
=
best_split_per_leaf_
[
larger_leaf_splits_
->
LeafIndex
()];
for
(
size_t
i
=
0
;
i
<
larger_leaf_splits_
->
BestSplitPerFeature
().
size
();
++
i
)
{
gains
.
push_back
(
larger_leaf_splits_
->
BestSplitPerFeature
()[
i
].
gain
);
}
larger_best_feature
=
static_cast
<
int
>
(
ArrayArgs
<
double
>::
ArgMax
(
gains
));
larger_best
=
larger_leaf_splits_
->
BestSplitPerFeature
()[
larger_best_feature
];
}
}
// sync global best info
// sync global best info
...
...
src/treelearner/data_partition.hpp
View file @
4f77bd28
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
#define LIGHTGBM_TREELEARNER_DATA_PARTITION_HPP_
#define LIGHTGBM_TREELEARNER_DATA_PARTITION_HPP_
#include <LightGBM/meta.h>
#include <LightGBM/meta.h>
#include <LightGBM/
feature
.h>
#include <LightGBM/
dataset
.h>
#include <LightGBM/utils/openmp_wrapper.h>
#include <LightGBM/utils/openmp_wrapper.h>
...
@@ -93,7 +93,7 @@ public:
...
@@ -93,7 +93,7 @@ public:
* \param threshold threshold that want to split
* \param threshold threshold that want to split
* \param right_leaf index of right leaf
* \param right_leaf index of right leaf
*/
*/
void
Split
(
int
leaf
,
const
B
in
*
feature
_bins
,
unsigned
in
t
threshold
,
int
right_leaf
)
{
void
Split
(
int
leaf
,
const
Dataset
*
dataset
,
in
t
feature
,
uint32_
t
threshold
,
int
right_leaf
)
{
const
data_size_t
min_inner_size
=
1000
;
const
data_size_t
min_inner_size
=
1000
;
// get leaf boundary
// get leaf boundary
const
data_size_t
begin
=
leaf_begin_
[
leaf
];
const
data_size_t
begin
=
leaf_begin_
[
leaf
];
...
@@ -111,7 +111,7 @@ public:
...
@@ -111,7 +111,7 @@ public:
data_size_t
cur_cnt
=
inner_size
;
data_size_t
cur_cnt
=
inner_size
;
if
(
cur_start
+
cur_cnt
>
cnt
)
{
cur_cnt
=
cnt
-
cur_start
;
}
if
(
cur_start
+
cur_cnt
>
cnt
)
{
cur_cnt
=
cnt
-
cur_start
;
}
// split data inner, reduce the times of function called
// split data inner, reduce the times of function called
data_size_t
cur_left_count
=
feature_bins
->
Split
(
threshold
,
indices_
.
data
()
+
begin
+
cur_start
,
cur_cnt
,
data_size_t
cur_left_count
=
dataset
->
Split
(
feature
,
threshold
,
indices_
.
data
()
+
begin
+
cur_start
,
cur_cnt
,
temp_left_indices_
.
data
()
+
cur_start
,
temp_right_indices_
.
data
()
+
cur_start
);
temp_left_indices_
.
data
()
+
cur_start
,
temp_right_indices_
.
data
()
+
cur_start
);
offsets_buf_
[
i
]
=
cur_start
;
offsets_buf_
[
i
]
=
cur_start
;
left_cnts_buf_
[
i
]
=
cur_left_count
;
left_cnts_buf_
[
i
]
=
cur_left_count
;
...
...
src/treelearner/feature_histogram.hpp
View file @
4f77bd28
This diff is collapsed.
Click to expand it.
src/treelearner/feature_parallel_tree_learner.cpp
View file @
4f77bd28
...
@@ -32,7 +32,7 @@ void FeatureParallelTreeLearner::BeforeTrain() {
...
@@ -32,7 +32,7 @@ void FeatureParallelTreeLearner::BeforeTrain() {
if
(
is_feature_used_
[
i
])
{
if
(
is_feature_used_
[
i
])
{
int
cur_min_machine
=
static_cast
<
int
>
(
ArrayArgs
<
int
>::
ArgMin
(
num_bins_distributed
));
int
cur_min_machine
=
static_cast
<
int
>
(
ArrayArgs
<
int
>::
ArgMin
(
num_bins_distributed
));
feature_distribution
[
cur_min_machine
].
push_back
(
i
);
feature_distribution
[
cur_min_machine
].
push_back
(
i
);
num_bins_distributed
[
cur_min_machine
]
+=
train_data_
->
Feature
At
(
i
)
->
num_b
in
();
num_bins_distributed
[
cur_min_machine
]
+=
train_data_
->
Feature
NumB
in
(
i
);
is_feature_used_
[
i
]
=
false
;
is_feature_used_
[
i
]
=
false
;
}
}
}
}
...
@@ -43,23 +43,12 @@ void FeatureParallelTreeLearner::BeforeTrain() {
...
@@ -43,23 +43,12 @@ void FeatureParallelTreeLearner::BeforeTrain() {
}
}
void
FeatureParallelTreeLearner
::
FindBestSplitsForLeaves
()
{
void
FeatureParallelTreeLearner
::
FindBestSplitsForLeaves
()
{
int
smaller_best_feature
=
-
1
,
larger_best_feature
=
-
1
;
SplitInfo
smaller_best
,
larger_best
;
SplitInfo
smaller_best
,
larger_best
;
// get best split at smaller leaf
// get best split at smaller leaf
std
::
vector
<
double
>
gains
;
smaller_best
=
best_split_per_leaf_
[
smaller_leaf_splits_
->
LeafIndex
()];
for
(
size_t
i
=
0
;
i
<
smaller_leaf_splits_
->
BestSplitPerFeature
().
size
();
++
i
)
{
// find local best split for larger leaf
gains
.
push_back
(
smaller_leaf_splits_
->
BestSplitPerFeature
()[
i
].
gain
);
}
smaller_best_feature
=
static_cast
<
int
>
(
ArrayArgs
<
double
>::
ArgMax
(
gains
));
smaller_best
=
smaller_leaf_splits_
->
BestSplitPerFeature
()[
smaller_best_feature
];
// get best split at larger leaf
if
(
larger_leaf_splits_
->
LeafIndex
()
>=
0
)
{
if
(
larger_leaf_splits_
->
LeafIndex
()
>=
0
)
{
gains
.
clear
();
larger_best
=
best_split_per_leaf_
[
larger_leaf_splits_
->
LeafIndex
()];
for
(
size_t
i
=
0
;
i
<
larger_leaf_splits_
->
BestSplitPerFeature
().
size
();
++
i
)
{
gains
.
push_back
(
larger_leaf_splits_
->
BestSplitPerFeature
()[
i
].
gain
);
}
larger_best_feature
=
static_cast
<
int
>
(
ArrayArgs
<
double
>::
ArgMax
(
gains
));
larger_best
=
larger_leaf_splits_
->
BestSplitPerFeature
()[
larger_best_feature
];
}
}
// sync global best info
// sync global best info
std
::
memcpy
(
input_buffer_
.
data
(),
&
smaller_best
,
sizeof
(
SplitInfo
));
std
::
memcpy
(
input_buffer_
.
data
(),
&
smaller_best
,
sizeof
(
SplitInfo
));
...
...
src/treelearner/leaf_splits.hpp
View file @
4f77bd28
...
@@ -2,8 +2,8 @@
...
@@ -2,8 +2,8 @@
#define LIGHTGBM_TREELEARNER_LEAF_SPLITS_HPP_
#define LIGHTGBM_TREELEARNER_LEAF_SPLITS_HPP_
#include <LightGBM/meta.h>
#include <LightGBM/meta.h>
#include "data_partition.hpp"
#include "split_info.hpp"
#include "split_info.hpp"
#include "data_partition.hpp"
#include <vector>
#include <vector>
...
@@ -17,10 +17,6 @@ public:
...
@@ -17,10 +17,6 @@ public:
LeafSplits
(
int
num_feature
,
data_size_t
num_data
)
LeafSplits
(
int
num_feature
,
data_size_t
num_data
)
:
num_data_in_leaf_
(
num_data
),
num_data_
(
num_data
),
num_features_
(
num_feature
),
:
num_data_in_leaf_
(
num_data
),
num_data_
(
num_data
),
num_features_
(
num_feature
),
data_indices_
(
nullptr
)
{
data_indices_
(
nullptr
)
{
best_split_per_feature_
.
resize
(
num_features_
);
for
(
int
i
=
0
;
i
<
num_features_
;
++
i
)
{
best_split_per_feature_
[
i
].
feature
=
i
;
}
}
}
void
ResetNumData
(
data_size_t
num_data
)
{
void
ResetNumData
(
data_size_t
num_data
)
{
num_data_
=
num_data
;
num_data_
=
num_data
;
...
@@ -42,9 +38,6 @@ public:
...
@@ -42,9 +38,6 @@ public:
data_indices_
=
data_partition
->
GetIndexOnLeaf
(
leaf
,
&
num_data_in_leaf_
);
data_indices_
=
data_partition
->
GetIndexOnLeaf
(
leaf
,
&
num_data_in_leaf_
);
sum_gradients_
=
sum_gradients
;
sum_gradients_
=
sum_gradients
;
sum_hessians_
=
sum_hessians
;
sum_hessians_
=
sum_hessians
;
for
(
SplitInfo
&
split_info
:
best_split_per_feature_
)
{
split_info
.
Reset
();
}
}
}
/*!
/*!
...
@@ -65,9 +58,6 @@ public:
...
@@ -65,9 +58,6 @@ public:
}
}
sum_gradients_
=
tmp_sum_gradients
;
sum_gradients_
=
tmp_sum_gradients
;
sum_hessians_
=
tmp_sum_hessians
;
sum_hessians_
=
tmp_sum_hessians
;
for
(
SplitInfo
&
split_info
:
best_split_per_feature_
)
{
split_info
.
Reset
();
}
}
}
/*!
/*!
...
@@ -90,9 +80,6 @@ public:
...
@@ -90,9 +80,6 @@ public:
}
}
sum_gradients_
=
tmp_sum_gradients
;
sum_gradients_
=
tmp_sum_gradients
;
sum_hessians_
=
tmp_sum_hessians
;
sum_hessians_
=
tmp_sum_hessians
;
for
(
SplitInfo
&
split_info
:
best_split_per_feature_
)
{
split_info
.
Reset
();
}
}
}
...
@@ -105,9 +92,6 @@ public:
...
@@ -105,9 +92,6 @@ public:
leaf_index_
=
0
;
leaf_index_
=
0
;
sum_gradients_
=
sum_gradients
;
sum_gradients_
=
sum_gradients
;
sum_hessians_
=
sum_hessians
;
sum_hessians_
=
sum_hessians
;
for
(
SplitInfo
&
split_info
:
best_split_per_feature_
)
{
split_info
.
Reset
();
}
}
}
/*!
/*!
...
@@ -115,13 +99,10 @@ public:
...
@@ -115,13 +99,10 @@ public:
*/
*/
void
Init
()
{
void
Init
()
{
leaf_index_
=
-
1
;
leaf_index_
=
-
1
;
for
(
SplitInfo
&
split_info
:
best_split_per_feature_
)
{
data_indices_
=
nullptr
;
split_info
.
Reset
();
num_data_in_leaf_
=
0
;
}
}
}
/*! \brief Get best splits on all features */
std
::
vector
<
SplitInfo
>&
BestSplitPerFeature
()
{
return
best_split_per_feature_
;}
/*! \brief Get current leaf index */
/*! \brief Get current leaf index */
int
LeafIndex
()
const
{
return
leaf_index_
;
}
int
LeafIndex
()
const
{
return
leaf_index_
;
}
...
@@ -140,8 +121,6 @@ public:
...
@@ -140,8 +121,6 @@ public:
private:
private:
/*! \brief store best splits of all feature on current leaf */
std
::
vector
<
SplitInfo
>
best_split_per_feature_
;
/*! \brief current leaf index */
/*! \brief current leaf index */
int
leaf_index_
;
int
leaf_index_
;
/*! \brief number of data on current leaf */
/*! \brief number of data on current leaf */
...
...
src/treelearner/parallel_tree_learner.h
View file @
4f77bd28
...
@@ -170,6 +170,10 @@ private:
...
@@ -170,6 +170,10 @@ private:
std
::
unique_ptr
<
FeatureHistogram
[]
>
smaller_leaf_histogram_array_global_
;
std
::
unique_ptr
<
FeatureHistogram
[]
>
smaller_leaf_histogram_array_global_
;
/*! \brief Store global histogram for larger leaf */
/*! \brief Store global histogram for larger leaf */
std
::
unique_ptr
<
FeatureHistogram
[]
>
larger_leaf_histogram_array_global_
;
std
::
unique_ptr
<
FeatureHistogram
[]
>
larger_leaf_histogram_array_global_
;
std
::
vector
<
HistogramBinEntry
>
smaller_leaf_histogram_data_
;
std
::
vector
<
HistogramBinEntry
>
larger_leaf_histogram_data_
;
std
::
vector
<
FeatureMetainfo
>
feature_metas_
;
};
};
}
// namespace LightGBM
}
// namespace LightGBM
...
...
src/treelearner/serial_tree_learner.cpp
View file @
4f77bd28
This diff is collapsed.
Click to expand it.
src/treelearner/serial_tree_learner.h
View file @
4f77bd28
...
@@ -7,10 +7,10 @@
...
@@ -7,10 +7,10 @@
#include <LightGBM/tree_learner.h>
#include <LightGBM/tree_learner.h>
#include <LightGBM/dataset.h>
#include <LightGBM/dataset.h>
#include <LightGBM/tree.h>
#include <LightGBM/tree.h>
#include <LightGBM/feature.h>
#include "feature_histogram.hpp"
#include "feature_histogram.hpp"
#include "data_partition.hpp"
#include "split_info.hpp"
#include "split_info.hpp"
#include "data_partition.hpp"
#include "leaf_splits.hpp"
#include "leaf_splits.hpp"
#include <cstdio>
#include <cstdio>
...
@@ -77,7 +77,7 @@ protected:
...
@@ -77,7 +77,7 @@ protected:
* \brief Find best features for leaves from smaller_leaf_splits_ and larger_leaf_splits_.
* \brief Find best features for leaves from smaller_leaf_splits_ and larger_leaf_splits_.
* This function will be called after FindBestThresholds.
* This function will be called after FindBestThresholds.
*/
*/
inline
virtual
void
FindBestSplitsForLeaves
();
virtual
void
FindBestSplitsForLeaves
();
/*!
/*!
* \brief Partition tree and data according best split.
* \brief Partition tree and data according best split.
...
@@ -95,12 +95,6 @@ protected:
...
@@ -95,12 +95,6 @@ protected:
*/
*/
inline
virtual
data_size_t
GetGlobalDataCountInLeaf
(
int
leaf_idx
)
const
;
inline
virtual
data_size_t
GetGlobalDataCountInLeaf
(
int
leaf_idx
)
const
;
/*!
* \brief Find best features for leaf from leaf_splits
* \param leaf_splits
*/
inline
void
FindBestSplitForLeaf
(
LeafSplits
*
leaf_splits
);
/*! \brief Last trained decision tree */
/*! \brief Last trained decision tree */
const
Tree
*
last_trained_tree_
;
const
Tree
*
last_trained_tree_
;
/*! \brief number of data */
/*! \brief number of data */
...
@@ -118,7 +112,7 @@ protected:
...
@@ -118,7 +112,7 @@ protected:
/*! \brief used for generate used features */
/*! \brief used for generate used features */
Random
random_
;
Random
random_
;
/*! \brief used for sub feature training, is_feature_used_[i] = false means don't used feature i */
/*! \brief used for sub feature training, is_feature_used_[i] = false means don't used feature i */
std
::
vector
<
bool
>
is_feature_used_
;
std
::
vector
<
int8_t
>
is_feature_used_
;
/*! \brief pointer to histograms array of parent of current leaves */
/*! \brief pointer to histograms array of parent of current leaves */
FeatureHistogram
*
parent_leaf_histogram_array_
;
FeatureHistogram
*
parent_leaf_histogram_array_
;
/*! \brief pointer to histograms array of smaller leaf */
/*! \brief pointer to histograms array of smaller leaf */
...
@@ -139,15 +133,6 @@ protected:
...
@@ -139,15 +133,6 @@ protected:
/*! \brief hessians of current iteration, ordered for cache optimized */
/*! \brief hessians of current iteration, ordered for cache optimized */
std
::
vector
<
score_t
>
ordered_hessians_
;
std
::
vector
<
score_t
>
ordered_hessians_
;
/*! \brief Pointer to ordered_gradients_, use this to avoid copy at BeforeTrain */
const
score_t
*
ptr_to_ordered_gradients_smaller_leaf_
;
/*! \brief Pointer to ordered_hessians_, use this to avoid copy at BeforeTrain*/
const
score_t
*
ptr_to_ordered_hessians_smaller_leaf_
;
/*! \brief Pointer to ordered_gradients_, use this to avoid copy at BeforeTrain */
const
score_t
*
ptr_to_ordered_gradients_larger_leaf_
;
/*! \brief Pointer to ordered_hessians_, use this to avoid copy at BeforeTrain*/
const
score_t
*
ptr_to_ordered_hessians_larger_leaf_
;
/*! \brief Store ordered bin */
/*! \brief Store ordered bin */
std
::
vector
<
std
::
unique_ptr
<
OrderedBin
>>
ordered_bins_
;
std
::
vector
<
std
::
unique_ptr
<
OrderedBin
>>
ordered_bins_
;
/*! \brief True if has ordered bin */
/*! \brief True if has ordered bin */
...
@@ -158,15 +143,9 @@ protected:
...
@@ -158,15 +143,9 @@ protected:
HistogramPool
histogram_pool_
;
HistogramPool
histogram_pool_
;
/*! \brief config of tree learner*/
/*! \brief config of tree learner*/
const
TreeConfig
*
tree_config_
;
const
TreeConfig
*
tree_config_
;
int
num_threads_
;
};
};
inline
void
SerialTreeLearner
::
FindBestSplitsForLeaves
()
{
FindBestSplitForLeaf
(
smaller_leaf_splits_
.
get
());
FindBestSplitForLeaf
(
larger_leaf_splits_
.
get
());
}
inline
data_size_t
SerialTreeLearner
::
GetGlobalDataCountInLeaf
(
int
leafIdx
)
const
{
inline
data_size_t
SerialTreeLearner
::
GetGlobalDataCountInLeaf
(
int
leafIdx
)
const
{
if
(
leafIdx
>=
0
)
{
if
(
leafIdx
>=
0
)
{
return
data_partition_
->
leaf_count
(
leafIdx
);
return
data_partition_
->
leaf_count
(
leafIdx
);
...
@@ -175,19 +154,5 @@ inline data_size_t SerialTreeLearner::GetGlobalDataCountInLeaf(int leafIdx) cons
...
@@ -175,19 +154,5 @@ inline data_size_t SerialTreeLearner::GetGlobalDataCountInLeaf(int leafIdx) cons
}
}
}
}
inline
void
SerialTreeLearner
::
FindBestSplitForLeaf
(
LeafSplits
*
leaf_splits
)
{
if
(
leaf_splits
==
nullptr
||
leaf_splits
->
LeafIndex
()
<
0
)
{
return
;
}
std
::
vector
<
double
>
gains
;
for
(
size_t
i
=
0
;
i
<
leaf_splits
->
BestSplitPerFeature
().
size
();
++
i
)
{
gains
.
push_back
(
leaf_splits
->
BestSplitPerFeature
()[
i
].
gain
);
}
int
best_feature
=
static_cast
<
int
>
(
ArrayArgs
<
double
>::
ArgMax
(
gains
));
int
leaf
=
leaf_splits
->
LeafIndex
();
best_split_per_leaf_
[
leaf
]
=
leaf_splits
->
BestSplitPerFeature
()[
best_feature
];
best_split_per_leaf_
[
leaf
].
feature
=
best_feature
;
}
}
// namespace LightGBM
}
// namespace LightGBM
#endif // LightGBM_TREELEARNER_SERIAL_TREE_LEARNER_H_
#endif // LightGBM_TREELEARNER_SERIAL_TREE_LEARNER_H_
src/treelearner/split_info.hpp
View file @
4f77bd28
...
@@ -53,6 +53,8 @@ public:
...
@@ -53,6 +53,8 @@ public:
inline
bool
operator
>
(
const
SplitInfo
&
si
)
const
;
inline
bool
operator
>
(
const
SplitInfo
&
si
)
const
;
inline
bool
operator
==
(
const
SplitInfo
&
si
)
const
;
inline
static
void
MaxReducer
(
const
char
*
src
,
char
*
dst
,
int
len
)
{
inline
static
void
MaxReducer
(
const
char
*
src
,
char
*
dst
,
int
len
)
{
const
int
type_size
=
sizeof
(
SplitInfo
);
const
int
type_size
=
sizeof
(
SplitInfo
);
int
used_size
=
0
;
int
used_size
=
0
;
...
@@ -103,5 +105,34 @@ inline bool SplitInfo::operator > (const SplitInfo& si) const {
...
@@ -103,5 +105,34 @@ inline bool SplitInfo::operator > (const SplitInfo& si) const {
}
}
}
}
inline
bool
SplitInfo
::
operator
==
(
const
SplitInfo
&
si
)
const
{
double
local_gain
=
this
->
gain
;
double
other_gain
=
si
.
gain
;
// replace nan with -inf
if
(
local_gain
==
NAN
)
{
local_gain
=
kMinScore
;
}
// replace nan with -inf
if
(
other_gain
==
NAN
)
{
other_gain
=
kMinScore
;
}
int
local_feature
=
this
->
feature
;
int
other_feature
=
si
.
feature
;
// replace -1 with max int
if
(
local_feature
==
-
1
)
{
local_feature
=
INT32_MAX
;
}
// replace -1 with max int
if
(
other_feature
==
-
1
)
{
other_feature
=
INT32_MAX
;
}
if
(
local_gain
!=
other_gain
)
{
return
local_gain
==
other_gain
;
}
else
{
// if same gain, use smaller feature
return
local_feature
==
other_feature
;
}
}
}
// namespace LightGBM
}
// namespace LightGBM
#endif // LightGBM_TREELEARNER_SPLIT_INFO_HPP_
#endif // LightGBM_TREELEARNER_SPLIT_INFO_HPP_
src/treelearner/voting_parallel_tree_learner.cpp
View file @
4f77bd28
This diff is collapsed.
Click to expand it.
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment