Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tianlh
LightGBM-DCU
Commits
1c774687
Commit
1c774687
authored
Aug 05, 2016
by
Guolin Ke
Browse files
first commit
parents
Changes
94
Hide whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
2254 additions
and
0 deletions
+2254
-0
src/objective/regression_objective.hpp
src/objective/regression_objective.hpp
+56
-0
src/treelearner/data_parallel_tree_learner.cpp
src/treelearner/data_parallel_tree_learner.cpp
+246
-0
src/treelearner/data_partition.hpp
src/treelearner/data_partition.hpp
+204
-0
src/treelearner/feature_histogram.hpp
src/treelearner/feature_histogram.hpp
+255
-0
src/treelearner/feature_parallel_tree_learner.cpp
src/treelearner/feature_parallel_tree_learner.cpp
+81
-0
src/treelearner/leaf_splits.hpp
src/treelearner/leaf_splits.hpp
+157
-0
src/treelearner/parallel_tree_learner.h
src/treelearner/parallel_tree_learner.h
+93
-0
src/treelearner/serial_tree_learner.cpp
src/treelearner/serial_tree_learner.cpp
+373
-0
src/treelearner/serial_tree_learner.h
src/treelearner/serial_tree_learner.h
+187
-0
src/treelearner/split_info.hpp
src/treelearner/split_info.hpp
+107
-0
src/treelearner/tree_learner.cpp
src/treelearner/tree_learner.cpp
+19
-0
windows/LightGBM.sln
windows/LightGBM.sln
+28
-0
windows/LightGBM.vcxproj
windows/LightGBM.vcxproj
+222
-0
windows/LightGBM.vcxproj.filters
windows/LightGBM.vcxproj.filters
+226
-0
No files found.
src/objective/regression_objective.hpp
0 → 100644
View file @
1c774687
#ifndef LIGHTGBM_OBJECTIVE_REGRESSION_OBJECTIVE_HPP_
#define LIGHTGBM_OBJECTIVE_REGRESSION_OBJECTIVE_HPP_
#include <LightGBM/objective_function.h>
namespace
LightGBM
{
/*!
* \brief Objective funtion for regression
*/
class
RegressionL2loss
:
public
ObjectiveFunction
{
public:
explicit
RegressionL2loss
(
const
ObjectiveConfig
&
)
{
}
~
RegressionL2loss
()
{
}
void
Init
(
const
Metadata
&
metadata
,
data_size_t
num_data
)
override
{
num_data_
=
num_data
;
label_
=
metadata
.
label
();
weights_
=
metadata
.
weights
();
}
void
GetGradients
(
const
score_t
*
score
,
score_t
*
gradients
,
score_t
*
hessians
)
const
override
{
if
(
weights_
==
nullptr
)
{
#pragma omp parallel for schedule(static)
for
(
data_size_t
i
=
0
;
i
<
num_data_
;
++
i
)
{
gradients
[
i
]
=
(
score
[
i
]
-
label_
[
i
]);
hessians
[
i
]
=
1.0
;
}
}
else
{
#pragma omp parallel for schedule(static)
for
(
data_size_t
i
=
0
;
i
<
num_data_
;
++
i
)
{
gradients
[
i
]
=
(
score
[
i
]
-
label_
[
i
])
*
weights_
[
i
];
hessians
[
i
]
=
weights_
[
i
];
}
}
}
double
GetSigmoid
()
const
override
{
// not sigmoid transform, return -1
return
-
1.0
;
}
private:
/*! \brief Number of data */
data_size_t
num_data_
;
/*! \brief Pointer of label */
const
float
*
label_
;
/*! \brief Pointer of weights */
const
float
*
weights_
;
};
}
// namespace LightGBM
#endif #endif // LightGBM_OBJECTIVE_REGRESSION_OBJECTIVE_HPP_
src/treelearner/data_parallel_tree_learner.cpp
0 → 100644
View file @
1c774687
#include "parallel_tree_learner.h"
#include <cstring>
#include <tuple>
#include <vector>
namespace
LightGBM
{
DataParallelTreeLearner
::
DataParallelTreeLearner
(
const
TreeConfig
&
tree_config
)
:
SerialTreeLearner
(
tree_config
),
input_buffer_
(
nullptr
),
output_buffer_
(
nullptr
),
is_feature_aggregated_
(
nullptr
),
block_start_
(
nullptr
),
block_len_
(
nullptr
),
buffer_write_start_pos_
(
nullptr
),
buffer_read_start_pos_
(
nullptr
),
global_data_count_in_leaf_
(
nullptr
)
{
}
DataParallelTreeLearner
::~
DataParallelTreeLearner
()
{
if
(
input_buffer_
!=
nullptr
)
{
delete
[]
input_buffer_
;
}
if
(
output_buffer_
!=
nullptr
)
{
delete
[]
output_buffer_
;
}
if
(
is_feature_aggregated_
!=
nullptr
)
{
delete
[]
is_feature_aggregated_
;
}
if
(
block_start_
!=
nullptr
)
{
delete
[]
block_start_
;
}
if
(
block_len_
!=
nullptr
)
{
delete
[]
block_len_
;
}
if
(
buffer_write_start_pos_
!=
nullptr
)
{
delete
[]
buffer_write_start_pos_
;
}
if
(
buffer_read_start_pos_
!=
nullptr
)
{
delete
[]
buffer_read_start_pos_
;
}
if
(
global_data_count_in_leaf_
!=
nullptr
)
{
delete
[]
global_data_count_in_leaf_
;
}
}
void
DataParallelTreeLearner
::
Init
(
const
Dataset
*
train_data
)
{
// initialize SerialTreeLearner
SerialTreeLearner
::
Init
(
train_data
);
// Get local rank and global machine size
rank_
=
Network
::
rank
();
num_machines_
=
Network
::
num_machines
();
// allocate buffer for communication
size_t
buffer_size
=
0
;
for
(
int
i
=
0
;
i
<
num_features_
;
++
i
)
{
buffer_size
+=
train_data_
->
FeatureAt
(
i
)
->
num_bin
()
*
sizeof
(
HistogramBinEntry
);
}
input_buffer_
=
new
char
[
buffer_size
];
output_buffer_
=
new
char
[
buffer_size
];
is_feature_aggregated_
=
new
bool
[
num_features_
];
block_start_
=
new
int
[
num_machines_
];
block_len_
=
new
int
[
num_machines_
];
buffer_write_start_pos_
=
new
int
[
num_features_
];
buffer_read_start_pos_
=
new
int
[
num_features_
];
global_data_count_in_leaf_
=
new
data_size_t
[
num_leaves_
];
}
void
DataParallelTreeLearner
::
BeforeTrain
()
{
SerialTreeLearner
::
BeforeTrain
();
// generate feature partition for current tree
std
::
vector
<
std
::
vector
<
int
>>
feature_distribution
(
num_machines_
,
std
::
vector
<
int
>
());
std
::
vector
<
int
>
num_bins_distributed
(
num_machines_
,
0
);
for
(
int
i
=
0
;
i
<
train_data_
->
num_features
();
++
i
)
{
if
(
is_feature_used_
[
i
])
{
int
cur_min_machine
=
static_cast
<
int
>
(
ArrayArgs
<
int
>::
ArgMin
(
num_bins_distributed
));
feature_distribution
[
cur_min_machine
].
push_back
(
i
);
num_bins_distributed
[
cur_min_machine
]
+=
train_data_
->
FeatureAt
(
i
)
->
num_bin
();
}
is_feature_aggregated_
[
i
]
=
false
;
}
// get local used feature
for
(
auto
fid
:
feature_distribution
[
rank_
])
{
is_feature_aggregated_
[
fid
]
=
true
;
}
// get block start and block len for reduce scatter
reduce_scatter_size_
=
0
;
for
(
int
i
=
0
;
i
<
num_machines_
;
++
i
)
{
block_len_
[
i
]
=
0
;
for
(
auto
fid
:
feature_distribution
[
i
])
{
block_len_
[
i
]
+=
train_data_
->
FeatureAt
(
fid
)
->
num_bin
()
*
sizeof
(
HistogramBinEntry
);
}
reduce_scatter_size_
+=
block_len_
[
i
];
}
block_start_
[
0
]
=
0
;
for
(
int
i
=
1
;
i
<
num_machines_
;
++
i
)
{
block_start_
[
i
]
=
block_start_
[
i
-
1
]
+
block_len_
[
i
-
1
];
}
// get buffer_write_start_pos_
int
bin_size
=
0
;
for
(
int
i
=
0
;
i
<
num_machines_
;
++
i
)
{
for
(
auto
fid
:
feature_distribution
[
i
])
{
buffer_write_start_pos_
[
fid
]
=
bin_size
;
bin_size
+=
train_data_
->
FeatureAt
(
fid
)
->
num_bin
()
*
sizeof
(
HistogramBinEntry
);
}
}
// get buffer_read_start_pos_
bin_size
=
0
;
for
(
auto
fid
:
feature_distribution
[
rank_
])
{
buffer_read_start_pos_
[
fid
]
=
bin_size
;
bin_size
+=
train_data_
->
FeatureAt
(
fid
)
->
num_bin
()
*
sizeof
(
HistogramBinEntry
);
}
// sync global data sumup info
std
::
tuple
<
data_size_t
,
score_t
,
score_t
>
data
(
smaller_leaf_splits_
->
num_data_in_leaf
(),
smaller_leaf_splits_
->
sum_gradients
(),
smaller_leaf_splits_
->
sum_hessians
());
int
size
=
sizeof
(
data
);
std
::
memcpy
(
input_buffer_
,
&
data
,
size
);
// global sumup reduce
Network
::
Allreduce
(
input_buffer_
,
size
,
size
,
output_buffer_
,
[](
const
char
*
src
,
char
*
dst
,
int
len
)
{
int
used_size
=
0
;
int
type_size
=
sizeof
(
std
::
tuple
<
data_size_t
,
score_t
,
score_t
>
);
const
std
::
tuple
<
data_size_t
,
score_t
,
score_t
>
*
p1
;
std
::
tuple
<
data_size_t
,
score_t
,
score_t
>
*
p2
;
while
(
used_size
<
len
)
{
p1
=
reinterpret_cast
<
const
std
::
tuple
<
data_size_t
,
score_t
,
score_t
>
*>
(
src
);
p2
=
reinterpret_cast
<
std
::
tuple
<
data_size_t
,
score_t
,
score_t
>
*>
(
dst
);
std
::
get
<
0
>
(
*
p2
)
=
std
::
get
<
0
>
(
*
p2
)
+
std
::
get
<
0
>
(
*
p1
);
std
::
get
<
1
>
(
*
p2
)
=
std
::
get
<
1
>
(
*
p2
)
+
std
::
get
<
1
>
(
*
p1
);
std
::
get
<
2
>
(
*
p2
)
=
std
::
get
<
2
>
(
*
p2
)
+
std
::
get
<
2
>
(
*
p1
);
src
+=
type_size
;
dst
+=
type_size
;
used_size
+=
type_size
;
}
});
// copy back
std
::
memcpy
(
&
data
,
output_buffer_
,
size
);
// set global sumup info
smaller_leaf_splits_
->
Init
(
std
::
get
<
1
>
(
data
),
std
::
get
<
2
>
(
data
));
// init global data count in leaf
global_data_count_in_leaf_
[
0
]
=
std
::
get
<
0
>
(
data
);
}
void
DataParallelTreeLearner
::
FindBestThresholds
()
{
// construct local histograms
#pragma omp parallel for schedule(guided)
for
(
int
feature_index
=
0
;
feature_index
<
num_features_
;
++
feature_index
)
{
if
((
is_feature_used_
!=
nullptr
&&
is_feature_used_
[
feature_index
]
==
false
))
continue
;
// construct histograms for smaller leaf
if
(
ordered_bins_
[
feature_index
]
==
nullptr
)
{
smaller_leaf_histogram_array_
[
feature_index
].
Construct
(
smaller_leaf_splits_
->
data_indices
(),
smaller_leaf_splits_
->
num_data_in_leaf
(),
smaller_leaf_splits_
->
sum_gradients
(),
smaller_leaf_splits_
->
sum_hessians
(),
ptr_to_ordered_gradients_
,
ptr_to_ordered_hessians_
);
}
else
{
smaller_leaf_histogram_array_
[
feature_index
].
Construct
(
ordered_bins_
[
feature_index
],
smaller_leaf_splits_
->
LeafIndex
(),
smaller_leaf_splits_
->
num_data_in_leaf
(),
smaller_leaf_splits_
->
sum_gradients
(),
smaller_leaf_splits_
->
sum_hessians
(),
gradients_
,
hessians_
);
}
// copy to buffer
std
::
memcpy
(
input_buffer_
+
buffer_write_start_pos_
[
feature_index
],
smaller_leaf_histogram_array_
[
feature_index
].
HistogramData
(),
smaller_leaf_histogram_array_
[
feature_index
].
SizeOfHistgram
());
}
// Reduce scatter for histogram
Network
::
ReduceScatter
(
input_buffer_
,
reduce_scatter_size_
,
block_start_
,
block_len_
,
output_buffer_
,
&
HistogramBinEntry
::
SumReducer
);
#pragma omp parallel for schedule(guided)
for
(
int
feature_index
=
0
;
feature_index
<
num_features_
;
++
feature_index
)
{
if
(
!
is_feature_aggregated_
[
feature_index
])
continue
;
// copy global sumup info
smaller_leaf_histogram_array_
[
feature_index
].
SetSumup
(
GetGlobalDataCountInLeaf
(
smaller_leaf_splits_
->
LeafIndex
()),
smaller_leaf_splits_
->
sum_gradients
(),
smaller_leaf_splits_
->
sum_hessians
());
// restore global histograms from buffer
smaller_leaf_histogram_array_
[
feature_index
].
FromMemory
(
output_buffer_
+
buffer_read_start_pos_
[
feature_index
]);
// find best threshold for smaller child
smaller_leaf_histogram_array_
[
feature_index
].
FindBestThreshold
(
&
smaller_leaf_splits_
->
BestSplitPerFeature
()[
feature_index
]);
// only root leaf
if
(
larger_leaf_splits_
==
nullptr
||
larger_leaf_splits_
->
LeafIndex
()
<
0
)
continue
;
// construct histgroms for large leaf, we init larger leaf as the parent, so we can just subtract the smaller leaf's histograms
larger_leaf_histogram_array_
[
feature_index
].
Subtract
(
smaller_leaf_histogram_array_
[
feature_index
]);
// set sumup info for histogram
larger_leaf_histogram_array_
[
feature_index
].
SetSumup
(
GetGlobalDataCountInLeaf
(
larger_leaf_splits_
->
LeafIndex
()),
larger_leaf_splits_
->
sum_gradients
(),
larger_leaf_splits_
->
sum_hessians
());
// find best threshold for larger child
larger_leaf_histogram_array_
[
feature_index
].
FindBestThreshold
(
&
larger_leaf_splits_
->
BestSplitPerFeature
()[
feature_index
]);
}
}
void
DataParallelTreeLearner
::
FindBestSplitsForLeaves
()
{
int
smaller_best_feature
=
-
1
,
larger_best_feature
=
-
1
;
SplitInfo
smaller_best
,
larger_best
;
std
::
vector
<
double
>
gains
;
// find local best split for smaller leaf
for
(
size_t
i
=
0
;
i
<
smaller_leaf_splits_
->
BestSplitPerFeature
().
size
();
++
i
)
{
gains
.
push_back
(
smaller_leaf_splits_
->
BestSplitPerFeature
()[
i
].
gain
);
}
smaller_best_feature
=
static_cast
<
int
>
(
ArrayArgs
<
double
>::
ArgMax
(
gains
));
smaller_best
=
smaller_leaf_splits_
->
BestSplitPerFeature
()[
smaller_best_feature
];
// find local best split for larger leaf
if
(
larger_leaf_splits_
->
LeafIndex
()
>=
0
)
{
gains
.
clear
();
for
(
size_t
i
=
0
;
i
<
larger_leaf_splits_
->
BestSplitPerFeature
().
size
();
++
i
)
{
gains
.
push_back
(
larger_leaf_splits_
->
BestSplitPerFeature
()[
i
].
gain
);
}
larger_best_feature
=
static_cast
<
int
>
(
ArrayArgs
<
double
>::
ArgMax
(
gains
));
larger_best
=
larger_leaf_splits_
->
BestSplitPerFeature
()[
larger_best_feature
];
}
// sync global best info
std
::
memcpy
(
input_buffer_
,
&
smaller_best
,
sizeof
(
SplitInfo
));
std
::
memcpy
(
input_buffer_
+
sizeof
(
SplitInfo
),
&
larger_best
,
sizeof
(
SplitInfo
));
Network
::
Allreduce
(
input_buffer_
,
sizeof
(
SplitInfo
)
*
2
,
sizeof
(
SplitInfo
),
output_buffer_
,
&
SplitInfo
::
MaxReducer
);
std
::
memcpy
(
&
smaller_best
,
output_buffer_
,
sizeof
(
SplitInfo
));
std
::
memcpy
(
&
larger_best
,
output_buffer_
+
sizeof
(
SplitInfo
),
sizeof
(
SplitInfo
));
// set best split
best_split_per_leaf_
[
smaller_leaf_splits_
->
LeafIndex
()]
=
smaller_best
;
if
(
larger_leaf_splits_
->
LeafIndex
()
>=
0
)
{
best_split_per_leaf_
[
larger_leaf_splits_
->
LeafIndex
()]
=
larger_best
;
}
}
void
DataParallelTreeLearner
::
Split
(
Tree
*
tree
,
int
best_Leaf
,
int
*
left_leaf
,
int
*
right_leaf
)
{
SerialTreeLearner
::
Split
(
tree
,
best_Leaf
,
left_leaf
,
right_leaf
);
const
SplitInfo
&
best_split_info
=
best_split_per_leaf_
[
best_Leaf
];
// need update global number of data in leaf
global_data_count_in_leaf_
[
*
left_leaf
]
=
best_split_info
.
left_count
;
global_data_count_in_leaf_
[
*
right_leaf
]
=
best_split_info
.
right_count
;
}
}
// namespace LightGBM
src/treelearner/data_partition.hpp
0 → 100644
View file @
1c774687
#ifndef LIGHTGBM_TREELEARNER_DATA_PARTITION_HPP_
#define LIGHTGBM_TREELEARNER_DATA_PARTITION_HPP_
#include <LightGBM/meta.h>
#include <LightGBM/feature.h>
#include <omp.h>
#include <cstring>
#include <vector>
namespace
LightGBM
{
/*!
* \brief DataPartition is used to store the the partition of data on tree.
*/
class
DataPartition
{
public:
DataPartition
(
data_size_t
num_data
,
int
num_leafs
)
:
num_data_
(
num_data
),
num_leaves_
(
num_leafs
)
{
leaf_begin_
=
new
data_size_t
[
num_leaves_
];
leaf_count_
=
new
data_size_t
[
num_leaves_
];
indices_
=
new
data_size_t
[
num_data_
];
temp_left_indices_
=
new
data_size_t
[
num_data_
];
temp_right_indices_
=
new
data_size_t
[
num_data_
];
used_data_indices_
=
nullptr
;
#pragma omp parallel
#pragma omp master
{
num_threads_
=
omp_get_num_threads
();
}
offsets_buf_
=
new
data_size_t
[
num_threads_
];
left_cnts_buf_
=
new
data_size_t
[
num_threads_
];
right_cnts_buf_
=
new
data_size_t
[
num_threads_
];
left_write_pos_buf_
=
new
data_size_t
[
num_threads_
];
right_write_pos_buf_
=
new
data_size_t
[
num_threads_
];
}
~
DataPartition
()
{
delete
[]
leaf_begin_
;
delete
[]
leaf_count_
;
delete
[]
indices_
;
delete
[]
temp_left_indices_
;
delete
[]
temp_right_indices_
;
delete
[]
offsets_buf_
;
delete
[]
left_cnts_buf_
;
delete
[]
right_cnts_buf_
;
delete
[]
left_write_pos_buf_
;
delete
[]
right_write_pos_buf_
;
}
/*!
* \brief Init, will put all data on the root(leaf_idx = 0)
*/
void
Init
()
{
for
(
int
i
=
0
;
i
<
num_leaves_
;
++
i
)
{
leaf_count_
[
i
]
=
0
;
}
leaf_begin_
[
0
]
=
0
;
if
(
used_data_indices_
==
nullptr
)
{
// if using all data
leaf_count_
[
0
]
=
num_data_
;
#pragma omp parallel for schedule(static)
for
(
data_size_t
i
=
0
;
i
<
num_data_
;
++
i
)
{
indices_
[
i
]
=
i
;
}
}
else
{
// if bagging
leaf_count_
[
0
]
=
used_data_count_
;
std
::
memcpy
(
indices_
,
used_data_indices_
,
used_data_count_
*
sizeof
(
data_size_t
));
}
}
/*!
* \brief Get the data indices of one leaf
* \param leaf index of leaf
* \param indices output data indices
* \return number of data on this leaf
*/
data_size_t
GetIndexOnLeaf
(
int
leaf
,
data_size_t
**
indices
)
const
{
// copy reference, maybe unsafe, but faster
data_size_t
begin
=
leaf_begin_
[
leaf
];
(
*
indices
)
=
static_cast
<
data_size_t
*>
(
indices_
+
begin
);
return
leaf_count_
[
leaf
];
}
/*!
* \brief Split the data
* \param leaf index of leaf
* \param feature_bins feature bin data
* \param threshold threshold that want to split
* \param right_leaf index of right leaf
*/
void
Split
(
int
leaf
,
const
Bin
*
feature_bins
,
unsigned
int
threshold
,
int
right_leaf
)
{
const
data_size_t
min_inner_size
=
1000
;
// get leaf boundary
const
data_size_t
begin
=
leaf_begin_
[
leaf
];
const
data_size_t
cnt
=
leaf_count_
[
leaf
];
data_size_t
inner_size
=
(
cnt
+
num_threads_
-
1
)
/
num_threads_
;
if
(
inner_size
<
min_inner_size
)
{
inner_size
=
min_inner_size
;
}
// split data multi-threading
#pragma omp parallel for schedule(static, 1)
for
(
int
i
=
0
;
i
<
num_threads_
;
++
i
)
{
left_cnts_buf_
[
i
]
=
0
;
right_cnts_buf_
[
i
]
=
0
;
data_size_t
cur_start
=
i
*
inner_size
;
if
(
cur_start
>
cnt
)
{
continue
;
}
data_size_t
cur_cnt
=
inner_size
;
if
(
cur_start
+
cur_cnt
>
cnt
)
{
cur_cnt
=
cnt
-
cur_start
;
}
// split data inner, reduce the times of function called
data_size_t
cur_left_count
=
feature_bins
->
Split
(
threshold
,
indices_
+
begin
+
cur_start
,
cur_cnt
,
temp_left_indices_
+
cur_start
,
temp_right_indices_
+
cur_start
);
offsets_buf_
[
i
]
=
cur_start
;
left_cnts_buf_
[
i
]
=
cur_left_count
;
right_cnts_buf_
[
i
]
=
cur_cnt
-
cur_left_count
;
}
data_size_t
left_cnt
=
0
;
left_write_pos_buf_
[
0
]
=
0
;
right_write_pos_buf_
[
0
]
=
0
;
for
(
int
i
=
1
;
i
<
num_threads_
;
++
i
)
{
left_write_pos_buf_
[
i
]
=
left_write_pos_buf_
[
i
-
1
]
+
left_cnts_buf_
[
i
-
1
];
right_write_pos_buf_
[
i
]
=
right_write_pos_buf_
[
i
-
1
]
+
right_cnts_buf_
[
i
-
1
];
}
left_cnt
=
left_write_pos_buf_
[
num_threads_
-
1
]
+
left_cnts_buf_
[
num_threads_
-
1
];
// copy back indices of right leaf to indices_
#pragma omp parallel for schedule(static, 1)
for
(
int
i
=
0
;
i
<
num_threads_
;
++
i
)
{
if
(
left_cnts_buf_
[
i
]
>
0
)
{
std
::
memcpy
(
indices_
+
begin
+
left_write_pos_buf_
[
i
],
temp_left_indices_
+
offsets_buf_
[
i
],
left_cnts_buf_
[
i
]
*
sizeof
(
data_size_t
));
}
if
(
right_cnts_buf_
[
i
]
>
0
)
{
std
::
memcpy
(
indices_
+
begin
+
left_cnt
+
right_write_pos_buf_
[
i
],
temp_right_indices_
+
offsets_buf_
[
i
],
right_cnts_buf_
[
i
]
*
sizeof
(
data_size_t
));
}
}
// update leaf boundary
leaf_count_
[
leaf
]
=
left_cnt
;
leaf_begin_
[
right_leaf
]
=
left_cnt
+
begin
;
leaf_count_
[
right_leaf
]
=
cnt
-
left_cnt
;
}
/*!
* \brief SetLabelAt used data indices before training, used for bagging
* \param used_data_indices indices of used data
* \param num_used_data number of used data
*/
void
SetUsedDataIndices
(
const
data_size_t
*
used_data_indices
,
data_size_t
num_used_data
)
{
used_data_indices_
=
used_data_indices
;
used_data_count_
=
num_used_data
;
}
/*!
* \brief Get number of data on one leaf
* \param leaf index of leaf
* \return number of data of this leaf
*/
data_size_t
leaf_count
(
int
leaf
)
const
{
return
leaf_count_
[
leaf
];
}
/*!
* \brief Get leaf begin
* \param leaf index of leaf
* \return begin index of this leaf
*/
data_size_t
leaf_begin
(
int
leaf
)
const
{
return
leaf_begin_
[
leaf
];
}
const
data_size_t
*
indices
()
const
{
return
indices_
;
}
/*! \brief Get number of leaves */
int
num_leaves
()
const
{
return
num_leaves_
;
}
private:
/*! \brief Number of all data */
data_size_t
num_data_
;
/*! \brief Number of all leaves */
int
num_leaves_
;
/*! \brief start index of data on one leaf */
data_size_t
*
leaf_begin_
;
/*! \brief number of data on one leaf */
data_size_t
*
leaf_count_
;
/*! \brief Store all data's indices, order by leaf[data_in_leaf0,..,data_leaf1,..] */
data_size_t
*
indices_
;
/*! \brief team indices buffer for split */
data_size_t
*
temp_left_indices_
;
/*! \brief team indices buffer for split */
data_size_t
*
temp_right_indices_
;
/*! \brief used data indices, used for bagging */
const
data_size_t
*
used_data_indices_
;
/*! \brief used data count, used for bagging */
data_size_t
used_data_count_
;
/*! \brief number of threads */
int
num_threads_
;
/*! \brief Buffer for multi-threading data partition, used to store offset for different threads */
data_size_t
*
offsets_buf_
;
/*! \brief Buffer for multi-threading data partition, used to store left count after split for different threads */
data_size_t
*
left_cnts_buf_
;
/*! \brief Buffer for multi-threading data partition, used to store right count after split for different threads */
data_size_t
*
right_cnts_buf_
;
/*! \brief Buffer for multi-threading data partition, used to store write position of left leaf for different threads */
data_size_t
*
left_write_pos_buf_
;
/*! \brief Buffer for multi-threading data partition, used to store write position of right leaf for different threads */
data_size_t
*
right_write_pos_buf_
;
};
}
// namespace LightGBM
#endif #endif // LightGBM_TREELEARNER_DATA_PARTITION_HPP_
src/treelearner/feature_histogram.hpp
0 → 100644
View file @
1c774687
#ifndef LIGHTGBM_TREELEARNER_FEATURE_HISTOGRAM_HPP_
#define LIGHTGBM_TREELEARNER_FEATURE_HISTOGRAM_HPP_
#include "split_info.hpp"
#include <LightGBM/feature.h>
#include <cstring>
namespace
LightGBM
{
/*!
* \brief FeatureHistogram is used to construct and store a histogram for a feature.
*/
class
FeatureHistogram
{
public:
FeatureHistogram
()
:
data_
(
nullptr
)
{
}
~
FeatureHistogram
()
{
if
(
data_
!=
nullptr
)
{
delete
[]
data_
;
}
}
/*!
* \brief Init the feature histogram
* \param feature the feature data for this histogram
* \param min_num_data_one_leaf minimal number of data in one leaf
*/
void
Init
(
const
Feature
*
feature
,
int
feature_idx
,
data_size_t
min_num_data_one_leaf
,
score_t
min_sum_hessian_one_leaf
)
{
feature_idx_
=
feature_idx
;
min_num_data_one_leaf_
=
min_num_data_one_leaf
;
min_sum_hessian_one_leaf_
=
min_sum_hessian_one_leaf
;
bin_data_
=
feature
->
bin_data
();
num_bins_
=
feature
->
num_bin
();
data_
=
new
HistogramBinEntry
[
num_bins_
];
}
/*!
* \brief Construct a histogram
* \param num_data number of data in current leaf
* \param sum_gradients sum of gradients of current leaf
* \param sum_hessians sum of hissians of current leaf
* \param ordered_gradients Orederd gradients
* \param ordered_hessians Ordered hessians
* \param data_indices data indices of current leaf
*/
void
Construct
(
data_size_t
*
data_indices
,
data_size_t
num_data
,
score_t
sum_gradients
,
score_t
sum_hessians
,
const
score_t
*
ordered_gradients
,
const
score_t
*
ordered_hessians
)
{
std
::
memset
(
data_
,
0
,
sizeof
(
HistogramBinEntry
)
*
num_bins_
);
num_data_
=
num_data
;
sum_gradients_
=
sum_gradients
;
sum_hessians_
=
sum_hessians
+
2
*
kEpsilon
;
bin_data_
->
ConstructHistogram
(
data_indices
,
num_data
,
ordered_gradients
,
ordered_hessians
,
data_
);
}
/*!
* \brief Construct a histogram by ordered bin
* \param leaf current leaf
* \param num_data number of data in current leaf
* \param sum_gradients sum of gradients of current leaf
* \param sum_hessians sum of hissians of current leaf
* \param gradients
* \param hessian
*/
void
Construct
(
const
OrderedBin
*
ordered_bin
,
int
leaf
,
data_size_t
num_data
,
score_t
sum_gradients
,
score_t
sum_hessians
,
const
score_t
*
gradients
,
const
score_t
*
hessians
)
{
std
::
memset
(
data_
,
0
,
sizeof
(
HistogramBinEntry
)
*
num_bins_
);
num_data_
=
num_data
;
sum_gradients_
=
sum_gradients
;
sum_hessians_
=
sum_hessians
+
2
*
kEpsilon
;
ordered_bin
->
ConstructHistogram
(
leaf
,
gradients
,
hessians
,
data_
);
}
/*!
* \brief Set sumup information for current histogram
* \param num_data number of data in current leaf
* \param sum_gradients sum of gradients of current leaf
* \param sum_hessians sum of hissians of current leaf
*/
void
SetSumup
(
data_size_t
num_data
,
score_t
sum_gradients
,
score_t
sum_hessians
)
{
num_data_
=
num_data
;
sum_gradients_
=
sum_gradients
;
sum_hessians_
=
sum_hessians
+
2
*
kEpsilon
;
}
/*!
* \brief Subtract current histograms with other
* \param other The histogram that want to subtract
*/
void
Subtract
(
const
FeatureHistogram
&
other
)
{
num_data_
-=
other
.
num_data_
;
sum_gradients_
-=
other
.
sum_gradients_
;
sum_hessians_
-=
other
.
sum_hessians_
;
for
(
unsigned
int
i
=
0
;
i
<
num_bins_
;
++
i
)
{
data_
[
i
].
cnt
-=
other
.
data_
[
i
].
cnt
;
data_
[
i
].
sum_gradients
-=
other
.
data_
[
i
].
sum_gradients
;
data_
[
i
].
sum_hessians
-=
other
.
data_
[
i
].
sum_hessians
;
}
}
/*!
* \brief Find best threshold for this histogram
* \param output The best split result
*/
void
FindBestThreshold
(
SplitInfo
*
output
)
{
score_t
best_sum_left_gradient
=
NAN
;
score_t
best_sum_left_hessian
=
NAN
;
score_t
best_gain
=
kMinScore
;
data_size_t
best_left_count
=
0
;
unsigned
int
best_threshold
=
static_cast
<
unsigned
int
>
(
num_bins_
);
score_t
sum_right_gradient
=
0.0
f
;
score_t
sum_right_hessian
=
kEpsilon
;
data_size_t
right_count
=
0
;
score_t
gain_shift
=
GetLeafSplitGain
(
sum_gradients_
,
sum_hessians_
);
is_splittable_
=
false
;
// from right to left, and we don't need data in bin0
for
(
unsigned
int
t
=
num_bins_
-
1
;
t
>
0
;
--
t
)
{
sum_right_gradient
+=
data_
[
t
].
sum_gradients
;
sum_right_hessian
+=
data_
[
t
].
sum_hessians
;
right_count
+=
data_
[
t
].
cnt
;
// if data not enough, or sum hessian too small
if
(
right_count
<
min_num_data_one_leaf_
||
sum_right_hessian
<
min_sum_hessian_one_leaf_
)
continue
;
data_size_t
left_count
=
num_data_
-
right_count
;
// if data not enough
if
(
left_count
<
min_num_data_one_leaf_
)
break
;
score_t
sum_left_hessian
=
sum_hessians_
-
sum_right_hessian
;
// if sum hessian too small
if
(
sum_left_hessian
<
min_sum_hessian_one_leaf_
)
{
break
;
}
score_t
sum_left_gradient
=
sum_gradients_
-
sum_right_gradient
;
// current split gain
score_t
current_gain
=
GetLeafSplitGain
(
sum_left_gradient
,
sum_left_hessian
)
+
GetLeafSplitGain
(
sum_right_gradient
,
sum_right_hessian
);
// gain is worst than no perform split
if
(
current_gain
<
gain_shift
)
{
continue
;
}
// mark to is splittable
is_splittable_
=
true
;
// better split point
if
(
current_gain
>
best_gain
)
{
best_left_count
=
left_count
;
best_sum_left_gradient
=
sum_left_gradient
;
best_sum_left_hessian
=
sum_left_hessian
;
// left is <= threshold, right is > threshold. so this is t-1
best_threshold
=
t
-
1
;
best_gain
=
current_gain
;
}
}
// update split information
output
->
feature
=
feature_idx_
;
output
->
threshold
=
best_threshold
;
output
->
left_output
=
CalculateSplittedLeafOutput
(
best_sum_left_gradient
,
best_sum_left_hessian
);
output
->
left_count
=
best_left_count
;
output
->
left_sum_gradient
=
best_sum_left_gradient
;
output
->
left_sum_hessian
=
best_sum_left_hessian
;
output
->
right_output
=
CalculateSplittedLeafOutput
(
sum_gradients_
-
best_sum_left_gradient
,
sum_hessians_
-
best_sum_left_hessian
);
output
->
right_count
=
num_data_
-
best_left_count
;
output
->
right_sum_gradient
=
sum_gradients_
-
best_sum_left_gradient
;
output
->
right_sum_hessian
=
sum_hessians_
-
best_sum_left_hessian
;
output
->
gain
=
best_gain
-
gain_shift
;
}
/*!
* \brief Binary size of this histogram
*/
int
SizeOfHistgram
()
const
{
return
num_bins_
*
sizeof
(
HistogramBinEntry
);
}
/*!
* \brief Memory pointer to histogram data
*/
const
HistogramBinEntry
*
HistogramData
()
const
{
return
data_
;
}
/*!
* \brief Restore histogram from memory
*/
void
FromMemory
(
char
*
memory_data
)
{
std
::
memcpy
(
data_
,
memory_data
,
num_bins_
*
sizeof
(
HistogramBinEntry
));
}
/*!
* \brief Set min number data in one leaf
*/
void
SetMinNumDataOneLeaf
(
data_size_t
new_val
)
{
min_num_data_one_leaf_
=
new_val
;
}
/*!
* \brief Set min sum hessian in one leaf
*/
void
SetMinSumHessianOneLeaf
(
score_t
new_val
)
{
min_sum_hessian_one_leaf_
=
new_val
;
}
/*!
* \brief True if this histogram can be splitted
*/
bool
is_splittable
()
{
return
is_splittable_
;
}
/*!
* \brief Set splittable to this histogram
*/
void
set_is_splittable
(
bool
val
)
{
is_splittable_
=
val
;
}
private:
/*!
* \brief Calculate the split gain based on sum_gradients and sum_hessians
* \param sum_gradients
* \param sum_hessians
* \return split gain
*/
score_t
GetLeafSplitGain
(
score_t
sum_gradients
,
score_t
sum_hessians
)
const
{
return
(
sum_gradients
*
sum_gradients
)
/
(
sum_hessians
);
}
/*!
* \brief Calculate the output of a leaf based on sum_gradients and sum_hessians
* \param sum_gradients
* \param sum_hessians
* \return leaf output
*/
score_t
CalculateSplittedLeafOutput
(
score_t
sum_gradients
,
score_t
sum_hessians
)
const
{
return
-
(
sum_gradients
)
/
(
sum_hessians
);
}
int
feature_idx_
;
/*! \brief minimal number of data in one leaf */
data_size_t
min_num_data_one_leaf_
;
/*! \brief minimal sum hessian of data in one leaf */
score_t
min_sum_hessian_one_leaf_
;
/*! \brief the bin data of current feature */
const
Bin
*
bin_data_
;
/*! \brief number of bin of histogram */
unsigned
int
num_bins_
;
/*! \brief sum of gradient of each bin */
HistogramBinEntry
*
data_
;
/*! \brief number of all data */
data_size_t
num_data_
;
/*! \brief sum of gradient of current leaf */
score_t
sum_gradients_
;
/*! \brief sum of hessians of current leaf */
score_t
sum_hessians_
;
/*! \brief False if this histogram cannot split */
bool
is_splittable_
=
true
;
};
}
// namespace LightGBM
#endif #endif // LightGBM_TREELEARNER_FEATURE_HISTOGRAM_HPP_
src/treelearner/feature_parallel_tree_learner.cpp
0 → 100644
View file @
1c774687
#include "parallel_tree_learner.h"
#include <cstring>
#include <vector>
namespace
LightGBM
{
FeatureParallelTreeLearner
::
FeatureParallelTreeLearner
(
const
TreeConfig
&
tree_config
)
:
SerialTreeLearner
(
tree_config
),
input_buffer_
(
nullptr
),
output_buffer_
(
nullptr
)
{
}
FeatureParallelTreeLearner
::~
FeatureParallelTreeLearner
()
{
if
(
input_buffer_
!=
nullptr
)
{
delete
[]
input_buffer_
;
}
if
(
output_buffer_
!=
nullptr
)
{
delete
[]
output_buffer_
;
}
}
void
FeatureParallelTreeLearner
::
Init
(
const
Dataset
*
train_data
)
{
SerialTreeLearner
::
Init
(
train_data
);
rank_
=
Network
::
rank
();
num_machines_
=
Network
::
num_machines
();
input_buffer_
=
new
char
[
sizeof
(
SplitInfo
)
*
2
];
output_buffer_
=
new
char
[
sizeof
(
SplitInfo
)
*
2
];
}
void
FeatureParallelTreeLearner
::
BeforeTrain
()
{
SerialTreeLearner
::
BeforeTrain
();
// get feature partition
std
::
vector
<
std
::
vector
<
int
>>
feature_distribution
(
num_machines_
,
std
::
vector
<
int
>
());
std
::
vector
<
int
>
num_bins_distributed
(
num_machines_
,
0
);
for
(
int
i
=
0
;
i
<
train_data_
->
num_features
();
++
i
)
{
if
(
is_feature_used_
[
i
])
{
int
cur_min_machine
=
static_cast
<
int
>
(
ArrayArgs
<
int
>::
ArgMin
(
num_bins_distributed
));
feature_distribution
[
cur_min_machine
].
push_back
(
i
);
num_bins_distributed
[
cur_min_machine
]
+=
train_data_
->
FeatureAt
(
i
)
->
num_bin
();
is_feature_used_
[
i
]
=
false
;
}
}
// get local used features
for
(
auto
fid
:
feature_distribution
[
rank_
])
{
is_feature_used_
[
fid
]
=
true
;
}
}
void
FeatureParallelTreeLearner
::
FindBestSplitsForLeaves
()
{
int
smaller_best_feature
=
-
1
,
larger_best_feature
=
-
1
;
SplitInfo
smaller_best
,
larger_best
;
// get best split at smaller leaf
std
::
vector
<
double
>
gains
;
for
(
size_t
i
=
0
;
i
<
smaller_leaf_splits_
->
BestSplitPerFeature
().
size
();
++
i
)
{
gains
.
push_back
(
smaller_leaf_splits_
->
BestSplitPerFeature
()[
i
].
gain
);
}
smaller_best_feature
=
static_cast
<
int
>
(
ArrayArgs
<
double
>::
ArgMax
(
gains
));
smaller_best
=
smaller_leaf_splits_
->
BestSplitPerFeature
()[
smaller_best_feature
];
// get best split at larger leaf
if
(
larger_leaf_splits_
->
LeafIndex
()
>=
0
)
{
gains
.
clear
();
for
(
size_t
i
=
0
;
i
<
larger_leaf_splits_
->
BestSplitPerFeature
().
size
();
++
i
)
{
gains
.
push_back
(
larger_leaf_splits_
->
BestSplitPerFeature
()[
i
].
gain
);
}
larger_best_feature
=
static_cast
<
int
>
(
ArrayArgs
<
double
>::
ArgMax
(
gains
));
larger_best
=
larger_leaf_splits_
->
BestSplitPerFeature
()[
larger_best_feature
];
}
// sync global best info
std
::
memcpy
(
input_buffer_
,
&
smaller_best
,
sizeof
(
SplitInfo
));
std
::
memcpy
(
input_buffer_
+
sizeof
(
SplitInfo
),
&
larger_best
,
sizeof
(
SplitInfo
));
Network
::
Allreduce
(
input_buffer_
,
sizeof
(
SplitInfo
)
*
2
,
sizeof
(
SplitInfo
),
output_buffer_
,
&
SplitInfo
::
MaxReducer
);
// copy back
std
::
memcpy
(
&
smaller_best
,
output_buffer_
,
sizeof
(
SplitInfo
));
std
::
memcpy
(
&
larger_best
,
output_buffer_
+
sizeof
(
SplitInfo
),
sizeof
(
SplitInfo
));
// update best split
best_split_per_leaf_
[
smaller_leaf_splits_
->
LeafIndex
()]
=
smaller_best
;
if
(
larger_leaf_splits_
->
LeafIndex
()
>=
0
)
{
best_split_per_leaf_
[
larger_leaf_splits_
->
LeafIndex
()]
=
larger_best
;
}
}
}
// namespace LightGBM
src/treelearner/leaf_splits.hpp
0 → 100644
View file @
1c774687
#ifndef LIGHTGBM_TREELEARNER_LEAF_SPLITS_HPP_
#define LIGHTGBM_TREELEARNER_LEAF_SPLITS_HPP_
#include <LightGBM/meta.h>
#include "data_partition.hpp"
#include "split_info.hpp"
#include <vector>
namespace
LightGBM
{
/*!
* \brief used to find splits candidates for a leaf
*/
class
LeafSplits
{
public:
LeafSplits
(
int
num_feature
,
data_size_t
num_data
)
:
num_data_in_leaf_
(
num_data
),
num_data_
(
num_data
),
num_features_
(
num_feature
),
data_indices_
(
nullptr
)
{
for
(
int
i
=
0
;
i
<
num_features_
;
++
i
)
{
best_split_per_feature_
.
push_back
(
SplitInfo
());
best_split_per_feature_
[
i
].
feature
=
i
;
}
}
~
LeafSplits
()
{
}
/*!
* \brief Init splits on current leaf, don't need to travesal all data
* \param leaf Index of current leaf
* \param data_partition current data partition
* \param sum_gradients
* \param sum_hessians
*/
void
Init
(
int
leaf
,
const
DataPartition
*
data_partition
,
score_t
sum_gradients
,
score_t
sum_hessians
)
{
leaf_index_
=
leaf
;
num_data_in_leaf_
=
data_partition
->
GetIndexOnLeaf
(
leaf
,
&
data_indices_
);
sum_gradients_
=
sum_gradients
;
sum_hessians_
=
sum_hessians
;
for
(
SplitInfo
&
split_info
:
best_split_per_feature_
)
{
split_info
.
Reset
();
}
}
/*!
* \brief Init splits on current leaf, need to travesal all data to sum up
* \param gradients
* \param hessians
*/
void
Init
(
const
score_t
*
gradients
,
const
score_t
*
hessians
)
{
num_data_in_leaf_
=
num_data_
;
leaf_index_
=
0
;
data_indices_
=
nullptr
;
score_t
tmp_sum_gradients
=
0.0
;
score_t
tmp_sum_hessians
=
0.0
;
#pragma omp parallel for schedule(static) reduction(+:tmp_sum_gradients, tmp_sum_hessians)
for
(
data_size_t
i
=
0
;
i
<
num_data_in_leaf_
;
++
i
)
{
tmp_sum_gradients
+=
gradients
[
i
];
tmp_sum_hessians
+=
hessians
[
i
];
}
sum_gradients_
=
tmp_sum_gradients
;
sum_hessians_
=
tmp_sum_hessians
;
for
(
SplitInfo
&
split_info
:
best_split_per_feature_
)
{
split_info
.
Reset
();
}
}
/*!
* \brief Init splits on current leaf, need to travesal all data to sum up
* \param leaf Index of current leaf
* \param data_partition current data partition
* \param gradients
* \param hessians
*/
void
Init
(
int
leaf
,
const
DataPartition
*
data_partition
,
const
score_t
*
gradients
,
const
score_t
*
hessians
)
{
leaf_index_
=
leaf
;
num_data_in_leaf_
=
data_partition
->
GetIndexOnLeaf
(
leaf
,
&
data_indices_
);
score_t
tmp_sum_gradients
=
0.0
;
score_t
tmp_sum_hessians
=
0.0
;
#pragma omp parallel for schedule(static) reduction(+:tmp_sum_gradients, tmp_sum_hessians)
for
(
data_size_t
i
=
0
;
i
<
num_data_in_leaf_
;
++
i
)
{
data_size_t
idx
=
data_indices_
[
i
];
tmp_sum_gradients
+=
gradients
[
idx
];
tmp_sum_hessians
+=
hessians
[
idx
];
}
sum_gradients_
=
tmp_sum_gradients
;
sum_hessians_
=
tmp_sum_hessians
;
for
(
SplitInfo
&
split_info
:
best_split_per_feature_
)
{
split_info
.
Reset
();
}
}
/*!
* \brief Init splits on current leaf, only update sum_gradients and sum_hessians
* \param sum_gradients
* \param sum_hessians
*/
void
Init
(
score_t
sum_gradients
,
score_t
sum_hessians
)
{
leaf_index_
=
0
;
sum_gradients_
=
sum_gradients
;
sum_hessians_
=
sum_hessians
;
for
(
SplitInfo
&
split_info
:
best_split_per_feature_
)
{
split_info
.
Reset
();
}
}
/*!
* \brief Init splits on current leaf
*/
void
Init
()
{
leaf_index_
=
-
1
;
for
(
SplitInfo
&
split_info
:
best_split_per_feature_
)
{
split_info
.
Reset
();
}
}
/*! \brief Get best splits on all features */
std
::
vector
<
SplitInfo
>&
BestSplitPerFeature
()
{
return
best_split_per_feature_
;}
/*! \brief Get current leaf index */
int
LeafIndex
()
const
{
return
leaf_index_
;
}
/*! \brief Get numer of data in current leaf */
data_size_t
num_data_in_leaf
()
const
{
return
num_data_in_leaf_
;
}
/*! \brief Get sum of gradients of current leaf */
score_t
sum_gradients
()
const
{
return
sum_gradients_
;
}
/*! \brief Get sum of hessians of current leaf */
score_t
sum_hessians
()
const
{
return
sum_hessians_
;
}
/*! \brief Get indices of data of current leaf */
data_size_t
*
data_indices
()
const
{
return
data_indices_
;
}
private:
/*! \brief store best splits of all feature on current leaf */
std
::
vector
<
SplitInfo
>
best_split_per_feature_
;
/*! \brief current leaf index */
int
leaf_index_
;
/*! \brief number of data on current leaf */
data_size_t
num_data_in_leaf_
;
/*! \brief number of all training data */
data_size_t
num_data_
;
/*! \brief number of features */
int
num_features_
;
/*! \brief sum of gradients of current leaf */
score_t
sum_gradients_
;
/*! \brief sum of hessians of current leaf */
score_t
sum_hessians_
;
/*! \brief indices of data of current leaf */
data_size_t
*
data_indices_
;
};
}
// namespace LightGBM
#endif #endif // LightGBM_TREELEARNER_LEAF_SPLITS_HPP_
src/treelearner/parallel_tree_learner.h
0 → 100644
View file @
1c774687
#ifndef LIGHTGBM_TREELEARNER_PARALLEL_TREE_LEARNER_H_
#define LIGHTGBM_TREELEARNER_PARALLEL_TREE_LEARNER_H_
#include <LightGBM/utils/array_args.h>
#include <LightGBM/network.h>
#include "serial_tree_learner.h"
#include <cstring>
#include <vector>
namespace
LightGBM
{
/*!
* \brief Feature parallel learning algorithm.
* Different machine will find best split on different features, then sync global best split
* When #data is small or #feature is large, you can use this to have better speed-up
*/
class
FeatureParallelTreeLearner
:
public
SerialTreeLearner
{
public:
explicit
FeatureParallelTreeLearner
(
const
TreeConfig
&
tree_config
);
~
FeatureParallelTreeLearner
();
virtual
void
Init
(
const
Dataset
*
train_data
);
protected:
void
BeforeTrain
()
override
;
void
FindBestSplitsForLeaves
()
override
;
private:
/*! \brief rank of local machine */
int
rank_
;
/*! \brief Number of machines of this parallel task */
int
num_machines_
;
/*! \brief Buffer for network send */
char
*
input_buffer_
;
/*! \brief Buffer for network receive */
char
*
output_buffer_
;
};
/*!
* \brief Data parallel learning algorithm.
* Workers use local data to construct histograms locally, then sync up global histograms.
* When #data is large or #feature is small, you can use this to have better speed-up
*/
class
DataParallelTreeLearner
:
public
SerialTreeLearner
{
public:
explicit
DataParallelTreeLearner
(
const
TreeConfig
&
tree_config
);
~
DataParallelTreeLearner
();
void
Init
(
const
Dataset
*
train_data
)
override
;
protected:
void
BeforeTrain
()
override
;
void
FindBestThresholds
()
override
;
void
FindBestSplitsForLeaves
()
override
;
void
Split
(
Tree
*
tree
,
int
best_Leaf
,
int
*
left_leaf
,
int
*
right_leaf
)
override
;
inline
data_size_t
GetGlobalDataCountInLeaf
(
int
leaf_idx
)
const
override
{
if
(
leaf_idx
>=
0
)
{
return
global_data_count_in_leaf_
[
leaf_idx
];
}
else
{
return
0
;
}
}
private:
/*! \brief Rank of local machine */
int
rank_
;
/*! \brief Number of machines of this parallel task */
int
num_machines_
;
/*! \brief Buffer for network send */
char
*
input_buffer_
;
/*! \brief Buffer for network receive */
char
*
output_buffer_
;
/*! \brief different machines will aggregate histograms for different features,
use this to mark local aggregate features*/
bool
*
is_feature_aggregated_
;
/*! \brief Block start index for reduce scatter */
int
*
block_start_
;
/*! \brief Block size for reduce scatter */
int
*
block_len_
;
/*! \brief Write positions for feature histgrams */
int
*
buffer_write_start_pos_
;
/*! \brief Read positions for local feature histgrams */
int
*
buffer_read_start_pos_
;
/*! \brief Size for reduce scatter */
int
reduce_scatter_size_
;
/*! \brief Store global number of data in leaves */
data_size_t
*
global_data_count_in_leaf_
;
};
}
// namespace LightGBM
#endif #endif // LightGBM_TREELEARNER_PARALLEL_TREE_LEARNER_H_
src/treelearner/serial_tree_learner.cpp
0 → 100644
View file @
1c774687
#include "serial_tree_learner.h"
#include <LightGBM/utils/array_args.h>
#include <algorithm>
#include <vector>
namespace
LightGBM
{
SerialTreeLearner
::
SerialTreeLearner
(
const
TreeConfig
&
tree_config
)
:
data_partition_
(
nullptr
),
is_feature_used_
(
nullptr
),
historical_histogram_array_
(
nullptr
),
smaller_leaf_histogram_array_
(
nullptr
),
larger_leaf_histogram_array_
(
nullptr
),
smaller_leaf_splits_
(
nullptr
),
larger_leaf_splits_
(
nullptr
),
ordered_gradients_
(
nullptr
),
ordered_hessians_
(
nullptr
),
is_data_in_leaf_
(
nullptr
)
{
// initialize with nullptr
num_leaves_
=
tree_config
.
num_leaves
;
min_num_data_one_leaf_
=
static_cast
<
data_size_t
>
(
tree_config
.
min_data_in_leaf
);
min_sum_hessian_one_leaf_
=
static_cast
<
float
>
(
tree_config
.
min_sum_hessian_in_leaf
);
feature_fraction_
=
tree_config
.
feature_fraction
;
random_
=
Random
(
tree_config
.
feature_fraction_seed
);
}
SerialTreeLearner
::~
SerialTreeLearner
()
{
if
(
data_partition_
!=
nullptr
)
{
delete
data_partition_
;
}
if
(
smaller_leaf_splits_
!=
nullptr
)
{
delete
smaller_leaf_splits_
;
}
if
(
larger_leaf_splits_
!=
nullptr
)
{
delete
larger_leaf_splits_
;
}
for
(
int
i
=
0
;
i
<
num_leaves_
;
++
i
)
{
if
(
historical_histogram_array_
[
i
]
!=
nullptr
)
{
delete
[]
historical_histogram_array_
[
i
];
}
}
if
(
historical_histogram_array_
!=
nullptr
)
{
delete
[]
historical_histogram_array_
;
}
if
(
is_feature_used_
!=
nullptr
)
{
delete
[]
is_feature_used_
;
}
if
(
ordered_gradients_
!=
nullptr
)
{
delete
[]
ordered_gradients_
;
}
if
(
ordered_hessians_
!=
nullptr
)
{
delete
[]
ordered_hessians_
;
}
for
(
auto
&
bin
:
ordered_bins_
)
{
delete
bin
;
}
if
(
is_data_in_leaf_
!=
nullptr
)
{
delete
[]
is_data_in_leaf_
;
}
}
void
SerialTreeLearner
::
Init
(
const
Dataset
*
train_data
)
{
train_data_
=
train_data
;
num_data_
=
train_data_
->
num_data
();
num_features_
=
train_data_
->
num_features
();
// allocate the space for historical_histogram_array_
historical_histogram_array_
=
new
FeatureHistogram
*
[
num_leaves_
];
for
(
int
i
=
0
;
i
<
num_leaves_
;
++
i
)
{
historical_histogram_array_
[
i
]
=
new
FeatureHistogram
[
train_data_
->
num_features
()];
for
(
int
j
=
0
;
j
<
train_data_
->
num_features
();
++
j
)
{
historical_histogram_array_
[
i
][
j
].
Init
(
train_data_
->
FeatureAt
(
j
),
j
,
min_num_data_one_leaf_
,
min_sum_hessian_one_leaf_
);
}
}
// push split information for all leaves
for
(
int
i
=
0
;
i
<
num_leaves_
;
++
i
)
{
best_split_per_leaf_
.
push_back
(
SplitInfo
());
}
// initialize ordered_bins_ with nullptr
for
(
int
i
=
0
;
i
<
num_features_
;
++
i
)
{
ordered_bins_
.
push_back
(
nullptr
);
}
// get ordered bin
#pragma omp parallel for schedule(guided)
for
(
int
i
=
0
;
i
<
num_features_
;
++
i
)
{
ordered_bins_
[
i
]
=
train_data_
->
FeatureAt
(
i
)
->
bin_data
()
->
CreateOrderedBin
();
}
// check existing for ordered bin
for
(
int
i
=
0
;
i
<
num_features_
;
++
i
)
{
if
(
ordered_bins_
[
i
]
!=
nullptr
)
{
has_ordered_bin_
=
true
;
break
;
}
}
// initialize splits for leaf
smaller_leaf_splits_
=
new
LeafSplits
(
train_data_
->
num_features
(),
train_data_
->
num_data
());
larger_leaf_splits_
=
new
LeafSplits
(
train_data_
->
num_features
(),
train_data_
->
num_data
());
// initialize data partition
data_partition_
=
new
DataPartition
(
num_data_
,
num_leaves_
);
is_feature_used_
=
new
bool
[
num_features_
];
// initialize ordered gradients and hessians
ordered_gradients_
=
new
score_t
[
num_data_
];
ordered_hessians_
=
new
score_t
[
num_data_
];
// if has ordered bin, need allocata a buffer to fast split
if
(
has_ordered_bin_
)
{
is_data_in_leaf_
=
new
char
[
num_data_
];
}
Log
::
Stdout
(
"#data:%d #feature:%d
\n
"
,
num_data_
,
num_features_
);
}
Tree
*
SerialTreeLearner
::
Train
(
const
score_t
*
gradients
,
const
score_t
*
hessians
)
{
gradients_
=
gradients
;
hessians_
=
hessians
;
// some initial works before training
BeforeTrain
();
Tree
*
tree
=
new
Tree
(
num_leaves_
);
// root leaf
int
left_leaf
=
0
;
// only root leaf can be splitted on first time
int
right_leaf
=
-
1
;
for
(
int
split
=
0
;
split
<
num_leaves_
-
1
;
split
++
)
{
// some initial works before finding best split
if
(
BeforeFindBestSplit
(
left_leaf
,
right_leaf
))
{
// find best threshold for every feature
FindBestThresholds
();
// find best split from all features
FindBestSplitsForLeaves
();
}
// Get a leaf with max split gain
int
best_leaf
=
static_cast
<
int
>
(
ArrayArgs
<
SplitInfo
>::
ArgMax
(
best_split_per_leaf_
));
// Get split information for best leaf
const
SplitInfo
&
best_leaf_SplitInfo
=
best_split_per_leaf_
[
best_leaf
];
// cannot split, quit
if
(
best_leaf_SplitInfo
.
gain
<=
0.0
)
{
Log
::
Stdout
(
"cannot find more split with gain = %f , current #leaves=%d
\n
"
,
best_leaf_SplitInfo
.
gain
,
split
+
1
);
break
;
}
// split tree with best leaf
Split
(
tree
,
best_leaf
,
&
left_leaf
,
&
right_leaf
);
}
// save pointer to last trained tree
last_trained_tree_
=
tree
;
return
tree
;
}
void
SerialTreeLearner
::
BeforeTrain
()
{
// initialize used features
for
(
int
i
=
0
;
i
<
num_features_
;
++
i
)
{
is_feature_used_
[
i
]
=
false
;
}
// Get used feature at current tree
size_t
used_feature_cnt
=
static_cast
<
size_t
>
(
num_features_
*
feature_fraction_
);
std
::
vector
<
size_t
>
used_feature_indices
=
random_
.
Sample
(
num_features_
,
used_feature_cnt
);
for
(
auto
idx
:
used_feature_indices
)
{
is_feature_used_
[
idx
]
=
true
;
}
// set all histogram to splittable
#pragma omp parallel for schedule(static)
for
(
int
i
=
0
;
i
<
num_leaves_
;
++
i
)
{
for
(
int
j
=
0
;
j
<
train_data_
->
num_features
();
++
j
)
{
historical_histogram_array_
[
i
][
j
].
set_is_splittable
(
true
);
}
}
// initialize data partition
data_partition_
->
Init
();
// reset the splits for leaves
for
(
int
i
=
0
;
i
<
num_leaves_
;
++
i
)
{
best_split_per_leaf_
[
i
].
Reset
();
}
// Sumup for root
if
(
data_partition_
->
leaf_count
(
0
)
==
num_data_
)
{
// use all data
smaller_leaf_splits_
->
Init
(
gradients_
,
hessians_
);
// point to gradients, avoid copy
ptr_to_ordered_gradients_
=
gradients_
;
ptr_to_ordered_hessians_
=
hessians_
;
}
else
{
// use bagging, only use part of data
smaller_leaf_splits_
->
Init
(
0
,
data_partition_
,
gradients_
,
hessians_
);
// copy used gradients and hessians to ordered buffer
const
data_size_t
*
indices
=
data_partition_
->
indices
();
data_size_t
cnt
=
data_partition_
->
leaf_count
(
0
);
#pragma omp parallel for schedule(static)
for
(
data_size_t
i
=
0
;
i
<
cnt
;
++
i
)
{
ordered_gradients_
[
i
]
=
gradients_
[
indices
[
i
]];
ordered_hessians_
[
i
]
=
hessians_
[
indices
[
i
]];
}
// point to ordered_gradients_ and ordered_hessians_
ptr_to_ordered_gradients_
=
ordered_gradients_
;
ptr_to_ordered_hessians_
=
ordered_hessians_
;
}
larger_leaf_splits_
->
Init
();
// if has ordered bin, need to initialize the ordered bin
if
(
has_ordered_bin_
)
{
if
(
data_partition_
->
leaf_count
(
0
)
==
num_data_
)
{
// use all data, pass nullptr
#pragma omp parallel for schedule(guided)
for
(
int
i
=
0
;
i
<
num_features_
;
++
i
)
{
if
(
ordered_bins_
[
i
]
!=
nullptr
)
{
ordered_bins_
[
i
]
->
Init
(
nullptr
,
num_leaves_
);
}
}
}
else
{
// bagging, only use part of data
// mark used data
std
::
memset
(
is_data_in_leaf_
,
0
,
sizeof
(
char
)
*
num_data_
);
const
data_size_t
*
indices
=
data_partition_
->
indices
();
data_size_t
begin
=
data_partition_
->
leaf_begin
(
0
);
data_size_t
end
=
begin
+
data_partition_
->
leaf_count
(
0
);
#pragma omp parallel for schedule(static)
for
(
data_size_t
i
=
begin
;
i
<
end
;
++
i
)
{
is_data_in_leaf_
[
indices
[
i
]]
=
1
;
}
// initialize ordered bin
#pragma omp parallel for schedule(guided)
for
(
int
i
=
0
;
i
<
num_features_
;
++
i
)
{
if
(
ordered_bins_
[
i
]
!=
nullptr
)
{
ordered_bins_
[
i
]
->
Init
(
is_data_in_leaf_
,
num_leaves_
);
}
}
}
}
}
bool
SerialTreeLearner
::
BeforeFindBestSplit
(
int
left_leaf
,
int
right_leaf
)
{
data_size_t
num_data_in_left_child
=
GetGlobalDataCountInLeaf
(
left_leaf
);
data_size_t
num_data_in_right_child
=
GetGlobalDataCountInLeaf
(
right_leaf
);
// no enough data to continue
if
(
num_data_in_right_child
<
static_cast
<
data_size_t
>
(
min_num_data_one_leaf_
*
2
)
&&
num_data_in_left_child
<
static_cast
<
data_size_t
>
(
min_num_data_one_leaf_
*
2
))
{
best_split_per_leaf_
[
left_leaf
].
gain
=
kMinScore
;
if
(
right_leaf
>=
0
)
{
best_split_per_leaf_
[
right_leaf
].
gain
=
kMinScore
;
}
return
false
;
}
// -1 if only has one leaf. else equal the index of smaller leaf
int
smaller_leaf
=
-
1
;
// only have root
if
(
right_leaf
<
0
)
{
smaller_leaf_histogram_array_
=
historical_histogram_array_
[
left_leaf
];
larger_leaf_histogram_array_
=
nullptr
;
}
else
if
(
num_data_in_left_child
<
num_data_in_right_child
)
{
smaller_leaf
=
left_leaf
;
// put parent(left) leaf's histograms into larger leaf's histgrams
larger_leaf_histogram_array_
=
historical_histogram_array_
[
left_leaf
];
smaller_leaf_histogram_array_
=
historical_histogram_array_
[
right_leaf
];
// We will construc histograms for smaller leaf, and smaller_leaf=left_leaf = parent.
// if we don't swap the cache, we will overwrite the parent's hisogram cache.
std
::
swap
(
historical_histogram_array_
[
left_leaf
],
historical_histogram_array_
[
right_leaf
]);
}
else
{
smaller_leaf
=
right_leaf
;
// put parent(left) leaf's histograms to larger leaf's histgrams
larger_leaf_histogram_array_
=
historical_histogram_array_
[
left_leaf
];
smaller_leaf_histogram_array_
=
historical_histogram_array_
[
right_leaf
];
}
// init for the ordered gradients, only initialize when have 2 leaves
if
(
smaller_leaf
>=
0
)
{
// only need to initialize for smaller leaf
// Get leaf boundary
const
data_size_t
*
indices
=
data_partition_
->
indices
();
data_size_t
begin
=
data_partition_
->
leaf_begin
(
smaller_leaf
);
data_size_t
end
=
begin
+
data_partition_
->
leaf_count
(
smaller_leaf
);
// copy
#pragma omp parallel for schedule(static)
for
(
data_size_t
i
=
begin
;
i
<
end
;
++
i
)
{
ordered_gradients_
[
i
-
begin
]
=
gradients_
[
indices
[
i
]];
ordered_hessians_
[
i
-
begin
]
=
hessians_
[
indices
[
i
]];
}
// assign pointer
ptr_to_ordered_gradients_
=
ordered_gradients_
;
ptr_to_ordered_hessians_
=
ordered_hessians_
;
}
// split for the ordered bin
if
(
has_ordered_bin_
&&
right_leaf
>=
0
)
{
// mark data that at left-leaf
std
::
memset
(
is_data_in_leaf_
,
0
,
sizeof
(
char
)
*
num_data_
);
const
data_size_t
*
indices
=
data_partition_
->
indices
();
data_size_t
begin
=
data_partition_
->
leaf_begin
(
left_leaf
);
data_size_t
end
=
begin
+
data_partition_
->
leaf_count
(
left_leaf
);
#pragma omp parallel for schedule(static)
for
(
data_size_t
i
=
begin
;
i
<
end
;
++
i
)
{
is_data_in_leaf_
[
indices
[
i
]]
=
1
;
}
// split the ordered bin
#pragma omp parallel for schedule(guided)
for
(
int
i
=
0
;
i
<
num_features_
;
++
i
)
{
if
(
ordered_bins_
[
i
]
!=
nullptr
)
{
ordered_bins_
[
i
]
->
Split
(
left_leaf
,
right_leaf
,
is_data_in_leaf_
);
}
}
}
return
true
;
}
void
SerialTreeLearner
::
FindBestThresholds
()
{
#pragma omp parallel for schedule(guided)
for
(
int
feature_index
=
0
;
feature_index
<
num_features_
;
feature_index
++
)
{
// feature is not used
if
((
is_feature_used_
!=
nullptr
&&
is_feature_used_
[
feature_index
]
==
false
))
continue
;
// if parent(larger) leaf cannot split at current feature
if
(
larger_leaf_histogram_array_
!=
nullptr
&&
!
larger_leaf_histogram_array_
[
feature_index
].
is_splittable
())
{
smaller_leaf_histogram_array_
[
feature_index
].
set_is_splittable
(
false
);
continue
;
}
// construct histograms for smaller leaf
if
(
ordered_bins_
[
feature_index
]
==
nullptr
)
{
// if not use ordered bin
smaller_leaf_histogram_array_
[
feature_index
].
Construct
(
smaller_leaf_splits_
->
data_indices
(),
smaller_leaf_splits_
->
num_data_in_leaf
(),
smaller_leaf_splits_
->
sum_gradients
(),
smaller_leaf_splits_
->
sum_hessians
(),
ptr_to_ordered_gradients_
,
ptr_to_ordered_hessians_
);
}
else
{
// used ordered bin
smaller_leaf_histogram_array_
[
feature_index
].
Construct
(
ordered_bins_
[
feature_index
],
smaller_leaf_splits_
->
LeafIndex
(),
smaller_leaf_splits_
->
num_data_in_leaf
(),
smaller_leaf_splits_
->
sum_gradients
(),
smaller_leaf_splits_
->
sum_hessians
(),
gradients_
,
hessians_
);
}
// find best threshold for smaller child
smaller_leaf_histogram_array_
[
feature_index
].
FindBestThreshold
(
&
smaller_leaf_splits_
->
BestSplitPerFeature
()[
feature_index
]);
// only has root leaf
if
(
larger_leaf_splits_
==
nullptr
||
larger_leaf_splits_
->
LeafIndex
()
<
0
)
continue
;
// construct histgroms for large leaf, we initialize larger leaf as the parent,
// so we can just subtract the smaller leaf's histograms
larger_leaf_histogram_array_
[
feature_index
].
Subtract
(
smaller_leaf_histogram_array_
[
feature_index
]);
// find best threshold for larger child
larger_leaf_histogram_array_
[
feature_index
].
FindBestThreshold
(
&
larger_leaf_splits_
->
BestSplitPerFeature
()[
feature_index
]);
}
}
void
SerialTreeLearner
::
Split
(
Tree
*
tree
,
int
best_Leaf
,
int
*
left_leaf
,
int
*
right_leaf
)
{
const
SplitInfo
&
best_split_info
=
best_split_per_leaf_
[
best_Leaf
];
// left = parent
*
left_leaf
=
best_Leaf
;
// split tree, will return right leaf
*
right_leaf
=
tree
->
Split
(
best_Leaf
,
best_split_info
.
feature
,
best_split_info
.
threshold
,
train_data_
->
FeatureAt
(
best_split_info
.
feature
)
->
feature_index
(),
train_data_
->
FeatureAt
(
best_split_info
.
feature
)
->
BinToValue
(
best_split_info
.
threshold
),
best_split_info
.
left_output
,
best_split_info
.
right_output
,
best_split_info
.
gain
);
// split data partition
data_partition_
->
Split
(
best_Leaf
,
train_data_
->
FeatureAt
(
best_split_info
.
feature
)
->
bin_data
(),
best_split_info
.
threshold
,
*
right_leaf
);
// init the leaves that used on next iteration
if
(
best_split_info
.
left_count
<
best_split_info
.
right_count
)
{
smaller_leaf_splits_
->
Init
(
*
left_leaf
,
data_partition_
,
best_split_info
.
left_sum_gradient
,
best_split_info
.
left_sum_hessian
);
larger_leaf_splits_
->
Init
(
*
right_leaf
,
data_partition_
,
best_split_info
.
right_sum_gradient
,
best_split_info
.
right_sum_hessian
);
}
else
{
smaller_leaf_splits_
->
Init
(
*
right_leaf
,
data_partition_
,
best_split_info
.
right_sum_gradient
,
best_split_info
.
right_sum_hessian
);
larger_leaf_splits_
->
Init
(
*
left_leaf
,
data_partition_
,
best_split_info
.
left_sum_gradient
,
best_split_info
.
left_sum_hessian
);
}
}
}
// namespace LightGBM
src/treelearner/serial_tree_learner.h
0 → 100644
View file @
1c774687
#ifndef LIGHTGBM_TREELEARNER_SERIAL_TREE_LEARNER_H_
#define LIGHTGBM_TREELEARNER_SERIAL_TREE_LEARNER_H_
#include <LightGBM/utils/random.h>
#include <LightGBM/utils/array_args.h>
#include <LightGBM/tree_learner.h>
#include <LightGBM/dataset.h>
#include <LightGBM/tree.h>
#include <LightGBM/feature.h>
#include "feature_histogram.hpp"
#include "data_partition.hpp"
#include "split_info.hpp"
#include "leaf_splits.hpp"
#include <cstdio>
#include <vector>
#include <random>
#include <cmath>
namespace
LightGBM
{
/*!
* \brief Used for learning a tree by single machine
*/
class
SerialTreeLearner
:
public
TreeLearner
{
public:
explicit
SerialTreeLearner
(
const
TreeConfig
&
tree_config
);
~
SerialTreeLearner
();
void
Init
(
const
Dataset
*
train_data
)
override
;
Tree
*
Train
(
const
score_t
*
gradients
,
const
score_t
*
hessians
)
override
;
void
SetBaggingData
(
const
data_size_t
*
used_indices
,
data_size_t
num_data
)
override
{
data_partition_
->
SetUsedDataIndices
(
used_indices
,
num_data
);
}
void
AddPredictionToScore
(
score_t
*
out_score
)
const
override
{
#pragma omp parallel for schedule(guided)
for
(
int
i
=
0
;
i
<
data_partition_
->
num_leaves
();
++
i
)
{
double
output
=
last_trained_tree_
->
LeafOutput
(
i
);
data_size_t
*
tmp_idx
=
nullptr
;
data_size_t
cnt_leaf_data
=
data_partition_
->
GetIndexOnLeaf
(
i
,
&
tmp_idx
);
for
(
data_size_t
j
=
0
;
j
<
cnt_leaf_data
;
++
j
)
{
out_score
[
tmp_idx
[
j
]]
+=
static_cast
<
score_t
>
(
output
);
}
}
}
protected:
/*!
* \brief Some initial works before training
*/
virtual
void
BeforeTrain
();
/*!
* \brief Some initial works before FindBestSplit
*/
virtual
bool
BeforeFindBestSplit
(
int
left_leaf
,
int
right_leaf
);
/*!
* \brief Find best thresholds for all features, using multi-threading.
* The result will be stored in smaller_leaf_splits_ and larger_leaf_splits_.
* This function will be called in FindBestSplit.
*/
virtual
void
FindBestThresholds
();
/*!
* \brief Find best features for leaves from smaller_leaf_splits_ and larger_leaf_splits_.
* This function will be called after FindBestThresholds.
*/
inline
virtual
void
FindBestSplitsForLeaves
();
/*!
* \brief Partition tree and data according best split.
* \param tree Current tree, will be splitted on this function.
* \param best_leaf The index of leaf that will be splitted.
* \param left_leaf The index of left leaf after splitted.
* \param right_leaf The index of right leaf after splitted.
*/
virtual
void
Split
(
Tree
*
tree
,
int
best_leaf
,
int
*
left_leaf
,
int
*
right_leaf
);
/*!
* \brief Get the number of data in a leaf
* \param leaf_idx The index of leaf
* \return The number of data in the leaf_idx leaf
*/
inline
virtual
data_size_t
GetGlobalDataCountInLeaf
(
int
leaf_idx
)
const
;
/*!
* \brief Find best features for leaf from leaf_splits
* \param leaf_splits
*/
inline
void
FindBestSplitForLeaf
(
LeafSplits
*
leaf_splits
);
/*! \brief Last trained decision tree */
const
Tree
*
last_trained_tree_
;
/*! \brief number of data */
data_size_t
num_data_
;
/*! \brief number of features */
int
num_features_
;
/*! \brief training data */
const
Dataset
*
train_data_
;
/*! \brief gradients of current iteration */
const
score_t
*
gradients_
;
/*! \brief hessians of current iteration */
const
score_t
*
hessians_
;
/*! \brief number of total leaves */
int
num_leaves_
;
/*! \brief mininal data on one leaf */
data_size_t
min_num_data_one_leaf_
;
/*! \brief mininal sum hessian on one leaf */
score_t
min_sum_hessian_one_leaf_
;
/*! \brief sub-feature fraction rate */
double
feature_fraction_
;
/*! \brief training data partition on leaves */
DataPartition
*
data_partition_
;
/*! \brief used for generate used features */
Random
random_
;
/*! \brief used for sub feature training, is_feature_used_[i] = falase means don't used feature i */
bool
*
is_feature_used_
;
/*! \brief cache historical histogram to speed up */
FeatureHistogram
**
historical_histogram_array_
;
/*! \brief pointer to histograms array of smaller leaf */
FeatureHistogram
*
smaller_leaf_histogram_array_
;
/*! \brief pointer to histograms array of larger leaf */
FeatureHistogram
*
larger_leaf_histogram_array_
;
/*! \brief store best split points for all leaves */
std
::
vector
<
SplitInfo
>
best_split_per_leaf_
;
/*! \brief stores best thresholds for all feature for smaller leaf */
LeafSplits
*
smaller_leaf_splits_
;
/*! \brief stores best thresholds for all feature for larger leaf */
LeafSplits
*
larger_leaf_splits_
;
/*! \brief gradients of current iteration, ordered for cache optimized */
score_t
*
ordered_gradients_
;
/*! \brief hessians of current iteration, ordered for cache optimized */
score_t
*
ordered_hessians_
;
/*! \brief Pointer to ordered_gradients_, use this to avoid copy at BeforeTrain */
const
score_t
*
ptr_to_ordered_gradients_
;
/*! \brief Pointer to ordered_hessians_, use this to avoid copy at BeforeTrain*/
const
score_t
*
ptr_to_ordered_hessians_
;
/*! \brief Store ordered bin */
std
::
vector
<
OrderedBin
*>
ordered_bins_
;
/*! \brief True if has ordered bin */
bool
has_ordered_bin_
=
false
;
/*! \brief is_data_in_leaf_[i] != 0 means i-th data is marked */
char
*
is_data_in_leaf_
;
};
inline
void
SerialTreeLearner
::
FindBestSplitsForLeaves
()
{
FindBestSplitForLeaf
(
smaller_leaf_splits_
);
FindBestSplitForLeaf
(
larger_leaf_splits_
);
}
inline
data_size_t
SerialTreeLearner
::
GetGlobalDataCountInLeaf
(
int
leafIdx
)
const
{
if
(
leafIdx
>=
0
)
{
return
data_partition_
->
leaf_count
(
leafIdx
);
}
else
{
return
0
;
}
}
inline
void
SerialTreeLearner
::
FindBestSplitForLeaf
(
LeafSplits
*
leaf_splits
)
{
if
(
leaf_splits
==
nullptr
||
leaf_splits
->
LeafIndex
()
<
0
)
{
return
;
}
std
::
vector
<
double
>
gains
;
for
(
size_t
i
=
0
;
i
<
leaf_splits
->
BestSplitPerFeature
().
size
();
++
i
)
{
gains
.
push_back
(
leaf_splits
->
BestSplitPerFeature
()[
i
].
gain
);
}
int
best_feature
=
static_cast
<
int
>
(
ArrayArgs
<
double
>::
ArgMax
(
gains
));
int
leaf
=
leaf_splits
->
LeafIndex
();
best_split_per_leaf_
[
leaf
]
=
leaf_splits
->
BestSplitPerFeature
()[
best_feature
];
best_split_per_leaf_
[
leaf
].
feature
=
best_feature
;
}
}
// namespace LightGBM
#endif #endif // LightGBM_TREELEARNER_SERIAL_TREE_LEARNER_H_
src/treelearner/split_info.hpp
0 → 100644
View file @
1c774687
#ifndef LIGHTGBM_TREELEARNER_SPLIT_INFO_HPP_
#define LIGHTGBM_TREELEARNER_SPLIT_INFO_HPP_
#include <LightGBM/meta.h>
#include <cmath>
#include <cstdint>
#include <cstring>
#include <functional>
namespace
LightGBM
{
/*!
* \brief Used to store some information for gain split point
*/
struct
SplitInfo
{
public:
/*! \brief Feature index */
int
feature
;
/*! \brief Split threshold */
unsigned
int
threshold
;
/*! \brief Left output after split */
score_t
left_output
;
/*! \brief Right output after split */
score_t
right_output
;
/*! \brief Split gain */
score_t
gain
;
/*! \brief Left number of data after split */
data_size_t
left_count
;
/*! \brief Right number of data after split */
data_size_t
right_count
;
/*! \brief Left sum gradient after split */
score_t
left_sum_gradient
;
/*! \brief Left sum hessian after split */
score_t
left_sum_hessian
;
/*! \brief Right sum gradient after split */
score_t
right_sum_gradient
;
/*! \brief Right sum hessian after split */
score_t
right_sum_hessian
;
SplitInfo
()
{
// initilize with -1 and -inf gain
feature
=
-
1
;
gain
=
kMinScore
;
}
inline
void
Reset
()
{
// initilize with -1 and -inf gain
feature
=
-
1
;
gain
=
kMinScore
;
}
inline
bool
operator
>
(
const
SplitInfo
&
si
)
const
;
inline
static
void
MaxReducer
(
const
char
*
src
,
char
*
dst
,
int
len
)
{
const
int
type_size
=
sizeof
(
SplitInfo
);
int
used_size
=
0
;
const
SplitInfo
*
p1
;
SplitInfo
*
p2
;
while
(
used_size
<
len
)
{
p1
=
reinterpret_cast
<
const
SplitInfo
*>
(
src
);
p2
=
reinterpret_cast
<
SplitInfo
*>
(
dst
);
if
(
*
p1
>
*
p2
)
{
// copy
std
::
memcpy
(
dst
,
src
,
type_size
);
}
src
+=
type_size
;
dst
+=
type_size
;
used_size
+=
type_size
;
}
}
};
inline
bool
SplitInfo
::
operator
>
(
const
SplitInfo
&
si
)
const
{
score_t
local_gain
=
this
->
gain
;
score_t
other_gain
=
si
.
gain
;
// replace nan with -inf
if
(
local_gain
==
NAN
)
{
local_gain
=
kMinScore
;
}
// replace nan with -inf
if
(
other_gain
==
NAN
)
{
other_gain
=
kMinScore
;
}
int
local_feature
=
this
->
feature
;
int
other_feature
=
si
.
feature
;
// replace -1 with max int
if
(
local_feature
==
-
1
)
{
local_feature
=
INT32_MAX
;
}
// replace -1 with max int
if
(
other_feature
==
-
1
)
{
other_feature
=
INT32_MAX
;
}
if
(
local_gain
!=
other_gain
)
{
return
local_gain
>
other_gain
;
}
else
{
// if same gain, use smaller feature
return
local_feature
<
other_feature
;
}
}
}
// namespace LightGBM
#endif #endif // LightGBM_TREELEARNER_SPLIT_INFO_HPP_
src/treelearner/tree_learner.cpp
0 → 100644
View file @
1c774687
#include <LightGBM/tree_learner.h>
#include "serial_tree_learner.h"
#include "parallel_tree_learner.h"
namespace
LightGBM
{
TreeLearner
*
TreeLearner
::
CreateTreeLearner
(
TreeLearnerType
type
,
const
TreeConfig
&
tree_config
)
{
if
(
type
==
TreeLearnerType
::
kSerialTreeLearner
)
{
return
new
SerialTreeLearner
(
tree_config
);
}
else
if
(
type
==
TreeLearnerType
::
kFeatureParallelTreelearner
)
{
return
new
FeatureParallelTreeLearner
(
tree_config
);
}
else
if
(
type
==
TreeLearnerType
::
kDataParallelTreeLearner
)
{
return
new
DataParallelTreeLearner
(
tree_config
);
}
return
nullptr
;
}
}
// namespace LightGBM
windows/LightGBM.sln
0 → 100644
View file @
1c774687
Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 14
VisualStudioVersion = 14.0.25123.0
MinimumVisualStudioVersion = 10.0.40219.1
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "LightGBM", "LightGBM.vcxproj", "{F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug_mpi|x64 = Debug_mpi|x64
Debug|x64 = Debug|x64
Release_mpi|x64 = Release_mpi|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.Debug_mpi|x64.ActiveCfg = Debug_mpi|x64
{F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.Debug_mpi|x64.Build.0 = Debug_mpi|x64
{F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.Debug|x64.ActiveCfg = Debug|x64
{F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.Debug|x64.Build.0 = Debug|x64
{F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.Release_mpi|x64.ActiveCfg = Release_mpi|x64
{F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.Release_mpi|x64.Build.0 = Release_mpi|x64
{F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.Release|x64.ActiveCfg = Release|x64
{F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal
windows/LightGBM.vcxproj
0 → 100644
View file @
1c774687
<?xml version="1.0" encoding="utf-8"?>
<Project
DefaultTargets=
"Build"
ToolsVersion=
"14.0"
xmlns=
"http://schemas.microsoft.com/developer/msbuild/2003"
>
<ItemGroup
Label=
"Projectconfigurations"
>
<ProjectConfiguration
Include=
"Debug_mpi|x64"
>
<Configuration>
Debug_mpi
</Configuration>
<Platform>
x64
</Platform>
</ProjectConfiguration>
<ProjectConfiguration
Include=
"Debug|x64"
>
<Configuration>
Debug
</Configuration>
<Platform>
x64
</Platform>
</ProjectConfiguration>
<ProjectConfiguration
Include=
"Release_mpi|x64"
>
<Configuration>
Release_mpi
</Configuration>
<Platform>
x64
</Platform>
</ProjectConfiguration>
<ProjectConfiguration
Include=
"Release|x64"
>
<Configuration>
Release
</Configuration>
<Platform>
x64
</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup
Label=
"Globals"
>
<ProjectGuid>
{F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}
</ProjectGuid>
<RootNamespace>
LightGBM
</RootNamespace>
<SccProjectName>
SAK
</SccProjectName>
<SccAuxPath>
SAK
</SccAuxPath>
<SccLocalPath>
SAK
</SccLocalPath>
<SccProvider>
SAK
</SccProvider>
<ProjectName>
LightGBM
</ProjectName>
<WindowsTargetPlatformVersion>
8.1
</WindowsTargetPlatformVersion>
</PropertyGroup>
<Import
Project=
"$(VCTargetsPath)\Microsoft.Cpp.Default.props"
/>
<PropertyGroup
Label=
"Configuration"
Condition=
"'$(Configuration)|$(Platform)'=='Debug_mpi|x64'"
>
<PlatformToolset>
v120
</PlatformToolset>
</PropertyGroup>
<PropertyGroup
Label=
"Configuration"
Condition=
"'$(Configuration)|$(Platform)'=='Release|x64'"
>
<PlatformToolset>
v120
</PlatformToolset>
</PropertyGroup>
<PropertyGroup
Label=
"Configuration"
Condition=
"'$(Configuration)|$(Platform)'=='Debug|x64'"
>
<PlatformToolset>
v120
</PlatformToolset>
</PropertyGroup>
<PropertyGroup
Label=
"Configuration"
Condition=
"'$(Configuration)|$(Platform)'=='Release_mpi|x64'"
>
<PlatformToolset>
v120
</PlatformToolset>
</PropertyGroup>
<Import
Project=
"$(VCTargetsPath)\Microsoft.Cpp.props"
/>
<ImportGroup
Label=
"ExtensionSettings"
>
</ImportGroup>
<ImportGroup
Condition=
"'$(configuration)|$(Platform)'=='Debug|x64'"
Label=
"PropertySheets"
>
<Import
Project=
"$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props"
Condition=
"exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')"
Label=
"LocalAppDataPlatform"
/>
</ImportGroup>
<ImportGroup
Condition=
"'$(configuration)|$(Platform)'=='Release|x64'"
Label=
"PropertySheets"
>
<Import
Project=
"$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props"
Condition=
"exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')"
Label=
"LocalAppDataPlatform"
/>
</ImportGroup>
<PropertyGroup
Label=
"UserMacros"
/>
<PropertyGroup
Condition=
"'$(configuration)|$(Platform)'=='Debug|x64'"
>
<IncludePath>
..\include;$(VC_IncludePath);$(WindowsSDK_IncludePath);
</IncludePath>
<LibraryPath>
$(VC_LibraryPath_x64);$(WindowsSDK_LibraryPath_x64);
</LibraryPath>
</PropertyGroup>
<PropertyGroup
Condition=
"'$(configuration)|$(Platform)'=='Debug_mpi|x64'"
>
<IncludePath>
$(MSMPI_INC);..\include;$(VC_IncludePath);$(WindowsSDK_IncludePath);
</IncludePath>
<LibraryPath>
$(MSMPI_LIB64);$(VC_LibraryPath_x64);$(WindowsSDK_LibraryPath_x64);
</LibraryPath>
</PropertyGroup>
<PropertyGroup
Condition=
"'$(configuration)|$(Platform)'=='Release|x64'"
>
<IncludePath>
..\include;$(VC_IncludePath);$(WindowsSDK_IncludePath);
</IncludePath>
<LibraryPath>
$(VC_LibraryPath_x64);$(WindowsSDK_LibraryPath_x64);
</LibraryPath>
</PropertyGroup>
<PropertyGroup
Condition=
"'$(configuration)|$(Platform)'=='Release_mpi|x64'"
>
<IncludePath>
$(MSMPI_INC);..\include;$(VC_IncludePath);$(WindowsSDK_IncludePath);
</IncludePath>
<LibraryPath>
$(MSMPI_LIB64);$(VC_LibraryPath_x64);$(WindowsSDK_LibraryPath_x64);
</LibraryPath>
</PropertyGroup>
<ItemDefinitionGroup
Condition=
"'$(Configuration)|$(Platform)'=='Debug_mpi|x64'"
>
<ClCompile>
<PreprocessorDefinitions>
USE_MPI
</PreprocessorDefinitions>
<WarningLevel>
Level4
</WarningLevel>
<OpenMPSupport>
true
</OpenMPSupport>
<FavorSizeOrSpeed>
Neither
</FavorSizeOrSpeed>
<InlineFunctionExpansion>
Default
</InlineFunctionExpansion>
<IntrinsicFunctions>
false
</IntrinsicFunctions>
<EnableFiberSafeOptimizations>
false
</EnableFiberSafeOptimizations>
<WholeProgramOptimization>
false
</WholeProgramOptimization>
<Optimization>
Disabled
</Optimization>
</ClCompile>
<Link>
<AdditionalLibraryDirectories>
</AdditionalLibraryDirectories>
</Link>
<Link>
<AdditionalDependencies>
msmpi.lib
</AdditionalDependencies>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup
Condition=
"'$(Configuration)|$(Platform)'=='Debug|x64'"
>
<ClCompile>
<PreprocessorDefinitions>
USE_SOCKET
</PreprocessorDefinitions>
<WarningLevel>
Level4
</WarningLevel>
<OpenMPSupport>
true
</OpenMPSupport>
<FavorSizeOrSpeed>
Neither
</FavorSizeOrSpeed>
<InlineFunctionExpansion>
Default
</InlineFunctionExpansion>
<IntrinsicFunctions>
false
</IntrinsicFunctions>
<EnableFiberSafeOptimizations>
false
</EnableFiberSafeOptimizations>
<WholeProgramOptimization>
false
</WholeProgramOptimization>
<Optimization>
Disabled
</Optimization>
</ClCompile>
<Link>
<AdditionalDependencies>
</AdditionalDependencies>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup
Condition=
"'$(configuration)|$(Platform)'=='Release_mpi|x64'"
>
<ClCompile>
<WarningLevel>
Level4
</WarningLevel>
<Optimization>
MaxSpeed
</Optimization>
<FunctionLevelLinking>
true
</FunctionLevelLinking>
<IntrinsicFunctions>
true
</IntrinsicFunctions>
<SDLCheck>
true
</SDLCheck>
<PreprocessorDefinitions>
USE_MPI;_MBCS;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)
</PreprocessorDefinitions>
<OpenMPSupport>
true
</OpenMPSupport>
<FavorSizeOrSpeed>
Speed
</FavorSizeOrSpeed>
<InlineFunctionExpansion>
AnySuitable
</InlineFunctionExpansion>
<EnableFiberSafeOptimizations>
false
</EnableFiberSafeOptimizations>
<WholeProgramOptimization>
true
</WholeProgramOptimization>
<OmitFramePointers>
true
</OmitFramePointers>
<RuntimeLibrary>
MultiThreadedDLL
</RuntimeLibrary>
</ClCompile>
<Link>
<AdditionalLibraryDirectories>
</AdditionalLibraryDirectories>
</Link>
<Link>
<GenerateDebugInformation>
true
</GenerateDebugInformation>
<EnableCOMDATFolding>
true
</EnableCOMDATFolding>
<OptimizeReferences>
true
</OptimizeReferences>
<AdditionalDependencies>
msmpi.lib
</AdditionalDependencies>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup
Condition=
"'$(Configuration)|$(Platform)'=='Release|x64'"
>
<ClCompile>
<PreprocessorDefinitions>
USE_SOCKET;_MBCS;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)
</PreprocessorDefinitions>
<WarningLevel>
Level4
</WarningLevel>
<OpenMPSupport>
true
</OpenMPSupport>
<FavorSizeOrSpeed>
Speed
</FavorSizeOrSpeed>
<InlineFunctionExpansion>
AnySuitable
</InlineFunctionExpansion>
<IntrinsicFunctions>
true
</IntrinsicFunctions>
<EnableFiberSafeOptimizations>
false
</EnableFiberSafeOptimizations>
<WholeProgramOptimization>
true
</WholeProgramOptimization>
<RuntimeLibrary>
MultiThreadedDLL
</RuntimeLibrary>
<OmitFramePointers>
true
</OmitFramePointers>
<FunctionLevelLinking>
true
</FunctionLevelLinking>
</ClCompile>
<Link>
<AdditionalDependencies
/>
<OptimizeReferences>
true
</OptimizeReferences>
</Link>
</ItemDefinitionGroup>
<ItemGroup>
<ClInclude
Include=
"..\include\LightGBM\application.h"
/>
<ClInclude
Include=
"..\include\LightGBM\bin.h"
/>
<ClInclude
Include=
"..\include\LightGBM\boosting.h"
/>
<ClInclude
Include=
"..\include\LightGBM\config.h"
/>
<ClInclude
Include=
"..\include\LightGBM\dataset.h"
/>
<ClInclude
Include=
"..\include\LightGBM\feature.h"
/>
<ClInclude
Include=
"..\include\LightGBM\meta.h"
/>
<ClInclude
Include=
"..\include\LightGBM\metric.h"
/>
<ClInclude
Include=
"..\include\LightGBM\network.h"
/>
<ClInclude
Include=
"..\include\LightGBM\objective_function.h"
/>
<ClInclude
Include=
"..\include\LightGBM\tree.h"
/>
<ClInclude
Include=
"..\include\LightGBM\tree_learner.h"
/>
<ClInclude
Include=
"..\include\LightGBM\utils\array_args.h"
/>
<ClInclude
Include=
"..\include\LightGBM\utils\common.h"
/>
<ClInclude
Include=
"..\include\LightGBM\utils\log.h"
/>
<ClInclude
Include=
"..\include\LightGBM\utils\pipeline_reader.h"
/>
<ClInclude
Include=
"..\include\LightGBM\utils\random.h"
/>
<ClInclude
Include=
"..\include\LightGBM\utils\text_reader.h"
/>
<ClInclude
Include=
"..\include\LightGBM\utils\threading.h"
/>
<ClInclude
Include=
"..\src\application\predictor.hpp"
/>
<ClInclude
Include=
"..\src\boosting\gbdt.h"
/>
<ClInclude
Include=
"..\src\boosting\score_updater.hpp"
/>
<ClInclude
Include=
"..\src\io\dense_bin.hpp"
/>
<ClInclude
Include=
"..\src\io\ordered_sparse_bin.hpp"
/>
<ClInclude
Include=
"..\src\io\parser.hpp"
/>
<ClInclude
Include=
"..\src\io\sparse_bin.hpp"
/>
<ClInclude
Include=
"..\src\metric\binary_metric.hpp"
/>
<ClInclude
Include=
"..\src\metric\rank_metric.hpp"
/>
<ClInclude
Include=
"..\src\metric\regression_metric.hpp"
/>
<ClInclude
Include=
"..\src\network\linkers.h"
/>
<ClInclude
Include=
"..\src\network\socket_wrapper.hpp"
/>
<ClInclude
Include=
"..\src\objective\binary_objective.hpp"
/>
<ClInclude
Include=
"..\src\objective\rank_objective.hpp"
/>
<ClInclude
Include=
"..\src\objective\regression_objective.hpp"
/>
<ClInclude
Include=
"..\src\treelearner\data_partition.hpp"
/>
<ClInclude
Include=
"..\src\treelearner\feature_histogram.hpp"
/>
<ClInclude
Include=
"..\src\treelearner\leaf_splits.hpp"
/>
<ClInclude
Include=
"..\src\treelearner\parallel_tree_learner.h"
/>
<ClInclude
Include=
"..\src\treelearner\serial_tree_learner.h"
/>
<ClInclude
Include=
"..\src\treelearner\split_info.hpp"
/>
</ItemGroup>
<ItemGroup>
<ClCompile
Include=
"..\src\application\application.cpp"
/>
<ClCompile
Include=
"..\src\boosting\boosting.cpp"
/>
<ClCompile
Include=
"..\src\boosting\gbdt.cpp"
/>
<ClCompile
Include=
"..\src\io\bin.cpp"
/>
<ClCompile
Include=
"..\src\io\config.cpp"
/>
<ClCompile
Include=
"..\src\io\dataset.cpp"
/>
<ClCompile
Include=
"..\src\io\metadata.cpp"
/>
<ClCompile
Include=
"..\src\io\parser.cpp"
/>
<ClCompile
Include=
"..\src\io\tree.cpp"
/>
<ClCompile
Include=
"..\src\metric\dcg_calculator.cpp"
/>
<ClCompile
Include=
"..\src\metric\metric.cpp"
/>
<ClCompile
Include=
"..\src\network\network.cpp"
/>
<ClCompile
Include=
"..\src\network\linkers_mpi.cpp"
/>
<ClCompile
Include=
"..\src\network\linkers_socket.cpp"
/>
<ClCompile
Include=
"..\src\network\linker_topo.cpp"
/>
<ClCompile
Include=
"..\src\objective\objective_function.cpp"
/>
<ClCompile
Include=
"..\src\main.cpp"
/>
<ClCompile
Include=
"..\src\treelearner\data_parallel_tree_learner.cpp"
/>
<ClCompile
Include=
"..\src\treelearner\feature_parallel_tree_learner.cpp"
/>
<ClCompile
Include=
"..\src\treelearner\serial_tree_learner.cpp"
/>
<ClCompile
Include=
"..\src\treelearner\tree_learner.cpp"
/>
</ItemGroup>
<Import
Project=
"$(VCTargetsPath)\Microsoft.Cpp.targets"
/>
<ImportGroup
Label=
"ExtensionTargets"
>
</ImportGroup>
</Project>
\ No newline at end of file
windows/LightGBM.vcxproj.filters
0 → 100644
View file @
1c774687
<?xml version="1.0" encoding="utf-8"?>
<Project
ToolsVersion=
"4.0"
xmlns=
"http://schemas.microsoft.com/developer/msbuild/2003"
>
<ItemGroup>
<Filter
Include=
"src"
>
<UniqueIdentifier>
{6e213f6b-b843-4469-bc8c-56c1ffe7f195}
</UniqueIdentifier>
</Filter>
<Filter
Include=
"include"
>
<UniqueIdentifier>
{29082261-e6cd-40b2-b30c-c4cb70f23339}
</UniqueIdentifier>
</Filter>
<Filter
Include=
"src\application"
>
<UniqueIdentifier>
{3a703e42-6f06-4ab1-8e46-0dfb07407d9e}
</UniqueIdentifier>
</Filter>
<Filter
Include=
"src\boosting"
>
<UniqueIdentifier>
{43be32f9-227b-4a15-9c0e-38dbf9747aeb}
</UniqueIdentifier>
</Filter>
<Filter
Include=
"src\io"
>
<UniqueIdentifier>
{6fcdaf19-880a-45b0-80db-344be9498017}
</UniqueIdentifier>
</Filter>
<Filter
Include=
"src\metric"
>
<UniqueIdentifier>
{8bacb16c-7f31-494f-94df-8ccc6c3e3894}
</UniqueIdentifier>
</Filter>
<Filter
Include=
"src\network"
>
<UniqueIdentifier>
{93db474b-4ab8-406b-99ec-eb8e40f97593}
</UniqueIdentifier>
</Filter>
<Filter
Include=
"src\objective"
>
<UniqueIdentifier>
{34d576af-dec6-4cad-90bd-f8d0e95ec614}
</UniqueIdentifier>
</Filter>
<Filter
Include=
"src\treelearner"
>
<UniqueIdentifier>
{16638c37-41bd-4124-8b80-befbca2f969f}
</UniqueIdentifier>
</Filter>
<Filter
Include=
"include\LightGBM"
>
<UniqueIdentifier>
{37b41659-26e2-4b2f-ac0c-7b52d8bd53da}
</UniqueIdentifier>
</Filter>
<Filter
Include=
"include\LightGBM\utils"
>
<UniqueIdentifier>
{bf66b9f7-015e-404d-8098-4353abc46956}
</UniqueIdentifier>
</Filter>
</ItemGroup>
<ItemGroup>
<ClInclude
Include=
"..\src\boosting\gbdt.h"
>
<Filter>
src\boosting
</Filter>
</ClInclude>
<ClInclude
Include=
"..\src\network\linkers.h"
>
<Filter>
src\network
</Filter>
</ClInclude>
<ClInclude
Include=
"..\src\treelearner\parallel_tree_learner.h"
>
<Filter>
src\treelearner
</Filter>
</ClInclude>
<ClInclude
Include=
"..\src\treelearner\serial_tree_learner.h"
>
<Filter>
src\treelearner
</Filter>
</ClInclude>
<ClInclude
Include=
"..\src\application\predictor.hpp"
>
<Filter>
src\application
</Filter>
</ClInclude>
<ClInclude
Include=
"..\src\boosting\score_updater.hpp"
>
<Filter>
src\boosting
</Filter>
</ClInclude>
<ClInclude
Include=
"..\src\io\dense_bin.hpp"
>
<Filter>
src\io
</Filter>
</ClInclude>
<ClInclude
Include=
"..\src\io\ordered_sparse_bin.hpp"
>
<Filter>
src\io
</Filter>
</ClInclude>
<ClInclude
Include=
"..\src\io\parser.hpp"
>
<Filter>
src\io
</Filter>
</ClInclude>
<ClInclude
Include=
"..\src\io\sparse_bin.hpp"
>
<Filter>
src\io
</Filter>
</ClInclude>
<ClInclude
Include=
"..\src\metric\binary_metric.hpp"
>
<Filter>
src\metric
</Filter>
</ClInclude>
<ClInclude
Include=
"..\src\metric\rank_metric.hpp"
>
<Filter>
src\metric
</Filter>
</ClInclude>
<ClInclude
Include=
"..\src\metric\regression_metric.hpp"
>
<Filter>
src\metric
</Filter>
</ClInclude>
<ClInclude
Include=
"..\src\network\socket_wrapper.hpp"
>
<Filter>
src\network
</Filter>
</ClInclude>
<ClInclude
Include=
"..\src\objective\binary_objective.hpp"
>
<Filter>
src\objective
</Filter>
</ClInclude>
<ClInclude
Include=
"..\src\objective\rank_objective.hpp"
>
<Filter>
src\objective
</Filter>
</ClInclude>
<ClInclude
Include=
"..\src\objective\regression_objective.hpp"
>
<Filter>
src\objective
</Filter>
</ClInclude>
<ClInclude
Include=
"..\src\treelearner\data_partition.hpp"
>
<Filter>
src\treelearner
</Filter>
</ClInclude>
<ClInclude
Include=
"..\src\treelearner\feature_histogram.hpp"
>
<Filter>
src\treelearner
</Filter>
</ClInclude>
<ClInclude
Include=
"..\src\treelearner\leaf_splits.hpp"
>
<Filter>
src\treelearner
</Filter>
</ClInclude>
<ClInclude
Include=
"..\src\treelearner\split_info.hpp"
>
<Filter>
src\treelearner
</Filter>
</ClInclude>
<ClInclude
Include=
"..\include\LightGBM\application.h"
>
<Filter>
include\LightGBM
</Filter>
</ClInclude>
<ClInclude
Include=
"..\include\LightGBM\bin.h"
>
<Filter>
include\LightGBM
</Filter>
</ClInclude>
<ClInclude
Include=
"..\include\LightGBM\boosting.h"
>
<Filter>
include\LightGBM
</Filter>
</ClInclude>
<ClInclude
Include=
"..\include\LightGBM\config.h"
>
<Filter>
include\LightGBM
</Filter>
</ClInclude>
<ClInclude
Include=
"..\include\LightGBM\dataset.h"
>
<Filter>
include\LightGBM
</Filter>
</ClInclude>
<ClInclude
Include=
"..\include\LightGBM\feature.h"
>
<Filter>
include\LightGBM
</Filter>
</ClInclude>
<ClInclude
Include=
"..\include\LightGBM\meta.h"
>
<Filter>
include\LightGBM
</Filter>
</ClInclude>
<ClInclude
Include=
"..\include\LightGBM\metric.h"
>
<Filter>
include\LightGBM
</Filter>
</ClInclude>
<ClInclude
Include=
"..\include\LightGBM\network.h"
>
<Filter>
include\LightGBM
</Filter>
</ClInclude>
<ClInclude
Include=
"..\include\LightGBM\objective_function.h"
>
<Filter>
include\LightGBM
</Filter>
</ClInclude>
<ClInclude
Include=
"..\include\LightGBM\tree.h"
>
<Filter>
include\LightGBM
</Filter>
</ClInclude>
<ClInclude
Include=
"..\include\LightGBM\tree_learner.h"
>
<Filter>
include\LightGBM
</Filter>
</ClInclude>
<ClInclude
Include=
"..\include\LightGBM\utils\array_args.h"
>
<Filter>
include\LightGBM\utils
</Filter>
</ClInclude>
<ClInclude
Include=
"..\include\LightGBM\utils\common.h"
>
<Filter>
include\LightGBM\utils
</Filter>
</ClInclude>
<ClInclude
Include=
"..\include\LightGBM\utils\log.h"
>
<Filter>
include\LightGBM\utils
</Filter>
</ClInclude>
<ClInclude
Include=
"..\include\LightGBM\utils\pipeline_reader.h"
>
<Filter>
include\LightGBM\utils
</Filter>
</ClInclude>
<ClInclude
Include=
"..\include\LightGBM\utils\random.h"
>
<Filter>
include\LightGBM\utils
</Filter>
</ClInclude>
<ClInclude
Include=
"..\include\LightGBM\utils\text_reader.h"
>
<Filter>
include\LightGBM\utils
</Filter>
</ClInclude>
<ClInclude
Include=
"..\include\LightGBM\utils\threading.h"
>
<Filter>
include\LightGBM\utils
</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<ClCompile
Include=
"..\src\application\application.cpp"
>
<Filter>
src\application
</Filter>
</ClCompile>
<ClCompile
Include=
"..\src\network\linkers_socket.cpp"
>
<Filter>
src\network
</Filter>
</ClCompile>
<ClCompile
Include=
"..\src\network\linkers_mpi.cpp"
>
<Filter>
src\network
</Filter>
</ClCompile>
<ClCompile
Include=
"..\src\treelearner\serial_tree_learner.cpp"
>
<Filter>
src\treelearner
</Filter>
</ClCompile>
<ClCompile
Include=
"..\src\treelearner\tree_learner.cpp"
>
<Filter>
src\treelearner
</Filter>
</ClCompile>
<ClCompile
Include=
"..\src\Boosting\gbdt.cpp"
>
<Filter>
src\boosting
</Filter>
</ClCompile>
<ClCompile
Include=
"..\src\io\dataset.cpp"
>
<Filter>
src\io
</Filter>
</ClCompile>
<ClCompile
Include=
"..\src\io\bin.cpp"
>
<Filter>
src\io
</Filter>
</ClCompile>
<ClCompile
Include=
"..\src\io\tree.cpp"
>
<Filter>
src\io
</Filter>
</ClCompile>
<ClCompile
Include=
"..\src\objective\objective_function.cpp"
>
<Filter>
src\objective
</Filter>
</ClCompile>
<ClCompile
Include=
"..\src\Boosting\boosting.cpp"
>
<Filter>
src\boosting
</Filter>
</ClCompile>
<ClCompile
Include=
"..\src\io\parser.cpp"
>
<Filter>
src\io
</Filter>
</ClCompile>
<ClCompile
Include=
"..\src\metric\metric.cpp"
>
<Filter>
src\metric
</Filter>
</ClCompile>
<ClCompile
Include=
"..\src\treelearner\data_parallel_tree_learner.cpp"
>
<Filter>
src\treelearner
</Filter>
</ClCompile>
<ClCompile
Include=
"..\src\treelearner\feature_parallel_tree_learner.cpp"
>
<Filter>
src\treelearner
</Filter>
</ClCompile>
<ClCompile
Include=
"..\src\network\linker_topo.cpp"
>
<Filter>
src\network
</Filter>
</ClCompile>
<ClCompile
Include=
"..\src\network\network.cpp"
>
<Filter>
src\network
</Filter>
</ClCompile>
<ClCompile
Include=
"..\src\io\config.cpp"
>
<Filter>
src\io
</Filter>
</ClCompile>
<ClCompile
Include=
"..\src\metric\dcg_calculator.cpp"
>
<Filter>
src\metric
</Filter>
</ClCompile>
<ClCompile
Include=
"..\src\io\metadata.cpp"
>
<Filter>
src\io
</Filter>
</ClCompile>
<ClCompile
Include=
"..\src\main.cpp"
>
<Filter>
src
</Filter>
</ClCompile>
</ItemGroup>
</Project>
\ No newline at end of file
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment