Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tianlh
LightGBM-DCU
Commits
eade219e
Commit
eade219e
authored
Mar 18, 2017
by
Qiwei Ye
Browse files
merge conflict
parents
f23e6083
060bd316
Changes
129
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
500 additions
and
197 deletions
+500
-197
src/treelearner/serial_tree_learner.h
src/treelearner/serial_tree_learner.h
+10
-41
src/treelearner/split_info.hpp
src/treelearner/split_info.hpp
+31
-0
src/treelearner/voting_parallel_tree_learner.cpp
src/treelearner/voting_parallel_tree_learner.cpp
+178
-62
tests/python_package_test/test_basic.py
tests/python_package_test/test_basic.py
+4
-4
tests/python_package_test/test_engine.py
tests/python_package_test/test_engine.py
+107
-47
tests/python_package_test/test_plotting.py
tests/python_package_test/test_plotting.py
+111
-0
tests/python_package_test/test_sklearn.py
tests/python_package_test/test_sklearn.py
+37
-33
windows/LightGBM.vcxproj
windows/LightGBM.vcxproj
+4
-1
windows/LightGBM.vcxproj.filters
windows/LightGBM.vcxproj.filters
+18
-9
No files found.
src/treelearner/serial_tree_learner.h
View file @
eade219e
...
...
@@ -7,10 +7,10 @@
#include <LightGBM/tree_learner.h>
#include <LightGBM/dataset.h>
#include <LightGBM/tree.h>
#include <LightGBM/feature.h>
#include "feature_histogram.hpp"
#include "data_partition.hpp"
#include "split_info.hpp"
#include "data_partition.hpp"
#include "leaf_splits.hpp"
#include <cstdio>
...
...
@@ -32,6 +32,8 @@ public:
void
Init
(
const
Dataset
*
train_data
)
override
;
void
ResetTrainingData
(
const
Dataset
*
train_data
)
override
;
void
ResetConfig
(
const
TreeConfig
*
tree_config
)
override
;
Tree
*
Train
(
const
score_t
*
gradients
,
const
score_t
*
hessians
)
override
;
...
...
@@ -41,7 +43,8 @@ public:
}
void
AddPredictionToScore
(
double
*
out_score
)
const
override
{
#pragma omp parallel for schedule(guided)
if
(
last_trained_tree_
->
num_leaves
()
<=
1
)
{
return
;
}
#pragma omp parallel for schedule(static)
for
(
int
i
=
0
;
i
<
data_partition_
->
num_leaves
();
++
i
)
{
double
output
=
static_cast
<
double
>
(
last_trained_tree_
->
LeafOutput
(
i
));
data_size_t
cnt_leaf_data
=
0
;
...
...
@@ -75,7 +78,7 @@ protected:
* \brief Find best features for leaves from smaller_leaf_splits_ and larger_leaf_splits_.
* This function will be called after FindBestThresholds.
*/
inline
virtual
void
FindBestSplitsForLeaves
();
virtual
void
FindBestSplitsForLeaves
();
/*!
* \brief Partition tree and data according best split.
...
...
@@ -93,12 +96,6 @@ protected:
*/
inline
virtual
data_size_t
GetGlobalDataCountInLeaf
(
int
leaf_idx
)
const
;
/*!
* \brief Find best features for leaf from leaf_splits
* \param leaf_splits
*/
inline
void
FindBestSplitForLeaf
(
LeafSplits
*
leaf_splits
);
/*! \brief Last trained decision tree */
const
Tree
*
last_trained_tree_
;
/*! \brief number of data */
...
...
@@ -116,7 +113,7 @@ protected:
/*! \brief used for generate used features */
Random
random_
;
/*! \brief used for sub feature training, is_feature_used_[i] = false means don't used feature i */
std
::
vector
<
bool
>
is_feature_used_
;
std
::
vector
<
int8_t
>
is_feature_used_
;
/*! \brief pointer to histograms array of parent of current leaves */
FeatureHistogram
*
parent_leaf_histogram_array_
;
/*! \brief pointer to histograms array of smaller leaf */
...
...
@@ -137,15 +134,6 @@ protected:
/*! \brief hessians of current iteration, ordered for cache optimized */
std
::
vector
<
score_t
>
ordered_hessians_
;
/*! \brief Pointer to ordered_gradients_, use this to avoid copy at BeforeTrain */
const
score_t
*
ptr_to_ordered_gradients_smaller_leaf_
;
/*! \brief Pointer to ordered_hessians_, use this to avoid copy at BeforeTrain*/
const
score_t
*
ptr_to_ordered_hessians_smaller_leaf_
;
/*! \brief Pointer to ordered_gradients_, use this to avoid copy at BeforeTrain */
const
score_t
*
ptr_to_ordered_gradients_larger_leaf_
;
/*! \brief Pointer to ordered_hessians_, use this to avoid copy at BeforeTrain*/
const
score_t
*
ptr_to_ordered_hessians_larger_leaf_
;
/*! \brief Store ordered bin */
std
::
vector
<
std
::
unique_ptr
<
OrderedBin
>>
ordered_bins_
;
/*! \brief True if has ordered bin */
...
...
@@ -156,15 +144,10 @@ protected:
HistogramPool
histogram_pool_
;
/*! \brief config of tree learner*/
const
TreeConfig
*
tree_config_
;
int
num_threads_
;
std
::
vector
<
int
>
ordered_bin_indices_
;
};
inline
void
SerialTreeLearner
::
FindBestSplitsForLeaves
()
{
FindBestSplitForLeaf
(
smaller_leaf_splits_
.
get
());
FindBestSplitForLeaf
(
larger_leaf_splits_
.
get
());
}
inline
data_size_t
SerialTreeLearner
::
GetGlobalDataCountInLeaf
(
int
leafIdx
)
const
{
if
(
leafIdx
>=
0
)
{
return
data_partition_
->
leaf_count
(
leafIdx
);
...
...
@@ -173,19 +156,5 @@ inline data_size_t SerialTreeLearner::GetGlobalDataCountInLeaf(int leafIdx) cons
}
}
inline
void
SerialTreeLearner
::
FindBestSplitForLeaf
(
LeafSplits
*
leaf_splits
)
{
if
(
leaf_splits
==
nullptr
||
leaf_splits
->
LeafIndex
()
<
0
)
{
return
;
}
std
::
vector
<
double
>
gains
;
for
(
size_t
i
=
0
;
i
<
leaf_splits
->
BestSplitPerFeature
().
size
();
++
i
)
{
gains
.
push_back
(
leaf_splits
->
BestSplitPerFeature
()[
i
].
gain
);
}
int
best_feature
=
static_cast
<
int
>
(
ArrayArgs
<
double
>::
ArgMax
(
gains
));
int
leaf
=
leaf_splits
->
LeafIndex
();
best_split_per_leaf_
[
leaf
]
=
leaf_splits
->
BestSplitPerFeature
()[
best_feature
];
best_split_per_leaf_
[
leaf
].
feature
=
best_feature
;
}
}
// namespace LightGBM
#endif // LightGBM_TREELEARNER_SERIAL_TREE_LEARNER_H_
src/treelearner/split_info.hpp
View file @
eade219e
...
...
@@ -53,6 +53,8 @@ public:
inline
bool
operator
>
(
const
SplitInfo
&
si
)
const
;
inline
bool
operator
==
(
const
SplitInfo
&
si
)
const
;
inline
static
void
MaxReducer
(
const
char
*
src
,
char
*
dst
,
int
len
)
{
const
int
type_size
=
sizeof
(
SplitInfo
);
int
used_size
=
0
;
...
...
@@ -103,5 +105,34 @@ inline bool SplitInfo::operator > (const SplitInfo& si) const {
}
}
inline
bool
SplitInfo
::
operator
==
(
const
SplitInfo
&
si
)
const
{
double
local_gain
=
this
->
gain
;
double
other_gain
=
si
.
gain
;
// replace nan with -inf
if
(
local_gain
==
NAN
)
{
local_gain
=
kMinScore
;
}
// replace nan with -inf
if
(
other_gain
==
NAN
)
{
other_gain
=
kMinScore
;
}
int
local_feature
=
this
->
feature
;
int
other_feature
=
si
.
feature
;
// replace -1 with max int
if
(
local_feature
==
-
1
)
{
local_feature
=
INT32_MAX
;
}
// replace -1 with max int
if
(
other_feature
==
-
1
)
{
other_feature
=
INT32_MAX
;
}
if
(
local_gain
!=
other_gain
)
{
return
local_gain
==
other_gain
;
}
else
{
// if same gain, use smaller feature
return
local_feature
==
other_feature
;
}
}
}
// namespace LightGBM
#endif // LightGBM_TREELEARNER_SPLIT_INFO_HPP_
src/treelearner/voting_parallel_tree_learner.cpp
View file @
eade219e
...
...
@@ -26,8 +26,8 @@ void VotingParallelTreeLearner::Init(const Dataset* train_data) {
// get max bin
int
max_bin
=
0
;
for
(
int
i
=
0
;
i
<
num_features_
;
++
i
)
{
if
(
max_bin
<
train_data_
->
Feature
At
(
i
)
->
num_b
in
())
{
max_bin
=
train_data_
->
Feature
At
(
i
)
->
num_b
in
();
if
(
max_bin
<
train_data_
->
Feature
NumB
in
(
i
))
{
max_bin
=
train_data_
->
Feature
NumB
in
(
i
);
}
}
// calculate buffer size
...
...
@@ -46,21 +46,42 @@ void VotingParallelTreeLearner::Init(const Dataset* train_data) {
larger_buffer_read_start_pos_
.
resize
(
num_features_
);
global_data_count_in_leaf_
.
resize
(
tree_config_
->
num_leaves
);
smaller_leaf_splits_global_
.
reset
(
new
LeafSplits
(
train_data_
->
num_features
(),
train_data_
->
num_data
()));
larger_leaf_splits_global_
.
reset
(
new
LeafSplits
(
train_data_
->
num_features
(),
train_data_
->
num_data
()));
smaller_leaf_splits_global_
.
reset
(
new
LeafSplits
(
train_data_
->
num_data
()));
larger_leaf_splits_global_
.
reset
(
new
LeafSplits
(
train_data_
->
num_data
()));
local_tree_config_
=
*
tree_config_
;
local_tree_config_
.
min_data_in_leaf
/=
num_machines_
;
local_tree_config_
.
min_sum_hessian_in_leaf
/=
num_machines_
;
histogram_pool_
.
ResetConfig
(
&
local_tree_config_
,
train_data_
->
num_features
()
);
histogram_pool_
.
ResetConfig
(
&
local_tree_config_
);
// initialize histograms for global
smaller_leaf_histogram_array_global_
.
reset
(
new
FeatureHistogram
[
num_features_
]);
larger_leaf_histogram_array_global_
.
reset
(
new
FeatureHistogram
[
num_features_
]);
for
(
int
j
=
0
;
j
<
num_features_
;
++
j
)
{
smaller_leaf_histogram_array_global_
[
j
].
Init
(
train_data_
->
FeatureAt
(
j
),
j
,
tree_config_
);
larger_leaf_histogram_array_global_
[
j
].
Init
(
train_data_
->
FeatureAt
(
j
),
j
,
tree_config_
);
auto
num_total_bin
=
train_data_
->
NumTotalBin
();
smaller_leaf_histogram_data_
.
resize
(
num_total_bin
);
larger_leaf_histogram_data_
.
resize
(
num_total_bin
);
feature_metas_
.
resize
(
train_data
->
num_features
());
#pragma omp parallel for schedule(static)
for
(
int
i
=
0
;
i
<
train_data
->
num_features
();
++
i
)
{
feature_metas_
[
i
].
num_bin
=
train_data
->
FeatureNumBin
(
i
);
if
(
train_data
->
FeatureBinMapper
(
i
)
->
GetDefaultBin
()
==
0
)
{
feature_metas_
[
i
].
bias
=
1
;
}
else
{
feature_metas_
[
i
].
bias
=
0
;
}
feature_metas_
[
i
].
tree_config
=
tree_config_
;
}
uint64_t
offset
=
0
;
for
(
int
j
=
0
;
j
<
train_data
->
num_features
();
++
j
)
{
offset
+=
static_cast
<
uint64_t
>
(
train_data
->
SubFeatureBinOffset
(
j
));
smaller_leaf_histogram_array_global_
[
j
].
Init
(
smaller_leaf_histogram_data_
.
data
()
+
offset
,
&
feature_metas_
[
j
],
train_data
->
FeatureBinMapper
(
j
)
->
bin_type
());
larger_leaf_histogram_array_global_
[
j
].
Init
(
larger_leaf_histogram_data_
.
data
()
+
offset
,
&
feature_metas_
[
j
],
train_data
->
FeatureBinMapper
(
j
)
->
bin_type
());
auto
num_bin
=
train_data
->
FeatureNumBin
(
j
);
if
(
train_data
->
FeatureBinMapper
(
j
)
->
GetDefaultBin
()
==
0
)
{
num_bin
-=
1
;
}
offset
+=
static_cast
<
uint64_t
>
(
num_bin
);
}
}
...
...
@@ -71,12 +92,11 @@ void VotingParallelTreeLearner::ResetConfig(const TreeConfig* tree_config) {
local_tree_config_
.
min_data_in_leaf
/=
num_machines_
;
local_tree_config_
.
min_sum_hessian_in_leaf
/=
num_machines_
;
histogram_pool_
.
ResetConfig
(
&
local_tree_config_
,
train_data_
->
num_features
()
);
histogram_pool_
.
ResetConfig
(
&
local_tree_config_
);
global_data_count_in_leaf_
.
resize
(
tree_config_
->
num_leaves
);
for
(
int
j
=
0
;
j
<
num_features_
;
++
j
)
{
smaller_leaf_histogram_array_global_
[
j
].
ResetConfig
(
tree_config_
);
larger_leaf_histogram_array_global_
[
j
].
ResetConfig
(
tree_config_
);
for
(
size_t
i
=
0
;
i
<
feature_metas_
.
size
();
++
i
)
{
feature_metas_
[
i
].
tree_config
=
tree_config_
;
}
}
...
...
@@ -183,17 +203,17 @@ void VotingParallelTreeLearner::CopyLocalHistogram(const std::vector<int>& small
while
(
cur_used_features
<
cur_total_feature
)
{
// copy smaller leaf histograms first
if
(
smaller_idx
<
smaller_top_features
.
size
())
{
int
fid
=
smaller_top_features
[
smaller_idx
];
int
inner_feature_index
=
train_data_
->
InnerFeatureIndex
(
smaller_top_features
[
smaller_idx
]
)
;
++
cur_used_features
;
// mark local aggregated feature
if
(
i
==
rank_
)
{
smaller_is_feature_aggregated_
[
fid
]
=
true
;
smaller_buffer_read_start_pos_
[
fid
]
=
static_cast
<
int
>
(
cur_size
);
smaller_is_feature_aggregated_
[
inner_feature_index
]
=
true
;
smaller_buffer_read_start_pos_
[
inner_feature_index
]
=
static_cast
<
int
>
(
cur_size
);
}
// copy
std
::
memcpy
(
input_buffer_
.
data
()
+
reduce_scatter_size_
,
smaller_leaf_histogram_array_
[
fid
].
Histogram
Data
(),
smaller_leaf_histogram_array_
[
fid
].
SizeOfHistgram
());
cur_size
+=
smaller_leaf_histogram_array_
[
fid
].
SizeOfHistgram
();
reduce_scatter_size_
+=
smaller_leaf_histogram_array_
[
fid
].
SizeOfHistgram
();
std
::
memcpy
(
input_buffer_
.
data
()
+
reduce_scatter_size_
,
smaller_leaf_histogram_array_
[
inner_feature_index
].
Raw
Data
(),
smaller_leaf_histogram_array_
[
inner_feature_index
].
SizeOfHistgram
());
cur_size
+=
smaller_leaf_histogram_array_
[
inner_feature_index
].
SizeOfHistgram
();
reduce_scatter_size_
+=
smaller_leaf_histogram_array_
[
inner_feature_index
].
SizeOfHistgram
();
++
smaller_idx
;
}
if
(
cur_used_features
>=
cur_total_feature
)
{
...
...
@@ -201,17 +221,17 @@ void VotingParallelTreeLearner::CopyLocalHistogram(const std::vector<int>& small
}
// then copy larger leaf histograms
if
(
larger_idx
<
larger_top_features
.
size
())
{
int
fid
=
larger_top_features
[
larger_idx
];
int
inner_feature_index
=
train_data_
->
InnerFeatureIndex
(
larger_top_features
[
larger_idx
]
)
;
++
cur_used_features
;
// mark local aggregated feature
if
(
i
==
rank_
)
{
larger_is_feature_aggregated_
[
fid
]
=
true
;
larger_buffer_read_start_pos_
[
fid
]
=
static_cast
<
int
>
(
cur_size
);
larger_is_feature_aggregated_
[
inner_feature_index
]
=
true
;
larger_buffer_read_start_pos_
[
inner_feature_index
]
=
static_cast
<
int
>
(
cur_size
);
}
// copy
std
::
memcpy
(
input_buffer_
.
data
()
+
reduce_scatter_size_
,
larger_leaf_histogram_array_
[
fid
].
Histogram
Data
(),
larger_leaf_histogram_array_
[
fid
].
SizeOfHistgram
());
cur_size
+=
larger_leaf_histogram_array_
[
fid
].
SizeOfHistgram
();
reduce_scatter_size_
+=
larger_leaf_histogram_array_
[
fid
].
SizeOfHistgram
();
std
::
memcpy
(
input_buffer_
.
data
()
+
reduce_scatter_size_
,
larger_leaf_histogram_array_
[
inner_feature_index
].
Raw
Data
(),
larger_leaf_histogram_array_
[
inner_feature_index
].
SizeOfHistgram
());
cur_size
+=
larger_leaf_histogram_array_
[
inner_feature_index
].
SizeOfHistgram
();
reduce_scatter_size_
+=
larger_leaf_histogram_array_
[
inner_feature_index
].
SizeOfHistgram
();
++
larger_idx
;
}
}
...
...
@@ -225,11 +245,83 @@ void VotingParallelTreeLearner::CopyLocalHistogram(const std::vector<int>& small
void
VotingParallelTreeLearner
::
FindBestThresholds
()
{
// use local data to find local best splits
SerialTreeLearner
::
FindBestThresholds
();
std
::
vector
<
int8_t
>
is_feature_used
(
num_features_
,
0
);
#pragma omp parallel for schedule(static)
for
(
int
feature_index
=
0
;
feature_index
<
num_features_
;
++
feature_index
)
{
if
(
!
is_feature_used_
[
feature_index
])
continue
;
if
(
parent_leaf_histogram_array_
!=
nullptr
&&
!
parent_leaf_histogram_array_
[
feature_index
].
is_splittable
())
{
smaller_leaf_histogram_array_
[
feature_index
].
set_is_splittable
(
false
);
continue
;
}
is_feature_used
[
feature_index
]
=
1
;
}
bool
use_subtract
=
true
;
if
(
parent_leaf_histogram_array_
==
nullptr
)
{
use_subtract
=
false
;
}
// construct smaller leaf
HistogramBinEntry
*
ptr_smaller_leaf_hist_data
=
smaller_leaf_histogram_array_
[
0
].
RawData
()
-
1
;
train_data_
->
ConstructHistograms
(
is_feature_used
,
smaller_leaf_splits_
->
data_indices
(),
smaller_leaf_splits_
->
num_data_in_leaf
(),
smaller_leaf_splits_
->
LeafIndex
(),
ordered_bins_
,
gradients_
,
hessians_
,
ordered_gradients_
.
data
(),
ordered_hessians_
.
data
(),
ptr_smaller_leaf_hist_data
);
if
(
larger_leaf_histogram_array_
!=
nullptr
&&
!
use_subtract
)
{
// construct larger leaf
HistogramBinEntry
*
ptr_larger_leaf_hist_data
=
larger_leaf_histogram_array_
[
0
].
RawData
()
-
1
;
train_data_
->
ConstructHistograms
(
is_feature_used
,
larger_leaf_splits_
->
data_indices
(),
larger_leaf_splits_
->
num_data_in_leaf
(),
larger_leaf_splits_
->
LeafIndex
(),
ordered_bins_
,
gradients_
,
hessians_
,
ordered_gradients_
.
data
(),
ordered_hessians_
.
data
(),
ptr_larger_leaf_hist_data
);
}
std
::
vector
<
SplitInfo
>
smaller_bestsplit_per_features
(
num_features_
);
std
::
vector
<
SplitInfo
>
larger_bestsplit_per_features
(
num_features_
);
// find splits
#pragma omp parallel for schedule(static)
for
(
int
feature_index
=
0
;
feature_index
<
num_features_
;
++
feature_index
)
{
if
(
!
is_feature_used
[
feature_index
])
{
continue
;
}
const
int
real_feature_index
=
train_data_
->
RealFeatureIndex
(
feature_index
);
train_data_
->
FixHistogram
(
feature_index
,
smaller_leaf_splits_
->
sum_gradients
(),
smaller_leaf_splits_
->
sum_hessians
(),
smaller_leaf_splits_
->
num_data_in_leaf
(),
smaller_leaf_histogram_array_
[
feature_index
].
RawData
());
smaller_leaf_histogram_array_
[
feature_index
].
FindBestThreshold
(
smaller_leaf_splits_
->
sum_gradients
(),
smaller_leaf_splits_
->
sum_hessians
(),
smaller_leaf_splits_
->
num_data_in_leaf
(),
&
smaller_bestsplit_per_features
[
feature_index
]);
smaller_bestsplit_per_features
[
feature_index
].
feature
=
real_feature_index
;
// only has root leaf
if
(
larger_leaf_splits_
==
nullptr
||
larger_leaf_splits_
->
LeafIndex
()
<
0
)
{
continue
;
}
if
(
use_subtract
)
{
larger_leaf_histogram_array_
[
feature_index
].
Subtract
(
smaller_leaf_histogram_array_
[
feature_index
]);
}
else
{
train_data_
->
FixHistogram
(
feature_index
,
larger_leaf_splits_
->
sum_gradients
(),
larger_leaf_splits_
->
sum_hessians
(),
larger_leaf_splits_
->
num_data_in_leaf
(),
larger_leaf_histogram_array_
[
feature_index
].
RawData
());
}
// find best threshold for larger child
larger_leaf_histogram_array_
[
feature_index
].
FindBestThreshold
(
larger_leaf_splits_
->
sum_gradients
(),
larger_leaf_splits_
->
sum_hessians
(),
larger_leaf_splits_
->
num_data_in_leaf
(),
&
larger_bestsplit_per_features
[
feature_index
]);
larger_bestsplit_per_features
[
feature_index
].
feature
=
real_feature_index
;
}
std
::
vector
<
SplitInfo
>
smaller_top_k_splits
,
larger_top_k_splits
;
// local voting
ArrayArgs
<
SplitInfo
>::
MaxK
(
smaller_
leaf_splits_
->
BestSplitPerF
eature
()
,
top_k_
,
&
smaller_top_k_splits
);
ArrayArgs
<
SplitInfo
>::
MaxK
(
larger_
leaf_splits_
->
BestSplitPerF
eature
()
,
top_k_
,
&
larger_top_k_splits
);
ArrayArgs
<
SplitInfo
>::
MaxK
(
smaller_
bestsplit_per_f
eature
s
,
top_k_
,
&
smaller_top_k_splits
);
ArrayArgs
<
SplitInfo
>::
MaxK
(
larger_
bestsplit_per_f
eature
s
,
top_k_
,
&
larger_top_k_splits
);
// gather
int
offset
=
0
;
for
(
int
i
=
0
;
i
<
top_k_
;
++
i
)
{
...
...
@@ -262,54 +354,78 @@ void VotingParallelTreeLearner::FindBestThresholds() {
// Reduce scatter for histogram
Network
::
ReduceScatter
(
input_buffer_
.
data
(),
reduce_scatter_size_
,
block_start_
.
data
(),
block_len_
.
data
(),
output_buffer_
.
data
(),
&
HistogramBinEntry
::
SumReducer
);
output_buffer_
.
data
(),
&
HistogramBinEntry
::
SumReducer
);
std
::
vector
<
SplitInfo
>
smaller_best
(
num_threads_
);
std
::
vector
<
SplitInfo
>
larger_best
(
num_threads_
);
// find best split from local aggregated histograms
#pragma omp parallel for schedule(
guided
)
#pragma omp parallel for schedule(
static
)
for
(
int
feature_index
=
0
;
feature_index
<
num_features_
;
++
feature_index
)
{
const
int
tid
=
omp_get_thread_num
();
if
(
smaller_is_feature_aggregated_
[
feature_index
])
{
smaller_leaf_histogram_array_global_
[
feature_index
].
SetSumup
(
GetGlobalDataCountInLeaf
(
smaller_leaf_splits_global_
->
LeafIndex
()),
smaller_leaf_splits_global_
->
sum_gradients
(),
smaller_leaf_splits_global_
->
sum_hessians
());
SplitInfo
smaller_split
;
// restore from buffer
smaller_leaf_histogram_array_global_
[
feature_index
].
FromMemory
(
output_buffer_
.
data
()
+
smaller_buffer_read_start_pos_
[
feature_index
]);
output_buffer_
.
data
()
+
smaller_buffer_read_start_pos_
[
feature_index
]);
train_data_
->
FixHistogram
(
feature_index
,
smaller_leaf_splits_global_
->
sum_gradients
(),
smaller_leaf_splits_global_
->
sum_hessians
(),
GetGlobalDataCountInLeaf
(
smaller_leaf_splits_global_
->
LeafIndex
()),
smaller_leaf_histogram_array_global_
[
feature_index
].
RawData
());
// find best threshold
smaller_leaf_histogram_array_global_
[
feature_index
].
FindBestThreshold
(
&
smaller_leaf_splits_global_
->
BestSplitPerFeature
()[
feature_index
]);
smaller_leaf_splits_global_
->
sum_gradients
(),
smaller_leaf_splits_global_
->
sum_hessians
(),
GetGlobalDataCountInLeaf
(
smaller_leaf_splits_global_
->
LeafIndex
()),
&
smaller_split
);
if
(
smaller_split
.
gain
>
smaller_best
[
tid
].
gain
)
{
smaller_best
[
tid
]
=
smaller_split
;
smaller_best
[
tid
].
feature
=
train_data_
->
RealFeatureIndex
(
feature_index
);
}
}
if
(
larger_is_feature_aggregated_
[
feature_index
])
{
larger_leaf_histogram_array_global_
[
feature_index
].
SetSumup
(
GetGlobalDataCountInLeaf
(
larger_leaf_splits_global_
->
LeafIndex
()),
larger_leaf_splits_global_
->
sum_gradients
(),
larger_leaf_splits_global_
->
sum_hessians
());
SplitInfo
larger_split
;
// restore from buffer
larger_leaf_histogram_array_global_
[
feature_index
].
FromMemory
(
output_buffer_
.
data
()
+
larger_buffer_read_start_pos_
[
feature_index
]);
train_data_
->
FixHistogram
(
feature_index
,
larger_leaf_splits_global_
->
sum_gradients
(),
larger_leaf_splits_global_
->
sum_hessians
(),
GetGlobalDataCountInLeaf
(
larger_leaf_splits_global_
->
LeafIndex
()),
larger_leaf_histogram_array_global_
[
feature_index
].
RawData
());
// find best threshold
larger_leaf_histogram_array_global_
[
feature_index
].
FindBestThreshold
(
&
larger_leaf_splits_global_
->
BestSplitPerFeature
()[
feature_index
]);
larger_leaf_histogram_array_global_
[
feature_index
].
FindBestThreshold
(
larger_leaf_splits_global_
->
sum_gradients
(),
larger_leaf_splits_global_
->
sum_hessians
(),
GetGlobalDataCountInLeaf
(
larger_leaf_splits_global_
->
LeafIndex
()),
&
larger_split
);
if
(
larger_split
.
gain
>
larger_best
[
tid
].
gain
)
{
larger_best
[
tid
]
=
larger_split
;
larger_best
[
tid
].
feature
=
train_data_
->
RealFeatureIndex
(
feature_index
);
}
}
}
auto
smaller_best_idx
=
ArrayArgs
<
SplitInfo
>::
ArgMax
(
smaller_best
);
int
leaf
=
smaller_leaf_splits_
->
LeafIndex
();
best_split_per_leaf_
[
leaf
]
=
smaller_best
[
smaller_best_idx
];
if
(
larger_leaf_splits_
!=
nullptr
&&
larger_leaf_splits_
->
LeafIndex
()
>=
0
)
{
leaf
=
larger_leaf_splits_
->
LeafIndex
();
auto
larger_best_idx
=
ArrayArgs
<
SplitInfo
>::
ArgMax
(
larger_best
);
best_split_per_leaf_
[
leaf
]
=
larger_best
[
larger_best_idx
];
}
}
void
VotingParallelTreeLearner
::
FindBestSplitsForLeaves
()
{
int
smaller_best_feature
=
-
1
,
larger_best_feature
=
-
1
;
// find local best
SplitInfo
smaller_best
,
larger_best
;
std
::
vector
<
double
>
gains
;
for
(
size_t
i
=
0
;
i
<
smaller_leaf_splits_global_
->
BestSplitPerFeature
().
size
();
++
i
)
{
gains
.
push_back
(
smaller_leaf_splits_global_
->
BestSplitPerFeature
()[
i
].
gain
);
}
smaller_best_feature
=
static_cast
<
int
>
(
ArrayArgs
<
double
>::
ArgMax
(
gains
));
smaller_best
=
smaller_leaf_splits_global_
->
BestSplitPerFeature
()[
smaller_best_feature
];
if
(
larger_leaf_splits_global_
->
LeafIndex
()
>=
0
)
{
gains
.
clear
();
for
(
size_t
i
=
0
;
i
<
larger_leaf_splits_global_
->
BestSplitPerFeature
().
size
();
++
i
)
{
gains
.
push_back
(
larger_leaf_splits_global_
->
BestSplitPerFeature
()[
i
].
gain
);
}
larger_best_feature
=
static_cast
<
int
>
(
ArrayArgs
<
double
>::
ArgMax
(
gains
));
larger_best
=
larger_leaf_splits_global_
->
BestSplitPerFeature
()[
larger_best_feature
];
smaller_best
=
best_split_per_leaf_
[
smaller_leaf_splits_
->
LeafIndex
()];
// find local best split for larger leaf
if
(
larger_leaf_splits_
->
LeafIndex
()
>=
0
)
{
larger_best
=
best_split_per_leaf_
[
larger_leaf_splits_
->
LeafIndex
()];
}
// sync global best info
std
::
memcpy
(
input_buffer_
.
data
(),
&
smaller_best
,
sizeof
(
SplitInfo
));
...
...
@@ -336,18 +452,18 @@ void VotingParallelTreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf,
// init the global sumup info
if
(
best_split_info
.
left_count
<
best_split_info
.
right_count
)
{
smaller_leaf_splits_global_
->
Init
(
*
left_leaf
,
data_partition_
.
get
(),
best_split_info
.
left_sum_gradient
,
best_split_info
.
left_sum_hessian
);
best_split_info
.
left_sum_gradient
,
best_split_info
.
left_sum_hessian
);
larger_leaf_splits_global_
->
Init
(
*
right_leaf
,
data_partition_
.
get
(),
best_split_info
.
right_sum_gradient
,
best_split_info
.
right_sum_hessian
);
best_split_info
.
right_sum_gradient
,
best_split_info
.
right_sum_hessian
);
}
else
{
smaller_leaf_splits_global_
->
Init
(
*
right_leaf
,
data_partition_
.
get
(),
best_split_info
.
right_sum_gradient
,
best_split_info
.
right_sum_hessian
);
best_split_info
.
right_sum_gradient
,
best_split_info
.
right_sum_hessian
);
larger_leaf_splits_global_
->
Init
(
*
left_leaf
,
data_partition_
.
get
(),
best_split_info
.
left_sum_gradient
,
best_split_info
.
left_sum_hessian
);
best_split_info
.
left_sum_gradient
,
best_split_info
.
left_sum_hessian
);
}
}
...
...
tests/python_package_test/test_basic.py
View file @
eade219e
...
...
@@ -6,21 +6,21 @@ import unittest
import
lightgbm
as
lgb
import
numpy
as
np
from
sklearn.datasets
import
load_breast_cancer
from
sklearn.datasets
import
load_breast_cancer
,
dump_svmlight_file
from
sklearn.model_selection
import
train_test_split
class
TestBasic
(
unittest
.
TestCase
):
def
test
(
self
):
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
*
load_breast_cancer
(
True
),
test_size
=
0.1
,
random_state
=
1
)
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
*
load_breast_cancer
(
True
),
test_size
=
0.1
,
random_state
=
2
)
train_data
=
lgb
.
Dataset
(
X_train
,
max_bin
=
255
,
label
=
y_train
)
valid_data
=
train_data
.
create_valid
(
X_test
,
label
=
y_test
)
params
=
{
"objective"
:
"binary"
,
"metric"
:
"auc"
,
"min_data"
:
1
,
"min_data"
:
1
0
,
"num_leaves"
:
15
,
"verbose"
:
-
1
}
...
...
@@ -36,7 +36,7 @@ class TestBasic(unittest.TestCase):
with
tempfile
.
NamedTemporaryFile
()
as
f
:
tname
=
f
.
name
with
open
(
tname
,
"w+b"
)
as
f
:
np
.
savetxt
(
f
,
X_test
,
delimiter
=
','
)
dump_svmlight_file
(
X_test
,
y_test
,
f
)
pred_from_file
=
bst
.
predict
(
tname
)
os
.
remove
(
tname
)
self
.
assertEqual
(
len
(
pred_from_matr
),
len
(
pred_from_file
))
...
...
tests/python_package_test/test_engine.py
View file @
eade219e
...
...
@@ -10,11 +10,17 @@ import numpy as np
from
sklearn.datasets
import
(
load_boston
,
load_breast_cancer
,
load_digits
,
load_iris
)
from
sklearn.metrics
import
log_loss
,
mean_absolute_error
,
mean_squared_error
from
sklearn.model_selection
import
train_test_split
from
sklearn.model_selection
import
train_test_split
,
TimeSeriesSplit
try
:
import
pandas
as
pd
IS_PANDAS_INSTALLED
=
True
except
ImportError
:
IS_PANDAS_INSTALLED
=
False
try
:
import
cPickle
as
pickle
except
:
except
ImportError
:
import
pickle
...
...
@@ -22,31 +28,33 @@ def multi_logloss(y_true, y_pred):
return
np
.
mean
([
-
math
.
log
(
y_pred
[
i
][
y
])
for
i
,
y
in
enumerate
(
y_true
)])
def
test_template
(
params
=
{
'objective'
:
'regression'
,
'metric'
:
'l2'
},
X_y
=
load_boston
(
True
),
feval
=
mean_squared_error
,
num_round
=
100
,
init_model
=
None
,
custom_eval
=
None
,
early_stopping_rounds
=
10
,
return_data
=
False
,
return_model
=
False
):
params
[
'verbose'
],
params
[
'seed'
]
=
-
1
,
42
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
*
X_y
,
test_size
=
0.1
,
random_state
=
42
)
lgb_train
=
lgb
.
Dataset
(
X_train
,
y_train
,
params
=
params
)
lgb_eval
=
lgb
.
Dataset
(
X_test
,
y_test
,
reference
=
lgb_train
,
params
=
params
)
if
return_data
:
return
lgb_train
,
lgb_eval
evals_result
=
{}
gbm
=
lgb
.
train
(
params
,
lgb_train
,
num_boost_round
=
num_round
,
valid_sets
=
lgb_eval
,
valid_names
=
'eval'
,
verbose_eval
=
False
,
feval
=
custom_eval
,
evals_result
=
evals_result
,
early_stopping_rounds
=
early_stopping_rounds
,
init_model
=
init_model
)
if
return_model
:
return
gbm
else
:
return
evals_result
,
feval
(
y_test
,
gbm
.
predict
(
X_test
,
gbm
.
best_iteration
))
class
template
(
object
):
@
staticmethod
def
test_template
(
params
=
{
'objective'
:
'regression'
,
'metric'
:
'l2'
},
X_y
=
load_boston
(
True
),
feval
=
mean_squared_error
,
num_round
=
150
,
init_model
=
None
,
custom_eval
=
None
,
early_stopping_rounds
=
10
,
return_data
=
False
,
return_model
=
False
):
params
[
'verbose'
],
params
[
'seed'
]
=
-
1
,
42
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
*
X_y
,
test_size
=
0.1
,
random_state
=
42
)
lgb_train
=
lgb
.
Dataset
(
X_train
,
y_train
,
params
=
params
)
lgb_eval
=
lgb
.
Dataset
(
X_test
,
y_test
,
reference
=
lgb_train
,
params
=
params
)
if
return_data
:
return
lgb_train
,
lgb_eval
evals_result
=
{}
gbm
=
lgb
.
train
(
params
,
lgb_train
,
num_boost_round
=
num_round
,
valid_sets
=
lgb_eval
,
valid_names
=
'eval'
,
verbose_eval
=
False
,
feval
=
custom_eval
,
evals_result
=
evals_result
,
early_stopping_rounds
=
early_stopping_rounds
,
init_model
=
init_model
)
if
return_model
:
return
gbm
else
:
return
evals_result
,
feval
(
y_test
,
gbm
.
predict
(
X_test
,
gbm
.
best_iteration
))
class
TestEngine
(
unittest
.
TestCase
):
...
...
@@ -57,12 +65,12 @@ class TestEngine(unittest.TestCase):
'objective'
:
'binary'
,
'metric'
:
'binary_logloss'
}
evals_result
,
ret
=
test_template
(
params
,
X_y
,
log_loss
)
evals_result
,
ret
=
template
.
test_template
(
params
,
X_y
,
log_loss
)
self
.
assertLess
(
ret
,
0.15
)
self
.
assertAlmostEqual
(
min
(
evals_result
[
'eval'
][
'binary_logloss'
]),
ret
,
places
=
5
)
def
test_regreesion
(
self
):
evals_result
,
ret
=
test_template
()
evals_result
,
ret
=
template
.
test_template
()
ret
**=
0.5
self
.
assertLess
(
ret
,
4
)
self
.
assertAlmostEqual
(
min
(
evals_result
[
'eval'
][
'l2'
]),
ret
,
places
=
5
)
...
...
@@ -74,7 +82,7 @@ class TestEngine(unittest.TestCase):
'metric'
:
'multi_logloss'
,
'num_class'
:
10
}
evals_result
,
ret
=
test_template
(
params
,
X_y
,
multi_logloss
)
evals_result
,
ret
=
template
.
test_template
(
params
,
X_y
,
multi_logloss
)
self
.
assertLess
(
ret
,
0.2
)
self
.
assertAlmostEqual
(
min
(
evals_result
[
'eval'
][
'multi_logloss'
]),
ret
,
places
=
5
)
...
...
@@ -84,11 +92,11 @@ class TestEngine(unittest.TestCase):
'metric'
:
'l1'
}
model_name
=
'model.txt'
gbm
=
test_template
(
params
,
num_round
=
20
,
return_model
=
True
,
early_stopping_rounds
=-
1
)
gbm
=
template
.
test_template
(
params
,
num_round
=
20
,
return_model
=
True
,
early_stopping_rounds
=-
1
)
gbm
.
save_model
(
model_name
)
evals_result
,
ret
=
test_template
(
params
,
feval
=
mean_absolute_error
,
num_round
=
80
,
init_model
=
model_name
,
custom_eval
=
(
lambda
p
,
d
:
(
'mae'
,
mean_absolute_error
(
p
,
d
.
get_label
()),
False
)))
evals_result
,
ret
=
template
.
test_template
(
params
,
feval
=
mean_absolute_error
,
num_round
=
80
,
init_model
=
model_name
,
custom_eval
=
(
lambda
p
,
d
:
(
'mae'
,
mean_absolute_error
(
p
,
d
.
get_label
()),
False
)))
self
.
assertLess
(
ret
,
3
)
self
.
assertAlmostEqual
(
min
(
evals_result
[
'eval'
][
'l1'
]),
ret
,
places
=
5
)
for
l1
,
mae
in
zip
(
evals_result
[
'eval'
][
'l1'
],
evals_result
[
'eval'
][
'mae'
]):
...
...
@@ -104,38 +112,90 @@ class TestEngine(unittest.TestCase):
'metric'
:
'multi_logloss'
,
'num_class'
:
3
}
gbm
=
test_template
(
params
,
X_y
,
num_round
=
20
,
return_model
=
True
,
early_stopping_rounds
=-
1
)
evals_result
,
ret
=
test_template
(
params
,
X_y
,
feval
=
multi_logloss
,
num_round
=
80
,
init_model
=
gbm
)
gbm
=
template
.
test_template
(
params
,
X_y
,
num_round
=
20
,
return_model
=
True
,
early_stopping_rounds
=-
1
)
evals_result
,
ret
=
template
.
test_template
(
params
,
X_y
,
feval
=
multi_logloss
,
num_round
=
80
,
init_model
=
gbm
)
self
.
assertLess
(
ret
,
1.5
)
self
.
assertAlmostEqual
(
min
(
evals_result
[
'eval'
][
'multi_logloss'
]),
ret
,
places
=
5
)
def
test_cv
(
self
):
lgb_train
,
_
=
test_template
(
return_data
=
True
)
lgb
.
cv
({
'verbose'
:
-
1
},
lgb_train
,
num_boost_round
=
20
,
nfold
=
5
,
lgb_train
,
_
=
template
.
test_template
(
return_data
=
True
)
lgb
.
cv
({
'verbose'
:
-
1
},
lgb_train
,
num_boost_round
=
20
,
nfold
=
5
,
shuffle
=
False
,
metrics
=
'l1'
,
verbose_eval
=
False
,
callbacks
=
[
lgb
.
reset_parameter
(
learning_rate
=
lambda
i
:
0.1
-
0.001
*
i
)])
tss
=
TimeSeriesSplit
(
3
)
lgb
.
cv
({
'verbose'
:
-
1
},
lgb_train
,
num_boost_round
=
20
,
data_splitter
=
tss
,
nfold
=
5
,
# test if wrong nfold is ignored
metrics
=
'l2'
,
verbose_eval
=
False
)
def
test_feature_name
(
self
):
lgb_train
,
_
=
template
.
test_template
(
return_data
=
True
)
feature_names
=
[
'f'
+
str
(
i
)
for
i
in
range
(
13
)]
gbm
=
lgb
.
train
({
'verbose'
:
-
1
},
lgb_train
,
num_boost_round
=
10
,
feature_name
=
feature_names
)
self
.
assertListEqual
(
feature_names
,
gbm
.
feature_name
())
def
test_save_load_copy_pickle
(
self
):
gbm
=
test_template
(
num_round
=
20
,
return_model
=
True
)
_
,
ret_origin
=
test_template
(
init_model
=
gbm
)
gbm
=
template
.
test_template
(
num_round
=
20
,
return_model
=
True
)
_
,
ret_origin
=
template
.
test_template
(
init_model
=
gbm
)
other_ret
=
[]
gbm
.
save_model
(
'lgb.model'
)
other_ret
.
append
(
test_template
(
init_model
=
'lgb.model'
)[
1
])
other_ret
.
append
(
template
.
test_template
(
init_model
=
'lgb.model'
)[
1
])
gbm_load
=
lgb
.
Booster
(
model_file
=
'lgb.model'
)
other_ret
.
append
(
test_template
(
init_model
=
gbm_load
)[
1
])
other_ret
.
append
(
test_template
(
init_model
=
copy
.
copy
(
gbm
))[
1
])
other_ret
.
append
(
test_template
(
init_model
=
copy
.
deepcopy
(
gbm
))[
1
])
other_ret
.
append
(
template
.
test_template
(
init_model
=
gbm_load
)[
1
])
other_ret
.
append
(
template
.
test_template
(
init_model
=
copy
.
copy
(
gbm
))[
1
])
other_ret
.
append
(
template
.
test_template
(
init_model
=
copy
.
deepcopy
(
gbm
))[
1
])
with
open
(
'lgb.pkl'
,
'wb'
)
as
f
:
pickle
.
dump
(
gbm
,
f
)
with
open
(
'lgb.pkl'
,
'rb'
)
as
f
:
gbm_pickle
=
pickle
.
load
(
f
)
other_ret
.
append
(
test_template
(
init_model
=
gbm_pickle
)[
1
])
other_ret
.
append
(
template
.
test_template
(
init_model
=
gbm_pickle
)[
1
])
gbm_pickles
=
pickle
.
loads
(
pickle
.
dumps
(
gbm
))
other_ret
.
append
(
test_template
(
init_model
=
gbm_pickles
)[
1
])
other_ret
.
append
(
template
.
test_template
(
init_model
=
gbm_pickles
)[
1
])
for
ret
in
other_ret
:
self
.
assertAlmostEqual
(
ret_origin
,
ret
,
places
=
5
)
@
unittest
.
skipIf
(
not
IS_PANDAS_INSTALLED
,
'pandas not installed'
)
def
test_pandas_categorical
(
self
):
X
=
pd
.
DataFrame
({
"A"
:
np
.
random
.
permutation
([
'a'
,
'b'
,
'c'
,
'd'
]
*
75
),
# str
"B"
:
np
.
random
.
permutation
([
1
,
2
,
3
]
*
100
),
# int
"C"
:
np
.
random
.
permutation
([
0.1
,
0.2
,
-
0.1
,
-
0.1
,
0.2
]
*
60
),
# float
"D"
:
np
.
random
.
permutation
([
True
,
False
]
*
150
)})
# bool
y
=
np
.
random
.
permutation
([
0
,
1
]
*
150
)
X_test
=
pd
.
DataFrame
({
"A"
:
np
.
random
.
permutation
([
'a'
,
'b'
,
'e'
]
*
20
),
"B"
:
np
.
random
.
permutation
([
1
,
3
]
*
30
),
"C"
:
np
.
random
.
permutation
([
0.1
,
-
0.1
,
0.2
,
0.2
]
*
15
),
"D"
:
np
.
random
.
permutation
([
True
,
False
]
*
30
)})
for
col
in
[
"A"
,
"B"
,
"C"
,
"D"
]:
X
[
col
]
=
X
[
col
].
astype
(
'category'
)
X_test
[
col
]
=
X_test
[
col
].
astype
(
'category'
)
params
=
{
'objective'
:
'binary'
,
'metric'
:
'binary_logloss'
,
'verbose'
:
-
1
}
lgb_train
=
lgb
.
Dataset
(
X
,
y
)
gbm0
=
lgb
.
train
(
params
,
lgb_train
,
num_boost_round
=
10
,
verbose_eval
=
False
)
pred0
=
list
(
gbm0
.
predict
(
X_test
))
lgb_train
=
lgb
.
Dataset
(
X
,
y
)
gbm1
=
lgb
.
train
(
params
,
lgb_train
,
num_boost_round
=
10
,
verbose_eval
=
False
,
categorical_feature
=
[
0
])
pred1
=
list
(
gbm1
.
predict
(
X_test
))
lgb_train
=
lgb
.
Dataset
(
X
,
y
)
gbm2
=
lgb
.
train
(
params
,
lgb_train
,
num_boost_round
=
10
,
verbose_eval
=
False
,
categorical_feature
=
[
'A'
])
pred2
=
list
(
gbm2
.
predict
(
X_test
))
lgb_train
=
lgb
.
Dataset
(
X
,
y
)
gbm3
=
lgb
.
train
(
params
,
lgb_train
,
num_boost_round
=
10
,
verbose_eval
=
False
,
categorical_feature
=
[
'A'
,
'B'
,
'C'
,
'D'
])
pred3
=
list
(
gbm3
.
predict
(
X_test
))
lgb_train
=
lgb
.
Dataset
(
X
,
y
)
gbm3
.
save_model
(
'categorical.model'
)
gbm4
=
lgb
.
Booster
(
model_file
=
'categorical.model'
)
pred4
=
list
(
gbm4
.
predict
(
X_test
))
self
.
assertListEqual
(
pred0
,
pred1
)
self
.
assertListEqual
(
pred0
,
pred2
)
self
.
assertListEqual
(
pred0
,
pred3
)
self
.
assertListEqual
(
pred0
,
pred4
)
print
(
"----------------------------------------------------------------------"
)
print
(
"running test_engine.py"
)
...
...
tests/python_package_test/test_plotting.py
0 → 100644
View file @
eade219e
# coding: utf-8
# pylint: skip-file
import
unittest
import
lightgbm
as
lgb
from
sklearn.datasets
import
load_breast_cancer
from
sklearn.model_selection
import
train_test_split
try
:
import
matplotlib
matplotlib
.
use
(
'Agg'
)
matplotlib_installed
=
True
except
ImportError
:
matplotlib_installed
=
False
class
TestBasic
(
unittest
.
TestCase
):
@
unittest
.
skipIf
(
not
matplotlib_installed
,
'matplotlib not installed'
)
def
test_plot_importance
(
self
):
X_train
,
_
,
y_train
,
_
=
train_test_split
(
*
load_breast_cancer
(
True
),
test_size
=
0.1
,
random_state
=
1
)
train_data
=
lgb
.
Dataset
(
X_train
,
y_train
)
params
=
{
"objective"
:
"binary"
,
"verbose"
:
-
1
,
"num_leaves"
:
3
}
gbm0
=
lgb
.
train
(
params
,
train_data
,
num_boost_round
=
10
)
ax0
=
lgb
.
plot_importance
(
gbm0
)
self
.
assertIsInstance
(
ax0
,
matplotlib
.
axes
.
Axes
)
self
.
assertEqual
(
ax0
.
get_title
(),
'Feature importance'
)
self
.
assertEqual
(
ax0
.
get_xlabel
(),
'Feature importance'
)
self
.
assertEqual
(
ax0
.
get_ylabel
(),
'Features'
)
self
.
assertLessEqual
(
len
(
ax0
.
patches
),
30
)
gbm1
=
lgb
.
LGBMClassifier
(
n_estimators
=
10
,
num_leaves
=
3
,
silent
=
True
)
gbm1
.
fit
(
X_train
,
y_train
)
ax1
=
lgb
.
plot_importance
(
gbm1
,
color
=
'r'
,
title
=
't'
,
xlabel
=
'x'
,
ylabel
=
'y'
)
self
.
assertIsInstance
(
ax1
,
matplotlib
.
axes
.
Axes
)
self
.
assertEqual
(
ax1
.
get_title
(),
't'
)
self
.
assertEqual
(
ax1
.
get_xlabel
(),
'x'
)
self
.
assertEqual
(
ax1
.
get_ylabel
(),
'y'
)
self
.
assertLessEqual
(
len
(
ax1
.
patches
),
30
)
for
patch
in
ax1
.
patches
:
self
.
assertTupleEqual
(
patch
.
get_facecolor
(),
(
1.
,
0
,
0
,
1.
))
# red
ax2
=
lgb
.
plot_importance
(
gbm0
,
color
=
[
'r'
,
'y'
,
'g'
,
'b'
],
title
=
None
,
xlabel
=
None
,
ylabel
=
None
)
self
.
assertIsInstance
(
ax2
,
matplotlib
.
axes
.
Axes
)
self
.
assertEqual
(
ax2
.
get_title
(),
''
)
self
.
assertEqual
(
ax2
.
get_xlabel
(),
''
)
self
.
assertEqual
(
ax2
.
get_ylabel
(),
''
)
self
.
assertLessEqual
(
len
(
ax2
.
patches
),
30
)
self
.
assertTupleEqual
(
ax2
.
patches
[
0
].
get_facecolor
(),
(
1.
,
0
,
0
,
1.
))
# r
self
.
assertTupleEqual
(
ax2
.
patches
[
1
].
get_facecolor
(),
(.
75
,
.
75
,
0
,
1.
))
# y
self
.
assertTupleEqual
(
ax2
.
patches
[
2
].
get_facecolor
(),
(
0
,
.
5
,
0
,
1.
))
# g
self
.
assertTupleEqual
(
ax2
.
patches
[
3
].
get_facecolor
(),
(
0
,
0
,
1.
,
1.
))
# b
@
unittest
.
skip
(
'Graphviz are not executables on Travis'
)
def
test_plot_tree
(
self
):
pass
@
unittest
.
skipIf
(
not
matplotlib_installed
,
'matplotlib not installed'
)
def
test_plot_metrics
(
self
):
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
*
load_breast_cancer
(
True
),
test_size
=
0.1
,
random_state
=
1
)
train_data
=
lgb
.
Dataset
(
X_train
,
y_train
)
test_data
=
lgb
.
Dataset
(
X_test
,
y_test
,
reference
=
train_data
)
params
=
{
"objective"
:
"binary"
,
"metric"
:
{
"binary_logloss"
,
"binary_error"
},
"verbose"
:
-
1
,
"num_leaves"
:
3
}
evals_result0
=
{}
gbm0
=
lgb
.
train
(
params
,
train_data
,
valid_sets
=
[
train_data
,
test_data
],
valid_names
=
[
'v1'
,
'v2'
],
num_boost_round
=
10
,
evals_result
=
evals_result0
,
verbose_eval
=
False
)
ax0
=
lgb
.
plot_metric
(
evals_result0
)
self
.
assertIsInstance
(
ax0
,
matplotlib
.
axes
.
Axes
)
self
.
assertEqual
(
ax0
.
get_title
(),
'Metric during training'
)
self
.
assertEqual
(
ax0
.
get_xlabel
(),
'Iterations'
)
self
.
assertIn
(
ax0
.
get_ylabel
(),
{
'binary_logloss'
,
'binary_error'
})
ax0
=
lgb
.
plot_metric
(
evals_result0
,
metric
=
'binary_error'
)
ax0
=
lgb
.
plot_metric
(
evals_result0
,
metric
=
'binary_logloss'
,
dataset_names
=
[
'v2'
])
evals_result1
=
{}
gbm1
=
lgb
.
train
(
params
,
train_data
,
num_boost_round
=
10
,
evals_result
=
evals_result1
,
verbose_eval
=
False
)
self
.
assertRaises
(
ValueError
,
lgb
.
plot_metric
,
evals_result1
)
gbm2
=
lgb
.
LGBMClassifier
(
n_estimators
=
10
,
num_leaves
=
3
,
silent
=
True
)
gbm2
.
fit
(
X_train
,
y_train
,
eval_set
=
[(
X_test
,
y_test
)],
verbose
=
False
)
ax2
=
lgb
.
plot_metric
(
gbm2
,
title
=
None
,
xlabel
=
None
,
ylabel
=
None
)
self
.
assertIsInstance
(
ax2
,
matplotlib
.
axes
.
Axes
)
self
.
assertEqual
(
ax2
.
get_title
(),
''
)
self
.
assertEqual
(
ax2
.
get_xlabel
(),
''
)
self
.
assertEqual
(
ax2
.
get_ylabel
(),
''
)
print
(
"----------------------------------------------------------------------"
)
print
(
"running test_plotting.py"
)
unittest
.
main
()
tests/python_package_test/test_sklearn.py
View file @
eade219e
...
...
@@ -12,42 +12,44 @@ from sklearn.metrics import log_loss, mean_squared_error
from
sklearn.model_selection
import
GridSearchCV
,
train_test_split
def
test_template
(
X_y
=
load_boston
(
True
),
model
=
lgb
.
LGBMRegressor
,
feval
=
mean_squared_error
,
num_round
=
100
,
custom_obj
=
None
,
predict_proba
=
False
,
return_data
=
False
,
return_model
=
False
):
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
*
X_y
,
test_size
=
0.1
,
random_state
=
42
)
if
return_data
:
return
X_train
,
X_test
,
y_train
,
y_test
arguments
=
{
'n_estimators'
:
num_round
,
'silent'
:
True
}
if
custom_obj
:
arguments
[
'objective'
]
=
custom_obj
gbm
=
model
(
**
arguments
)
gbm
.
fit
(
X_train
,
y_train
,
eval_set
=
[(
X_test
,
y_test
)],
early_stopping_rounds
=
10
,
verbose
=
False
)
if
return_model
:
return
gbm
elif
predict_proba
:
return
feval
(
y_test
,
gbm
.
predict_proba
(
X_test
))
else
:
return
feval
(
y_test
,
gbm
.
predict
(
X_test
))
class
template
(
object
):
@
staticmethod
def
test_template
(
X_y
=
load_boston
(
True
),
model
=
lgb
.
LGBMRegressor
,
feval
=
mean_squared_error
,
num_round
=
100
,
custom_obj
=
None
,
predict_proba
=
False
,
return_data
=
False
,
return_model
=
False
):
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
*
X_y
,
test_size
=
0.1
,
random_state
=
42
)
if
return_data
:
return
X_train
,
X_test
,
y_train
,
y_test
arguments
=
{
'n_estimators'
:
num_round
,
'silent'
:
True
}
if
custom_obj
:
arguments
[
'objective'
]
=
custom_obj
gbm
=
model
(
**
arguments
)
gbm
.
fit
(
X_train
,
y_train
,
eval_set
=
[(
X_test
,
y_test
)],
early_stopping_rounds
=
10
,
verbose
=
False
)
if
return_model
:
return
gbm
elif
predict_proba
:
return
feval
(
y_test
,
gbm
.
predict_proba
(
X_test
))
else
:
return
feval
(
y_test
,
gbm
.
predict
(
X_test
))
class
TestSklearn
(
unittest
.
TestCase
):
def
test_binary
(
self
):
X_y
=
load_breast_cancer
(
True
)
ret
=
test_template
(
X_y
,
lgb
.
LGBMClassifier
,
log_loss
,
predict_proba
=
True
)
ret
=
template
.
test_template
(
X_y
,
lgb
.
LGBMClassifier
,
log_loss
,
predict_proba
=
True
)
self
.
assertLess
(
ret
,
0.15
)
def
test_regreesion
(
self
):
self
.
assertLess
(
test_template
()
**
0.5
,
4
)
self
.
assertLess
(
template
.
test_template
()
**
0.5
,
4
)
def
test_multiclass
(
self
):
X_y
=
load_digits
(
10
,
True
)
def
multi_error
(
y_true
,
y_pred
):
return
np
.
mean
(
y_true
!=
y_pred
)
ret
=
test_template
(
X_y
,
lgb
.
LGBMClassifier
,
multi_error
)
ret
=
template
.
test_template
(
X_y
,
lgb
.
LGBMClassifier
,
multi_error
)
self
.
assertLess
(
ret
,
0.2
)
def
test_lambdarank
(
self
):
...
...
@@ -68,7 +70,7 @@ class TestSklearn(unittest.TestCase):
grad
=
(
y_pred
-
y_true
)
hess
=
np
.
ones
(
len
(
y_true
))
return
grad
,
hess
ret
=
test_template
(
custom_obj
=
objective_ls
)
ret
=
template
.
test_template
(
custom_obj
=
objective_ls
)
self
.
assertLess
(
ret
,
100
)
def
test_binary_classification_with_custom_objective
(
self
):
...
...
@@ -81,17 +83,17 @@ class TestSklearn(unittest.TestCase):
def
binary_error
(
y_test
,
y_pred
):
return
np
.
mean
([
int
(
p
>
0.5
)
!=
y
for
y
,
p
in
zip
(
y_test
,
y_pred
)])
ret
=
test_template
(
X_y
,
lgb
.
LGBMClassifier
,
feval
=
binary_error
,
custom_obj
=
logregobj
)
ret
=
template
.
test_template
(
X_y
,
lgb
.
LGBMClassifier
,
feval
=
binary_error
,
custom_obj
=
logregobj
)
self
.
assertLess
(
ret
,
0.1
)
def
test_dart
(
self
):
X_train
,
X_test
,
y_train
,
y_test
=
test_template
(
return_data
=
True
)
X_train
,
X_test
,
y_train
,
y_test
=
template
.
test_template
(
return_data
=
True
)
gbm
=
lgb
.
LGBMRegressor
(
boosting_type
=
'dart'
)
gbm
.
fit
(
X_train
,
y_train
)
self
.
assertLessEqual
(
gbm
.
score
(
X_train
,
y_train
),
1.
)
def
test_grid_search
(
self
):
X_train
,
X_test
,
y_train
,
y_test
=
test_template
(
return_data
=
True
)
X_train
,
X_test
,
y_train
,
y_test
=
template
.
test_template
(
return_data
=
True
)
params
=
{
'boosting_type'
:
[
'dart'
,
'gbdt'
],
'n_estimators'
:
[
15
,
20
],
'drop_rate'
:
[
0.1
,
0.2
]}
...
...
@@ -100,27 +102,29 @@ class TestSklearn(unittest.TestCase):
self
.
assertIn
(
gbm
.
best_params_
[
'n_estimators'
],
[
15
,
20
])
def
test_clone_and_property
(
self
):
gbm
=
test_template
(
return_model
=
True
)
gbm
=
template
.
test_template
(
return_model
=
True
)
gbm_clone
=
clone
(
gbm
)
self
.
assertIsInstance
(
gbm
.
booster_
,
lgb
.
Booster
)
self
.
assertIsInstance
(
gbm
.
feature_importance_
,
np
.
ndarray
)
clf
=
test_template
(
load_digits
(
2
,
True
),
model
=
lgb
.
LGBMClassifier
,
return_model
=
True
)
self
.
assertIsInstance
(
gbm
.
feature_importance
s
_
,
np
.
ndarray
)
clf
=
template
.
test_template
(
load_digits
(
2
,
True
),
model
=
lgb
.
LGBMClassifier
,
return_model
=
True
)
self
.
assertListEqual
(
sorted
(
clf
.
classes_
),
[
0
,
1
])
self
.
assertEqual
(
clf
.
n_classes_
,
2
)
self
.
assertIsInstance
(
clf
.
booster_
,
lgb
.
Booster
)
self
.
assertIsInstance
(
clf
.
feature_importance_
,
np
.
ndarray
)
self
.
assertIsInstance
(
clf
.
feature_importance
s
_
,
np
.
ndarray
)
def
test_joblib
(
self
):
gbm
=
test_template
(
num_round
=
10
,
return_model
=
True
)
gbm
=
template
.
test_template
(
num_round
=
10
,
return_model
=
True
)
joblib
.
dump
(
gbm
,
'lgb.pkl'
)
gbm_pickle
=
joblib
.
load
(
'lgb.pkl'
)
self
.
assertIsInstance
(
gbm_pickle
.
booster_
,
lgb
.
Booster
)
self
.
assertDictEqual
(
gbm
.
get_params
(),
gbm_pickle
.
get_params
())
self
.
assertListEqual
(
list
(
gbm
.
feature_importance_
),
list
(
gbm_pickle
.
feature_importance_
))
X_train
,
X_test
,
y_train
,
y_test
=
test_template
(
return_data
=
True
)
self
.
assertListEqual
(
list
(
gbm
.
feature_importance
s
_
),
list
(
gbm_pickle
.
feature_importance
s
_
))
X_train
,
X_test
,
y_train
,
y_test
=
template
.
test_template
(
return_data
=
True
)
gbm
.
fit
(
X_train
,
y_train
,
eval_set
=
[(
X_test
,
y_test
)],
verbose
=
False
)
gbm_pickle
.
fit
(
X_train
,
y_train
,
eval_set
=
[(
X_test
,
y_test
)],
verbose
=
False
)
self
.
assertDictEqual
(
gbm
.
evals_result_
,
gbm_pickle
.
evals_result_
)
for
key
in
gbm
.
evals_result_
:
for
evals
in
zip
(
gbm
.
evals_result_
[
key
],
gbm_pickle
.
evals_result_
[
key
]):
self
.
assertAlmostEqual
(
*
evals
,
places
=
5
)
pred_origin
=
gbm
.
predict
(
X_test
)
pred_pickle
=
gbm_pickle
.
predict
(
X_test
)
self
.
assertEqual
(
len
(
pred_origin
),
len
(
pred_pickle
))
...
...
windows/LightGBM.vcxproj
View file @
eade219e
...
...
@@ -196,7 +196,7 @@
<ClInclude
Include=
"..\include\LightGBM\c_api.h"
/>
<ClInclude
Include=
"..\include\LightGBM\dataset.h"
/>
<ClInclude
Include=
"..\include\LightGBM\dataset_loader.h"
/>
<ClInclude
Include=
"..\include\LightGBM\feature.h"
/>
<ClInclude
Include=
"..\include\LightGBM\feature
_group
.h"
/>
<ClInclude
Include=
"..\include\LightGBM\meta.h"
/>
<ClInclude
Include=
"..\include\LightGBM\metric.h"
/>
<ClInclude
Include=
"..\include\LightGBM\network.h"
/>
...
...
@@ -206,6 +206,7 @@
<ClInclude
Include=
"..\include\LightGBM\utils\array_args.h"
/>
<ClInclude
Include=
"..\include\LightGBM\utils\common.h"
/>
<ClInclude
Include=
"..\include\LightGBM\utils\log.h"
/>
<ClInclude
Include=
"..\include\LightGBM\utils\openmp_wrapper.h"
/>
<ClInclude
Include=
"..\include\LightGBM\utils\pipeline_reader.h"
/>
<ClInclude
Include=
"..\include\LightGBM\utils\random.h"
/>
<ClInclude
Include=
"..\include\LightGBM\utils\text_reader.h"
/>
...
...
@@ -213,8 +214,10 @@
<ClInclude
Include=
"..\src\application\predictor.hpp"
/>
<ClInclude
Include=
"..\src\boosting\gbdt.h"
/>
<ClInclude
Include=
"..\src\boosting\dart.hpp"
/>
<ClInclude
Include=
"..\src\boosting\goss.hpp"
/>
<ClInclude
Include=
"..\src\boosting\score_updater.hpp"
/>
<ClInclude
Include=
"..\src\io\dense_bin.hpp"
/>
<ClInclude
Include=
"..\src\io\dense_nbits_bin.hpp"
/>
<ClInclude
Include=
"..\src\io\ordered_sparse_bin.hpp"
/>
<ClInclude
Include=
"..\src\io\parser.hpp"
/>
<ClInclude
Include=
"..\src\io\sparse_bin.hpp"
/>
...
...
windows/LightGBM.vcxproj.filters
View file @
eade219e
...
...
@@ -96,15 +96,9 @@
<ClInclude
Include=
"..\src\treelearner\data_partition.hpp"
>
<Filter>
src\treelearner
</Filter>
</ClInclude>
<ClInclude
Include=
"..\src\treelearner\feature_histogram.hpp"
>
<Filter>
src\treelearner
</Filter>
</ClInclude>
<ClInclude
Include=
"..\src\treelearner\leaf_splits.hpp"
>
<Filter>
src\treelearner
</Filter>
</ClInclude>
<ClInclude
Include=
"..\src\treelearner\split_info.hpp"
>
<Filter>
src\treelearner
</Filter>
</ClInclude>
<ClInclude
Include=
"..\include\LightGBM\application.h"
>
<Filter>
include\LightGBM
</Filter>
</ClInclude>
...
...
@@ -120,9 +114,6 @@
<ClInclude
Include=
"..\include\LightGBM\dataset.h"
>
<Filter>
include\LightGBM
</Filter>
</ClInclude>
<ClInclude
Include=
"..\include\LightGBM\feature.h"
>
<Filter>
include\LightGBM
</Filter>
</ClInclude>
<ClInclude
Include=
"..\include\LightGBM\meta.h"
>
<Filter>
include\LightGBM
</Filter>
</ClInclude>
...
...
@@ -171,6 +162,24 @@
<ClInclude
Include=
"..\src\boosting\dart.hpp"
>
<Filter>
src\boosting
</Filter>
</ClInclude>
<ClInclude
Include=
"..\include\LightGBM\feature_group.h"
>
<Filter>
include\LightGBM
</Filter>
</ClInclude>
<ClInclude
Include=
"..\src\treelearner\feature_histogram.hpp"
>
<Filter>
src\treelearner
</Filter>
</ClInclude>
<ClInclude
Include=
"..\src\treelearner\split_info.hpp"
>
<Filter>
src\treelearner
</Filter>
</ClInclude>
<ClInclude
Include=
"..\src\boosting\goss.hpp"
>
<Filter>
src\boosting
</Filter>
</ClInclude>
<ClInclude
Include=
"..\src\io\dense_nbits_bin.hpp"
>
<Filter>
src\io
</Filter>
</ClInclude>
<ClInclude
Include=
"..\include\LightGBM\utils\openmp_wrapper.h"
>
<Filter>
include\LightGBM\utils
</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<ClCompile
Include=
"..\src\application\application.cpp"
>
...
...
Prev
1
…
3
4
5
6
7
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment