Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tianlh
LightGBM-DCU
Commits
eade219e
Commit
eade219e
authored
Mar 18, 2017
by
Qiwei Ye
Browse files
merge conflict
parents
f23e6083
060bd316
Changes
129
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
843 additions
and
260 deletions
+843
-260
include/LightGBM/dataset.h
include/LightGBM/dataset.h
+141
-30
include/LightGBM/dataset_loader.h
include/LightGBM/dataset_loader.h
+10
-9
include/LightGBM/export.h
include/LightGBM/export.h
+21
-0
include/LightGBM/feature_group.h
include/LightGBM/feature_group.h
+190
-0
include/LightGBM/metric.h
include/LightGBM/metric.h
+1
-1
include/LightGBM/network.h
include/LightGBM/network.h
+1
-1
include/LightGBM/objective_function.h
include/LightGBM/objective_function.h
+1
-1
include/LightGBM/tree.h
include/LightGBM/tree.h
+27
-38
include/LightGBM/tree_learner.h
include/LightGBM/tree_learner.h
+2
-0
include/LightGBM/utils/array_args.h
include/LightGBM/utils/array_args.h
+90
-41
include/LightGBM/utils/common.h
include/LightGBM/utils/common.h
+4
-1
include/LightGBM/utils/log.h
include/LightGBM/utils/log.h
+13
-1
include/LightGBM/utils/openmp_wrapper.h
include/LightGBM/utils/openmp_wrapper.h
+27
-0
include/LightGBM/utils/random.h
include/LightGBM/utils/random.h
+28
-19
include/LightGBM/utils/threading.h
include/LightGBM/utils/threading.h
+1
-1
python-package/lightgbm/__init__.py
python-package/lightgbm/__init__.py
+11
-4
python-package/lightgbm/basic.py
python-package/lightgbm/basic.py
+191
-59
python-package/lightgbm/callback.py
python-package/lightgbm/callback.py
+28
-25
python-package/lightgbm/compat.py
python-package/lightgbm/compat.py
+13
-2
python-package/lightgbm/engine.py
python-package/lightgbm/engine.py
+43
-27
No files found.
include/LightGBM/dataset.h
View file @
eade219e
...
@@ -6,7 +6,7 @@
...
@@ -6,7 +6,7 @@
#include <LightGBM/meta.h>
#include <LightGBM/meta.h>
#include <LightGBM/config.h>
#include <LightGBM/config.h>
#include <LightGBM/feature.h>
#include <LightGBM/feature
_group
.h>
#include <vector>
#include <vector>
#include <utility>
#include <utility>
...
@@ -19,7 +19,6 @@ namespace LightGBM {
...
@@ -19,7 +19,6 @@ namespace LightGBM {
/*! \brief forward declaration */
/*! \brief forward declaration */
class
DatasetLoader
;
class
DatasetLoader
;
/*!
/*!
* \brief This class is used to store some meta(non-feature) data for training data,
* \brief This class is used to store some meta(non-feature) data for training data,
* e.g. labels, weights, initial scores, qurey level informations.
* e.g. labels, weights, initial scores, qurey level informations.
...
@@ -88,8 +87,6 @@ public:
...
@@ -88,8 +87,6 @@ public:
void
SetQuery
(
const
data_size_t
*
query
,
data_size_t
len
);
void
SetQuery
(
const
data_size_t
*
query
,
data_size_t
len
);
void
SetQueryId
(
const
data_size_t
*
query_id
,
data_size_t
len
);
/*!
/*!
* \brief Set initial scores
* \brief Set initial scores
* \param init_score Initial scores, this class will manage memory for init_score.
* \param init_score Initial scores, this class will manage memory for init_score.
...
@@ -175,7 +172,7 @@ public:
...
@@ -175,7 +172,7 @@ public:
* \brief Get Number of queries
* \brief Get Number of queries
* \return Number of queries
* \return Number of queries
*/
*/
inline
const
data_size_t
num_queries
()
const
{
return
num_queries_
;
}
inline
data_size_t
num_queries
()
const
{
return
num_queries_
;
}
/*!
/*!
* \brief Get weights for queries, if not exists, will return nullptr
* \brief Get weights for queries, if not exists, will return nullptr
...
@@ -244,6 +241,9 @@ private:
...
@@ -244,6 +241,9 @@ private:
std
::
vector
<
data_size_t
>
queries_
;
std
::
vector
<
data_size_t
>
queries_
;
/*! \brief mutex for threading safe call */
/*! \brief mutex for threading safe call */
std
::
mutex
mutex_
;
std
::
mutex
mutex_
;
bool
weight_load_from_file_
;
bool
query_load_from_file_
;
bool
init_score_load_from_file_
;
};
};
...
@@ -280,14 +280,20 @@ class Dataset {
...
@@ -280,14 +280,20 @@ class Dataset {
public:
public:
friend
DatasetLoader
;
friend
DatasetLoader
;
Dataset
();
LIGHTGBM_EXPORT
Dataset
();
Dataset
(
data_size_t
num_data
);
LIGHTGBM_EXPORT
Dataset
(
data_size_t
num_data
);
void
Construct
(
std
::
vector
<
std
::
unique_ptr
<
BinMapper
>>&
bin_mappers
,
const
std
::
vector
<
std
::
vector
<
int
>>&
sample_non_zero_indices
,
size_t
total_sample_cnt
,
const
IOConfig
&
io_config
);
/*! \brief Destructor */
/*! \brief Destructor */
~
Dataset
();
LIGHTGBM_EXPORT
~
Dataset
();
bool
CheckAlign
(
const
Dataset
&
other
)
const
{
LIGHTGBM_EXPORT
bool
CheckAlign
(
const
Dataset
&
other
)
const
{
if
(
num_features_
!=
other
.
num_features_
)
{
if
(
num_features_
!=
other
.
num_features_
)
{
return
false
;
return
false
;
}
}
...
@@ -298,7 +304,7 @@ public:
...
@@ -298,7 +304,7 @@ public:
return
false
;
return
false
;
}
}
for
(
int
i
=
0
;
i
<
num_features_
;
++
i
)
{
for
(
int
i
=
0
;
i
<
num_features_
;
++
i
)
{
if
(
!
f
eature
s_
[
i
]
->
CheckAlign
(
*
(
other
.
f
eature
s_
[
i
].
get
(
))))
{
if
(
!
F
eature
BinMapper
(
i
)
->
CheckAlign
(
*
(
other
.
F
eature
BinMapper
(
i
))))
{
return
false
;
return
false
;
}
}
}
}
...
@@ -306,57 +312,140 @@ public:
...
@@ -306,57 +312,140 @@ public:
}
}
inline
void
PushOneRow
(
int
tid
,
data_size_t
row_idx
,
const
std
::
vector
<
double
>&
feature_values
)
{
inline
void
PushOneRow
(
int
tid
,
data_size_t
row_idx
,
const
std
::
vector
<
double
>&
feature_values
)
{
if
(
is_finish_load_
)
{
return
;
}
for
(
size_t
i
=
0
;
i
<
feature_values
.
size
()
&&
i
<
static_cast
<
size_t
>
(
num_total_features_
);
++
i
)
{
for
(
size_t
i
=
0
;
i
<
feature_values
.
size
()
&&
i
<
static_cast
<
size_t
>
(
num_total_features_
);
++
i
)
{
int
feature_idx
=
used_feature_map_
[
i
];
int
feature_idx
=
used_feature_map_
[
i
];
if
(
feature_idx
>=
0
)
{
if
(
feature_idx
>=
0
)
{
features_
[
feature_idx
]
->
PushData
(
tid
,
row_idx
,
feature_values
[
i
]);
const
int
group
=
feature2group_
[
feature_idx
];
const
int
sub_feature
=
feature2subfeature_
[
feature_idx
];
feature_groups_
[
group
]
->
PushData
(
tid
,
sub_feature
,
row_idx
,
feature_values
[
i
]);
}
}
}
}
}
}
inline
void
PushOneRow
(
int
tid
,
data_size_t
row_idx
,
const
std
::
vector
<
std
::
pair
<
int
,
double
>>&
feature_values
)
{
inline
void
PushOneRow
(
int
tid
,
data_size_t
row_idx
,
const
std
::
vector
<
std
::
pair
<
int
,
double
>>&
feature_values
)
{
if
(
is_finish_load_
)
{
return
;
}
for
(
auto
&
inner_data
:
feature_values
)
{
for
(
auto
&
inner_data
:
feature_values
)
{
if
(
inner_data
.
first
>=
num_total_features_
)
{
continue
;
}
if
(
inner_data
.
first
>=
num_total_features_
)
{
continue
;
}
int
feature_idx
=
used_feature_map_
[
inner_data
.
first
];
int
feature_idx
=
used_feature_map_
[
inner_data
.
first
];
if
(
feature_idx
>=
0
)
{
if
(
feature_idx
>=
0
)
{
features_
[
feature_idx
]
->
PushData
(
tid
,
row_idx
,
inner_data
.
second
);
const
int
group
=
feature2group_
[
feature_idx
];
const
int
sub_feature
=
feature2subfeature_
[
feature_idx
];
feature_groups_
[
group
]
->
PushData
(
tid
,
sub_feature
,
row_idx
,
inner_data
.
second
);
}
}
}
}
}
inline
void
PushOneData
(
int
tid
,
data_size_t
row_idx
,
int
group
,
int
sub_feature
,
double
value
)
{
feature_groups_
[
group
]
->
PushData
(
tid
,
sub_feature
,
row_idx
,
value
);
}
inline
int
RealFeatureIndex
(
int
fidx
)
const
{
return
real_feature_idx_
[
fidx
];
}
}
inline
int
Get
InnerFeatureIndex
(
int
col_idx
)
const
{
inline
int
InnerFeatureIndex
(
int
col_idx
)
const
{
return
used_feature_map_
[
col_idx
];
return
used_feature_map_
[
col_idx
];
}
}
inline
int
Feature2Group
(
int
feature_idx
)
const
{
return
feature2group_
[
feature_idx
];
}
inline
int
Feture2SubFeature
(
int
feature_idx
)
const
{
return
feature2subfeature_
[
feature_idx
];
}
inline
uint64_t
NumTotalBin
()
const
{
return
group_bin_boundaries_
.
back
();
}
Dataset
*
Subset
(
const
data_size_t
*
used_indices
,
data_size_t
num_used_indices
,
bool
is_enable_sparse
)
const
;
void
ReSize
(
data_size_t
num_data
)
;
void
FinishLoad
(
);
void
CopySubset
(
const
Dataset
*
fullset
,
const
data_size_t
*
used_indices
,
data_size_t
num_used_indices
,
bool
need_meta_data
);
bool
SetFloatField
(
const
char
*
field_name
,
const
float
*
field_data
,
data_size_t
num_element
);
LIGHTGBM_EXPORT
void
FinishLoad
(
);
bool
Set
Double
Field
(
const
char
*
field_name
,
const
double
*
field_data
,
data_size_t
num_element
);
LIGHTGBM_EXPORT
bool
Set
Float
Field
(
const
char
*
field_name
,
const
float
*
field_data
,
data_size_t
num_element
);
bool
Set
Int
Field
(
const
char
*
field_name
,
const
int
*
field_data
,
data_size_t
num_element
);
LIGHTGBM_EXPORT
bool
Set
Double
Field
(
const
char
*
field_name
,
const
double
*
field_data
,
data_size_t
num_element
);
bool
G
et
Floa
tField
(
const
char
*
field_name
,
data_size_t
*
out_len
,
const
float
**
out_ptr
);
LIGHTGBM_EXPORT
bool
S
et
In
tField
(
const
char
*
field_name
,
const
int
*
field_data
,
data_size_t
num_element
);
bool
Get
Double
Field
(
const
char
*
field_name
,
data_size_t
*
out_len
,
const
double
**
out_ptr
);
LIGHTGBM_EXPORT
bool
Get
Float
Field
(
const
char
*
field_name
,
data_size_t
*
out_len
,
const
float
**
out_ptr
);
bool
GetIntField
(
const
char
*
field_name
,
data_size_t
*
out_len
,
const
int
**
out_ptr
);
LIGHTGBM_EXPORT
bool
GetDoubleField
(
const
char
*
field_name
,
data_size_t
*
out_len
,
const
double
**
out_ptr
);
LIGHTGBM_EXPORT
bool
GetIntField
(
const
char
*
field_name
,
data_size_t
*
out_len
,
const
int
**
out_ptr
);
/*!
/*!
* \brief Save current dataset into binary file, will save to "filename.bin"
* \brief Save current dataset into binary file, will save to "filename.bin"
*/
*/
void
SaveBinaryFile
(
const
char
*
bin_filename
);
LIGHTGBM_EXPORT
void
SaveBinaryFile
(
const
char
*
bin_filename
);
LIGHTGBM_EXPORT
void
CopyFeatureMapperFrom
(
const
Dataset
*
dataset
);
LIGHTGBM_EXPORT
void
CreateValid
(
const
Dataset
*
dataset
);
void
ConstructHistograms
(
const
std
::
vector
<
int8_t
>&
is_feature_used
,
const
data_size_t
*
data_indices
,
data_size_t
num_data
,
int
leaf_idx
,
std
::
vector
<
std
::
unique_ptr
<
OrderedBin
>>&
ordered_bins
,
const
score_t
*
gradients
,
const
score_t
*
hessians
,
score_t
*
ordered_gradients
,
score_t
*
ordered_hessians
,
HistogramBinEntry
*
histogram_data
)
const
;
void
FixHistogram
(
int
feature_idx
,
double
sum_gradient
,
double
sum_hessian
,
data_size_t
num_data
,
HistogramBinEntry
*
data
)
const
;
inline
data_size_t
Split
(
int
feature
,
uint32_t
threshold
,
data_size_t
*
data_indices
,
data_size_t
num_data
,
data_size_t
*
lte_indices
,
data_size_t
*
gt_indices
)
const
{
const
int
group
=
feature2group_
[
feature
];
const
int
sub_feature
=
feature2subfeature_
[
feature
];
return
feature_groups_
[
group
]
->
Split
(
sub_feature
,
threshold
,
data_indices
,
num_data
,
lte_indices
,
gt_indices
);
}
void
CopyFeatureMapperFrom
(
const
Dataset
*
dataset
,
bool
is_enable_sparse
);
inline
int
SubFeatureBinOffset
(
int
i
)
const
{
const
int
sub_feature
=
feature2subfeature_
[
i
];
if
(
sub_feature
==
0
)
{
return
1
;
}
else
{
return
0
;
}
}
/*!
inline
int
FeatureNumBin
(
int
i
)
const
{
* \brief Get a feature pointer for specific index
const
int
group
=
feature2group_
[
i
];
* \param i Index for feature
const
int
sub_feature
=
feature2subfeature_
[
i
];
* \return Pointer of feature
return
feature_groups_
[
group
]
->
bin_mappers_
[
sub_feature
]
->
num_bin
();
*/
}
inline
Feature
*
FeatureAt
(
int
i
)
const
{
return
features_
[
i
].
get
();
}
inline
const
BinMapper
*
FeatureBinMapper
(
int
i
)
const
{
const
int
group
=
feature2group_
[
i
];
const
int
sub_feature
=
feature2subfeature_
[
i
];
return
feature_groups_
[
group
]
->
bin_mappers_
[
sub_feature
].
get
();
}
inline
BinIterator
*
FeatureIterator
(
int
i
)
const
{
const
int
group
=
feature2group_
[
i
];
const
int
sub_feature
=
feature2subfeature_
[
i
];
return
feature_groups_
[
group
]
->
SubFeatureIterator
(
sub_feature
);
}
inline
double
RealThreshold
(
int
i
,
uint32_t
threshold
)
const
{
const
int
group
=
feature2group_
[
i
];
const
int
sub_feature
=
feature2subfeature_
[
i
];
return
feature_groups_
[
group
]
->
bin_mappers_
[
sub_feature
]
->
BinToValue
(
threshold
);
}
inline
void
CreateOrderedBins
(
std
::
vector
<
std
::
unique_ptr
<
OrderedBin
>>*
ordered_bins
)
const
{
ordered_bins
->
resize
(
num_groups_
);
#pragma omp parallel for schedule(guided)
for
(
int
i
=
0
;
i
<
num_groups_
;
++
i
)
{
ordered_bins
->
at
(
i
).
reset
(
feature_groups_
[
i
]
->
bin_data_
->
CreateOrderedBin
());
}
}
/*!
/*!
* \brief Get meta data pointer
* \brief Get meta data pointer
...
@@ -384,6 +473,20 @@ public:
...
@@ -384,6 +473,20 @@ public:
feature_names_
=
std
::
vector
<
std
::
string
>
(
feature_names
);
feature_names_
=
std
::
vector
<
std
::
string
>
(
feature_names
);
}
}
inline
std
::
vector
<
std
::
string
>
feature_infos
()
const
{
std
::
vector
<
std
::
string
>
bufs
;
for
(
int
i
=
0
;
i
<
num_total_features_
;
i
++
)
{
int
fidx
=
used_feature_map_
[
i
];
if
(
fidx
==
-
1
)
{
bufs
.
push_back
(
"none"
);
}
else
{
const
auto
bin_mapper
=
FeatureBinMapper
(
fidx
);
bufs
.
push_back
(
bin_mapper
->
bin_info
());
}
}
return
bufs
;
}
/*! \brief Get Number of data */
/*! \brief Get Number of data */
inline
data_size_t
num_data
()
const
{
return
num_data_
;
}
inline
data_size_t
num_data
()
const
{
return
num_data_
;
}
...
@@ -395,7 +498,7 @@ public:
...
@@ -395,7 +498,7 @@ public:
private:
private:
const
char
*
data_filename_
;
const
char
*
data_filename_
;
/*! \brief Store used features */
/*! \brief Store used features */
std
::
vector
<
std
::
unique_ptr
<
Feature
>>
features_
;
std
::
vector
<
std
::
unique_ptr
<
Feature
Group
>>
feature
_group
s_
;
/*! \brief Mapper from real feature index to used index*/
/*! \brief Mapper from real feature index to used index*/
std
::
vector
<
int
>
used_feature_map_
;
std
::
vector
<
int
>
used_feature_map_
;
/*! \brief Number of used features*/
/*! \brief Number of used features*/
...
@@ -412,6 +515,14 @@ private:
...
@@ -412,6 +515,14 @@ private:
std
::
vector
<
std
::
string
>
feature_names_
;
std
::
vector
<
std
::
string
>
feature_names_
;
/*! \brief store feature names */
/*! \brief store feature names */
static
const
char
*
binary_file_token
;
static
const
char
*
binary_file_token
;
int
num_groups_
;
std
::
vector
<
int
>
real_feature_idx_
;
std
::
vector
<
int
>
feature2group_
;
std
::
vector
<
int
>
feature2subfeature_
;
std
::
vector
<
uint64_t
>
group_bin_boundaries_
;
std
::
vector
<
int
>
group_feature_start_
;
std
::
vector
<
int
>
group_feature_cnt_
;
bool
is_finish_load_
;
};
};
}
// namespace LightGBM
}
// namespace LightGBM
...
...
include/LightGBM/dataset_loader.h
View file @
eade219e
...
@@ -8,21 +8,21 @@ namespace LightGBM {
...
@@ -8,21 +8,21 @@ namespace LightGBM {
class
DatasetLoader
{
class
DatasetLoader
{
public:
public:
DatasetLoader
(
const
IOConfig
&
io_config
,
const
PredictFunction
&
predict_fun
,
int
num_class
,
const
char
*
filename
);
LIGHTGBM_EXPORT
DatasetLoader
(
const
IOConfig
&
io_config
,
const
PredictFunction
&
predict_fun
,
int
num_class
,
const
char
*
filename
);
~
DatasetLoader
();
LIGHTGBM_EXPORT
~
DatasetLoader
();
Dataset
*
LoadFromFile
(
const
char
*
filename
,
int
rank
,
int
num_machines
);
LIGHTGBM_EXPORT
Dataset
*
LoadFromFile
(
const
char
*
filename
,
int
rank
,
int
num_machines
);
Dataset
*
LoadFromFile
(
const
char
*
filename
)
{
LIGHTGBM_EXPORT
Dataset
*
LoadFromFile
(
const
char
*
filename
)
{
return
LoadFromFile
(
filename
,
0
,
1
);
return
LoadFromFile
(
filename
,
0
,
1
);
}
}
Dataset
*
LoadFromFileAlignWithOtherDataset
(
const
char
*
filename
,
const
Dataset
*
train_data
);
LIGHTGBM_EXPORT
Dataset
*
LoadFromFileAlignWithOtherDataset
(
const
char
*
filename
,
const
Dataset
*
train_data
);
Dataset
*
LoadFromBinFile
(
const
char
*
data_filename
,
const
char
*
bin_filename
,
int
rank
,
int
num_machines
);
LIGHTGBM_EXPORT
Dataset
*
CostructFromSampleData
(
std
::
vector
<
std
::
vector
<
double
>>&
sample_values
,
std
::
vector
<
std
::
vector
<
int
>>&
sample_indices
,
Dataset
*
CostructFromSampleData
(
std
::
vector
<
std
::
vector
<
double
>>&
sample_values
,
size_t
total_sample_size
,
data_size_t
num_data
);
size_t
total_sample_size
,
data_size_t
num_data
);
/*! \brief Disable copy */
/*! \brief Disable copy */
DatasetLoader
&
operator
=
(
const
DatasetLoader
&
)
=
delete
;
DatasetLoader
&
operator
=
(
const
DatasetLoader
&
)
=
delete
;
...
@@ -31,6 +31,8 @@ public:
...
@@ -31,6 +31,8 @@ public:
private:
private:
Dataset
*
LoadFromBinFile
(
const
char
*
data_filename
,
const
char
*
bin_filename
,
int
rank
,
int
num_machines
,
int
*
num_global_data
,
std
::
vector
<
data_size_t
>*
used_data_indices
);
void
SetHeader
(
const
char
*
filename
);
void
SetHeader
(
const
char
*
filename
);
void
CheckDataset
(
const
Dataset
*
dataset
);
void
CheckDataset
(
const
Dataset
*
dataset
);
...
@@ -71,7 +73,6 @@ private:
...
@@ -71,7 +73,6 @@ private:
std
::
vector
<
std
::
string
>
feature_names_
;
std
::
vector
<
std
::
string
>
feature_names_
;
/*! \brief Mapper from real feature index to used index*/
/*! \brief Mapper from real feature index to used index*/
std
::
unordered_set
<
int
>
categorical_features_
;
std
::
unordered_set
<
int
>
categorical_features_
;
};
};
}
}
...
...
include/LightGBM/export.h
0 → 100644
View file @
eade219e
#ifndef LIGHTGBM_EXPORT_H_
#define LIGHTGBM_EXPORT_H_
/** Macros for exporting symbols in MSVC/GCC/CLANG **/
#ifdef __cplusplus
#define LIGHTGBM_EXTERN_C extern "C"
#else
#define LIGHTGBM_EXTERN_C
#endif
#ifdef _MSC_VER
#define LIGHTGBM_EXPORT __declspec(dllexport)
#define LIGHTGBM_C_EXPORT LIGHTGBM_EXTERN_C __declspec(dllexport)
#else
#define LIGHTGBM_EXPORT
#define LIGHTGBM_C_EXPORT LIGHTGBM_EXTERN_C
#endif
#endif
/** LIGHTGBM_EXPORT_H_ **/
include/LightGBM/feature.h
→
include/LightGBM/feature
_group
.h
View file @
eade219e
#ifndef LIGHTGBM_FEATURE_H_
#ifndef LIGHTGBM_FEATURE_
GROUP_
H_
#define LIGHTGBM_FEATURE_H_
#define LIGHTGBM_FEATURE_
GROUP_
H_
#include <LightGBM/utils/random.h>
#include <LightGBM/utils/random.h>
...
@@ -12,22 +12,41 @@
...
@@ -12,22 +12,41 @@
namespace
LightGBM
{
namespace
LightGBM
{
/*! \brief Using to store data and providing some operations on one feature*/
class
Dataset
;
class
Feature
{
class
DatasetLoader
;
/*! \brief Using to store data and providing some operations on one feature group*/
class
FeatureGroup
{
public:
public:
friend
Dataset
;
friend
DatasetLoader
;
/*!
/*!
* \brief Constructor
* \brief Constructor
* \param feature
_idx Index of this feature
* \param
num_
feature
number of features of this group
* \param bin_mapper Bin mapper for
this
feature
* \param bin_mapper
s
Bin mapper for feature
s
* \param num_data Total number of data
* \param num_data Total number of data
* \param is_enable_sparse True if enable sparse feature
* \param is_enable_sparse True if enable sparse feature
*/
*/
Feature
(
int
feature_idx
,
BinMapper
*
bin_mapper
,
FeatureGroup
(
int
num_feature
,
data_size_t
num_data
,
bool
is_enable_sparse
)
std
::
vector
<
std
::
unique_ptr
<
BinMapper
>>&
bin_mappers
,
:
bin_mapper_
(
bin_mapper
)
{
data_size_t
num_data
,
bool
is_enable_sparse
)
:
num_feature_
(
num_feature
)
{
feature_index_
=
feature_idx
;
CHECK
(
static_cast
<
int
>
(
bin_mappers
.
size
())
==
num_feature
);
bin_data_
.
reset
(
Bin
::
CreateBin
(
num_data
,
bin_mapper_
->
num_bin
(),
// use bin at zero to store default_bin
bin_mapper_
->
sparse_rate
(),
is_enable_sparse
,
&
is_sparse_
,
bin_mapper_
->
GetDefaultBin
(),
bin_mapper_
->
bin_type
()));
num_total_bin_
=
1
;
bin_offsets_
.
emplace_back
(
num_total_bin_
);
int
cnt_non_zero
=
0
;
for
(
int
i
=
0
;
i
<
num_feature_
;
++
i
)
{
bin_mappers_
.
emplace_back
(
bin_mappers
[
i
].
release
());
auto
num_bin
=
bin_mappers_
[
i
]
->
num_bin
();
if
(
bin_mappers_
[
i
]
->
GetDefaultBin
()
==
0
)
{
num_bin
-=
1
;
}
num_total_bin_
+=
num_bin
;
bin_offsets_
.
emplace_back
(
num_total_bin_
);
cnt_non_zero
+=
static_cast
<
int
>
(
num_data
*
(
1.0
f
-
bin_mappers_
[
i
]
->
sparse_rate
()));
}
double
sparse_rate
=
1.0
f
-
static_cast
<
double
>
(
cnt_non_zero
)
/
(
num_data
);
bin_data_
.
reset
(
Bin
::
CreateBin
(
num_data
,
num_total_bin_
,
sparse_rate
,
is_enable_sparse
,
&
is_sparse_
));
}
}
/*!
/*!
* \brief Constructor from memory
* \brief Constructor from memory
...
@@ -35,39 +54,44 @@ public:
...
@@ -35,39 +54,44 @@ public:
* \param num_all_data Number of global data
* \param num_all_data Number of global data
* \param local_used_indices Local used indices, empty means using all data
* \param local_used_indices Local used indices, empty means using all data
*/
*/
Feature
(
const
void
*
memory
,
data_size_t
num_all_data
,
Feature
Group
(
const
void
*
memory
,
data_size_t
num_all_data
,
const
std
::
vector
<
data_size_t
>&
local_used_indices
)
{
const
std
::
vector
<
data_size_t
>&
local_used_indices
)
{
const
char
*
memory_ptr
=
reinterpret_cast
<
const
char
*>
(
memory
);
const
char
*
memory_ptr
=
reinterpret_cast
<
const
char
*>
(
memory
);
// get featuer index
feature_index_
=
*
(
reinterpret_cast
<
const
int
*>
(
memory_ptr
));
memory_ptr
+=
sizeof
(
feature_index_
);
// get is_sparse
// get is_sparse
is_sparse_
=
*
(
reinterpret_cast
<
const
bool
*>
(
memory_ptr
));
is_sparse_
=
*
(
reinterpret_cast
<
const
bool
*>
(
memory_ptr
));
memory_ptr
+=
sizeof
(
is_sparse_
);
memory_ptr
+=
sizeof
(
is_sparse_
);
num_feature_
=
*
(
reinterpret_cast
<
const
int
*>
(
memory_ptr
));
memory_ptr
+=
sizeof
(
num_feature_
);
// get bin mapper
// get bin mapper
bin_mapper_
.
reset
(
new
BinMapper
(
memory_ptr
));
bin_mappers_
.
clear
();
memory_ptr
+=
bin_mapper_
->
SizesInByte
();
bin_offsets_
.
clear
();
// start from 1, due to need to store zero bin in this slot
num_total_bin_
=
1
;
bin_offsets_
.
emplace_back
(
num_total_bin_
);
for
(
int
i
=
0
;
i
<
num_feature_
;
++
i
)
{
bin_mappers_
.
emplace_back
(
new
BinMapper
(
memory_ptr
));
auto
num_bin
=
bin_mappers_
[
i
]
->
num_bin
();
if
(
bin_mappers_
[
i
]
->
GetDefaultBin
()
==
0
)
{
num_bin
-=
1
;
}
num_total_bin_
+=
num_bin
;
bin_offsets_
.
emplace_back
(
num_total_bin_
);
memory_ptr
+=
bin_mappers_
[
i
]
->
SizesInByte
();
}
data_size_t
num_data
=
num_all_data
;
data_size_t
num_data
=
num_all_data
;
if
(
!
local_used_indices
.
empty
())
{
if
(
!
local_used_indices
.
empty
())
{
num_data
=
static_cast
<
data_size_t
>
(
local_used_indices
.
size
());
num_data
=
static_cast
<
data_size_t
>
(
local_used_indices
.
size
());
}
}
if
(
is_sparse_
)
{
if
(
is_sparse_
)
{
bin_data_
.
reset
(
Bin
::
CreateSparseBin
(
num_data
,
bin_mapper_
->
num_bin
(),
bin_mapper_
->
GetDefaultBin
(),
bin_mapper_
->
bin_type
()
));
bin_data_
.
reset
(
Bin
::
CreateSparseBin
(
num_data
,
num_total_bin_
));
}
else
{
}
else
{
bin_data_
.
reset
(
Bin
::
CreateDenseBin
(
num_data
,
bin_mapper_
->
num_bin
(),
bin_mapper_
->
GetDefaultBin
(),
bin_mapper_
->
bin_type
()
));
bin_data_
.
reset
(
Bin
::
CreateDenseBin
(
num_data
,
num_total_bin_
));
}
}
// get bin data
// get bin data
bin_data_
->
LoadFromMemory
(
memory_ptr
,
local_used_indices
);
bin_data_
->
LoadFromMemory
(
memory_ptr
,
local_used_indices
);
}
}
/*! \brief Destructor */
/*! \brief Destructor */
~
Feature
()
{
~
FeatureGroup
()
{
}
bool
CheckAlign
(
const
Feature
&
other
)
const
{
if
(
feature_index_
!=
other
.
feature_index_
)
{
return
false
;
}
return
bin_mapper_
->
CheckAlign
(
*
(
other
.
bin_mapper_
.
get
()));
}
}
/*!
/*!
...
@@ -76,66 +100,91 @@ public:
...
@@ -76,66 +100,91 @@ public:
* \param idx Index of record
* \param idx Index of record
* \param value feature value of record
* \param value feature value of record
*/
*/
inline
void
PushData
(
int
tid
,
data_size_t
line_idx
,
double
value
)
{
inline
void
PushData
(
int
tid
,
int
sub_feature_idx
,
data_size_t
line_idx
,
double
value
)
{
unsigned
int
bin
=
bin_mapper_
->
ValueToBin
(
value
);
uint32_t
bin
=
bin_mappers_
[
sub_feature_idx
]
->
ValueToBin
(
value
);
bin_data_
->
Push
(
tid
,
line_idx
,
bin
);
if
(
bin
==
bin_mappers_
[
sub_feature_idx
]
->
GetDefaultBin
())
{
return
;
}
bin
+=
bin_offsets_
[
sub_feature_idx
];
if
(
bin_mappers_
[
sub_feature_idx
]
->
GetDefaultBin
()
==
0
)
{
bin
-=
1
;
}
}
inline
void
PushBin
(
int
tid
,
data_size_t
line_idx
,
unsigned
int
bin
)
{
bin_data_
->
Push
(
tid
,
line_idx
,
bin
);
bin_data_
->
Push
(
tid
,
line_idx
,
bin
);
}
}
inline
void
FinishLoad
()
{
bin_data_
->
FinishLoad
();
}
/*! \brief Index of this feature */
inline
int
feature_index
()
const
{
return
feature_index_
;
}
/*! \brief Bin mapper that this feature used */
inline
const
BinMapper
*
bin_mapper
()
const
{
return
bin_mapper_
.
get
();
}
/*! \brief Number of bin of this feature */
inline
int
num_bin
()
const
{
return
bin_mapper_
->
num_bin
();
}
inline
BinType
bin_type
()
const
{
return
bin_mapper_
->
bin_type
();
}
inline
void
CopySubset
(
const
FeatureGroup
*
full_feature
,
const
data_size_t
*
used_indices
,
data_size_t
num_used_indices
)
{
/*! \brief Get bin data of this feature */
bin_data_
->
CopySubset
(
full_feature
->
bin_data_
.
get
(),
used_indices
,
num_used_indices
);
inline
const
Bin
*
bin_data
()
const
{
return
bin_data_
.
get
();
}
}
inline
BinIterator
*
SubFeatureIterator
(
int
sub_feature
)
{
uint32_t
min_bin
=
bin_offsets_
[
sub_feature
];
uint32_t
max_bin
=
bin_offsets_
[
sub_feature
+
1
]
-
1
;
uint32_t
default_bin
=
bin_mappers_
[
sub_feature
]
->
GetDefaultBin
();
return
bin_data_
->
GetIterator
(
min_bin
,
max_bin
,
default_bin
);
}
inline
data_size_t
Split
(
int
sub_feature
,
uint32_t
threshold
,
data_size_t
*
data_indices
,
data_size_t
num_data
,
data_size_t
*
lte_indices
,
data_size_t
*
gt_indices
)
const
{
uint32_t
min_bin
=
bin_offsets_
[
sub_feature
];
uint32_t
max_bin
=
bin_offsets_
[
sub_feature
+
1
]
-
1
;
uint32_t
default_bin
=
bin_mappers_
[
sub_feature
]
->
GetDefaultBin
();
return
bin_data_
->
Split
(
min_bin
,
max_bin
,
default_bin
,
threshold
,
data_indices
,
num_data
,
lte_indices
,
gt_indices
,
bin_mappers_
[
sub_feature
]
->
bin_type
());
}
/*!
/*!
* \brief From bin to feature value
* \brief From bin to feature value
* \param bin
* \param bin
* \return Feature value of this bin
* \return Feature
Group
value of this bin
*/
*/
inline
double
BinToValue
(
unsigned
int
bin
)
inline
double
BinToValue
(
int
sub_feature_idx
,
uint32_t
bin
)
const
{
const
{
return
bin_mapper_
->
BinToValue
(
bin
);
}
return
bin_mappers_
[
sub_feature_idx
]
->
BinToValue
(
bin
);
}
/*!
/*!
* \brief Save binary data to file
* \brief Save binary data to file
* \param file File want to write
* \param file File want to write
*/
*/
void
SaveBinaryToFile
(
FILE
*
file
)
const
{
void
SaveBinaryToFile
(
FILE
*
file
)
const
{
fwrite
(
&
feature_index_
,
sizeof
(
feature_index_
),
1
,
file
);
fwrite
(
&
is_sparse_
,
sizeof
(
is_sparse_
),
1
,
file
);
fwrite
(
&
is_sparse_
,
sizeof
(
is_sparse_
),
1
,
file
);
bin_mapper_
->
SaveBinaryToFile
(
file
);
fwrite
(
&
num_feature_
,
sizeof
(
num_feature_
),
1
,
file
);
for
(
int
i
=
0
;
i
<
num_feature_
;
++
i
)
{
bin_mappers_
[
i
]
->
SaveBinaryToFile
(
file
);
}
bin_data_
->
SaveBinaryToFile
(
file
);
bin_data_
->
SaveBinaryToFile
(
file
);
}
}
/*!
/*!
* \brief Get sizes in byte of this object
* \brief Get sizes in byte of this object
*/
*/
size_t
SizesInByte
()
const
{
size_t
SizesInByte
()
const
{
return
sizeof
(
feature_index_
)
+
sizeof
(
is_sparse_
)
+
size_t
ret
=
sizeof
(
is_sparse_
)
+
sizeof
(
num_feature_
);
bin_mapper_
->
SizesInByte
()
+
bin_data_
->
SizesInByte
();
for
(
int
i
=
0
;
i
<
num_feature_
;
++
i
)
{
ret
+=
bin_mappers_
[
i
]
->
SizesInByte
();
}
ret
+=
bin_data_
->
SizesInByte
();
return
ret
;
}
}
/*! \brief Disable copy */
/*! \brief Disable copy */
Feature
&
operator
=
(
const
Feature
&
)
=
delete
;
Feature
Group
&
operator
=
(
const
Feature
Group
&
)
=
delete
;
/*! \brief Disable copy */
/*! \brief Disable copy */
Feature
(
const
Feature
&
)
=
delete
;
Feature
Group
(
const
Feature
Group
&
)
=
delete
;
private:
private:
/*! \brief Index of this feature */
/*! \brief Number of features */
int
feature_index_
;
int
num_feature_
;
/*! \brief Bin mapper that this feature used */
/*! \brief Bin mapper for sub features */
std
::
unique_ptr
<
BinMapper
>
bin_mapper_
;
std
::
vector
<
std
::
unique_ptr
<
BinMapper
>>
bin_mappers_
;
/*! \brief Bin offsets for sub features */
std
::
vector
<
uint32_t
>
bin_offsets_
;
/*! \brief Bin data of this feature */
/*! \brief Bin data of this feature */
std
::
unique_ptr
<
Bin
>
bin_data_
;
std
::
unique_ptr
<
Bin
>
bin_data_
;
/*! \brief True if this feature is sparse */
/*! \brief True if this feature is sparse */
bool
is_sparse_
;
bool
is_sparse_
;
int
num_total_bin_
;
};
};
}
// namespace LightGBM
}
// namespace LightGBM
#endif // L
ight
GBM_FEATURE_H_
#endif // L
IGHT
GBM_FEATURE_
GROUP_
H_
include/LightGBM/metric.h
View file @
eade219e
...
@@ -46,7 +46,7 @@ public:
...
@@ -46,7 +46,7 @@ public:
* \param type Specific type of metric
* \param type Specific type of metric
* \param config Config for metric
* \param config Config for metric
*/
*/
static
Metric
*
CreateMetric
(
const
std
::
string
&
type
,
const
MetricConfig
&
config
);
LIGHTGBM_EXPORT
static
Metric
*
CreateMetric
(
const
std
::
string
&
type
,
const
MetricConfig
&
config
);
};
};
...
...
include/LightGBM/network.h
View file @
eade219e
...
@@ -41,7 +41,7 @@ public:
...
@@ -41,7 +41,7 @@ public:
* When number of machines is not power of 2, need group machines into power of 2 group.
* When number of machines is not power of 2, need group machines into power of 2 group.
* And we can let each group has at most 2 machines.
* And we can let each group has at most 2 machines.
* if the group only has 1 machine. this machine is the normal node
* if the group only has 1 machine. this machine is the normal node
* if the grou has 2 machines, this group will have two type of nodes, one is the leader.
* if the grou
p
has 2 machines, this group will have two type of nodes, one is the leader.
* leader will represent this group and communication with others.
* leader will represent this group and communication with others.
*/
*/
enum
RecursiveHalvingNodeType
{
enum
RecursiveHalvingNodeType
{
...
...
include/LightGBM/objective_function.h
View file @
eade219e
...
@@ -44,7 +44,7 @@ public:
...
@@ -44,7 +44,7 @@ public:
* \param type Specific type of objective function
* \param type Specific type of objective function
* \param config Config for objective function
* \param config Config for objective function
*/
*/
static
ObjectiveFunction
*
CreateObjectiveFunction
(
const
std
::
string
&
type
,
LIGHTGBM_EXPORT
static
ObjectiveFunction
*
CreateObjectiveFunction
(
const
std
::
string
&
type
,
const
ObjectiveConfig
&
config
);
const
ObjectiveConfig
&
config
);
};
};
...
...
include/LightGBM/tree.h
View file @
eade219e
...
@@ -2,7 +2,6 @@
...
@@ -2,7 +2,6 @@
#define LIGHTGBM_TREE_H_
#define LIGHTGBM_TREE_H_
#include <LightGBM/meta.h>
#include <LightGBM/meta.h>
#include <LightGBM/feature.h>
#include <LightGBM/dataset.h>
#include <LightGBM/dataset.h>
#include <string>
#include <string>
...
@@ -46,7 +45,7 @@ public:
...
@@ -46,7 +45,7 @@ public:
* \param gain Split gain
* \param gain Split gain
* \return The index of new leaf.
* \return The index of new leaf.
*/
*/
int
Split
(
int
leaf
,
int
feature
,
BinType
bin_type
,
u
nsigned
in
t
threshold
,
int
real_feature
,
int
Split
(
int
leaf
,
int
feature
,
BinType
bin_type
,
u
int32_
t
threshold
,
int
real_feature
,
double
threshold_double
,
double
left_value
,
double
threshold_double
,
double
left_value
,
double
right_value
,
data_size_t
left_cnt
,
data_size_t
right_cnt
,
double
gain
);
double
right_value
,
data_size_t
left_cnt
,
data_size_t
right_cnt
,
double
gain
);
...
@@ -64,7 +63,8 @@ public:
...
@@ -64,7 +63,8 @@ public:
* \param num_data Number of total data
* \param num_data Number of total data
* \param score Will add prediction to score
* \param score Will add prediction to score
*/
*/
void
AddPredictionToScore
(
const
Dataset
*
data
,
data_size_t
num_data
,
void
AddPredictionToScore
(
const
Dataset
*
data
,
data_size_t
num_data
,
double
*
score
)
const
;
double
*
score
)
const
;
/*!
/*!
...
@@ -93,7 +93,7 @@ public:
...
@@ -93,7 +93,7 @@ public:
inline
int
leaf_depth
(
int
leaf_idx
)
const
{
return
leaf_depth_
[
leaf_idx
];
}
inline
int
leaf_depth
(
int
leaf_idx
)
const
{
return
leaf_depth_
[
leaf_idx
];
}
/*! \brief Get feature of specific split*/
/*! \brief Get feature of specific split*/
inline
int
split_feature
_real
(
int
split_idx
)
const
{
return
split_feature_
real_
[
split_idx
];
}
inline
int
split_feature
(
int
split_idx
)
const
{
return
split_feature_
[
split_idx
];
}
/*!
/*!
* \brief Shrinkage for the tree's output
* \brief Shrinkage for the tree's output
...
@@ -101,9 +101,11 @@ public:
...
@@ -101,9 +101,11 @@ public:
* \param rate The factor of shrinkage
* \param rate The factor of shrinkage
*/
*/
inline
void
Shrinkage
(
double
rate
)
{
inline
void
Shrinkage
(
double
rate
)
{
#pragma omp parallel for schedule(static)
for
(
int
i
=
0
;
i
<
num_leaves_
;
++
i
)
{
for
(
int
i
=
0
;
i
<
num_leaves_
;
++
i
)
{
leaf_value_
[
i
]
=
leaf_value_
[
i
]
*
rate
;
leaf_value_
[
i
]
*
=
rate
;
}
}
shrinkage_
*=
rate
;
}
}
/*! \brief Serialize this object to string*/
/*! \brief Serialize this object to string*/
...
@@ -138,18 +140,10 @@ public:
...
@@ -138,18 +140,10 @@ public:
}
}
}
}
static
std
::
vector
<
std
::
function
<
bool
(
unsigned
int
,
unsigned
in
t
)
>
>
inner_decision_funs
;
static
std
::
vector
<
bool
(
*
)(
uint32_t
,
uint32_
t
)
>
inner_decision_funs
;
static
std
::
vector
<
std
::
function
<
bool
(
double
,
double
)
>
>
decision_funs
;
static
std
::
vector
<
bool
(
*
)(
double
,
double
)
>
decision_funs
;
private:
private:
/*!
* \brief Find leaf index of which record belongs by data
* \param data The dataset
* \param data_idx Index of record
* \return Leaf index
*/
inline
int
GetLeaf
(
const
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>&
iterators
,
data_size_t
data_idx
)
const
;
/*!
/*!
* \brief Find leaf index of which record belongs by features
* \brief Find leaf index of which record belongs by features
...
@@ -171,11 +165,11 @@ private:
...
@@ -171,11 +165,11 @@ private:
/*! \brief A non-leaf node's right child */
/*! \brief A non-leaf node's right child */
std
::
vector
<
int
>
right_child_
;
std
::
vector
<
int
>
right_child_
;
/*! \brief A non-leaf node's split feature */
/*! \brief A non-leaf node's split feature */
std
::
vector
<
int
>
split_feature_
;
std
::
vector
<
int
>
split_feature_
inner
;
/*! \brief A non-leaf node's split feature, the original index */
/*! \brief A non-leaf node's split feature, the original index */
std
::
vector
<
int
>
split_feature_
real_
;
std
::
vector
<
int
>
split_feature_
;
/*! \brief A non-leaf node's split threshold in bin */
/*! \brief A non-leaf node's split threshold in bin */
std
::
vector
<
u
nsigned
in
t
>
threshold_in_bin_
;
std
::
vector
<
u
int32_
t
>
threshold_in_bin_
;
/*! \brief A non-leaf node's split threshold in feature value */
/*! \brief A non-leaf node's split threshold in feature value */
std
::
vector
<
double
>
threshold_
;
std
::
vector
<
double
>
threshold_
;
/*! \brief Decision type, 0 for '<='(numerical feature), 1 for 'is'(categorical feature) */
/*! \brief Decision type, 0 for '<='(numerical feature), 1 for 'is'(categorical feature) */
...
@@ -195,39 +189,34 @@ private:
...
@@ -195,39 +189,34 @@ private:
std
::
vector
<
data_size_t
>
internal_count_
;
std
::
vector
<
data_size_t
>
internal_count_
;
/*! \brief Depth for leaves */
/*! \brief Depth for leaves */
std
::
vector
<
int
>
leaf_depth_
;
std
::
vector
<
int
>
leaf_depth_
;
double
shrinkage_
;
bool
has_categorical_
;
};
};
inline
double
Tree
::
Predict
(
const
double
*
feature_values
)
const
{
inline
double
Tree
::
Predict
(
const
double
*
feature_values
)
const
{
if
(
num_leaves_
>
1
)
{
int
leaf
=
GetLeaf
(
feature_values
);
int
leaf
=
GetLeaf
(
feature_values
);
return
LeafOutput
(
leaf
);
return
LeafOutput
(
leaf
);
}
else
{
return
0.0
f
;
}
}
}
inline
int
Tree
::
PredictLeafIndex
(
const
double
*
feature_values
)
const
{
inline
int
Tree
::
PredictLeafIndex
(
const
double
*
feature_values
)
const
{
if
(
num_leaves_
>
1
)
{
int
leaf
=
GetLeaf
(
feature_values
);
int
leaf
=
GetLeaf
(
feature_values
);
return
leaf
;
return
leaf
;
}
inline
int
Tree
::
GetLeaf
(
const
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>&
iterators
,
data_size_t
data_idx
)
const
{
int
node
=
0
;
while
(
node
>=
0
)
{
if
(
inner_decision_funs
[
decision_type_
[
node
]](
iterators
[
split_feature_
[
node
]]
->
Get
(
data_idx
),
threshold_in_bin_
[
node
]))
{
node
=
left_child_
[
node
];
}
else
{
}
else
{
node
=
right_child_
[
node
]
;
return
0
;
}
}
}
return
~
node
;
}
}
inline
int
Tree
::
GetLeaf
(
const
double
*
feature_values
)
const
{
inline
int
Tree
::
GetLeaf
(
const
double
*
feature_values
)
const
{
int
node
=
0
;
int
node
=
0
;
while
(
node
>=
0
)
{
while
(
node
>=
0
)
{
if
(
decision_funs
[
decision_type_
[
node
]](
if
(
decision_funs
[
decision_type_
[
node
]](
feature_values
[
split_feature_
real_
[
node
]],
feature_values
[
split_feature_
[
node
]],
threshold_
[
node
]))
{
threshold_
[
node
]))
{
node
=
left_child_
[
node
];
node
=
left_child_
[
node
];
}
else
{
}
else
{
...
...
include/LightGBM/tree_learner.h
View file @
eade219e
...
@@ -27,6 +27,8 @@ public:
...
@@ -27,6 +27,8 @@ public:
*/
*/
virtual
void
Init
(
const
Dataset
*
train_data
)
=
0
;
virtual
void
Init
(
const
Dataset
*
train_data
)
=
0
;
virtual
void
ResetTrainingData
(
const
Dataset
*
train_data
)
=
0
;
/*!
/*!
* \brief Reset tree configs
* \brief Reset tree configs
* \param tree_config config of tree
* \param tree_config config of tree
...
...
include/LightGBM/utils/array_args.h
View file @
eade219e
...
@@ -3,6 +3,7 @@
...
@@ -3,6 +3,7 @@
#include <vector>
#include <vector>
#include <algorithm>
#include <algorithm>
#include <LightGBM/utils/openmp_wrapper.h>
namespace
LightGBM
{
namespace
LightGBM
{
...
@@ -12,88 +13,136 @@ namespace LightGBM {
...
@@ -12,88 +13,136 @@ namespace LightGBM {
template
<
typename
VAL_T
>
template
<
typename
VAL_T
>
class
ArrayArgs
{
class
ArrayArgs
{
public:
public:
inline
static
size_t
ArgMaxMT
(
const
std
::
vector
<
VAL_T
>&
array
)
{
int
num_threads
=
1
;
#pragma omp parallel
#pragma omp master
{
num_threads
=
omp_get_num_threads
();
}
int
step
=
std
::
max
(
1
,
(
static_cast
<
int
>
(
array
.
size
())
+
num_threads
-
1
)
/
num_threads
);
std
::
vector
<
size_t
>
arg_maxs
(
num_threads
,
0
);
#pragma omp parallel for schedule(static,1)
for
(
int
i
=
0
;
i
<
num_threads
;
++
i
)
{
size_t
start
=
step
*
i
;
if
(
start
>=
array
.
size
())
{
continue
;
}
size_t
end
=
std
::
min
(
array
.
size
(),
start
+
step
);
size_t
arg_max
=
start
;
for
(
size_t
j
=
start
+
1
;
j
<
end
;
++
j
)
{
if
(
array
[
j
]
>
array
[
arg_max
])
{
arg_max
=
j
;
}
}
arg_maxs
[
i
]
=
arg_max
;
}
size_t
ret
=
arg_maxs
[
0
];
for
(
int
i
=
1
;
i
<
num_threads
;
++
i
)
{
if
(
array
[
arg_maxs
[
i
]]
>
array
[
ret
])
{
ret
=
arg_maxs
[
i
];
}
}
return
ret
;
}
inline
static
size_t
ArgMax
(
const
std
::
vector
<
VAL_T
>&
array
)
{
inline
static
size_t
ArgMax
(
const
std
::
vector
<
VAL_T
>&
array
)
{
if
(
array
.
empty
())
{
if
(
array
.
empty
())
{
return
0
;
return
0
;
}
}
size_t
argMax
=
0
;
if
(
array
.
size
()
>
100
)
{
return
ArgMaxMT
(
array
);
}
else
{
size_t
arg_max
=
0
;
for
(
size_t
i
=
1
;
i
<
array
.
size
();
++
i
)
{
for
(
size_t
i
=
1
;
i
<
array
.
size
();
++
i
)
{
if
(
array
[
i
]
>
array
[
argMax
])
{
if
(
array
[
i
]
>
array
[
arg_max
])
{
argMax
=
i
;
arg_max
=
i
;
}
}
}
return
arg_max
;
}
}
return
argMax
;
}
}
inline
static
size_t
ArgMin
(
const
std
::
vector
<
VAL_T
>&
array
)
{
inline
static
size_t
ArgMin
(
const
std
::
vector
<
VAL_T
>&
array
)
{
if
(
array
.
empty
())
{
if
(
array
.
empty
())
{
return
0
;
return
0
;
}
}
size_t
arg
M
in
=
0
;
size_t
arg
_m
in
=
0
;
for
(
size_t
i
=
1
;
i
<
array
.
size
();
++
i
)
{
for
(
size_t
i
=
1
;
i
<
array
.
size
();
++
i
)
{
if
(
array
[
i
]
<
array
[
arg
M
in
])
{
if
(
array
[
i
]
<
array
[
arg
_m
in
])
{
arg
M
in
=
i
;
arg
_m
in
=
i
;
}
}
}
}
return
arg
M
in
;
return
arg
_m
in
;
}
}
inline
static
size_t
ArgMax
(
const
VAL_T
*
array
,
size_t
n
)
{
inline
static
size_t
ArgMax
(
const
VAL_T
*
array
,
size_t
n
)
{
if
(
n
<=
0
)
{
if
(
n
<=
0
)
{
return
0
;
return
0
;
}
}
size_t
arg
M
ax
=
0
;
size_t
arg
_m
ax
=
0
;
for
(
size_t
i
=
1
;
i
<
n
;
++
i
)
{
for
(
size_t
i
=
1
;
i
<
n
;
++
i
)
{
if
(
array
[
i
]
>
array
[
arg
M
ax
])
{
if
(
array
[
i
]
>
array
[
arg
_m
ax
])
{
arg
M
ax
=
i
;
arg
_m
ax
=
i
;
}
}
}
}
return
arg
M
ax
;
return
arg
_m
ax
;
}
}
inline
static
size_t
ArgMin
(
const
VAL_T
*
array
,
size_t
n
)
{
inline
static
size_t
ArgMin
(
const
VAL_T
*
array
,
size_t
n
)
{
if
(
n
<=
0
)
{
if
(
n
<=
0
)
{
return
0
;
return
0
;
}
}
size_t
arg
M
in
=
0
;
size_t
arg
_m
in
=
0
;
for
(
size_t
i
=
1
;
i
<
n
;
++
i
)
{
for
(
size_t
i
=
1
;
i
<
n
;
++
i
)
{
if
(
array
[
i
]
<
array
[
arg
M
in
])
{
if
(
array
[
i
]
<
array
[
arg
_m
in
])
{
arg
M
in
=
i
;
arg
_m
in
=
i
;
}
}
}
}
return
arg
M
in
;
return
arg
_m
in
;
}
}
inline
static
size_t
Partition
(
std
::
vector
<
VAL_T
>*
array
,
size_t
start
,
size_t
end
)
{
inline
static
void
Partition
(
std
::
vector
<
VAL_T
>*
arr
,
int
start
,
int
end
,
int
*
l
,
int
*
r
)
{
VAL_T
&
pivot
=
(
*
array
)[
end
-
1
];
int
i
=
start
-
1
;
size_t
p_idx
=
start
;
int
j
=
end
-
1
;
for
(
size_t
i
=
start
;
i
<
end
-
1
;
++
i
)
{
int
p
=
i
;
if
((
*
array
)[
i
]
>
pivot
)
{
int
q
=
j
;
std
::
swap
((
*
array
)[
p_idx
],
(
*
array
)[
i
]);
if
(
start
>=
end
)
{
++
p_idx
;
return
;
}
}
}
std
::
swap
((
*
array
)[
p_idx
],
(
*
array
)[
end
-
1
]);
std
::
vector
<
VAL_T
>&
ref
=
*
arr
;
return
p_idx
;
VAL_T
v
=
ref
[
end
-
1
];
for
(;;)
{
while
(
ref
[
++
i
]
>
v
);
while
(
v
>
ref
[
--
j
])
{
if
(
j
==
start
)
{
break
;
}
}
if
(
i
>=
j
)
{
break
;
}
std
::
swap
(
ref
[
i
],
ref
[
j
]);
if
(
ref
[
i
]
==
v
)
{
p
++
;
std
::
swap
(
ref
[
p
],
ref
[
i
]);
}
if
(
v
==
ref
[
j
])
{
q
--
;
std
::
swap
(
ref
[
j
],
ref
[
q
]);
}
}
std
::
swap
(
ref
[
i
],
ref
[
end
-
1
]);
j
=
i
-
1
;
i
=
i
+
1
;
for
(
int
k
=
start
;
k
<=
p
;
k
++
,
j
--
)
{
std
::
swap
(
ref
[
k
],
ref
[
j
]);
}
for
(
int
k
=
end
-
2
;
k
>=
q
;
k
--
,
i
++
)
{
std
::
swap
(
ref
[
i
],
ref
[
k
]);
}
*
l
=
j
;
*
r
=
i
;
};
};
inline
static
size_
t
ArgMaxAtK
(
std
::
vector
<
VAL_T
>*
arr
ay
,
size_
t
start
,
size_
t
end
,
size_
t
k
)
{
inline
static
in
t
ArgMaxAtK
(
std
::
vector
<
VAL_T
>*
arr
,
in
t
start
,
in
t
end
,
in
t
k
)
{
if
(
start
=
=
end
-
1
)
{
if
(
start
>
=
end
-
1
)
{
return
start
;
return
start
;
}
}
size_t
p_idx
=
Partition
(
array
,
start
,
end
)
;
int
l
=
start
;
i
f
(
p_idx
==
k
)
{
i
nt
r
=
end
-
1
;
return
p_idx
;
Partition
(
arr
,
start
,
end
,
&
l
,
&
r
)
;
}
if
((
k
>
l
&&
k
<
r
)
||
l
==
0
||
r
==
end
-
1
)
{
else
if
(
k
<
p_idx
)
{
return
k
;
return
ArgMaxAtK
(
array
,
start
,
p_idx
,
k
);
}
else
if
(
k
<=
l
)
{
}
return
ArgMaxAtK
(
arr
,
start
,
l
,
k
);
else
{
}
else
{
return
ArgMaxAtK
(
arr
ay
,
p_idx
+
1
,
end
,
k
);
return
ArgMaxAtK
(
arr
,
r
,
end
,
k
);
}
}
}
}
inline
static
void
MaxK
(
const
std
::
vector
<
VAL_T
>&
array
,
size_
t
k
,
std
::
vector
<
VAL_T
>*
out
)
{
inline
static
void
MaxK
(
const
std
::
vector
<
VAL_T
>&
array
,
in
t
k
,
std
::
vector
<
VAL_T
>*
out
)
{
out
->
clear
();
out
->
clear
();
if
(
k
<=
0
)
{
if
(
k
<=
0
)
{
return
;
return
;
...
@@ -101,10 +150,10 @@ public:
...
@@ -101,10 +150,10 @@ public:
for
(
auto
val
:
array
)
{
for
(
auto
val
:
array
)
{
out
->
push_back
(
val
);
out
->
push_back
(
val
);
}
}
if
(
k
>=
array
.
size
())
{
if
(
static_cast
<
size_t
>
(
k
)
>=
array
.
size
())
{
return
;
return
;
}
}
ArgMaxAtK
(
out
,
0
,
out
->
size
(),
k
-
1
);
ArgMaxAtK
(
out
,
0
,
static_cast
<
int
>
(
out
->
size
()
)
,
k
-
1
);
out
->
erase
(
out
->
begin
()
+
k
,
out
->
end
());
out
->
erase
(
out
->
begin
()
+
k
,
out
->
end
());
}
}
...
...
include/LightGBM/utils/common.h
View file @
eade219e
...
@@ -150,7 +150,7 @@ inline static const char* Atof(const char* p, double* out) {
...
@@ -150,7 +150,7 @@ inline static const char* Atof(const char* p, double* out) {
frac
=
0
;
frac
=
0
;
scale
=
1.0
;
scale
=
1.0
;
if
((
*
p
==
'e'
)
||
(
*
p
==
'E'
))
{
if
((
*
p
==
'e'
)
||
(
*
p
==
'E'
))
{
u
nsigned
in
t
expon
;
u
int32_
t
expon
;
// Get sign of exponent, if any.
// Get sign of exponent, if any.
++
p
;
++
p
;
if
(
*
p
==
'-'
)
{
if
(
*
p
==
'-'
)
{
...
@@ -273,6 +273,9 @@ inline static std::string ArrayToString(const std::vector<T>& arr, size_t n, cha
...
@@ -273,6 +273,9 @@ inline static std::string ArrayToString(const std::vector<T>& arr, size_t n, cha
template
<
typename
T
>
template
<
typename
T
>
inline
static
std
::
vector
<
T
>
StringToArray
(
const
std
::
string
&
str
,
char
delimiter
,
size_t
n
)
{
inline
static
std
::
vector
<
T
>
StringToArray
(
const
std
::
string
&
str
,
char
delimiter
,
size_t
n
)
{
if
(
n
==
0
)
{
return
std
::
vector
<
T
>
();
}
std
::
vector
<
std
::
string
>
strs
=
Split
(
str
.
c_str
(),
delimiter
);
std
::
vector
<
std
::
string
>
strs
=
Split
(
str
.
c_str
(),
delimiter
);
if
(
strs
.
size
()
!=
n
)
{
if
(
strs
.
size
()
!=
n
)
{
Log
::
Fatal
(
"StringToArray error, size doesn't match."
);
Log
::
Fatal
(
"StringToArray error, size doesn't match."
);
...
...
include/LightGBM/utils/log.h
View file @
eade219e
...
@@ -45,6 +45,10 @@ public:
...
@@ -45,6 +45,10 @@ public:
GetLevel
()
=
level
;
GetLevel
()
=
level
;
}
}
static
void
ResetUseException
(
bool
use_ex
)
{
UseException
()
=
use_ex
;
}
static
void
Debug
(
const
char
*
format
,
...)
{
static
void
Debug
(
const
char
*
format
,
...)
{
va_list
val
;
va_list
val
;
va_start
(
val
,
format
);
va_start
(
val
,
format
);
...
@@ -73,7 +77,13 @@ public:
...
@@ -73,7 +77,13 @@ public:
vsprintf
(
str_buf
,
format
,
val
);
vsprintf
(
str_buf
,
format
,
val
);
#endif
#endif
va_end
(
val
);
va_end
(
val
);
fprintf
(
stderr
,
"[LightGBM] [Fatal] %s
\n
"
,
str_buf
);
fflush
(
stderr
);
if
(
UseException
())
{
throw
std
::
runtime_error
(
std
::
string
(
str_buf
));
throw
std
::
runtime_error
(
std
::
string
(
str_buf
));
}
else
{
std
::
exit
(
-
1
);
}
}
}
private:
private:
...
@@ -96,6 +106,8 @@ private:
...
@@ -96,6 +106,8 @@ private:
static
LogLevel
&
GetLevel
()
{
static
thread_local
LogLevel
level
=
LogLevel
::
Info
;
return
level
;
}
static
LogLevel
&
GetLevel
()
{
static
thread_local
LogLevel
level
=
LogLevel
::
Info
;
return
level
;
}
#endif
#endif
static
bool
&
UseException
()
{
static
bool
use_ex
=
false
;
return
use_ex
;
}
};
};
}
// namespace LightGBM
}
// namespace LightGBM
...
...
include/LightGBM/utils/openmp_wrapper.h
0 → 100644
View file @
eade219e
#ifndef LIGHTGBM_OPENMP_WRAPPER_H_
#define LIGHTGBM_OPENMP_WRAPPER_H_
#ifdef _OPENMP
#include <omp.h>
#else
#ifdef _MSC_VER
#pragma warning( disable : 4068 ) // disable unknown pragma warning
#endif
#ifdef __cplusplus
extern
"C"
{
#endif
/** Fall here if no OPENMP support, so just
simulate a single thread running.
All #pragma omp should be ignored by the compiler **/
inline
void
omp_set_num_threads
(
int
)
{}
inline
int
omp_get_num_threads
()
{
return
1
;}
inline
int
omp_get_thread_num
()
{
return
0
;}
#ifdef __cplusplus
};
// extern "C"
#endif
#endif
#endif
/* LIGHTGBM_OPENMP_WRAPPER_H_ */
include/LightGBM/utils/random.h
View file @
eade219e
...
@@ -20,30 +20,41 @@ public:
...
@@ -20,30 +20,41 @@ public:
std
::
random_device
rd
;
std
::
random_device
rd
;
auto
genrator
=
std
::
mt19937
(
rd
());
auto
genrator
=
std
::
mt19937
(
rd
());
std
::
uniform_int_distribution
<
int
>
distribution
(
0
,
x
);
std
::
uniform_int_distribution
<
int
>
distribution
(
0
,
x
);
x
=
static_cast
<
unsigned
int
>
(
distribution
(
genrator
)
)
;
x
=
distribution
(
genrator
);
}
}
/*!
/*!
* \brief Constructor, with specific seed
* \brief Constructor, with specific seed
*/
*/
Random
(
int
seed
)
{
Random
(
int
seed
)
{
x
=
static_cast
<
unsigned
int
>
(
seed
)
;
x
=
seed
;
}
}
/*!
/*!
* \brief Generate random integer
* \brief Generate random integer, int16 range. [0, 65536]
* \param lower_bound lower bound
* \param upper_bound upper bound
* \return The random integer between [lower_bound, upper_bound)
*/
inline
int
NextShort
(
int
lower_bound
,
int
upper_bound
)
{
return
(
RandInt16
())
%
(
upper_bound
-
lower_bound
)
+
lower_bound
;
}
/*!
* \brief Generate random integer, int32 range
* \param lower_bound lower bound
* \param lower_bound lower bound
* \param upper_bound upper bound
* \param upper_bound upper bound
* \return The random integer between [lower_bound, upper_bound)
* \return The random integer between [lower_bound, upper_bound)
*/
*/
inline
int
NextInt
(
int
lower_bound
,
int
upper_bound
)
{
inline
int
NextInt
(
int
lower_bound
,
int
upper_bound
)
{
return
(
next
())
%
(
upper_bound
-
lower_bound
+
1
)
+
lower_bound
;
return
(
RandInt32
())
%
(
upper_bound
-
lower_bound
)
+
lower_bound
;
}
}
/*!
/*!
* \brief Generate random float data
* \brief Generate random float data
* \return The random float between [0.0, 1.0)
* \return The random float between [0.0, 1.0)
*/
*/
inline
double
NextDouble
()
{
inline
float
NextFloat
()
{
// get random float in [0,1)
// get random float in [0,1)
return
static_cast
<
double
>
(
next
()
%
2047
)
/
2047
.0
f
;
return
static_cast
<
float
>
(
RandInt16
())
/
(
32768
.0
f
)
;
}
}
/*!
/*!
* \brief Sample K data from {0,1,...,N-1}
* \brief Sample K data from {0,1,...,N-1}
...
@@ -58,26 +69,24 @@ public:
...
@@ -58,26 +69,24 @@ public:
}
}
for
(
int
i
=
0
;
i
<
N
;
++
i
)
{
for
(
int
i
=
0
;
i
<
N
;
++
i
)
{
double
prob
=
(
K
-
ret
.
size
())
/
static_cast
<
double
>
(
N
-
i
);
double
prob
=
(
K
-
ret
.
size
())
/
static_cast
<
double
>
(
N
-
i
);
if
(
Next
Double
()
<
prob
)
{
if
(
Next
Float
()
<
prob
)
{
ret
.
push_back
(
i
);
ret
.
push_back
(
i
);
}
}
}
}
return
ret
;
return
ret
;
}
}
private:
private:
unsigned
next
()
{
inline
int
RandInt16
()
{
x
^=
x
<<
16
;
x
=
(
214013
*
x
+
2531011
);
x
^=
x
>>
5
;
return
(
x
>>
16
)
&
0x7FFF
;
x
^=
x
<<
1
;
auto
t
=
x
;
x
=
y
;
y
=
z
;
z
=
t
^
x
^
y
;
return
z
;
}
}
unsigned
int
x
=
123456789
;
unsigned
int
y
=
362436069
;
inline
int
RandInt32
()
{
unsigned
int
z
=
521288629
;
x
=
(
214013
*
x
+
2531011
);
return
x
&
0x7FFFFFF
;
}
int
x
=
123456789
;
};
};
...
...
include/LightGBM/utils/threading.h
View file @
eade219e
#ifndef LIGHTGBM_UTILS_THREADING_H_
#ifndef LIGHTGBM_UTILS_THREADING_H_
#define LIGHTGBM_UTILS_THREADING_H_
#define LIGHTGBM_UTILS_THREADING_H_
#include <
omp
.h>
#include <
LightGBM/utils/openmp_wrapper
.h>
#include <vector>
#include <vector>
#include <functional>
#include <functional>
...
...
python-package/lightgbm/__init__.py
View file @
eade219e
...
@@ -6,13 +6,19 @@ Contributors: https://github.com/Microsoft/LightGBM/graphs/contributors
...
@@ -6,13 +6,19 @@ Contributors: https://github.com/Microsoft/LightGBM/graphs/contributors
from
__future__
import
absolute_import
from
__future__
import
absolute_import
from
.basic
import
Dataset
,
Booster
from
.basic
import
Booster
,
Dataset
from
.engine
import
train
,
cv
from
.callback
import
(
early_stopping
,
print_evaluation
,
record_evaluation
,
from
.callback
import
print_evaluation
,
record_evaluation
,
reset_parameter
,
early_stopping
reset_parameter
)
from
.engine
import
cv
,
train
try
:
try
:
from
.sklearn
import
LGBMModel
,
LGBMRegressor
,
LGBMClassifier
,
LGBMRanker
from
.sklearn
import
LGBMModel
,
LGBMRegressor
,
LGBMClassifier
,
LGBMRanker
except
ImportError
:
except
ImportError
:
pass
pass
try
:
from
.plotting
import
plot_importance
,
plot_metric
,
plot_tree
except
ImportError
:
pass
__version__
=
0.1
__version__
=
0.1
...
@@ -20,4 +26,5 @@ __version__ = 0.1
...
@@ -20,4 +26,5 @@ __version__ = 0.1
__all__
=
[
'Dataset'
,
'Booster'
,
__all__
=
[
'Dataset'
,
'Booster'
,
'train'
,
'cv'
,
'train'
,
'cv'
,
'LGBMModel'
,
'LGBMRegressor'
,
'LGBMClassifier'
,
'LGBMRanker'
,
'LGBMModel'
,
'LGBMRegressor'
,
'LGBMClassifier'
,
'LGBMRanker'
,
'print_evaluation'
,
'record_evaluation'
,
'reset_parameter'
,
'early_stopping'
]
'print_evaluation'
,
'record_evaluation'
,
'reset_parameter'
,
'early_stopping'
,
'plot_importance'
,
'plot_metric'
,
'plot_tree'
]
python-package/lightgbm/basic.py
View file @
eade219e
This diff is collapsed.
Click to expand it.
python-package/lightgbm/callback.py
View file @
eade219e
...
@@ -3,6 +3,7 @@
...
@@ -3,6 +3,7 @@
from
__future__
import
absolute_import
from
__future__
import
absolute_import
import
collections
import
collections
from
operator
import
gt
,
lt
from
.compat
import
range_
from
.compat
import
range_
...
@@ -159,48 +160,50 @@ def early_stopping(stopping_rounds, verbose=True):
...
@@ -159,48 +160,50 @@ def early_stopping(stopping_rounds, verbose=True):
callback : function
callback : function
The requested callback function.
The requested callback function.
"""
"""
factor_to_bigger_better
=
{}
best_score
=
[]
best_
score
=
{}
best_
iter
=
[]
best_
iter
=
{}
best_
msg
=
[]
best_msg
=
{}
cmp_op
=
[]
def
init
(
env
):
def
init
(
env
):
"""internal function"""
"""internal function"""
if
not
env
.
evaluation_result_list
:
if
not
env
.
evaluation_result_list
:
raise
ValueError
(
'For early stopping, at least one dataset
or
eval metric is required for evaluation'
)
raise
ValueError
(
'For early stopping, at least one dataset
and
eval metric is required for evaluation'
)
if
verbose
:
if
verbose
:
msg
=
"Train until valid scores didn't improve in {} rounds."
msg
=
"Train until valid scores didn't improve in {} rounds."
print
(
msg
.
format
(
stopping_rounds
))
print
(
msg
.
format
(
stopping_rounds
))
for
i
in
range_
(
len
(
env
.
evaluation_result_list
)):
for
eval_ret
in
env
.
evaluation_result_list
:
best_score
[
i
]
=
float
(
'-inf'
)
best_iter
.
append
(
0
)
best_iter
[
i
]
=
0
if
verbose
:
if
verbose
:
best_msg
[
i
]
=
""
best_msg
.
append
(
None
)
factor_to_bigger_better
[
i
]
=
1.0
if
env
.
evaluation_result_list
[
i
][
3
]
else
-
1.0
if
eval_ret
[
3
]:
best_score
.
append
(
float
(
'-inf'
))
cmp_op
.
append
(
gt
)
else
:
best_score
.
append
(
float
(
'inf'
))
cmp_op
.
append
(
lt
)
def
callback
(
env
):
def
callback
(
env
):
"""internal function"""
"""internal function"""
if
not
best_score
:
if
not
cmp_op
:
init
(
env
)
init
(
env
)
best_msg_buffer
=
None
for
i
in
range_
(
len
(
env
.
evaluation_result_list
)):
for
i
in
range_
(
len
(
env
.
evaluation_result_list
)):
score
=
env
.
evaluation_result_list
[
i
][
2
]
*
factor_to_bigger_better
[
i
]
score
=
env
.
evaluation_result_list
[
i
][
2
]
if
score
>
best_score
[
i
]:
if
cmp_op
[
i
](
score
,
best_score
[
i
]
)
:
best_score
[
i
]
=
score
best_score
[
i
]
=
score
best_iter
[
i
]
=
env
.
iteration
best_iter
[
i
]
=
env
.
iteration
if
verbose
:
if
verbose
:
best_msg
[
i
]
=
'[%d]
\t
%s'
%
(
if
not
best_msg_buffer
:
env
.
iteration
+
1
,
'
\t
'
.
join
(
best_msg_buffer
=
'[%d]
\t
%s'
%
(
[
_format_eval_result
(
x
)
for
x
in
env
.
evaluation_result_list
]
env
.
iteration
+
1
,
'
\t
'
.
join
([
_format_eval_result
(
x
)
for
x
in
env
.
evaluation_result_list
]))
)
best_msg
[
i
]
=
best_msg_buffer
)
elif
env
.
iteration
-
best_iter
[
i
]
>=
stopping_rounds
:
else
:
if
env
.
iteration
-
best_iter
[
i
]
>=
stopping_rounds
:
env
.
model
.
set_attr
(
best_iteration
=
str
(
best_iter
[
i
]))
env
.
model
.
set_attr
(
best_iteration
=
str
(
best_iter
[
i
]))
if
verbose
:
if
verbose
:
print
(
'Early stopping, best iteration is:'
)
print
(
'Early stopping, best iteration is:
\n
'
+
best_msg
[
i
])
print
(
best_msg
[
i
])
raise
EarlyStopException
(
best_iter
[
i
])
raise
EarlyStopException
(
best_iter
[
i
])
callback
.
order
=
30
callback
.
order
=
30
return
callback
return
callback
python-package/lightgbm/compat.py
View file @
eade219e
...
@@ -6,13 +6,15 @@ from __future__ import absolute_import
...
@@ -6,13 +6,15 @@ from __future__ import absolute_import
import
inspect
import
inspect
import
sys
import
sys
import
numpy
as
np
is_py3
=
(
sys
.
version_info
[
0
]
==
3
)
is_py3
=
(
sys
.
version_info
[
0
]
==
3
)
"""compatibility between python2 and python3"""
"""compatibility between python2 and python3"""
if
is_py3
:
if
is_py3
:
string_type
=
str
string_type
=
str
numeric_types
=
(
int
,
float
,
bool
)
numeric_types
=
(
int
,
float
,
bool
)
integer_types
=
int
integer_types
=
(
int
,
)
range_
=
range
range_
=
range
def
argc_
(
func
):
def
argc_
(
func
):
...
@@ -36,6 +38,16 @@ except (ImportError, SyntaxError):
...
@@ -36,6 +38,16 @@ except (ImportError, SyntaxError):
# because of u'...' Unicode literals.
# because of u'...' Unicode literals.
import
json
import
json
def
json_default_with_numpy
(
obj
):
if
isinstance
(
obj
,
(
np
.
integer
,
np
.
floating
,
np
.
bool_
)):
return
obj
.
item
()
elif
isinstance
(
obj
,
np
.
ndarray
):
return
obj
.
tolist
()
else
:
return
obj
"""pandas"""
"""pandas"""
try
:
try
:
from
pandas
import
Series
,
DataFrame
from
pandas
import
Series
,
DataFrame
...
@@ -69,5 +81,4 @@ except ImportError:
...
@@ -69,5 +81,4 @@ except ImportError:
LGBMClassifierBase
=
object
LGBMClassifierBase
=
object
LGBMRegressorBase
=
object
LGBMRegressorBase
=
object
LGBMLabelEncoder
=
None
LGBMLabelEncoder
=
None
LGBMDeprecated
=
None
LGBMStratifiedKFold
=
None
LGBMStratifiedKFold
=
None
python-package/lightgbm/engine.py
View file @
eade219e
...
@@ -17,7 +17,7 @@ from .compat import (SKLEARN_INSTALLED, LGBMStratifiedKFold, integer_types,
...
@@ -17,7 +17,7 @@ from .compat import (SKLEARN_INSTALLED, LGBMStratifiedKFold, integer_types,
def
train
(
params
,
train_set
,
num_boost_round
=
100
,
def
train
(
params
,
train_set
,
num_boost_round
=
100
,
valid_sets
=
None
,
valid_names
=
None
,
valid_sets
=
None
,
valid_names
=
None
,
fobj
=
None
,
feval
=
None
,
init_model
=
None
,
fobj
=
None
,
feval
=
None
,
init_model
=
None
,
feature_name
=
None
,
categorical_feature
=
None
,
feature_name
=
'auto'
,
categorical_feature
=
'auto'
,
early_stopping_rounds
=
None
,
evals_result
=
None
,
early_stopping_rounds
=
None
,
evals_result
=
None
,
verbose_eval
=
True
,
learning_rates
=
None
,
callbacks
=
None
):
verbose_eval
=
True
,
learning_rates
=
None
,
callbacks
=
None
):
"""
"""
...
@@ -42,12 +42,14 @@ def train(params, train_set, num_boost_round=100,
...
@@ -42,12 +42,14 @@ def train(params, train_set, num_boost_round=100,
Note: should return (eval_name, eval_result, is_higher_better) of list of this
Note: should return (eval_name, eval_result, is_higher_better) of list of this
init_model : file name of lightgbm model or 'Booster' instance
init_model : file name of lightgbm model or 'Booster' instance
model used for continued train
model used for continued train
feature_name : list of str
feature_name : list of str
, or 'auto'
Feature names
Feature names
categorical_feature : list of str or int
If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features,
Categorical features,
type int represents index,
type int represents index,
type str represents feature names (need to specify feature_name as well)
type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
early_stopping_rounds: int
early_stopping_rounds: int
Activates early stopping.
Activates early stopping.
Requires at least one validation data and one metric
Requires at least one validation data and one metric
...
@@ -96,7 +98,7 @@ def train(params, train_set, num_boost_round=100,
...
@@ -96,7 +98,7 @@ def train(params, train_set, num_boost_round=100,
init_iteration
=
predictor
.
num_total_iteration
if
predictor
is
not
None
else
0
init_iteration
=
predictor
.
num_total_iteration
if
predictor
is
not
None
else
0
"""check dataset"""
"""check dataset"""
if
not
isinstance
(
train_set
,
Dataset
):
if
not
isinstance
(
train_set
,
Dataset
):
raise
TypeError
(
"Tranin
i
g only accepts Dataset object"
)
raise
TypeError
(
"Tra
i
ning only accepts Dataset object"
)
train_set
.
_update_params
(
params
)
train_set
.
_update_params
(
params
)
train_set
.
_set_predictor
(
predictor
)
train_set
.
_set_predictor
(
predictor
)
...
@@ -219,28 +221,35 @@ class CVBooster(object):
...
@@ -219,28 +221,35 @@ class CVBooster(object):
return
handlerFunction
return
handlerFunction
def
_make_n_folds
(
full_data
,
nfold
,
params
,
seed
,
fpreproc
=
None
,
stratified
=
False
,
shuffle
=
True
):
def
_make_n_folds
(
full_data
,
data_splitter
,
nfold
,
params
,
seed
,
fpreproc
=
None
,
stratified
=
False
,
shuffle
=
True
):
"""
"""
Make an n-fold list of Booster from random indices.
Make an n-fold list of Booster from random indices.
"""
"""
np
.
random
.
seed
(
seed
)
np
.
random
.
seed
(
seed
)
if
stratified
:
num_data
=
full_data
.
construct
().
num_data
()
if
SKLEARN_INSTALLED
:
if
data_splitter
is
not
None
:
sfk
=
LGBMStratifiedKFold
(
n_splits
=
nfold
,
shuffle
=
shuffle
,
random_state
=
seed
)
if
not
hasattr
(
data_splitter
,
'split'
):
idset
=
[
x
[
1
]
for
x
in
sfk
.
split
(
X
=
full_data
.
get_label
(),
y
=
full_data
.
get_label
())]
raise
AttributeError
(
"data_splitter has no method 'split'"
)
else
:
folds
=
data_splitter
.
split
(
np
.
arange
(
num_data
))
elif
stratified
:
if
not
SKLEARN_INSTALLED
:
raise
LightGBMError
(
'Scikit-learn is required for stratified cv'
)
raise
LightGBMError
(
'Scikit-learn is required for stratified cv'
)
sfk
=
LGBMStratifiedKFold
(
n_splits
=
nfold
,
shuffle
=
shuffle
,
random_state
=
seed
)
folds
=
sfk
.
split
(
X
=
np
.
zeros
(
num_data
),
y
=
full_data
.
get_label
())
else
:
else
:
full_data
.
construct
()
if
shuffle
:
if
shuffle
:
randidx
=
np
.
random
.
permutation
(
full_data
.
num_data
())
randidx
=
np
.
random
.
permutation
(
num_data
)
kstep
=
int
(
len
(
randidx
)
/
nfold
)
else
:
idset
=
[
randidx
[(
i
*
kstep
):
min
(
len
(
randidx
),
(
i
+
1
)
*
kstep
)]
for
i
in
range_
(
nfold
)]
randidx
=
np
.
arange
(
num_data
)
kstep
=
int
(
num_data
/
nfold
)
test_id
=
[
randidx
[
i
:
i
+
kstep
]
for
i
in
range_
(
0
,
num_data
,
kstep
)]
train_id
=
[
np
.
concatenate
([
test_id
[
i
]
for
i
in
range_
(
nfold
)
if
k
!=
i
])
for
k
in
range_
(
nfold
)]
folds
=
zip
(
train_id
,
test_id
)
ret
=
CVBooster
()
ret
=
CVBooster
()
for
k
in
range_
(
n
fold
)
:
for
train_idx
,
test_idx
in
fold
s
:
train_set
=
full_data
.
subset
(
np
.
concatenate
([
idset
[
i
]
for
i
in
range_
(
nfold
)
if
k
!=
i
])
)
train_set
=
full_data
.
subset
(
train_idx
)
valid_set
=
full_data
.
subset
(
idset
[
k
]
)
valid_set
=
full_data
.
subset
(
test_idx
)
# run preprocessing on the data set if needed
# run preprocessing on the data set if needed
if
fpreproc
is
not
None
:
if
fpreproc
is
not
None
:
train_set
,
valid_set
,
tparam
=
fpreproc
(
train_set
,
valid_set
,
params
.
copy
())
train_set
,
valid_set
,
tparam
=
fpreproc
(
train_set
,
valid_set
,
params
.
copy
())
...
@@ -265,9 +274,10 @@ def _agg_cv_result(raw_results):
...
@@ -265,9 +274,10 @@ def _agg_cv_result(raw_results):
return
[(
'cv_agg'
,
k
,
np
.
mean
(
v
),
metric_type
[
k
],
np
.
std
(
v
))
for
k
,
v
in
cvmap
.
items
()]
return
[(
'cv_agg'
,
k
,
np
.
mean
(
v
),
metric_type
[
k
],
np
.
std
(
v
))
for
k
,
v
in
cvmap
.
items
()]
def
cv
(
params
,
train_set
,
num_boost_round
=
10
,
nfold
=
5
,
stratified
=
False
,
def
cv
(
params
,
train_set
,
num_boost_round
=
10
,
shuffle
=
True
,
metrics
=
None
,
fobj
=
None
,
feval
=
None
,
init_model
=
None
,
data_splitter
=
None
,
nfold
=
5
,
stratified
=
False
,
shuffle
=
True
,
feature_name
=
None
,
categorical_feature
=
None
,
metrics
=
None
,
fobj
=
None
,
feval
=
None
,
init_model
=
None
,
feature_name
=
'auto'
,
categorical_feature
=
'auto'
,
early_stopping_rounds
=
None
,
fpreproc
=
None
,
early_stopping_rounds
=
None
,
fpreproc
=
None
,
verbose_eval
=
None
,
show_stdv
=
True
,
seed
=
0
,
verbose_eval
=
None
,
show_stdv
=
True
,
seed
=
0
,
callbacks
=
None
):
callbacks
=
None
):
...
@@ -282,14 +292,14 @@ def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False,
...
@@ -282,14 +292,14 @@ def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False,
Data to be trained.
Data to be trained.
num_boost_round : int
num_boost_round : int
Number of boosting iterations.
Number of boosting iterations.
data_splitter : an instance with split(X) method
Instance with split(X) method.
nfold : int
nfold : int
Number of folds in CV.
Number of folds in CV.
stratified : bool
stratified : bool
Perform stratified sampling.
Perform stratified sampling.
shuffle: bool
shuffle: bool
Whether shuffle before split data
Whether shuffle before split data
folds : a KFold or StratifiedKFold instance
Sklearn KFolds or StratifiedKFolds.
metrics : string or list of strings
metrics : string or list of strings
Evaluation metrics to be watched in CV.
Evaluation metrics to be watched in CV.
fobj : function
fobj : function
...
@@ -298,11 +308,14 @@ def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False,
...
@@ -298,11 +308,14 @@ def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False,
Custom evaluation function.
Custom evaluation function.
init_model : file name of lightgbm model or 'Booster' instance
init_model : file name of lightgbm model or 'Booster' instance
model used for continued train
model used for continued train
feature_name : list of str
feature_name : list of str
, or 'auto'
Feature names
Feature names
categorical_feature : list of str or int
If 'auto' and data is pandas DataFrame, use data columns name
Categorical features, type int represents index,
categorical_feature : list of str or int, or 'auto'
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
early_stopping_rounds: int
early_stopping_rounds: int
Activates early stopping. CV error needs to decrease at least
Activates early stopping. CV error needs to decrease at least
every <early_stopping_rounds> round(s) to continue.
every <early_stopping_rounds> round(s) to continue.
...
@@ -351,7 +364,10 @@ def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False,
...
@@ -351,7 +364,10 @@ def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False,
params
[
'metric'
].
extend
(
metrics
)
params
[
'metric'
].
extend
(
metrics
)
results
=
collections
.
defaultdict
(
list
)
results
=
collections
.
defaultdict
(
list
)
cvfolds
=
_make_n_folds
(
train_set
,
nfold
,
params
,
seed
,
fpreproc
,
stratified
,
shuffle
)
cvfolds
=
_make_n_folds
(
train_set
,
data_splitter
=
data_splitter
,
nfold
=
nfold
,
params
=
params
,
seed
=
seed
,
fpreproc
=
fpreproc
,
stratified
=
stratified
,
shuffle
=
shuffle
)
# setup callbacks
# setup callbacks
if
callbacks
is
None
:
if
callbacks
is
None
:
...
@@ -380,7 +396,7 @@ def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False,
...
@@ -380,7 +396,7 @@ def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False,
begin_iteration
=
0
,
begin_iteration
=
0
,
end_iteration
=
num_boost_round
,
end_iteration
=
num_boost_round
,
evaluation_result_list
=
None
))
evaluation_result_list
=
None
))
cvfolds
.
update
(
fobj
)
cvfolds
.
update
(
fobj
=
fobj
)
res
=
_agg_cv_result
(
cvfolds
.
eval_valid
(
feval
))
res
=
_agg_cv_result
(
cvfolds
.
eval_valid
(
feval
))
for
_
,
key
,
mean
,
_
,
std
in
res
:
for
_
,
key
,
mean
,
_
,
std
in
res
:
results
[
key
+
'-mean'
].
append
(
mean
)
results
[
key
+
'-mean'
].
append
(
mean
)
...
...
Prev
1
2
3
4
5
6
7
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment