Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tianlh
LightGBM-DCU
Commits
eade219e
Commit
eade219e
authored
Mar 18, 2017
by
Qiwei Ye
Browse files
merge conflict
parents
f23e6083
060bd316
Changes
129
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
843 additions
and
260 deletions
+843
-260
include/LightGBM/dataset.h
include/LightGBM/dataset.h
+141
-30
include/LightGBM/dataset_loader.h
include/LightGBM/dataset_loader.h
+10
-9
include/LightGBM/export.h
include/LightGBM/export.h
+21
-0
include/LightGBM/feature_group.h
include/LightGBM/feature_group.h
+190
-0
include/LightGBM/metric.h
include/LightGBM/metric.h
+1
-1
include/LightGBM/network.h
include/LightGBM/network.h
+1
-1
include/LightGBM/objective_function.h
include/LightGBM/objective_function.h
+1
-1
include/LightGBM/tree.h
include/LightGBM/tree.h
+27
-38
include/LightGBM/tree_learner.h
include/LightGBM/tree_learner.h
+2
-0
include/LightGBM/utils/array_args.h
include/LightGBM/utils/array_args.h
+90
-41
include/LightGBM/utils/common.h
include/LightGBM/utils/common.h
+4
-1
include/LightGBM/utils/log.h
include/LightGBM/utils/log.h
+13
-1
include/LightGBM/utils/openmp_wrapper.h
include/LightGBM/utils/openmp_wrapper.h
+27
-0
include/LightGBM/utils/random.h
include/LightGBM/utils/random.h
+28
-19
include/LightGBM/utils/threading.h
include/LightGBM/utils/threading.h
+1
-1
python-package/lightgbm/__init__.py
python-package/lightgbm/__init__.py
+11
-4
python-package/lightgbm/basic.py
python-package/lightgbm/basic.py
+191
-59
python-package/lightgbm/callback.py
python-package/lightgbm/callback.py
+28
-25
python-package/lightgbm/compat.py
python-package/lightgbm/compat.py
+13
-2
python-package/lightgbm/engine.py
python-package/lightgbm/engine.py
+43
-27
No files found.
include/LightGBM/dataset.h
View file @
eade219e
...
@@ -6,7 +6,7 @@
...
@@ -6,7 +6,7 @@
#include <LightGBM/meta.h>
#include <LightGBM/meta.h>
#include <LightGBM/config.h>
#include <LightGBM/config.h>
#include <LightGBM/feature.h>
#include <LightGBM/feature
_group
.h>
#include <vector>
#include <vector>
#include <utility>
#include <utility>
...
@@ -19,7 +19,6 @@ namespace LightGBM {
...
@@ -19,7 +19,6 @@ namespace LightGBM {
/*! \brief forward declaration */
/*! \brief forward declaration */
class
DatasetLoader
;
class
DatasetLoader
;
/*!
/*!
* \brief This class is used to store some meta(non-feature) data for training data,
* \brief This class is used to store some meta(non-feature) data for training data,
* e.g. labels, weights, initial scores, qurey level informations.
* e.g. labels, weights, initial scores, qurey level informations.
...
@@ -88,8 +87,6 @@ public:
...
@@ -88,8 +87,6 @@ public:
void
SetQuery
(
const
data_size_t
*
query
,
data_size_t
len
);
void
SetQuery
(
const
data_size_t
*
query
,
data_size_t
len
);
void
SetQueryId
(
const
data_size_t
*
query_id
,
data_size_t
len
);
/*!
/*!
* \brief Set initial scores
* \brief Set initial scores
* \param init_score Initial scores, this class will manage memory for init_score.
* \param init_score Initial scores, this class will manage memory for init_score.
...
@@ -175,7 +172,7 @@ public:
...
@@ -175,7 +172,7 @@ public:
* \brief Get Number of queries
* \brief Get Number of queries
* \return Number of queries
* \return Number of queries
*/
*/
inline
const
data_size_t
num_queries
()
const
{
return
num_queries_
;
}
inline
data_size_t
num_queries
()
const
{
return
num_queries_
;
}
/*!
/*!
* \brief Get weights for queries, if not exists, will return nullptr
* \brief Get weights for queries, if not exists, will return nullptr
...
@@ -244,6 +241,9 @@ private:
...
@@ -244,6 +241,9 @@ private:
std
::
vector
<
data_size_t
>
queries_
;
std
::
vector
<
data_size_t
>
queries_
;
/*! \brief mutex for threading safe call */
/*! \brief mutex for threading safe call */
std
::
mutex
mutex_
;
std
::
mutex
mutex_
;
bool
weight_load_from_file_
;
bool
query_load_from_file_
;
bool
init_score_load_from_file_
;
};
};
...
@@ -280,14 +280,20 @@ class Dataset {
...
@@ -280,14 +280,20 @@ class Dataset {
public:
public:
friend
DatasetLoader
;
friend
DatasetLoader
;
Dataset
();
LIGHTGBM_EXPORT
Dataset
();
Dataset
(
data_size_t
num_data
);
LIGHTGBM_EXPORT
Dataset
(
data_size_t
num_data
);
void
Construct
(
std
::
vector
<
std
::
unique_ptr
<
BinMapper
>>&
bin_mappers
,
const
std
::
vector
<
std
::
vector
<
int
>>&
sample_non_zero_indices
,
size_t
total_sample_cnt
,
const
IOConfig
&
io_config
);
/*! \brief Destructor */
/*! \brief Destructor */
~
Dataset
();
LIGHTGBM_EXPORT
~
Dataset
();
bool
CheckAlign
(
const
Dataset
&
other
)
const
{
LIGHTGBM_EXPORT
bool
CheckAlign
(
const
Dataset
&
other
)
const
{
if
(
num_features_
!=
other
.
num_features_
)
{
if
(
num_features_
!=
other
.
num_features_
)
{
return
false
;
return
false
;
}
}
...
@@ -298,7 +304,7 @@ public:
...
@@ -298,7 +304,7 @@ public:
return
false
;
return
false
;
}
}
for
(
int
i
=
0
;
i
<
num_features_
;
++
i
)
{
for
(
int
i
=
0
;
i
<
num_features_
;
++
i
)
{
if
(
!
f
eature
s_
[
i
]
->
CheckAlign
(
*
(
other
.
f
eature
s_
[
i
].
get
(
))))
{
if
(
!
F
eature
BinMapper
(
i
)
->
CheckAlign
(
*
(
other
.
F
eature
BinMapper
(
i
))))
{
return
false
;
return
false
;
}
}
}
}
...
@@ -306,57 +312,140 @@ public:
...
@@ -306,57 +312,140 @@ public:
}
}
inline
void
PushOneRow
(
int
tid
,
data_size_t
row_idx
,
const
std
::
vector
<
double
>&
feature_values
)
{
inline
void
PushOneRow
(
int
tid
,
data_size_t
row_idx
,
const
std
::
vector
<
double
>&
feature_values
)
{
if
(
is_finish_load_
)
{
return
;
}
for
(
size_t
i
=
0
;
i
<
feature_values
.
size
()
&&
i
<
static_cast
<
size_t
>
(
num_total_features_
);
++
i
)
{
for
(
size_t
i
=
0
;
i
<
feature_values
.
size
()
&&
i
<
static_cast
<
size_t
>
(
num_total_features_
);
++
i
)
{
int
feature_idx
=
used_feature_map_
[
i
];
int
feature_idx
=
used_feature_map_
[
i
];
if
(
feature_idx
>=
0
)
{
if
(
feature_idx
>=
0
)
{
features_
[
feature_idx
]
->
PushData
(
tid
,
row_idx
,
feature_values
[
i
]);
const
int
group
=
feature2group_
[
feature_idx
];
const
int
sub_feature
=
feature2subfeature_
[
feature_idx
];
feature_groups_
[
group
]
->
PushData
(
tid
,
sub_feature
,
row_idx
,
feature_values
[
i
]);
}
}
}
}
}
}
inline
void
PushOneRow
(
int
tid
,
data_size_t
row_idx
,
const
std
::
vector
<
std
::
pair
<
int
,
double
>>&
feature_values
)
{
inline
void
PushOneRow
(
int
tid
,
data_size_t
row_idx
,
const
std
::
vector
<
std
::
pair
<
int
,
double
>>&
feature_values
)
{
if
(
is_finish_load_
)
{
return
;
}
for
(
auto
&
inner_data
:
feature_values
)
{
for
(
auto
&
inner_data
:
feature_values
)
{
if
(
inner_data
.
first
>=
num_total_features_
)
{
continue
;
}
if
(
inner_data
.
first
>=
num_total_features_
)
{
continue
;
}
int
feature_idx
=
used_feature_map_
[
inner_data
.
first
];
int
feature_idx
=
used_feature_map_
[
inner_data
.
first
];
if
(
feature_idx
>=
0
)
{
if
(
feature_idx
>=
0
)
{
features_
[
feature_idx
]
->
PushData
(
tid
,
row_idx
,
inner_data
.
second
);
const
int
group
=
feature2group_
[
feature_idx
];
const
int
sub_feature
=
feature2subfeature_
[
feature_idx
];
feature_groups_
[
group
]
->
PushData
(
tid
,
sub_feature
,
row_idx
,
inner_data
.
second
);
}
}
}
}
}
}
inline
int
GetInnerFeatureIndex
(
int
col_idx
)
const
{
inline
void
PushOneData
(
int
tid
,
data_size_t
row_idx
,
int
group
,
int
sub_feature
,
double
value
)
{
feature_groups_
[
group
]
->
PushData
(
tid
,
sub_feature
,
row_idx
,
value
);
}
inline
int
RealFeatureIndex
(
int
fidx
)
const
{
return
real_feature_idx_
[
fidx
];
}
inline
int
InnerFeatureIndex
(
int
col_idx
)
const
{
return
used_feature_map_
[
col_idx
];
return
used_feature_map_
[
col_idx
];
}
}
inline
int
Feature2Group
(
int
feature_idx
)
const
{
return
feature2group_
[
feature_idx
];
}
inline
int
Feture2SubFeature
(
int
feature_idx
)
const
{
return
feature2subfeature_
[
feature_idx
];
}
inline
uint64_t
NumTotalBin
()
const
{
return
group_bin_boundaries_
.
back
();
}
Dataset
*
Subset
(
const
data_size_t
*
used_indices
,
data_size_t
num_used_indices
,
bool
is_enable_sparse
)
const
;
void
ReSize
(
data_size_t
num_data
)
;
void
FinishLoad
(
);
void
CopySubset
(
const
Dataset
*
fullset
,
const
data_size_t
*
used_indices
,
data_size_t
num_used_indices
,
bool
need_meta_data
);
bool
SetFloatField
(
const
char
*
field_name
,
const
float
*
field_data
,
data_size_t
num_element
);
LIGHTGBM_EXPORT
void
FinishLoad
(
);
bool
Set
Double
Field
(
const
char
*
field_name
,
const
double
*
field_data
,
data_size_t
num_element
);
LIGHTGBM_EXPORT
bool
Set
Float
Field
(
const
char
*
field_name
,
const
float
*
field_data
,
data_size_t
num_element
);
bool
Set
Int
Field
(
const
char
*
field_name
,
const
int
*
field_data
,
data_size_t
num_element
);
LIGHTGBM_EXPORT
bool
Set
Double
Field
(
const
char
*
field_name
,
const
double
*
field_data
,
data_size_t
num_element
);
bool
G
et
Floa
tField
(
const
char
*
field_name
,
data_size_t
*
out_len
,
const
float
**
out_ptr
);
LIGHTGBM_EXPORT
bool
S
et
In
tField
(
const
char
*
field_name
,
const
int
*
field_data
,
data_size_t
num_element
);
bool
Get
Double
Field
(
const
char
*
field_name
,
data_size_t
*
out_len
,
const
double
**
out_ptr
);
LIGHTGBM_EXPORT
bool
Get
Float
Field
(
const
char
*
field_name
,
data_size_t
*
out_len
,
const
float
**
out_ptr
);
bool
GetIntField
(
const
char
*
field_name
,
data_size_t
*
out_len
,
const
int
**
out_ptr
);
LIGHTGBM_EXPORT
bool
GetDoubleField
(
const
char
*
field_name
,
data_size_t
*
out_len
,
const
double
**
out_ptr
);
LIGHTGBM_EXPORT
bool
GetIntField
(
const
char
*
field_name
,
data_size_t
*
out_len
,
const
int
**
out_ptr
);
/*!
/*!
* \brief Save current dataset into binary file, will save to "filename.bin"
* \brief Save current dataset into binary file, will save to "filename.bin"
*/
*/
void
SaveBinaryFile
(
const
char
*
bin_filename
);
LIGHTGBM_EXPORT
void
SaveBinaryFile
(
const
char
*
bin_filename
);
LIGHTGBM_EXPORT
void
CopyFeatureMapperFrom
(
const
Dataset
*
dataset
);
LIGHTGBM_EXPORT
void
CreateValid
(
const
Dataset
*
dataset
);
void
ConstructHistograms
(
const
std
::
vector
<
int8_t
>&
is_feature_used
,
const
data_size_t
*
data_indices
,
data_size_t
num_data
,
int
leaf_idx
,
std
::
vector
<
std
::
unique_ptr
<
OrderedBin
>>&
ordered_bins
,
const
score_t
*
gradients
,
const
score_t
*
hessians
,
score_t
*
ordered_gradients
,
score_t
*
ordered_hessians
,
HistogramBinEntry
*
histogram_data
)
const
;
void
FixHistogram
(
int
feature_idx
,
double
sum_gradient
,
double
sum_hessian
,
data_size_t
num_data
,
HistogramBinEntry
*
data
)
const
;
inline
data_size_t
Split
(
int
feature
,
uint32_t
threshold
,
data_size_t
*
data_indices
,
data_size_t
num_data
,
data_size_t
*
lte_indices
,
data_size_t
*
gt_indices
)
const
{
const
int
group
=
feature2group_
[
feature
];
const
int
sub_feature
=
feature2subfeature_
[
feature
];
return
feature_groups_
[
group
]
->
Split
(
sub_feature
,
threshold
,
data_indices
,
num_data
,
lte_indices
,
gt_indices
);
}
inline
int
SubFeatureBinOffset
(
int
i
)
const
{
const
int
sub_feature
=
feature2subfeature_
[
i
];
if
(
sub_feature
==
0
)
{
return
1
;
}
else
{
return
0
;
}
}
void
CopyFeatureMapperFrom
(
const
Dataset
*
dataset
,
bool
is_enable_sparse
);
inline
int
FeatureNumBin
(
int
i
)
const
{
const
int
group
=
feature2group_
[
i
];
const
int
sub_feature
=
feature2subfeature_
[
i
];
return
feature_groups_
[
group
]
->
bin_mappers_
[
sub_feature
]
->
num_bin
();
}
inline
const
BinMapper
*
FeatureBinMapper
(
int
i
)
const
{
const
int
group
=
feature2group_
[
i
];
const
int
sub_feature
=
feature2subfeature_
[
i
];
return
feature_groups_
[
group
]
->
bin_mappers_
[
sub_feature
].
get
();
}
/*!
inline
BinIterator
*
FeatureIterator
(
int
i
)
const
{
* \brief Get a feature pointer for specific index
const
int
group
=
feature2group_
[
i
];
* \param i Index for feature
const
int
sub_feature
=
feature2subfeature_
[
i
];
* \return Pointer of feature
return
feature_groups_
[
group
]
->
SubFeatureIterator
(
sub_feature
);
*/
}
inline
Feature
*
FeatureAt
(
int
i
)
const
{
return
features_
[
i
].
get
();
}
inline
double
RealThreshold
(
int
i
,
uint32_t
threshold
)
const
{
const
int
group
=
feature2group_
[
i
];
const
int
sub_feature
=
feature2subfeature_
[
i
];
return
feature_groups_
[
group
]
->
bin_mappers_
[
sub_feature
]
->
BinToValue
(
threshold
);
}
inline
void
CreateOrderedBins
(
std
::
vector
<
std
::
unique_ptr
<
OrderedBin
>>*
ordered_bins
)
const
{
ordered_bins
->
resize
(
num_groups_
);
#pragma omp parallel for schedule(guided)
for
(
int
i
=
0
;
i
<
num_groups_
;
++
i
)
{
ordered_bins
->
at
(
i
).
reset
(
feature_groups_
[
i
]
->
bin_data_
->
CreateOrderedBin
());
}
}
/*!
/*!
* \brief Get meta data pointer
* \brief Get meta data pointer
...
@@ -384,6 +473,20 @@ public:
...
@@ -384,6 +473,20 @@ public:
feature_names_
=
std
::
vector
<
std
::
string
>
(
feature_names
);
feature_names_
=
std
::
vector
<
std
::
string
>
(
feature_names
);
}
}
inline
std
::
vector
<
std
::
string
>
feature_infos
()
const
{
std
::
vector
<
std
::
string
>
bufs
;
for
(
int
i
=
0
;
i
<
num_total_features_
;
i
++
)
{
int
fidx
=
used_feature_map_
[
i
];
if
(
fidx
==
-
1
)
{
bufs
.
push_back
(
"none"
);
}
else
{
const
auto
bin_mapper
=
FeatureBinMapper
(
fidx
);
bufs
.
push_back
(
bin_mapper
->
bin_info
());
}
}
return
bufs
;
}
/*! \brief Get Number of data */
/*! \brief Get Number of data */
inline
data_size_t
num_data
()
const
{
return
num_data_
;
}
inline
data_size_t
num_data
()
const
{
return
num_data_
;
}
...
@@ -395,7 +498,7 @@ public:
...
@@ -395,7 +498,7 @@ public:
private:
private:
const
char
*
data_filename_
;
const
char
*
data_filename_
;
/*! \brief Store used features */
/*! \brief Store used features */
std
::
vector
<
std
::
unique_ptr
<
Feature
>>
features_
;
std
::
vector
<
std
::
unique_ptr
<
Feature
Group
>>
feature
_group
s_
;
/*! \brief Mapper from real feature index to used index*/
/*! \brief Mapper from real feature index to used index*/
std
::
vector
<
int
>
used_feature_map_
;
std
::
vector
<
int
>
used_feature_map_
;
/*! \brief Number of used features*/
/*! \brief Number of used features*/
...
@@ -412,6 +515,14 @@ private:
...
@@ -412,6 +515,14 @@ private:
std
::
vector
<
std
::
string
>
feature_names_
;
std
::
vector
<
std
::
string
>
feature_names_
;
/*! \brief store feature names */
/*! \brief store feature names */
static
const
char
*
binary_file_token
;
static
const
char
*
binary_file_token
;
int
num_groups_
;
std
::
vector
<
int
>
real_feature_idx_
;
std
::
vector
<
int
>
feature2group_
;
std
::
vector
<
int
>
feature2subfeature_
;
std
::
vector
<
uint64_t
>
group_bin_boundaries_
;
std
::
vector
<
int
>
group_feature_start_
;
std
::
vector
<
int
>
group_feature_cnt_
;
bool
is_finish_load_
;
};
};
}
// namespace LightGBM
}
// namespace LightGBM
...
...
include/LightGBM/dataset_loader.h
View file @
eade219e
...
@@ -8,21 +8,21 @@ namespace LightGBM {
...
@@ -8,21 +8,21 @@ namespace LightGBM {
class
DatasetLoader
{
class
DatasetLoader
{
public:
public:
DatasetLoader
(
const
IOConfig
&
io_config
,
const
PredictFunction
&
predict_fun
,
int
num_class
,
const
char
*
filename
);
LIGHTGBM_EXPORT
DatasetLoader
(
const
IOConfig
&
io_config
,
const
PredictFunction
&
predict_fun
,
int
num_class
,
const
char
*
filename
);
~
DatasetLoader
();
LIGHTGBM_EXPORT
~
DatasetLoader
();
Dataset
*
LoadFromFile
(
const
char
*
filename
,
int
rank
,
int
num_machines
);
LIGHTGBM_EXPORT
Dataset
*
LoadFromFile
(
const
char
*
filename
,
int
rank
,
int
num_machines
);
Dataset
*
LoadFromFile
(
const
char
*
filename
)
{
LIGHTGBM_EXPORT
Dataset
*
LoadFromFile
(
const
char
*
filename
)
{
return
LoadFromFile
(
filename
,
0
,
1
);
return
LoadFromFile
(
filename
,
0
,
1
);
}
}
Dataset
*
LoadFromFileAlignWithOtherDataset
(
const
char
*
filename
,
const
Dataset
*
train_data
);
LIGHTGBM_EXPORT
Dataset
*
LoadFromFileAlignWithOtherDataset
(
const
char
*
filename
,
const
Dataset
*
train_data
);
Dataset
*
LoadFromBinFile
(
const
char
*
data_filename
,
const
char
*
bin_filename
,
int
rank
,
int
num_machines
);
LIGHTGBM_EXPORT
Dataset
*
CostructFromSampleData
(
std
::
vector
<
std
::
vector
<
double
>>&
sample_values
,
std
::
vector
<
std
::
vector
<
int
>>&
sample_indices
,
Dataset
*
CostructFromSampleData
(
std
::
vector
<
std
::
vector
<
double
>>&
sample_values
,
size_t
total_sample_size
,
data_size_t
num_data
);
size_t
total_sample_size
,
data_size_t
num_data
);
/*! \brief Disable copy */
/*! \brief Disable copy */
DatasetLoader
&
operator
=
(
const
DatasetLoader
&
)
=
delete
;
DatasetLoader
&
operator
=
(
const
DatasetLoader
&
)
=
delete
;
...
@@ -31,6 +31,8 @@ public:
...
@@ -31,6 +31,8 @@ public:
private:
private:
Dataset
*
LoadFromBinFile
(
const
char
*
data_filename
,
const
char
*
bin_filename
,
int
rank
,
int
num_machines
,
int
*
num_global_data
,
std
::
vector
<
data_size_t
>*
used_data_indices
);
void
SetHeader
(
const
char
*
filename
);
void
SetHeader
(
const
char
*
filename
);
void
CheckDataset
(
const
Dataset
*
dataset
);
void
CheckDataset
(
const
Dataset
*
dataset
);
...
@@ -71,7 +73,6 @@ private:
...
@@ -71,7 +73,6 @@ private:
std
::
vector
<
std
::
string
>
feature_names_
;
std
::
vector
<
std
::
string
>
feature_names_
;
/*! \brief Mapper from real feature index to used index*/
/*! \brief Mapper from real feature index to used index*/
std
::
unordered_set
<
int
>
categorical_features_
;
std
::
unordered_set
<
int
>
categorical_features_
;
};
};
}
}
...
...
include/LightGBM/export.h
0 → 100644
View file @
eade219e
#ifndef LIGHTGBM_EXPORT_H_
#define LIGHTGBM_EXPORT_H_
/** Macros for exporting symbols in MSVC/GCC/CLANG **/
#ifdef __cplusplus
#define LIGHTGBM_EXTERN_C extern "C"
#else
#define LIGHTGBM_EXTERN_C
#endif
#ifdef _MSC_VER
#define LIGHTGBM_EXPORT __declspec(dllexport)
#define LIGHTGBM_C_EXPORT LIGHTGBM_EXTERN_C __declspec(dllexport)
#else
#define LIGHTGBM_EXPORT
#define LIGHTGBM_C_EXPORT LIGHTGBM_EXTERN_C
#endif
#endif
/** LIGHTGBM_EXPORT_H_ **/
include/LightGBM/feature.h
→
include/LightGBM/feature
_group
.h
View file @
eade219e
#ifndef LIGHTGBM_FEATURE_H_
#ifndef LIGHTGBM_FEATURE_
GROUP_
H_
#define LIGHTGBM_FEATURE_H_
#define LIGHTGBM_FEATURE_
GROUP_
H_
#include <LightGBM/utils/random.h>
#include <LightGBM/utils/random.h>
...
@@ -12,22 +12,41 @@
...
@@ -12,22 +12,41 @@
namespace
LightGBM
{
namespace
LightGBM
{
/*! \brief Using to store data and providing some operations on one feature*/
class
Dataset
;
class
Feature
{
class
DatasetLoader
;
/*! \brief Using to store data and providing some operations on one feature group*/
class
FeatureGroup
{
public:
public:
friend
Dataset
;
friend
DatasetLoader
;
/*!
/*!
* \brief Constructor
* \brief Constructor
* \param feature
_idx Index of this feature
* \param
num_
feature
number of features of this group
* \param bin_mapper Bin mapper for
this
feature
* \param bin_mapper
s
Bin mapper for feature
s
* \param num_data Total number of data
* \param num_data Total number of data
* \param is_enable_sparse True if enable sparse feature
* \param is_enable_sparse True if enable sparse feature
*/
*/
Feature
(
int
feature_idx
,
BinMapper
*
bin_mapper
,
FeatureGroup
(
int
num_feature
,
data_size_t
num_data
,
bool
is_enable_sparse
)
std
::
vector
<
std
::
unique_ptr
<
BinMapper
>>&
bin_mappers
,
:
bin_mapper_
(
bin_mapper
)
{
data_size_t
num_data
,
bool
is_enable_sparse
)
:
num_feature_
(
num_feature
)
{
feature_index_
=
feature_idx
;
CHECK
(
static_cast
<
int
>
(
bin_mappers
.
size
())
==
num_feature
);
bin_data_
.
reset
(
Bin
::
CreateBin
(
num_data
,
bin_mapper_
->
num_bin
(),
// use bin at zero to store default_bin
bin_mapper_
->
sparse_rate
(),
is_enable_sparse
,
&
is_sparse_
,
bin_mapper_
->
GetDefaultBin
(),
bin_mapper_
->
bin_type
()));
num_total_bin_
=
1
;
bin_offsets_
.
emplace_back
(
num_total_bin_
);
int
cnt_non_zero
=
0
;
for
(
int
i
=
0
;
i
<
num_feature_
;
++
i
)
{
bin_mappers_
.
emplace_back
(
bin_mappers
[
i
].
release
());
auto
num_bin
=
bin_mappers_
[
i
]
->
num_bin
();
if
(
bin_mappers_
[
i
]
->
GetDefaultBin
()
==
0
)
{
num_bin
-=
1
;
}
num_total_bin_
+=
num_bin
;
bin_offsets_
.
emplace_back
(
num_total_bin_
);
cnt_non_zero
+=
static_cast
<
int
>
(
num_data
*
(
1.0
f
-
bin_mappers_
[
i
]
->
sparse_rate
()));
}
double
sparse_rate
=
1.0
f
-
static_cast
<
double
>
(
cnt_non_zero
)
/
(
num_data
);
bin_data_
.
reset
(
Bin
::
CreateBin
(
num_data
,
num_total_bin_
,
sparse_rate
,
is_enable_sparse
,
&
is_sparse_
));
}
}
/*!
/*!
* \brief Constructor from memory
* \brief Constructor from memory
...
@@ -35,39 +54,44 @@ public:
...
@@ -35,39 +54,44 @@ public:
* \param num_all_data Number of global data
* \param num_all_data Number of global data
* \param local_used_indices Local used indices, empty means using all data
* \param local_used_indices Local used indices, empty means using all data
*/
*/
Feature
(
const
void
*
memory
,
data_size_t
num_all_data
,
Feature
Group
(
const
void
*
memory
,
data_size_t
num_all_data
,
const
std
::
vector
<
data_size_t
>&
local_used_indices
)
{
const
std
::
vector
<
data_size_t
>&
local_used_indices
)
{
const
char
*
memory_ptr
=
reinterpret_cast
<
const
char
*>
(
memory
);
const
char
*
memory_ptr
=
reinterpret_cast
<
const
char
*>
(
memory
);
// get featuer index
feature_index_
=
*
(
reinterpret_cast
<
const
int
*>
(
memory_ptr
));
memory_ptr
+=
sizeof
(
feature_index_
);
// get is_sparse
// get is_sparse
is_sparse_
=
*
(
reinterpret_cast
<
const
bool
*>
(
memory_ptr
));
is_sparse_
=
*
(
reinterpret_cast
<
const
bool
*>
(
memory_ptr
));
memory_ptr
+=
sizeof
(
is_sparse_
);
memory_ptr
+=
sizeof
(
is_sparse_
);
num_feature_
=
*
(
reinterpret_cast
<
const
int
*>
(
memory_ptr
));
memory_ptr
+=
sizeof
(
num_feature_
);
// get bin mapper
// get bin mapper
bin_mapper_
.
reset
(
new
BinMapper
(
memory_ptr
));
bin_mappers_
.
clear
();
memory_ptr
+=
bin_mapper_
->
SizesInByte
();
bin_offsets_
.
clear
();
// start from 1, due to need to store zero bin in this slot
num_total_bin_
=
1
;
bin_offsets_
.
emplace_back
(
num_total_bin_
);
for
(
int
i
=
0
;
i
<
num_feature_
;
++
i
)
{
bin_mappers_
.
emplace_back
(
new
BinMapper
(
memory_ptr
));
auto
num_bin
=
bin_mappers_
[
i
]
->
num_bin
();
if
(
bin_mappers_
[
i
]
->
GetDefaultBin
()
==
0
)
{
num_bin
-=
1
;
}
num_total_bin_
+=
num_bin
;
bin_offsets_
.
emplace_back
(
num_total_bin_
);
memory_ptr
+=
bin_mappers_
[
i
]
->
SizesInByte
();
}
data_size_t
num_data
=
num_all_data
;
data_size_t
num_data
=
num_all_data
;
if
(
!
local_used_indices
.
empty
())
{
if
(
!
local_used_indices
.
empty
())
{
num_data
=
static_cast
<
data_size_t
>
(
local_used_indices
.
size
());
num_data
=
static_cast
<
data_size_t
>
(
local_used_indices
.
size
());
}
}
if
(
is_sparse_
)
{
if
(
is_sparse_
)
{
bin_data_
.
reset
(
Bin
::
CreateSparseBin
(
num_data
,
bin_mapper_
->
num_bin
(),
bin_mapper_
->
GetDefaultBin
(),
bin_mapper_
->
bin_type
()
));
bin_data_
.
reset
(
Bin
::
CreateSparseBin
(
num_data
,
num_total_bin_
));
}
else
{
}
else
{
bin_data_
.
reset
(
Bin
::
CreateDenseBin
(
num_data
,
bin_mapper_
->
num_bin
(),
bin_mapper_
->
GetDefaultBin
(),
bin_mapper_
->
bin_type
()
));
bin_data_
.
reset
(
Bin
::
CreateDenseBin
(
num_data
,
num_total_bin_
));
}
}
// get bin data
// get bin data
bin_data_
->
LoadFromMemory
(
memory_ptr
,
local_used_indices
);
bin_data_
->
LoadFromMemory
(
memory_ptr
,
local_used_indices
);
}
}
/*! \brief Destructor */
/*! \brief Destructor */
~
Feature
()
{
~
FeatureGroup
()
{
}
bool
CheckAlign
(
const
Feature
&
other
)
const
{
if
(
feature_index_
!=
other
.
feature_index_
)
{
return
false
;
}
return
bin_mapper_
->
CheckAlign
(
*
(
other
.
bin_mapper_
.
get
()));
}
}
/*!
/*!
...
@@ -76,66 +100,91 @@ public:
...
@@ -76,66 +100,91 @@ public:
* \param idx Index of record
* \param idx Index of record
* \param value feature value of record
* \param value feature value of record
*/
*/
inline
void
PushData
(
int
tid
,
data_size_t
line_idx
,
double
value
)
{
inline
void
PushData
(
int
tid
,
int
sub_feature_idx
,
data_size_t
line_idx
,
double
value
)
{
unsigned
int
bin
=
bin_mapper_
->
ValueToBin
(
value
);
uint32_t
bin
=
bin_mappers_
[
sub_feature_idx
]
->
ValueToBin
(
value
);
if
(
bin
==
bin_mappers_
[
sub_feature_idx
]
->
GetDefaultBin
())
{
return
;
}
bin
+=
bin_offsets_
[
sub_feature_idx
];
if
(
bin_mappers_
[
sub_feature_idx
]
->
GetDefaultBin
()
==
0
)
{
bin
-=
1
;
}
bin_data_
->
Push
(
tid
,
line_idx
,
bin
);
bin_data_
->
Push
(
tid
,
line_idx
,
bin
);
}
}
inline
void
PushBin
(
int
tid
,
data_size_t
line_idx
,
unsigned
int
bin
)
{
bin_data_
->
Push
(
tid
,
line_idx
,
bin
);
inline
void
CopySubset
(
const
FeatureGroup
*
full_feature
,
const
data_size_t
*
used_indices
,
data_size_t
num_used_indices
)
{
bin_data_
->
CopySubset
(
full_feature
->
bin_data_
.
get
(),
used_indices
,
num_used_indices
);
}
}
inline
void
FinishLoad
()
{
bin_data_
->
FinishLoad
();
}
/*! \brief Index of this feature */
inline
int
feature_index
()
const
{
return
feature_index_
;
}
/*! \brief Bin mapper that this feature used */
inline
const
BinMapper
*
bin_mapper
()
const
{
return
bin_mapper_
.
get
();
}
/*! \brief Number of bin of this feature */
inline
int
num_bin
()
const
{
return
bin_mapper_
->
num_bin
();
}
inline
BinType
bin_type
()
const
{
return
bin_mapper_
->
bin_type
();
}
inline
BinIterator
*
SubFeatureIterator
(
int
sub_feature
)
{
/*! \brief Get bin data of this feature */
uint32_t
min_bin
=
bin_offsets_
[
sub_feature
];
inline
const
Bin
*
bin_data
()
const
{
return
bin_data_
.
get
();
}
uint32_t
max_bin
=
bin_offsets_
[
sub_feature
+
1
]
-
1
;
uint32_t
default_bin
=
bin_mappers_
[
sub_feature
]
->
GetDefaultBin
();
return
bin_data_
->
GetIterator
(
min_bin
,
max_bin
,
default_bin
);
}
inline
data_size_t
Split
(
int
sub_feature
,
uint32_t
threshold
,
data_size_t
*
data_indices
,
data_size_t
num_data
,
data_size_t
*
lte_indices
,
data_size_t
*
gt_indices
)
const
{
uint32_t
min_bin
=
bin_offsets_
[
sub_feature
];
uint32_t
max_bin
=
bin_offsets_
[
sub_feature
+
1
]
-
1
;
uint32_t
default_bin
=
bin_mappers_
[
sub_feature
]
->
GetDefaultBin
();
return
bin_data_
->
Split
(
min_bin
,
max_bin
,
default_bin
,
threshold
,
data_indices
,
num_data
,
lte_indices
,
gt_indices
,
bin_mappers_
[
sub_feature
]
->
bin_type
());
}
/*!
/*!
* \brief From bin to feature value
* \brief From bin to feature value
* \param bin
* \param bin
* \return Feature value of this bin
* \return Feature
Group
value of this bin
*/
*/
inline
double
BinToValue
(
unsigned
int
bin
)
inline
double
BinToValue
(
int
sub_feature_idx
,
uint32_t
bin
)
const
{
const
{
return
bin_mapper_
->
BinToValue
(
bin
);
}
return
bin_mappers_
[
sub_feature_idx
]
->
BinToValue
(
bin
);
}
/*!
/*!
* \brief Save binary data to file
* \brief Save binary data to file
* \param file File want to write
* \param file File want to write
*/
*/
void
SaveBinaryToFile
(
FILE
*
file
)
const
{
void
SaveBinaryToFile
(
FILE
*
file
)
const
{
fwrite
(
&
feature_index_
,
sizeof
(
feature_index_
),
1
,
file
);
fwrite
(
&
is_sparse_
,
sizeof
(
is_sparse_
),
1
,
file
);
fwrite
(
&
is_sparse_
,
sizeof
(
is_sparse_
),
1
,
file
);
bin_mapper_
->
SaveBinaryToFile
(
file
);
fwrite
(
&
num_feature_
,
sizeof
(
num_feature_
),
1
,
file
);
for
(
int
i
=
0
;
i
<
num_feature_
;
++
i
)
{
bin_mappers_
[
i
]
->
SaveBinaryToFile
(
file
);
}
bin_data_
->
SaveBinaryToFile
(
file
);
bin_data_
->
SaveBinaryToFile
(
file
);
}
}
/*!
/*!
* \brief Get sizes in byte of this object
* \brief Get sizes in byte of this object
*/
*/
size_t
SizesInByte
()
const
{
size_t
SizesInByte
()
const
{
return
sizeof
(
feature_index_
)
+
sizeof
(
is_sparse_
)
+
size_t
ret
=
sizeof
(
is_sparse_
)
+
sizeof
(
num_feature_
);
bin_mapper_
->
SizesInByte
()
+
bin_data_
->
SizesInByte
();
for
(
int
i
=
0
;
i
<
num_feature_
;
++
i
)
{
ret
+=
bin_mappers_
[
i
]
->
SizesInByte
();
}
ret
+=
bin_data_
->
SizesInByte
();
return
ret
;
}
}
/*! \brief Disable copy */
/*! \brief Disable copy */
Feature
&
operator
=
(
const
Feature
&
)
=
delete
;
Feature
Group
&
operator
=
(
const
Feature
Group
&
)
=
delete
;
/*! \brief Disable copy */
/*! \brief Disable copy */
Feature
(
const
Feature
&
)
=
delete
;
Feature
Group
(
const
Feature
Group
&
)
=
delete
;
private:
private:
/*! \brief Index of this feature */
/*! \brief Number of features */
int
feature_index_
;
int
num_feature_
;
/*! \brief Bin mapper that this feature used */
/*! \brief Bin mapper for sub features */
std
::
unique_ptr
<
BinMapper
>
bin_mapper_
;
std
::
vector
<
std
::
unique_ptr
<
BinMapper
>>
bin_mappers_
;
/*! \brief Bin offsets for sub features */
std
::
vector
<
uint32_t
>
bin_offsets_
;
/*! \brief Bin data of this feature */
/*! \brief Bin data of this feature */
std
::
unique_ptr
<
Bin
>
bin_data_
;
std
::
unique_ptr
<
Bin
>
bin_data_
;
/*! \brief True if this feature is sparse */
/*! \brief True if this feature is sparse */
bool
is_sparse_
;
bool
is_sparse_
;
int
num_total_bin_
;
};
};
}
// namespace LightGBM
}
// namespace LightGBM
#endif // L
ight
GBM_FEATURE_H_
#endif // L
IGHT
GBM_FEATURE_
GROUP_
H_
include/LightGBM/metric.h
View file @
eade219e
...
@@ -46,7 +46,7 @@ public:
...
@@ -46,7 +46,7 @@ public:
* \param type Specific type of metric
* \param type Specific type of metric
* \param config Config for metric
* \param config Config for metric
*/
*/
static
Metric
*
CreateMetric
(
const
std
::
string
&
type
,
const
MetricConfig
&
config
);
LIGHTGBM_EXPORT
static
Metric
*
CreateMetric
(
const
std
::
string
&
type
,
const
MetricConfig
&
config
);
};
};
...
...
include/LightGBM/network.h
View file @
eade219e
...
@@ -41,7 +41,7 @@ public:
...
@@ -41,7 +41,7 @@ public:
* When number of machines is not power of 2, need group machines into power of 2 group.
* When number of machines is not power of 2, need group machines into power of 2 group.
* And we can let each group has at most 2 machines.
* And we can let each group has at most 2 machines.
* if the group only has 1 machine. this machine is the normal node
* if the group only has 1 machine. this machine is the normal node
* if the grou has 2 machines, this group will have two type of nodes, one is the leader.
* if the grou
p
has 2 machines, this group will have two type of nodes, one is the leader.
* leader will represent this group and communication with others.
* leader will represent this group and communication with others.
*/
*/
enum
RecursiveHalvingNodeType
{
enum
RecursiveHalvingNodeType
{
...
...
include/LightGBM/objective_function.h
View file @
eade219e
...
@@ -44,7 +44,7 @@ public:
...
@@ -44,7 +44,7 @@ public:
* \param type Specific type of objective function
* \param type Specific type of objective function
* \param config Config for objective function
* \param config Config for objective function
*/
*/
static
ObjectiveFunction
*
CreateObjectiveFunction
(
const
std
::
string
&
type
,
LIGHTGBM_EXPORT
static
ObjectiveFunction
*
CreateObjectiveFunction
(
const
std
::
string
&
type
,
const
ObjectiveConfig
&
config
);
const
ObjectiveConfig
&
config
);
};
};
...
...
include/LightGBM/tree.h
View file @
eade219e
...
@@ -2,7 +2,6 @@
...
@@ -2,7 +2,6 @@
#define LIGHTGBM_TREE_H_
#define LIGHTGBM_TREE_H_
#include <LightGBM/meta.h>
#include <LightGBM/meta.h>
#include <LightGBM/feature.h>
#include <LightGBM/dataset.h>
#include <LightGBM/dataset.h>
#include <string>
#include <string>
...
@@ -46,7 +45,7 @@ public:
...
@@ -46,7 +45,7 @@ public:
* \param gain Split gain
* \param gain Split gain
* \return The index of new leaf.
* \return The index of new leaf.
*/
*/
int
Split
(
int
leaf
,
int
feature
,
BinType
bin_type
,
u
nsigned
in
t
threshold
,
int
real_feature
,
int
Split
(
int
leaf
,
int
feature
,
BinType
bin_type
,
u
int32_
t
threshold
,
int
real_feature
,
double
threshold_double
,
double
left_value
,
double
threshold_double
,
double
left_value
,
double
right_value
,
data_size_t
left_cnt
,
data_size_t
right_cnt
,
double
gain
);
double
right_value
,
data_size_t
left_cnt
,
data_size_t
right_cnt
,
double
gain
);
...
@@ -64,8 +63,9 @@ public:
...
@@ -64,8 +63,9 @@ public:
* \param num_data Number of total data
* \param num_data Number of total data
* \param score Will add prediction to score
* \param score Will add prediction to score
*/
*/
void
AddPredictionToScore
(
const
Dataset
*
data
,
data_size_t
num_data
,
void
AddPredictionToScore
(
const
Dataset
*
data
,
double
*
score
)
const
;
data_size_t
num_data
,
double
*
score
)
const
;
/*!
/*!
* \brief Adding prediction value of this tree model to scorese
* \brief Adding prediction value of this tree model to scorese
...
@@ -93,7 +93,7 @@ public:
...
@@ -93,7 +93,7 @@ public:
inline
int
leaf_depth
(
int
leaf_idx
)
const
{
return
leaf_depth_
[
leaf_idx
];
}
inline
int
leaf_depth
(
int
leaf_idx
)
const
{
return
leaf_depth_
[
leaf_idx
];
}
/*! \brief Get feature of specific split*/
/*! \brief Get feature of specific split*/
inline
int
split_feature
_real
(
int
split_idx
)
const
{
return
split_feature_
real_
[
split_idx
];
}
inline
int
split_feature
(
int
split_idx
)
const
{
return
split_feature_
[
split_idx
];
}
/*!
/*!
* \brief Shrinkage for the tree's output
* \brief Shrinkage for the tree's output
...
@@ -101,9 +101,11 @@ public:
...
@@ -101,9 +101,11 @@ public:
* \param rate The factor of shrinkage
* \param rate The factor of shrinkage
*/
*/
inline
void
Shrinkage
(
double
rate
)
{
inline
void
Shrinkage
(
double
rate
)
{
#pragma omp parallel for schedule(static)
for
(
int
i
=
0
;
i
<
num_leaves_
;
++
i
)
{
for
(
int
i
=
0
;
i
<
num_leaves_
;
++
i
)
{
leaf_value_
[
i
]
=
leaf_value_
[
i
]
*
rate
;
leaf_value_
[
i
]
*
=
rate
;
}
}
shrinkage_
*=
rate
;
}
}
/*! \brief Serialize this object to string*/
/*! \brief Serialize this object to string*/
...
@@ -138,18 +140,10 @@ public:
...
@@ -138,18 +140,10 @@ public:
}
}
}
}
static
std
::
vector
<
std
::
function
<
bool
(
unsigned
int
,
unsigned
in
t
)
>
>
inner_decision_funs
;
static
std
::
vector
<
bool
(
*
)(
uint32_t
,
uint32_
t
)
>
inner_decision_funs
;
static
std
::
vector
<
std
::
function
<
bool
(
double
,
double
)
>
>
decision_funs
;
static
std
::
vector
<
bool
(
*
)(
double
,
double
)
>
decision_funs
;
private:
private:
/*!
* \brief Find leaf index of which record belongs by data
* \param data The dataset
* \param data_idx Index of record
* \return Leaf index
*/
inline
int
GetLeaf
(
const
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>&
iterators
,
data_size_t
data_idx
)
const
;
/*!
/*!
* \brief Find leaf index of which record belongs by features
* \brief Find leaf index of which record belongs by features
...
@@ -171,11 +165,11 @@ private:
...
@@ -171,11 +165,11 @@ private:
/*! \brief A non-leaf node's right child */
/*! \brief A non-leaf node's right child */
std
::
vector
<
int
>
right_child_
;
std
::
vector
<
int
>
right_child_
;
/*! \brief A non-leaf node's split feature */
/*! \brief A non-leaf node's split feature */
std
::
vector
<
int
>
split_feature_
;
std
::
vector
<
int
>
split_feature_
inner
;
/*! \brief A non-leaf node's split feature, the original index */
/*! \brief A non-leaf node's split feature, the original index */
std
::
vector
<
int
>
split_feature_
real_
;
std
::
vector
<
int
>
split_feature_
;
/*! \brief A non-leaf node's split threshold in bin */
/*! \brief A non-leaf node's split threshold in bin */
std
::
vector
<
u
nsigned
in
t
>
threshold_in_bin_
;
std
::
vector
<
u
int32_
t
>
threshold_in_bin_
;
/*! \brief A non-leaf node's split threshold in feature value */
/*! \brief A non-leaf node's split threshold in feature value */
std
::
vector
<
double
>
threshold_
;
std
::
vector
<
double
>
threshold_
;
/*! \brief Decision type, 0 for '<='(numerical feature), 1 for 'is'(categorical feature) */
/*! \brief Decision type, 0 for '<='(numerical feature), 1 for 'is'(categorical feature) */
...
@@ -195,39 +189,34 @@ private:
...
@@ -195,39 +189,34 @@ private:
std
::
vector
<
data_size_t
>
internal_count_
;
std
::
vector
<
data_size_t
>
internal_count_
;
/*! \brief Depth for leaves */
/*! \brief Depth for leaves */
std
::
vector
<
int
>
leaf_depth_
;
std
::
vector
<
int
>
leaf_depth_
;
double
shrinkage_
;
bool
has_categorical_
;
};
};
inline
double
Tree
::
Predict
(
const
double
*
feature_values
)
const
{
inline
double
Tree
::
Predict
(
const
double
*
feature_values
)
const
{
int
leaf
=
GetLeaf
(
feature_values
);
if
(
num_leaves_
>
1
)
{
return
LeafOutput
(
leaf
);
int
leaf
=
GetLeaf
(
feature_values
);
return
LeafOutput
(
leaf
);
}
else
{
return
0.0
f
;
}
}
}
inline
int
Tree
::
PredictLeafIndex
(
const
double
*
feature_values
)
const
{
inline
int
Tree
::
PredictLeafIndex
(
const
double
*
feature_values
)
const
{
int
leaf
=
GetLeaf
(
feature_values
);
if
(
num_leaves_
>
1
)
{
return
leaf
;
int
leaf
=
GetLeaf
(
feature_values
);
}
return
leaf
;
}
else
{
inline
int
Tree
::
GetLeaf
(
const
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>&
iterators
,
return
0
;
data_size_t
data_idx
)
const
{
int
node
=
0
;
while
(
node
>=
0
)
{
if
(
inner_decision_funs
[
decision_type_
[
node
]](
iterators
[
split_feature_
[
node
]]
->
Get
(
data_idx
),
threshold_in_bin_
[
node
]))
{
node
=
left_child_
[
node
];
}
else
{
node
=
right_child_
[
node
];
}
}
}
return
~
node
;
}
}
inline
int
Tree
::
GetLeaf
(
const
double
*
feature_values
)
const
{
inline
int
Tree
::
GetLeaf
(
const
double
*
feature_values
)
const
{
int
node
=
0
;
int
node
=
0
;
while
(
node
>=
0
)
{
while
(
node
>=
0
)
{
if
(
decision_funs
[
decision_type_
[
node
]](
if
(
decision_funs
[
decision_type_
[
node
]](
feature_values
[
split_feature_
real_
[
node
]],
feature_values
[
split_feature_
[
node
]],
threshold_
[
node
]))
{
threshold_
[
node
]))
{
node
=
left_child_
[
node
];
node
=
left_child_
[
node
];
}
else
{
}
else
{
...
...
include/LightGBM/tree_learner.h
View file @
eade219e
...
@@ -27,6 +27,8 @@ public:
...
@@ -27,6 +27,8 @@ public:
*/
*/
virtual
void
Init
(
const
Dataset
*
train_data
)
=
0
;
virtual
void
Init
(
const
Dataset
*
train_data
)
=
0
;
virtual
void
ResetTrainingData
(
const
Dataset
*
train_data
)
=
0
;
/*!
/*!
* \brief Reset tree configs
* \brief Reset tree configs
* \param tree_config config of tree
* \param tree_config config of tree
...
...
include/LightGBM/utils/array_args.h
View file @
eade219e
...
@@ -3,6 +3,7 @@
...
@@ -3,6 +3,7 @@
#include <vector>
#include <vector>
#include <algorithm>
#include <algorithm>
#include <LightGBM/utils/openmp_wrapper.h>
namespace
LightGBM
{
namespace
LightGBM
{
...
@@ -12,88 +13,136 @@ namespace LightGBM {
...
@@ -12,88 +13,136 @@ namespace LightGBM {
template
<
typename
VAL_T
>
template
<
typename
VAL_T
>
class
ArrayArgs
{
class
ArrayArgs
{
public:
public:
inline
static
size_t
ArgMaxMT
(
const
std
::
vector
<
VAL_T
>&
array
)
{
int
num_threads
=
1
;
#pragma omp parallel
#pragma omp master
{
num_threads
=
omp_get_num_threads
();
}
int
step
=
std
::
max
(
1
,
(
static_cast
<
int
>
(
array
.
size
())
+
num_threads
-
1
)
/
num_threads
);
std
::
vector
<
size_t
>
arg_maxs
(
num_threads
,
0
);
#pragma omp parallel for schedule(static,1)
for
(
int
i
=
0
;
i
<
num_threads
;
++
i
)
{
size_t
start
=
step
*
i
;
if
(
start
>=
array
.
size
())
{
continue
;
}
size_t
end
=
std
::
min
(
array
.
size
(),
start
+
step
);
size_t
arg_max
=
start
;
for
(
size_t
j
=
start
+
1
;
j
<
end
;
++
j
)
{
if
(
array
[
j
]
>
array
[
arg_max
])
{
arg_max
=
j
;
}
}
arg_maxs
[
i
]
=
arg_max
;
}
size_t
ret
=
arg_maxs
[
0
];
for
(
int
i
=
1
;
i
<
num_threads
;
++
i
)
{
if
(
array
[
arg_maxs
[
i
]]
>
array
[
ret
])
{
ret
=
arg_maxs
[
i
];
}
}
return
ret
;
}
inline
static
size_t
ArgMax
(
const
std
::
vector
<
VAL_T
>&
array
)
{
inline
static
size_t
ArgMax
(
const
std
::
vector
<
VAL_T
>&
array
)
{
if
(
array
.
empty
())
{
if
(
array
.
empty
())
{
return
0
;
return
0
;
}
}
size_t
argMax
=
0
;
if
(
array
.
size
()
>
100
)
{
for
(
size_t
i
=
1
;
i
<
array
.
size
();
++
i
)
{
return
ArgMaxMT
(
array
);
if
(
array
[
i
]
>
array
[
argMax
])
{
}
else
{
argMax
=
i
;
size_t
arg_max
=
0
;
for
(
size_t
i
=
1
;
i
<
array
.
size
();
++
i
)
{
if
(
array
[
i
]
>
array
[
arg_max
])
{
arg_max
=
i
;
}
}
}
return
arg_max
;
}
}
return
argMax
;
}
}
inline
static
size_t
ArgMin
(
const
std
::
vector
<
VAL_T
>&
array
)
{
inline
static
size_t
ArgMin
(
const
std
::
vector
<
VAL_T
>&
array
)
{
if
(
array
.
empty
())
{
if
(
array
.
empty
())
{
return
0
;
return
0
;
}
}
size_t
arg
M
in
=
0
;
size_t
arg
_m
in
=
0
;
for
(
size_t
i
=
1
;
i
<
array
.
size
();
++
i
)
{
for
(
size_t
i
=
1
;
i
<
array
.
size
();
++
i
)
{
if
(
array
[
i
]
<
array
[
arg
M
in
])
{
if
(
array
[
i
]
<
array
[
arg
_m
in
])
{
arg
M
in
=
i
;
arg
_m
in
=
i
;
}
}
}
}
return
arg
M
in
;
return
arg
_m
in
;
}
}
inline
static
size_t
ArgMax
(
const
VAL_T
*
array
,
size_t
n
)
{
inline
static
size_t
ArgMax
(
const
VAL_T
*
array
,
size_t
n
)
{
if
(
n
<=
0
)
{
if
(
n
<=
0
)
{
return
0
;
return
0
;
}
}
size_t
arg
M
ax
=
0
;
size_t
arg
_m
ax
=
0
;
for
(
size_t
i
=
1
;
i
<
n
;
++
i
)
{
for
(
size_t
i
=
1
;
i
<
n
;
++
i
)
{
if
(
array
[
i
]
>
array
[
arg
M
ax
])
{
if
(
array
[
i
]
>
array
[
arg
_m
ax
])
{
arg
M
ax
=
i
;
arg
_m
ax
=
i
;
}
}
}
}
return
arg
M
ax
;
return
arg
_m
ax
;
}
}
inline
static
size_t
ArgMin
(
const
VAL_T
*
array
,
size_t
n
)
{
inline
static
size_t
ArgMin
(
const
VAL_T
*
array
,
size_t
n
)
{
if
(
n
<=
0
)
{
if
(
n
<=
0
)
{
return
0
;
return
0
;
}
}
size_t
arg
M
in
=
0
;
size_t
arg
_m
in
=
0
;
for
(
size_t
i
=
1
;
i
<
n
;
++
i
)
{
for
(
size_t
i
=
1
;
i
<
n
;
++
i
)
{
if
(
array
[
i
]
<
array
[
arg
M
in
])
{
if
(
array
[
i
]
<
array
[
arg
_m
in
])
{
arg
M
in
=
i
;
arg
_m
in
=
i
;
}
}
}
}
return
arg
M
in
;
return
arg
_m
in
;
}
}
inline
static
size_t
Partition
(
std
::
vector
<
VAL_T
>*
array
,
size_t
start
,
size_t
end
)
{
inline
static
void
Partition
(
std
::
vector
<
VAL_T
>*
arr
,
int
start
,
int
end
,
int
*
l
,
int
*
r
)
{
VAL_T
&
pivot
=
(
*
array
)[
end
-
1
];
int
i
=
start
-
1
;
size_t
p_idx
=
start
;
int
j
=
end
-
1
;
for
(
size_t
i
=
start
;
i
<
end
-
1
;
++
i
)
{
int
p
=
i
;
if
((
*
array
)[
i
]
>
pivot
)
{
int
q
=
j
;
std
::
swap
((
*
array
)[
p_idx
],
(
*
array
)[
i
]);
if
(
start
>=
end
)
{
++
p_idx
;
return
;
}
}
}
std
::
swap
((
*
array
)[
p_idx
],
(
*
array
)[
end
-
1
]);
std
::
vector
<
VAL_T
>&
ref
=
*
arr
;
return
p_idx
;
VAL_T
v
=
ref
[
end
-
1
];
for
(;;)
{
while
(
ref
[
++
i
]
>
v
);
while
(
v
>
ref
[
--
j
])
{
if
(
j
==
start
)
{
break
;
}
}
if
(
i
>=
j
)
{
break
;
}
std
::
swap
(
ref
[
i
],
ref
[
j
]);
if
(
ref
[
i
]
==
v
)
{
p
++
;
std
::
swap
(
ref
[
p
],
ref
[
i
]);
}
if
(
v
==
ref
[
j
])
{
q
--
;
std
::
swap
(
ref
[
j
],
ref
[
q
]);
}
}
std
::
swap
(
ref
[
i
],
ref
[
end
-
1
]);
j
=
i
-
1
;
i
=
i
+
1
;
for
(
int
k
=
start
;
k
<=
p
;
k
++
,
j
--
)
{
std
::
swap
(
ref
[
k
],
ref
[
j
]);
}
for
(
int
k
=
end
-
2
;
k
>=
q
;
k
--
,
i
++
)
{
std
::
swap
(
ref
[
i
],
ref
[
k
]);
}
*
l
=
j
;
*
r
=
i
;
};
};
inline
static
size_
t
ArgMaxAtK
(
std
::
vector
<
VAL_T
>*
arr
ay
,
size_
t
start
,
size_
t
end
,
size_
t
k
)
{
inline
static
in
t
ArgMaxAtK
(
std
::
vector
<
VAL_T
>*
arr
,
in
t
start
,
in
t
end
,
in
t
k
)
{
if
(
start
=
=
end
-
1
)
{
if
(
start
>
=
end
-
1
)
{
return
start
;
return
start
;
}
}
size_t
p_idx
=
Partition
(
array
,
start
,
end
)
;
int
l
=
start
;
i
f
(
p_idx
==
k
)
{
i
nt
r
=
end
-
1
;
return
p_idx
;
Partition
(
arr
,
start
,
end
,
&
l
,
&
r
)
;
}
if
((
k
>
l
&&
k
<
r
)
||
l
==
0
||
r
==
end
-
1
)
{
else
if
(
k
<
p_idx
)
{
return
k
;
return
ArgMaxAtK
(
array
,
start
,
p_idx
,
k
);
}
else
if
(
k
<=
l
)
{
}
return
ArgMaxAtK
(
arr
,
start
,
l
,
k
);
else
{
}
else
{
return
ArgMaxAtK
(
arr
ay
,
p_idx
+
1
,
end
,
k
);
return
ArgMaxAtK
(
arr
,
r
,
end
,
k
);
}
}
}
}
inline
static
void
MaxK
(
const
std
::
vector
<
VAL_T
>&
array
,
size_
t
k
,
std
::
vector
<
VAL_T
>*
out
)
{
inline
static
void
MaxK
(
const
std
::
vector
<
VAL_T
>&
array
,
in
t
k
,
std
::
vector
<
VAL_T
>*
out
)
{
out
->
clear
();
out
->
clear
();
if
(
k
<=
0
)
{
if
(
k
<=
0
)
{
return
;
return
;
...
@@ -101,10 +150,10 @@ public:
...
@@ -101,10 +150,10 @@ public:
for
(
auto
val
:
array
)
{
for
(
auto
val
:
array
)
{
out
->
push_back
(
val
);
out
->
push_back
(
val
);
}
}
if
(
k
>=
array
.
size
())
{
if
(
static_cast
<
size_t
>
(
k
)
>=
array
.
size
())
{
return
;
return
;
}
}
ArgMaxAtK
(
out
,
0
,
out
->
size
(),
k
-
1
);
ArgMaxAtK
(
out
,
0
,
static_cast
<
int
>
(
out
->
size
()
)
,
k
-
1
);
out
->
erase
(
out
->
begin
()
+
k
,
out
->
end
());
out
->
erase
(
out
->
begin
()
+
k
,
out
->
end
());
}
}
...
...
include/LightGBM/utils/common.h
View file @
eade219e
...
@@ -150,7 +150,7 @@ inline static const char* Atof(const char* p, double* out) {
...
@@ -150,7 +150,7 @@ inline static const char* Atof(const char* p, double* out) {
frac
=
0
;
frac
=
0
;
scale
=
1.0
;
scale
=
1.0
;
if
((
*
p
==
'e'
)
||
(
*
p
==
'E'
))
{
if
((
*
p
==
'e'
)
||
(
*
p
==
'E'
))
{
u
nsigned
in
t
expon
;
u
int32_
t
expon
;
// Get sign of exponent, if any.
// Get sign of exponent, if any.
++
p
;
++
p
;
if
(
*
p
==
'-'
)
{
if
(
*
p
==
'-'
)
{
...
@@ -273,6 +273,9 @@ inline static std::string ArrayToString(const std::vector<T>& arr, size_t n, cha
...
@@ -273,6 +273,9 @@ inline static std::string ArrayToString(const std::vector<T>& arr, size_t n, cha
template
<
typename
T
>
template
<
typename
T
>
inline
static
std
::
vector
<
T
>
StringToArray
(
const
std
::
string
&
str
,
char
delimiter
,
size_t
n
)
{
inline
static
std
::
vector
<
T
>
StringToArray
(
const
std
::
string
&
str
,
char
delimiter
,
size_t
n
)
{
if
(
n
==
0
)
{
return
std
::
vector
<
T
>
();
}
std
::
vector
<
std
::
string
>
strs
=
Split
(
str
.
c_str
(),
delimiter
);
std
::
vector
<
std
::
string
>
strs
=
Split
(
str
.
c_str
(),
delimiter
);
if
(
strs
.
size
()
!=
n
)
{
if
(
strs
.
size
()
!=
n
)
{
Log
::
Fatal
(
"StringToArray error, size doesn't match."
);
Log
::
Fatal
(
"StringToArray error, size doesn't match."
);
...
...
include/LightGBM/utils/log.h
View file @
eade219e
...
@@ -45,6 +45,10 @@ public:
...
@@ -45,6 +45,10 @@ public:
GetLevel
()
=
level
;
GetLevel
()
=
level
;
}
}
static
void
ResetUseException
(
bool
use_ex
)
{
UseException
()
=
use_ex
;
}
static
void
Debug
(
const
char
*
format
,
...)
{
static
void
Debug
(
const
char
*
format
,
...)
{
va_list
val
;
va_list
val
;
va_start
(
val
,
format
);
va_start
(
val
,
format
);
...
@@ -73,7 +77,13 @@ public:
...
@@ -73,7 +77,13 @@ public:
vsprintf
(
str_buf
,
format
,
val
);
vsprintf
(
str_buf
,
format
,
val
);
#endif
#endif
va_end
(
val
);
va_end
(
val
);
throw
std
::
runtime_error
(
std
::
string
(
str_buf
));
fprintf
(
stderr
,
"[LightGBM] [Fatal] %s
\n
"
,
str_buf
);
fflush
(
stderr
);
if
(
UseException
())
{
throw
std
::
runtime_error
(
std
::
string
(
str_buf
));
}
else
{
std
::
exit
(
-
1
);
}
}
}
private:
private:
...
@@ -96,6 +106,8 @@ private:
...
@@ -96,6 +106,8 @@ private:
static
LogLevel
&
GetLevel
()
{
static
thread_local
LogLevel
level
=
LogLevel
::
Info
;
return
level
;
}
static
LogLevel
&
GetLevel
()
{
static
thread_local
LogLevel
level
=
LogLevel
::
Info
;
return
level
;
}
#endif
#endif
static
bool
&
UseException
()
{
static
bool
use_ex
=
false
;
return
use_ex
;
}
};
};
}
// namespace LightGBM
}
// namespace LightGBM
...
...
include/LightGBM/utils/openmp_wrapper.h
0 → 100644
View file @
eade219e
#ifndef LIGHTGBM_OPENMP_WRAPPER_H_
#define LIGHTGBM_OPENMP_WRAPPER_H_
#ifdef _OPENMP
#include <omp.h>
#else
#ifdef _MSC_VER
#pragma warning( disable : 4068 ) // disable unknown pragma warning
#endif
#ifdef __cplusplus
extern
"C"
{
#endif
/** Fall here if no OPENMP support, so just
simulate a single thread running.
All #pragma omp should be ignored by the compiler **/
inline
void
omp_set_num_threads
(
int
)
{}
inline
int
omp_get_num_threads
()
{
return
1
;}
inline
int
omp_get_thread_num
()
{
return
0
;}
#ifdef __cplusplus
};
// extern "C"
#endif
#endif
#endif
/* LIGHTGBM_OPENMP_WRAPPER_H_ */
include/LightGBM/utils/random.h
View file @
eade219e
...
@@ -20,30 +20,41 @@ public:
...
@@ -20,30 +20,41 @@ public:
std
::
random_device
rd
;
std
::
random_device
rd
;
auto
genrator
=
std
::
mt19937
(
rd
());
auto
genrator
=
std
::
mt19937
(
rd
());
std
::
uniform_int_distribution
<
int
>
distribution
(
0
,
x
);
std
::
uniform_int_distribution
<
int
>
distribution
(
0
,
x
);
x
=
static_cast
<
unsigned
int
>
(
distribution
(
genrator
)
)
;
x
=
distribution
(
genrator
);
}
}
/*!
/*!
* \brief Constructor, with specific seed
* \brief Constructor, with specific seed
*/
*/
Random
(
int
seed
)
{
Random
(
int
seed
)
{
x
=
static_cast
<
unsigned
int
>
(
seed
)
;
x
=
seed
;
}
}
/*!
/*!
* \brief Generate random integer
* \brief Generate random integer, int16 range. [0, 65536]
* \param lower_bound lower bound
* \param upper_bound upper bound
* \return The random integer between [lower_bound, upper_bound)
*/
inline
int
NextShort
(
int
lower_bound
,
int
upper_bound
)
{
return
(
RandInt16
())
%
(
upper_bound
-
lower_bound
)
+
lower_bound
;
}
/*!
* \brief Generate random integer, int32 range
* \param lower_bound lower bound
* \param lower_bound lower bound
* \param upper_bound upper bound
* \param upper_bound upper bound
* \return The random integer between [lower_bound, upper_bound)
* \return The random integer between [lower_bound, upper_bound)
*/
*/
inline
int
NextInt
(
int
lower_bound
,
int
upper_bound
)
{
inline
int
NextInt
(
int
lower_bound
,
int
upper_bound
)
{
return
(
next
())
%
(
upper_bound
-
lower_bound
+
1
)
+
lower_bound
;
return
(
RandInt32
())
%
(
upper_bound
-
lower_bound
)
+
lower_bound
;
}
}
/*!
/*!
* \brief Generate random float data
* \brief Generate random float data
* \return The random float between [0.0, 1.0)
* \return The random float between [0.0, 1.0)
*/
*/
inline
double
NextDouble
()
{
inline
float
NextFloat
()
{
// get random float in [0,1)
// get random float in [0,1)
return
static_cast
<
double
>
(
next
()
%
2047
)
/
2047
.0
f
;
return
static_cast
<
float
>
(
RandInt16
())
/
(
32768
.0
f
)
;
}
}
/*!
/*!
* \brief Sample K data from {0,1,...,N-1}
* \brief Sample K data from {0,1,...,N-1}
...
@@ -58,26 +69,24 @@ public:
...
@@ -58,26 +69,24 @@ public:
}
}
for
(
int
i
=
0
;
i
<
N
;
++
i
)
{
for
(
int
i
=
0
;
i
<
N
;
++
i
)
{
double
prob
=
(
K
-
ret
.
size
())
/
static_cast
<
double
>
(
N
-
i
);
double
prob
=
(
K
-
ret
.
size
())
/
static_cast
<
double
>
(
N
-
i
);
if
(
Next
Double
()
<
prob
)
{
if
(
Next
Float
()
<
prob
)
{
ret
.
push_back
(
i
);
ret
.
push_back
(
i
);
}
}
}
}
return
ret
;
return
ret
;
}
}
private:
private:
unsigned
next
()
{
inline
int
RandInt16
()
{
x
^=
x
<<
16
;
x
=
(
214013
*
x
+
2531011
);
x
^=
x
>>
5
;
return
(
x
>>
16
)
&
0x7FFF
;
x
^=
x
<<
1
;
auto
t
=
x
;
x
=
y
;
y
=
z
;
z
=
t
^
x
^
y
;
return
z
;
}
}
unsigned
int
x
=
123456789
;
unsigned
int
y
=
362436069
;
inline
int
RandInt32
()
{
unsigned
int
z
=
521288629
;
x
=
(
214013
*
x
+
2531011
);
return
x
&
0x7FFFFFF
;
}
int
x
=
123456789
;
};
};
...
...
include/LightGBM/utils/threading.h
View file @
eade219e
#ifndef LIGHTGBM_UTILS_THREADING_H_
#ifndef LIGHTGBM_UTILS_THREADING_H_
#define LIGHTGBM_UTILS_THREADING_H_
#define LIGHTGBM_UTILS_THREADING_H_
#include <
omp
.h>
#include <
LightGBM/utils/openmp_wrapper
.h>
#include <vector>
#include <vector>
#include <functional>
#include <functional>
...
...
python-package/lightgbm/__init__.py
View file @
eade219e
...
@@ -6,13 +6,19 @@ Contributors: https://github.com/Microsoft/LightGBM/graphs/contributors
...
@@ -6,13 +6,19 @@ Contributors: https://github.com/Microsoft/LightGBM/graphs/contributors
from
__future__
import
absolute_import
from
__future__
import
absolute_import
from
.basic
import
Dataset
,
Booster
from
.basic
import
Booster
,
Dataset
from
.engine
import
train
,
cv
from
.callback
import
(
early_stopping
,
print_evaluation
,
record_evaluation
,
from
.callback
import
print_evaluation
,
record_evaluation
,
reset_parameter
,
early_stopping
reset_parameter
)
from
.engine
import
cv
,
train
try
:
try
:
from
.sklearn
import
LGBMModel
,
LGBMRegressor
,
LGBMClassifier
,
LGBMRanker
from
.sklearn
import
LGBMModel
,
LGBMRegressor
,
LGBMClassifier
,
LGBMRanker
except
ImportError
:
except
ImportError
:
pass
pass
try
:
from
.plotting
import
plot_importance
,
plot_metric
,
plot_tree
except
ImportError
:
pass
__version__
=
0.1
__version__
=
0.1
...
@@ -20,4 +26,5 @@ __version__ = 0.1
...
@@ -20,4 +26,5 @@ __version__ = 0.1
__all__
=
[
'Dataset'
,
'Booster'
,
__all__
=
[
'Dataset'
,
'Booster'
,
'train'
,
'cv'
,
'train'
,
'cv'
,
'LGBMModel'
,
'LGBMRegressor'
,
'LGBMClassifier'
,
'LGBMRanker'
,
'LGBMModel'
,
'LGBMRegressor'
,
'LGBMClassifier'
,
'LGBMRanker'
,
'print_evaluation'
,
'record_evaluation'
,
'reset_parameter'
,
'early_stopping'
]
'print_evaluation'
,
'record_evaluation'
,
'reset_parameter'
,
'early_stopping'
,
'plot_importance'
,
'plot_metric'
,
'plot_tree'
]
python-package/lightgbm/basic.py
View file @
eade219e
...
@@ -6,13 +6,15 @@ from __future__ import absolute_import
...
@@ -6,13 +6,15 @@ from __future__ import absolute_import
import
ctypes
import
ctypes
import
os
import
os
import
warnings
from
tempfile
import
NamedTemporaryFile
from
tempfile
import
NamedTemporaryFile
import
numpy
as
np
import
numpy
as
np
import
scipy.sparse
import
scipy.sparse
from
.compat
import
(
DataFrame
,
Series
,
integer_types
,
json
,
numeric_types
,
from
.compat
import
(
DataFrame
,
Series
,
integer_types
,
json
,
range_
,
string_type
)
json_default_with_numpy
,
numeric_types
,
range_
,
string_type
)
from
.libpath
import
find_lib_path
from
.libpath
import
find_lib_path
...
@@ -213,6 +215,81 @@ def c_int_array(data):
...
@@ -213,6 +215,81 @@ def c_int_array(data):
return
(
ptr_data
,
type_data
)
return
(
ptr_data
,
type_data
)
PANDAS_DTYPE_MAPPER
=
{
'int8'
:
'int'
,
'int16'
:
'int'
,
'int32'
:
'int'
,
'int64'
:
'int'
,
'uint8'
:
'int'
,
'uint16'
:
'int'
,
'uint32'
:
'int'
,
'uint64'
:
'int'
,
'float16'
:
'float'
,
'float32'
:
'float'
,
'float64'
:
'float'
,
'bool'
:
'int'
}
def
_data_from_pandas
(
data
,
feature_name
,
categorical_feature
,
pandas_categorical
):
if
isinstance
(
data
,
DataFrame
):
if
feature_name
==
'auto'
or
feature_name
is
None
:
if
all
([
isinstance
(
name
,
integer_types
+
(
np
.
integer
,
))
for
name
in
data
.
columns
]):
msg
=
"""Using Pandas (default) integer column names, not column indexes. You can use indexes with DataFrame.values."""
warnings
.
filterwarnings
(
'once'
)
warnings
.
warn
(
msg
,
stacklevel
=
5
)
data
=
data
.
rename
(
columns
=
str
)
cat_cols
=
data
.
select_dtypes
(
include
=
[
'category'
]).
columns
if
pandas_categorical
is
None
:
# train dataset
pandas_categorical
=
[
list
(
data
[
col
].
cat
.
categories
)
for
col
in
cat_cols
]
else
:
if
len
(
cat_cols
)
!=
len
(
pandas_categorical
):
raise
ValueError
(
'train and valid dataset categorical_feature do not match.'
)
for
col
,
category
in
zip
(
cat_cols
,
pandas_categorical
):
if
list
(
data
[
col
].
cat
.
categories
)
!=
list
(
category
):
data
[
col
]
=
data
[
col
].
cat
.
set_categories
(
category
)
if
len
(
cat_cols
):
# cat_cols is pandas Index object
data
=
data
.
copy
()
# not alter origin DataFrame
data
[
cat_cols
]
=
data
[
cat_cols
].
apply
(
lambda
x
:
x
.
cat
.
codes
)
if
categorical_feature
is
not
None
:
if
feature_name
is
None
:
feature_name
=
list
(
data
.
columns
)
if
categorical_feature
==
'auto'
:
categorical_feature
=
list
(
cat_cols
)
else
:
categorical_feature
=
list
(
categorical_feature
)
+
list
(
cat_cols
)
if
feature_name
==
'auto'
:
feature_name
=
list
(
data
.
columns
)
data_dtypes
=
data
.
dtypes
if
not
all
(
dtype
.
name
in
PANDAS_DTYPE_MAPPER
for
dtype
in
data_dtypes
):
bad_fields
=
[
data
.
columns
[
i
]
for
i
,
dtype
in
enumerate
(
data_dtypes
)
if
dtype
.
name
not
in
PANDAS_DTYPE_MAPPER
]
msg
=
"""DataFrame.dtypes for data must be int, float or bool. Did not expect the data types in fields """
raise
ValueError
(
msg
+
', '
.
join
(
bad_fields
))
data
=
data
.
values
.
astype
(
'float'
)
else
:
if
feature_name
==
'auto'
:
feature_name
=
None
if
categorical_feature
==
'auto'
:
categorical_feature
=
None
return
data
,
feature_name
,
categorical_feature
,
pandas_categorical
def
_label_from_pandas
(
label
):
if
isinstance
(
label
,
DataFrame
):
if
len
(
label
.
columns
)
>
1
:
raise
ValueError
(
'DataFrame for label cannot have multiple columns'
)
label_dtypes
=
label
.
dtypes
if
not
all
(
dtype
.
name
in
PANDAS_DTYPE_MAPPER
for
dtype
in
label_dtypes
):
raise
ValueError
(
'DataFrame.dtypes for label must be int, float or bool'
)
label
=
label
.
values
.
astype
(
'float'
)
return
label
def
_save_pandas_categorical
(
file_name
,
pandas_categorical
):
with
open
(
file_name
,
'a'
)
as
f
:
f
.
write
(
'
\n
pandas_categorical:'
+
json
.
dumps
(
pandas_categorical
,
default
=
json_default_with_numpy
))
def
_load_pandas_categorical
(
file_name
):
with
open
(
file_name
,
'r'
)
as
f
:
last_line
=
f
.
readlines
()[
-
1
]
if
last_line
.
startswith
(
'pandas_categorical:'
):
return
json
.
loads
(
last_line
[
len
(
'pandas_categorical:'
):])
return
None
class
_InnerPredictor
(
object
):
class
_InnerPredictor
(
object
):
"""
"""
A _InnerPredictor of LightGBM.
A _InnerPredictor of LightGBM.
...
@@ -244,6 +321,7 @@ class _InnerPredictor(object):
...
@@ -244,6 +321,7 @@ class _InnerPredictor(object):
ctypes
.
byref
(
out_num_class
)))
ctypes
.
byref
(
out_num_class
)))
self
.
num_class
=
out_num_class
.
value
self
.
num_class
=
out_num_class
.
value
self
.
num_total_iteration
=
out_num_iterations
.
value
self
.
num_total_iteration
=
out_num_iterations
.
value
self
.
pandas_categorical
=
_load_pandas_categorical
(
model_file
)
elif
booster_handle
is
not
None
:
elif
booster_handle
is
not
None
:
self
.
__is_manage_handle
=
False
self
.
__is_manage_handle
=
False
self
.
handle
=
booster_handle
self
.
handle
=
booster_handle
...
@@ -257,6 +335,7 @@ class _InnerPredictor(object):
...
@@ -257,6 +335,7 @@ class _InnerPredictor(object):
self
.
handle
,
self
.
handle
,
ctypes
.
byref
(
out_num_iterations
)))
ctypes
.
byref
(
out_num_iterations
)))
self
.
num_total_iteration
=
out_num_iterations
.
value
self
.
num_total_iteration
=
out_num_iterations
.
value
self
.
pandas_categorical
=
None
else
:
else
:
raise
TypeError
(
'Need Model file or Booster handle to create a predictor'
)
raise
TypeError
(
'Need Model file or Booster handle to create a predictor'
)
...
@@ -292,6 +371,7 @@ class _InnerPredictor(object):
...
@@ -292,6 +371,7 @@ class _InnerPredictor(object):
"""
"""
if
isinstance
(
data
,
Dataset
):
if
isinstance
(
data
,
Dataset
):
raise
TypeError
(
"Cannot use Dataset instance for prediction, please use raw data instead"
)
raise
TypeError
(
"Cannot use Dataset instance for prediction, please use raw data instead"
)
data
=
_data_from_pandas
(
data
,
None
,
None
,
self
.
pandas_categorical
)[
0
]
predict_type
=
C_API_PREDICT_NORMAL
predict_type
=
C_API_PREDICT_NORMAL
if
raw_score
:
if
raw_score
:
predict_type
=
C_API_PREDICT_RAW_SCORE
predict_type
=
C_API_PREDICT_RAW_SCORE
...
@@ -448,41 +528,11 @@ class _InnerPredictor(object):
...
@@ -448,41 +528,11 @@ class _InnerPredictor(object):
return
preds
,
nrow
return
preds
,
nrow
PANDAS_DTYPE_MAPPER
=
{
'int8'
:
'int'
,
'int16'
:
'int'
,
'int32'
:
'int'
,
'int64'
:
'int'
,
'uint8'
:
'int'
,
'uint16'
:
'int'
,
'uint32'
:
'int'
,
'uint64'
:
'int'
,
'float16'
:
'float'
,
'float32'
:
'float'
,
'float64'
:
'float'
,
'bool'
:
'int'
}
def
_data_from_pandas
(
data
):
if
isinstance
(
data
,
DataFrame
):
data_dtypes
=
data
.
dtypes
if
not
all
(
dtype
.
name
in
PANDAS_DTYPE_MAPPER
for
dtype
in
data_dtypes
):
bad_fields
=
[
data
.
columns
[
i
]
for
i
,
dtype
in
enumerate
(
data_dtypes
)
if
dtype
.
name
not
in
PANDAS_DTYPE_MAPPER
]
msg
=
"""DataFrame.dtypes for data must be int, float or bool. Did not expect the data types in fields """
raise
ValueError
(
msg
+
', '
.
join
(
bad_fields
))
data
=
data
.
values
.
astype
(
'float'
)
return
data
def
_label_from_pandas
(
label
):
if
isinstance
(
label
,
DataFrame
):
if
len
(
label
.
columns
)
>
1
:
raise
ValueError
(
'DataFrame for label cannot have multiple columns'
)
label_dtypes
=
label
.
dtypes
if
not
all
(
dtype
.
name
in
PANDAS_DTYPE_MAPPER
for
dtype
in
label_dtypes
):
raise
ValueError
(
'DataFrame.dtypes for label must be int, float or bool'
)
label
=
label
.
values
.
astype
(
'float'
)
return
label
class
Dataset
(
object
):
class
Dataset
(
object
):
"""Dataset in LightGBM."""
"""Dataset in LightGBM."""
def
__init__
(
self
,
data
,
label
=
None
,
max_bin
=
255
,
reference
=
None
,
def
__init__
(
self
,
data
,
label
=
None
,
max_bin
=
255
,
reference
=
None
,
weight
=
None
,
group
=
None
,
silent
=
False
,
weight
=
None
,
group
=
None
,
silent
=
False
,
feature_name
=
None
,
categorical_feature
=
None
,
params
=
None
,
feature_name
=
'auto'
,
categorical_feature
=
'auto'
,
params
=
None
,
free_raw_data
=
True
):
free_raw_data
=
True
):
"""
"""
Parameters
Parameters
...
@@ -502,12 +552,14 @@ class Dataset(object):
...
@@ -502,12 +552,14 @@ class Dataset(object):
Group/query size for dataset
Group/query size for dataset
silent : boolean, optional
silent : boolean, optional
Whether print messages during construction
Whether print messages during construction
feature_name : list of str
feature_name : list of str
, or 'auto'
Feature names
Feature names
categorical_feature : list of str or int
If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features,
Categorical features,
type int represents index,
type int represents index,
type str represents feature names (need to specify feature_name as well)
type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
params: dict, optional
params: dict, optional
Other parameters
Other parameters
free_raw_data: Bool
free_raw_data: Bool
...
@@ -527,6 +579,7 @@ class Dataset(object):
...
@@ -527,6 +579,7 @@ class Dataset(object):
self
.
free_raw_data
=
free_raw_data
self
.
free_raw_data
=
free_raw_data
self
.
used_indices
=
None
self
.
used_indices
=
None
self
.
_predictor
=
None
self
.
_predictor
=
None
self
.
pandas_categorical
=
None
def
__del__
(
self
):
def
__del__
(
self
):
self
.
_free_handle
()
self
.
_free_handle
()
...
@@ -538,12 +591,12 @@ class Dataset(object):
...
@@ -538,12 +591,12 @@ class Dataset(object):
def
_lazy_init
(
self
,
data
,
label
=
None
,
max_bin
=
255
,
reference
=
None
,
def
_lazy_init
(
self
,
data
,
label
=
None
,
max_bin
=
255
,
reference
=
None
,
weight
=
None
,
group
=
None
,
predictor
=
None
,
weight
=
None
,
group
=
None
,
predictor
=
None
,
silent
=
False
,
feature_name
=
None
,
silent
=
False
,
feature_name
=
'auto'
,
categorical_feature
=
None
,
params
=
None
):
categorical_feature
=
'auto'
,
params
=
None
):
if
data
is
None
:
if
data
is
None
:
self
.
handle
=
None
self
.
handle
=
None
return
return
data
=
_data_from_pandas
(
data
)
data
,
feature_name
,
categorical_feature
,
self
.
pandas_categorical
=
_data_from_pandas
(
data
,
feature_name
,
categorical_feature
,
self
.
pandas_categorical
)
label
=
_label_from_pandas
(
label
)
label
=
_label_from_pandas
(
label
)
self
.
data_has_header
=
False
self
.
data_has_header
=
False
"""process for args"""
"""process for args"""
...
@@ -760,7 +813,8 @@ class Dataset(object):
...
@@ -760,7 +813,8 @@ class Dataset(object):
ret
=
Dataset
(
data
,
label
=
label
,
max_bin
=
self
.
max_bin
,
reference
=
self
,
ret
=
Dataset
(
data
,
label
=
label
,
max_bin
=
self
.
max_bin
,
reference
=
self
,
weight
=
weight
,
group
=
group
,
silent
=
silent
,
params
=
params
,
weight
=
weight
,
group
=
group
,
silent
=
silent
,
params
=
params
,
free_raw_data
=
self
.
free_raw_data
)
free_raw_data
=
self
.
free_raw_data
)
ret
.
_set_predictor
(
self
.
_predictor
)
ret
.
_predictor
=
self
.
_predictor
ret
.
pandas_categorical
=
self
.
pandas_categorical
return
ret
return
ret
def
subset
(
self
,
used_indices
,
params
=
None
):
def
subset
(
self
,
used_indices
,
params
=
None
):
...
@@ -777,6 +831,7 @@ class Dataset(object):
...
@@ -777,6 +831,7 @@ class Dataset(object):
ret
=
Dataset
(
None
,
reference
=
self
,
feature_name
=
self
.
feature_name
,
ret
=
Dataset
(
None
,
reference
=
self
,
feature_name
=
self
.
feature_name
,
categorical_feature
=
self
.
categorical_feature
,
params
=
params
)
categorical_feature
=
self
.
categorical_feature
,
params
=
params
)
ret
.
_predictor
=
self
.
_predictor
ret
.
_predictor
=
self
.
_predictor
ret
.
pandas_categorical
=
self
.
pandas_categorical
ret
.
used_indices
=
used_indices
ret
.
used_indices
=
used_indices
return
ret
return
ret
...
@@ -945,7 +1000,7 @@ class Dataset(object):
...
@@ -945,7 +1000,7 @@ class Dataset(object):
Feature names
Feature names
"""
"""
self
.
feature_name
=
feature_name
self
.
feature_name
=
feature_name
if
self
.
handle
is
not
None
and
feature_name
is
not
None
:
if
self
.
handle
is
not
None
and
feature_name
is
not
None
and
feature_name
!=
'auto'
:
if
len
(
feature_name
)
!=
self
.
num_feature
():
if
len
(
feature_name
)
!=
self
.
num_feature
():
raise
ValueError
(
"Length of feature_name({}) and num_feature({}) don't match"
.
format
(
len
(
feature_name
),
self
.
num_feature
()))
raise
ValueError
(
"Length of feature_name({}) and num_feature({}) don't match"
.
format
(
len
(
feature_name
),
self
.
num_feature
()))
c_feature_name
=
[
c_str
(
name
)
for
name
in
feature_name
]
c_feature_name
=
[
c_str
(
name
)
for
name
in
feature_name
]
...
@@ -1153,6 +1208,7 @@ class Booster(object):
...
@@ -1153,6 +1208,7 @@ class Booster(object):
self
.
__inner_predict_buffer
=
[
None
]
self
.
__inner_predict_buffer
=
[
None
]
self
.
__is_predicted_cur_iter
=
[
False
]
self
.
__is_predicted_cur_iter
=
[
False
]
self
.
__get_eval_info
()
self
.
__get_eval_info
()
self
.
pandas_categorical
=
train_set
.
pandas_categorical
elif
model_file
is
not
None
:
elif
model_file
is
not
None
:
"""Prediction task"""
"""Prediction task"""
out_num_iterations
=
ctypes
.
c_int
(
0
)
out_num_iterations
=
ctypes
.
c_int
(
0
)
...
@@ -1165,6 +1221,9 @@ class Booster(object):
...
@@ -1165,6 +1221,9 @@ class Booster(object):
self
.
handle
,
self
.
handle
,
ctypes
.
byref
(
out_num_class
)))
ctypes
.
byref
(
out_num_class
)))
self
.
__num_class
=
out_num_class
.
value
self
.
__num_class
=
out_num_class
.
value
self
.
pandas_categorical
=
_load_pandas_categorical
(
model_file
)
elif
'model_str'
in
params
:
self
.
__load_model_from_string
(
params
[
'model_str'
])
else
:
else
:
raise
TypeError
(
'Need at least one training dataset or model file to create booster instance'
)
raise
TypeError
(
'Need at least one training dataset or model file to create booster instance'
)
...
@@ -1176,9 +1235,10 @@ class Booster(object):
...
@@ -1176,9 +1235,10 @@ class Booster(object):
return
self
.
__deepcopy__
(
None
)
return
self
.
__deepcopy__
(
None
)
def
__deepcopy__
(
self
,
_
):
def
__deepcopy__
(
self
,
_
):
with
_temp_file
()
as
f
:
model_str
=
self
.
__save_model_to_string
()
self
.
save_model
(
f
.
name
)
booster
=
Booster
({
'model_str'
:
model_str
})
return
Booster
(
model_file
=
f
.
name
)
booster
.
pandas_categorical
=
self
.
pandas_categorical
return
booster
def
__getstate__
(
self
):
def
__getstate__
(
self
):
this
=
self
.
__dict__
.
copy
()
this
=
self
.
__dict__
.
copy
()
...
@@ -1186,22 +1246,18 @@ class Booster(object):
...
@@ -1186,22 +1246,18 @@ class Booster(object):
this
.
pop
(
'train_set'
,
None
)
this
.
pop
(
'train_set'
,
None
)
this
.
pop
(
'valid_sets'
,
None
)
this
.
pop
(
'valid_sets'
,
None
)
if
handle
is
not
None
:
if
handle
is
not
None
:
with
_temp_file
()
as
f
:
this
[
"handle"
]
=
self
.
__save_model_to_string
()
self
.
save_model
(
f
.
name
)
this
[
"handle"
]
=
f
.
readlines
()
return
this
return
this
def
__setstate__
(
self
,
state
):
def
__setstate__
(
self
,
state
):
model
=
state
[
'handle'
]
model
_str
=
state
.
get
(
'handle'
,
None
)
if
model
is
not
None
:
if
model
_str
is
not
None
:
handle
=
ctypes
.
c_void_p
()
handle
=
ctypes
.
c_void_p
()
out_num_iterations
=
ctypes
.
c_int
(
0
)
out_num_iterations
=
ctypes
.
c_int
(
0
)
with
_temp_file
()
as
f
:
_safe_call
(
_LIB
.
LGBM_BoosterLoadModelFromString
(
f
.
writelines
(
model
)
c_str
(
model_str
),
_safe_call
(
_LIB
.
LGBM_BoosterCreateFromModelfile
(
ctypes
.
byref
(
out_num_iterations
),
c_str
(
f
.
name
),
ctypes
.
byref
(
handle
)))
ctypes
.
byref
(
out_num_iterations
),
ctypes
.
byref
(
handle
)))
state
[
'handle'
]
=
handle
state
[
'handle'
]
=
handle
self
.
__dict__
.
update
(
state
)
self
.
__dict__
.
update
(
state
)
...
@@ -1421,6 +1477,47 @@ class Booster(object):
...
@@ -1421,6 +1477,47 @@ class Booster(object):
self
.
handle
,
self
.
handle
,
ctypes
.
c_int
(
num_iteration
),
ctypes
.
c_int
(
num_iteration
),
c_str
(
filename
)))
c_str
(
filename
)))
_save_pandas_categorical
(
filename
,
self
.
pandas_categorical
)
def
__load_model_from_string
(
self
,
model_str
):
"""[Private] Load model from string"""
out_num_iterations
=
ctypes
.
c_int
(
0
)
_safe_call
(
_LIB
.
LGBM_BoosterLoadModelFromString
(
c_str
(
model_str
),
ctypes
.
byref
(
out_num_iterations
),
ctypes
.
byref
(
self
.
handle
)))
out_num_class
=
ctypes
.
c_int
(
0
)
_safe_call
(
_LIB
.
LGBM_BoosterGetNumClasses
(
self
.
handle
,
ctypes
.
byref
(
out_num_class
)))
self
.
__num_class
=
out_num_class
.
value
def
__save_model_to_string
(
self
,
num_iteration
=-
1
):
"""[Private] Save model to string"""
if
num_iteration
<=
0
:
num_iteration
=
self
.
best_iteration
buffer_len
=
1
<<
20
tmp_out_len
=
ctypes
.
c_int
(
0
)
string_buffer
=
ctypes
.
create_string_buffer
(
buffer_len
)
ptr_string_buffer
=
ctypes
.
c_char_p
(
*
[
ctypes
.
addressof
(
string_buffer
)])
_safe_call
(
_LIB
.
LGBM_BoosterSaveModelToString
(
self
.
handle
,
ctypes
.
c_int
(
num_iteration
),
ctypes
.
c_int
(
buffer_len
),
ctypes
.
byref
(
tmp_out_len
),
ptr_string_buffer
))
actual_len
=
tmp_out_len
.
value
'''if buffer length is not long enough, re-allocate a buffer'''
if
actual_len
>
buffer_len
:
string_buffer
=
ctypes
.
create_string_buffer
(
actual_len
)
ptr_string_buffer
=
ctypes
.
c_char_p
(
*
[
ctypes
.
addressof
(
string_buffer
)])
_safe_call
(
_LIB
.
LGBM_BoosterSaveModelToString
(
self
.
handle
,
ctypes
.
c_int
(
num_iteration
),
ctypes
.
c_int
(
actual_len
),
ctypes
.
byref
(
tmp_out_len
),
ptr_string_buffer
))
return
string_buffer
.
value
.
decode
()
def
dump_model
(
self
,
num_iteration
=-
1
):
def
dump_model
(
self
,
num_iteration
=-
1
):
"""
"""
...
@@ -1484,24 +1581,59 @@ class Booster(object):
...
@@ -1484,24 +1581,59 @@ class Booster(object):
-------
-------
Prediction result
Prediction result
"""
"""
predictor
=
_InnerPredictor
(
booster_handle
=
self
.
handle
)
predictor
=
self
.
_to_predictor
(
)
if
num_iteration
<=
0
:
if
num_iteration
<=
0
:
num_iteration
=
self
.
best_iteration
num_iteration
=
self
.
best_iteration
return
predictor
.
predict
(
data
,
num_iteration
,
raw_score
,
pred_leaf
,
data_has_header
,
is_reshape
)
return
predictor
.
predict
(
data
,
num_iteration
,
raw_score
,
pred_leaf
,
data_has_header
,
is_reshape
)
def
_to_predictor
(
self
):
def
_to_predictor
(
self
):
"""Convert to predictor
"""Convert to predictor"""
"""
predictor
=
_InnerPredictor
(
booster_handle
=
self
.
handle
)
predictor
=
_InnerPredictor
(
booster_handle
=
self
.
handle
)
predictor
.
pandas_categorical
=
self
.
pandas_categorical
return
predictor
return
predictor
def
feature_name
(
self
):
"""
Get feature names.
Returns
-------
result : array
Array of feature names.
"""
out_num_feature
=
ctypes
.
c_int
(
0
)
"""Get num of features"""
_safe_call
(
_LIB
.
LGBM_BoosterGetNumFeature
(
self
.
handle
,
ctypes
.
byref
(
out_num_feature
)))
num_feature
=
out_num_feature
.
value
"""Get name of features"""
tmp_out_len
=
ctypes
.
c_int
(
0
)
string_buffers
=
[
ctypes
.
create_string_buffer
(
255
)
for
i
in
range_
(
num_feature
)]
ptr_string_buffers
=
(
ctypes
.
c_char_p
*
num_feature
)(
*
map
(
ctypes
.
addressof
,
string_buffers
))
_safe_call
(
_LIB
.
LGBM_BoosterGetFeatureNames
(
self
.
handle
,
ctypes
.
byref
(
tmp_out_len
),
ptr_string_buffers
))
if
num_feature
!=
tmp_out_len
.
value
:
raise
ValueError
(
"Length of feature names doesn't equal with num_feature"
)
return
[
string_buffers
[
i
].
value
.
decode
()
for
i
in
range_
(
num_feature
)]
def
feature_importance
(
self
,
importance_type
=
'split'
):
def
feature_importance
(
self
,
importance_type
=
'split'
):
"""
"""
Feature importances
Get feature importances
Parameters
----------
importance_type : str, default "split"
How the importance is calculated: "split" or "gain"
"split" is the number of times a feature is used in a model
"gain" is the total gain of splits which use the feature
Returns
Returns
-------
-------
Array of feature importances
result : array
Array of feature importances.
"""
"""
if
importance_type
not
in
[
"split"
,
"gain"
]:
if
importance_type
not
in
[
"split"
,
"gain"
]:
raise
KeyError
(
"importance_type must be split or gain"
)
raise
KeyError
(
"importance_type must be split or gain"
)
...
...
python-package/lightgbm/callback.py
View file @
eade219e
...
@@ -3,6 +3,7 @@
...
@@ -3,6 +3,7 @@
from
__future__
import
absolute_import
from
__future__
import
absolute_import
import
collections
import
collections
from
operator
import
gt
,
lt
from
.compat
import
range_
from
.compat
import
range_
...
@@ -159,48 +160,50 @@ def early_stopping(stopping_rounds, verbose=True):
...
@@ -159,48 +160,50 @@ def early_stopping(stopping_rounds, verbose=True):
callback : function
callback : function
The requested callback function.
The requested callback function.
"""
"""
factor_to_bigger_better
=
{}
best_score
=
[]
best_
score
=
{}
best_
iter
=
[]
best_
iter
=
{}
best_
msg
=
[]
best_msg
=
{}
cmp_op
=
[]
def
init
(
env
):
def
init
(
env
):
"""internal function"""
"""internal function"""
if
not
env
.
evaluation_result_list
:
if
not
env
.
evaluation_result_list
:
raise
ValueError
(
'For early stopping, at least one dataset
or
eval metric is required for evaluation'
)
raise
ValueError
(
'For early stopping, at least one dataset
and
eval metric is required for evaluation'
)
if
verbose
:
if
verbose
:
msg
=
"Train until valid scores didn't improve in {} rounds."
msg
=
"Train until valid scores didn't improve in {} rounds."
print
(
msg
.
format
(
stopping_rounds
))
print
(
msg
.
format
(
stopping_rounds
))
for
i
in
range_
(
len
(
env
.
evaluation_result_list
)):
for
eval_ret
in
env
.
evaluation_result_list
:
best_score
[
i
]
=
float
(
'-inf'
)
best_iter
.
append
(
0
)
best_iter
[
i
]
=
0
if
verbose
:
if
verbose
:
best_msg
[
i
]
=
""
best_msg
.
append
(
None
)
factor_to_bigger_better
[
i
]
=
1.0
if
env
.
evaluation_result_list
[
i
][
3
]
else
-
1.0
if
eval_ret
[
3
]:
best_score
.
append
(
float
(
'-inf'
))
cmp_op
.
append
(
gt
)
else
:
best_score
.
append
(
float
(
'inf'
))
cmp_op
.
append
(
lt
)
def
callback
(
env
):
def
callback
(
env
):
"""internal function"""
"""internal function"""
if
not
best_score
:
if
not
cmp_op
:
init
(
env
)
init
(
env
)
best_msg_buffer
=
None
for
i
in
range_
(
len
(
env
.
evaluation_result_list
)):
for
i
in
range_
(
len
(
env
.
evaluation_result_list
)):
score
=
env
.
evaluation_result_list
[
i
][
2
]
*
factor_to_bigger_better
[
i
]
score
=
env
.
evaluation_result_list
[
i
][
2
]
if
score
>
best_score
[
i
]:
if
cmp_op
[
i
](
score
,
best_score
[
i
]
)
:
best_score
[
i
]
=
score
best_score
[
i
]
=
score
best_iter
[
i
]
=
env
.
iteration
best_iter
[
i
]
=
env
.
iteration
if
verbose
:
if
verbose
:
best_msg
[
i
]
=
'[%d]
\t
%s'
%
(
if
not
best_msg_buffer
:
env
.
iteration
+
1
,
'
\t
'
.
join
(
best_msg_buffer
=
'[%d]
\t
%s'
%
(
[
_format_eval_result
(
x
)
for
x
in
env
.
evaluation_result_list
]
env
.
iteration
+
1
,
'
\t
'
.
join
([
_format_eval_result
(
x
)
for
x
in
env
.
evaluation_result_list
]))
)
best_msg
[
i
]
=
best_msg_buffer
)
elif
env
.
iteration
-
best_iter
[
i
]
>=
stopping_rounds
:
else
:
env
.
model
.
set_attr
(
best_iteration
=
str
(
best_iter
[
i
]))
if
env
.
iteration
-
best_iter
[
i
]
>=
stopping_rounds
:
if
verbose
:
env
.
model
.
set_attr
(
best_iteration
=
str
(
best_iter
[
i
]))
print
(
'Early stopping, best iteration is:
\n
'
+
best_msg
[
i
])
if
verbose
:
raise
EarlyStopException
(
best_iter
[
i
])
print
(
'Early stopping, best iteration is:'
)
print
(
best_msg
[
i
])
raise
EarlyStopException
(
best_iter
[
i
])
callback
.
order
=
30
callback
.
order
=
30
return
callback
return
callback
python-package/lightgbm/compat.py
View file @
eade219e
...
@@ -6,13 +6,15 @@ from __future__ import absolute_import
...
@@ -6,13 +6,15 @@ from __future__ import absolute_import
import
inspect
import
inspect
import
sys
import
sys
import
numpy
as
np
is_py3
=
(
sys
.
version_info
[
0
]
==
3
)
is_py3
=
(
sys
.
version_info
[
0
]
==
3
)
"""compatibility between python2 and python3"""
"""compatibility between python2 and python3"""
if
is_py3
:
if
is_py3
:
string_type
=
str
string_type
=
str
numeric_types
=
(
int
,
float
,
bool
)
numeric_types
=
(
int
,
float
,
bool
)
integer_types
=
int
integer_types
=
(
int
,
)
range_
=
range
range_
=
range
def
argc_
(
func
):
def
argc_
(
func
):
...
@@ -36,6 +38,16 @@ except (ImportError, SyntaxError):
...
@@ -36,6 +38,16 @@ except (ImportError, SyntaxError):
# because of u'...' Unicode literals.
# because of u'...' Unicode literals.
import
json
import
json
def
json_default_with_numpy
(
obj
):
if
isinstance
(
obj
,
(
np
.
integer
,
np
.
floating
,
np
.
bool_
)):
return
obj
.
item
()
elif
isinstance
(
obj
,
np
.
ndarray
):
return
obj
.
tolist
()
else
:
return
obj
"""pandas"""
"""pandas"""
try
:
try
:
from
pandas
import
Series
,
DataFrame
from
pandas
import
Series
,
DataFrame
...
@@ -69,5 +81,4 @@ except ImportError:
...
@@ -69,5 +81,4 @@ except ImportError:
LGBMClassifierBase
=
object
LGBMClassifierBase
=
object
LGBMRegressorBase
=
object
LGBMRegressorBase
=
object
LGBMLabelEncoder
=
None
LGBMLabelEncoder
=
None
LGBMDeprecated
=
None
LGBMStratifiedKFold
=
None
LGBMStratifiedKFold
=
None
python-package/lightgbm/engine.py
View file @
eade219e
...
@@ -17,7 +17,7 @@ from .compat import (SKLEARN_INSTALLED, LGBMStratifiedKFold, integer_types,
...
@@ -17,7 +17,7 @@ from .compat import (SKLEARN_INSTALLED, LGBMStratifiedKFold, integer_types,
def
train
(
params
,
train_set
,
num_boost_round
=
100
,
def
train
(
params
,
train_set
,
num_boost_round
=
100
,
valid_sets
=
None
,
valid_names
=
None
,
valid_sets
=
None
,
valid_names
=
None
,
fobj
=
None
,
feval
=
None
,
init_model
=
None
,
fobj
=
None
,
feval
=
None
,
init_model
=
None
,
feature_name
=
None
,
categorical_feature
=
None
,
feature_name
=
'auto'
,
categorical_feature
=
'auto'
,
early_stopping_rounds
=
None
,
evals_result
=
None
,
early_stopping_rounds
=
None
,
evals_result
=
None
,
verbose_eval
=
True
,
learning_rates
=
None
,
callbacks
=
None
):
verbose_eval
=
True
,
learning_rates
=
None
,
callbacks
=
None
):
"""
"""
...
@@ -42,12 +42,14 @@ def train(params, train_set, num_boost_round=100,
...
@@ -42,12 +42,14 @@ def train(params, train_set, num_boost_round=100,
Note: should return (eval_name, eval_result, is_higher_better) of list of this
Note: should return (eval_name, eval_result, is_higher_better) of list of this
init_model : file name of lightgbm model or 'Booster' instance
init_model : file name of lightgbm model or 'Booster' instance
model used for continued train
model used for continued train
feature_name : list of str
feature_name : list of str
, or 'auto'
Feature names
Feature names
categorical_feature : list of str or int
If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features,
Categorical features,
type int represents index,
type int represents index,
type str represents feature names (need to specify feature_name as well)
type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
early_stopping_rounds: int
early_stopping_rounds: int
Activates early stopping.
Activates early stopping.
Requires at least one validation data and one metric
Requires at least one validation data and one metric
...
@@ -96,7 +98,7 @@ def train(params, train_set, num_boost_round=100,
...
@@ -96,7 +98,7 @@ def train(params, train_set, num_boost_round=100,
init_iteration
=
predictor
.
num_total_iteration
if
predictor
is
not
None
else
0
init_iteration
=
predictor
.
num_total_iteration
if
predictor
is
not
None
else
0
"""check dataset"""
"""check dataset"""
if
not
isinstance
(
train_set
,
Dataset
):
if
not
isinstance
(
train_set
,
Dataset
):
raise
TypeError
(
"Tranin
i
g only accepts Dataset object"
)
raise
TypeError
(
"Tra
i
ning only accepts Dataset object"
)
train_set
.
_update_params
(
params
)
train_set
.
_update_params
(
params
)
train_set
.
_set_predictor
(
predictor
)
train_set
.
_set_predictor
(
predictor
)
...
@@ -219,28 +221,35 @@ class CVBooster(object):
...
@@ -219,28 +221,35 @@ class CVBooster(object):
return
handlerFunction
return
handlerFunction
def
_make_n_folds
(
full_data
,
nfold
,
params
,
seed
,
fpreproc
=
None
,
stratified
=
False
,
shuffle
=
True
):
def
_make_n_folds
(
full_data
,
data_splitter
,
nfold
,
params
,
seed
,
fpreproc
=
None
,
stratified
=
False
,
shuffle
=
True
):
"""
"""
Make an n-fold list of Booster from random indices.
Make an n-fold list of Booster from random indices.
"""
"""
np
.
random
.
seed
(
seed
)
np
.
random
.
seed
(
seed
)
if
stratified
:
num_data
=
full_data
.
construct
().
num_data
()
if
SKLEARN_INSTALLED
:
if
data_splitter
is
not
None
:
sfk
=
LGBMStratifiedKFold
(
n_splits
=
nfold
,
shuffle
=
shuffle
,
random_state
=
seed
)
if
not
hasattr
(
data_splitter
,
'split'
):
idset
=
[
x
[
1
]
for
x
in
sfk
.
split
(
X
=
full_data
.
get_label
(),
y
=
full_data
.
get_label
())]
raise
AttributeError
(
"data_splitter has no method 'split'"
)
else
:
folds
=
data_splitter
.
split
(
np
.
arange
(
num_data
))
elif
stratified
:
if
not
SKLEARN_INSTALLED
:
raise
LightGBMError
(
'Scikit-learn is required for stratified cv'
)
raise
LightGBMError
(
'Scikit-learn is required for stratified cv'
)
sfk
=
LGBMStratifiedKFold
(
n_splits
=
nfold
,
shuffle
=
shuffle
,
random_state
=
seed
)
folds
=
sfk
.
split
(
X
=
np
.
zeros
(
num_data
),
y
=
full_data
.
get_label
())
else
:
else
:
full_data
.
construct
()
if
shuffle
:
if
shuffle
:
randidx
=
np
.
random
.
permutation
(
full_data
.
num_data
())
randidx
=
np
.
random
.
permutation
(
num_data
)
kstep
=
int
(
len
(
randidx
)
/
nfold
)
else
:
idset
=
[
randidx
[(
i
*
kstep
):
min
(
len
(
randidx
),
(
i
+
1
)
*
kstep
)]
for
i
in
range_
(
nfold
)]
randidx
=
np
.
arange
(
num_data
)
kstep
=
int
(
num_data
/
nfold
)
test_id
=
[
randidx
[
i
:
i
+
kstep
]
for
i
in
range_
(
0
,
num_data
,
kstep
)]
train_id
=
[
np
.
concatenate
([
test_id
[
i
]
for
i
in
range_
(
nfold
)
if
k
!=
i
])
for
k
in
range_
(
nfold
)]
folds
=
zip
(
train_id
,
test_id
)
ret
=
CVBooster
()
ret
=
CVBooster
()
for
k
in
range_
(
n
fold
)
:
for
train_idx
,
test_idx
in
fold
s
:
train_set
=
full_data
.
subset
(
np
.
concatenate
([
idset
[
i
]
for
i
in
range_
(
nfold
)
if
k
!=
i
])
)
train_set
=
full_data
.
subset
(
train_idx
)
valid_set
=
full_data
.
subset
(
idset
[
k
]
)
valid_set
=
full_data
.
subset
(
test_idx
)
# run preprocessing on the data set if needed
# run preprocessing on the data set if needed
if
fpreproc
is
not
None
:
if
fpreproc
is
not
None
:
train_set
,
valid_set
,
tparam
=
fpreproc
(
train_set
,
valid_set
,
params
.
copy
())
train_set
,
valid_set
,
tparam
=
fpreproc
(
train_set
,
valid_set
,
params
.
copy
())
...
@@ -265,9 +274,10 @@ def _agg_cv_result(raw_results):
...
@@ -265,9 +274,10 @@ def _agg_cv_result(raw_results):
return
[(
'cv_agg'
,
k
,
np
.
mean
(
v
),
metric_type
[
k
],
np
.
std
(
v
))
for
k
,
v
in
cvmap
.
items
()]
return
[(
'cv_agg'
,
k
,
np
.
mean
(
v
),
metric_type
[
k
],
np
.
std
(
v
))
for
k
,
v
in
cvmap
.
items
()]
def
cv
(
params
,
train_set
,
num_boost_round
=
10
,
nfold
=
5
,
stratified
=
False
,
def
cv
(
params
,
train_set
,
num_boost_round
=
10
,
shuffle
=
True
,
metrics
=
None
,
fobj
=
None
,
feval
=
None
,
init_model
=
None
,
data_splitter
=
None
,
nfold
=
5
,
stratified
=
False
,
shuffle
=
True
,
feature_name
=
None
,
categorical_feature
=
None
,
metrics
=
None
,
fobj
=
None
,
feval
=
None
,
init_model
=
None
,
feature_name
=
'auto'
,
categorical_feature
=
'auto'
,
early_stopping_rounds
=
None
,
fpreproc
=
None
,
early_stopping_rounds
=
None
,
fpreproc
=
None
,
verbose_eval
=
None
,
show_stdv
=
True
,
seed
=
0
,
verbose_eval
=
None
,
show_stdv
=
True
,
seed
=
0
,
callbacks
=
None
):
callbacks
=
None
):
...
@@ -282,14 +292,14 @@ def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False,
...
@@ -282,14 +292,14 @@ def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False,
Data to be trained.
Data to be trained.
num_boost_round : int
num_boost_round : int
Number of boosting iterations.
Number of boosting iterations.
data_splitter : an instance with split(X) method
Instance with split(X) method.
nfold : int
nfold : int
Number of folds in CV.
Number of folds in CV.
stratified : bool
stratified : bool
Perform stratified sampling.
Perform stratified sampling.
shuffle: bool
shuffle: bool
Whether shuffle before split data
Whether shuffle before split data
folds : a KFold or StratifiedKFold instance
Sklearn KFolds or StratifiedKFolds.
metrics : string or list of strings
metrics : string or list of strings
Evaluation metrics to be watched in CV.
Evaluation metrics to be watched in CV.
fobj : function
fobj : function
...
@@ -298,11 +308,14 @@ def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False,
...
@@ -298,11 +308,14 @@ def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False,
Custom evaluation function.
Custom evaluation function.
init_model : file name of lightgbm model or 'Booster' instance
init_model : file name of lightgbm model or 'Booster' instance
model used for continued train
model used for continued train
feature_name : list of str
feature_name : list of str
, or 'auto'
Feature names
Feature names
categorical_feature : list of str or int
If 'auto' and data is pandas DataFrame, use data columns name
Categorical features, type int represents index,
categorical_feature : list of str or int, or 'auto'
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
early_stopping_rounds: int
early_stopping_rounds: int
Activates early stopping. CV error needs to decrease at least
Activates early stopping. CV error needs to decrease at least
every <early_stopping_rounds> round(s) to continue.
every <early_stopping_rounds> round(s) to continue.
...
@@ -351,7 +364,10 @@ def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False,
...
@@ -351,7 +364,10 @@ def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False,
params
[
'metric'
].
extend
(
metrics
)
params
[
'metric'
].
extend
(
metrics
)
results
=
collections
.
defaultdict
(
list
)
results
=
collections
.
defaultdict
(
list
)
cvfolds
=
_make_n_folds
(
train_set
,
nfold
,
params
,
seed
,
fpreproc
,
stratified
,
shuffle
)
cvfolds
=
_make_n_folds
(
train_set
,
data_splitter
=
data_splitter
,
nfold
=
nfold
,
params
=
params
,
seed
=
seed
,
fpreproc
=
fpreproc
,
stratified
=
stratified
,
shuffle
=
shuffle
)
# setup callbacks
# setup callbacks
if
callbacks
is
None
:
if
callbacks
is
None
:
...
@@ -380,7 +396,7 @@ def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False,
...
@@ -380,7 +396,7 @@ def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False,
begin_iteration
=
0
,
begin_iteration
=
0
,
end_iteration
=
num_boost_round
,
end_iteration
=
num_boost_round
,
evaluation_result_list
=
None
))
evaluation_result_list
=
None
))
cvfolds
.
update
(
fobj
)
cvfolds
.
update
(
fobj
=
fobj
)
res
=
_agg_cv_result
(
cvfolds
.
eval_valid
(
feval
))
res
=
_agg_cv_result
(
cvfolds
.
eval_valid
(
feval
))
for
_
,
key
,
mean
,
_
,
std
in
res
:
for
_
,
key
,
mean
,
_
,
std
in
res
:
results
[
key
+
'-mean'
].
append
(
mean
)
results
[
key
+
'-mean'
].
append
(
mean
)
...
...
Prev
1
2
3
4
5
6
7
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment