Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tianlh
LightGBM-DCU
Commits
4f77bd28
Commit
4f77bd28
authored
Feb 20, 2017
by
Guolin Ke
Browse files
update to v2.
parent
13d4581b
Changes
64
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
726 additions
and
506 deletions
+726
-506
include/LightGBM/dataset_loader.h
include/LightGBM/dataset_loader.h
+3
-3
include/LightGBM/feature_group.h
include/LightGBM/feature_group.h
+190
-0
include/LightGBM/tree.h
include/LightGBM/tree.h
+37
-46
include/LightGBM/utils/array_args.h
include/LightGBM/utils/array_args.h
+89
-40
include/LightGBM/utils/common.h
include/LightGBM/utils/common.h
+1
-1
include/LightGBM/utils/random.h
include/LightGBM/utils/random.h
+23
-6
pmml/pmml.py
pmml/pmml.py
+2
-3
python-package/lightgbm/basic.py
python-package/lightgbm/basic.py
+8
-94
python-package/lightgbm/engine.py
python-package/lightgbm/engine.py
+2
-14
python-package/lightgbm/plotting.py
python-package/lightgbm/plotting.py
+1
-6
python-package/lightgbm/sklearn.py
python-package/lightgbm/sklearn.py
+4
-13
src/boosting/boosting.cpp
src/boosting/boosting.cpp
+7
-0
src/boosting/dart.hpp
src/boosting/dart.hpp
+5
-0
src/boosting/gbdt.cpp
src/boosting/gbdt.cpp
+3
-59
src/boosting/gbdt.h
src/boosting/gbdt.h
+0
-2
src/boosting/goss.hpp
src/boosting/goss.hpp
+174
-0
src/boosting/score_updater.hpp
src/boosting/score_updater.hpp
+2
-0
src/c_api.cpp
src/c_api.cpp
+23
-18
src/io/bin.cpp
src/io/bin.cpp
+138
-195
src/io/config.cpp
src/io/config.cpp
+14
-6
No files found.
include/LightGBM/dataset_loader.h
View file @
4f77bd28
...
...
@@ -20,7 +20,9 @@ public:
LIGHTGBM_EXPORT
Dataset
*
LoadFromFileAlignWithOtherDataset
(
const
char
*
filename
,
const
Dataset
*
train_data
);
LIGHTGBM_EXPORT
Dataset
*
CostructFromSampleData
(
std
::
vector
<
std
::
vector
<
double
>>&
sample_values
,
size_t
total_sample_size
,
data_size_t
num_data
);
LIGHTGBM_EXPORT
Dataset
*
CostructFromSampleData
(
std
::
vector
<
std
::
vector
<
double
>>&
sample_values
,
std
::
vector
<
std
::
vector
<
int
>>&
sample_indices
,
size_t
total_sample_size
,
data_size_t
num_data
);
/*! \brief Disable copy */
DatasetLoader
&
operator
=
(
const
DatasetLoader
&
)
=
delete
;
...
...
@@ -69,8 +71,6 @@ private:
std
::
unordered_set
<
int
>
ignore_features_
;
/*! \brief store feature names */
std
::
vector
<
std
::
string
>
feature_names_
;
/*! \brief Mapper from real feature index to used index*/
std
::
unordered_set
<
int
>
categorical_features_
;
};
...
...
include/LightGBM/feature.h
→
include/LightGBM/feature
_group
.h
View file @
4f77bd28
#ifndef LIGHTGBM_FEATURE_H_
#define LIGHTGBM_FEATURE_H_
#ifndef LIGHTGBM_FEATURE_
GROUP_
H_
#define LIGHTGBM_FEATURE_
GROUP_
H_
#include <LightGBM/utils/random.h>
...
...
@@ -12,22 +12,41 @@
namespace
LightGBM
{
/*! \brief Using to store data and providing some operations on one feature*/
class
Feature
{
class
Dataset
;
class
DatasetLoader
;
/*! \brief Using to store data and providing some operations on one feature group*/
class
FeatureGroup
{
public:
friend
Dataset
;
friend
DatasetLoader
;
/*!
* \brief Constructor
* \param feature
_idx Index of this feature
* \param bin_mapper Bin mapper for
this
feature
* \param
num_
feature
number of features of this group
* \param bin_mapper
s
Bin mapper for feature
s
* \param num_data Total number of data
* \param is_enable_sparse True if enable sparse feature
*/
Feature
(
int
feature_idx
,
BinMapper
*
bin_mapper
,
data_size_t
num_data
,
bool
is_enable_sparse
)
:
bin_mapper_
(
bin_mapper
)
{
feature_index_
=
feature_idx
;
bin_data_
.
reset
(
Bin
::
CreateBin
(
num_data
,
bin_mapper_
->
num_bin
(),
bin_mapper_
->
sparse_rate
(),
is_enable_sparse
,
&
is_sparse_
,
bin_mapper_
->
GetDefaultBin
(),
bin_mapper_
->
bin_type
()));
FeatureGroup
(
int
num_feature
,
std
::
vector
<
std
::
unique_ptr
<
BinMapper
>>&
bin_mappers
,
data_size_t
num_data
,
bool
is_enable_sparse
)
:
num_feature_
(
num_feature
)
{
CHECK
(
static_cast
<
int
>
(
bin_mappers
.
size
())
==
num_feature
);
// use bin at zero to store default_bin
num_total_bin_
=
1
;
bin_offsets_
.
emplace_back
(
num_total_bin_
);
int
cnt_non_zero
=
0
;
for
(
int
i
=
0
;
i
<
num_feature_
;
++
i
)
{
bin_mappers_
.
emplace_back
(
bin_mappers
[
i
].
release
());
auto
num_bin
=
bin_mappers_
[
i
]
->
num_bin
();
if
(
bin_mappers_
[
i
]
->
GetDefaultBin
()
==
0
)
{
num_bin
-=
1
;
}
num_total_bin_
+=
num_bin
;
bin_offsets_
.
emplace_back
(
num_total_bin_
);
cnt_non_zero
+=
static_cast
<
int
>
(
num_data
*
(
1.0
f
-
bin_mappers_
[
i
]
->
sparse_rate
()));
}
double
sparse_rate
=
1.0
f
-
static_cast
<
double
>
(
cnt_non_zero
)
/
(
num_data
);
bin_data_
.
reset
(
Bin
::
CreateBin
(
num_data
,
num_total_bin_
,
sparse_rate
,
is_enable_sparse
,
&
is_sparse_
));
}
/*!
* \brief Constructor from memory
...
...
@@ -35,39 +54,44 @@ public:
* \param num_all_data Number of global data
* \param local_used_indices Local used indices, empty means using all data
*/
Feature
(
const
void
*
memory
,
data_size_t
num_all_data
,
Feature
Group
(
const
void
*
memory
,
data_size_t
num_all_data
,
const
std
::
vector
<
data_size_t
>&
local_used_indices
)
{
const
char
*
memory_ptr
=
reinterpret_cast
<
const
char
*>
(
memory
);
// get featuer index
feature_index_
=
*
(
reinterpret_cast
<
const
int
*>
(
memory_ptr
));
memory_ptr
+=
sizeof
(
feature_index_
);
// get is_sparse
is_sparse_
=
*
(
reinterpret_cast
<
const
bool
*>
(
memory_ptr
));
memory_ptr
+=
sizeof
(
is_sparse_
);
num_feature_
=
*
(
reinterpret_cast
<
const
int
*>
(
memory_ptr
));
memory_ptr
+=
sizeof
(
num_feature_
);
// get bin mapper
bin_mapper_
.
reset
(
new
BinMapper
(
memory_ptr
));
memory_ptr
+=
bin_mapper_
->
SizesInByte
();
bin_mappers_
.
clear
();
bin_offsets_
.
clear
();
// start from 1, due to need to store zero bin in this slot
num_total_bin_
=
1
;
bin_offsets_
.
emplace_back
(
num_total_bin_
);
for
(
int
i
=
0
;
i
<
num_feature_
;
++
i
)
{
bin_mappers_
.
emplace_back
(
new
BinMapper
(
memory_ptr
));
auto
num_bin
=
bin_mappers_
[
i
]
->
num_bin
();
if
(
bin_mappers_
[
i
]
->
GetDefaultBin
()
==
0
)
{
num_bin
-=
1
;
}
num_total_bin_
+=
num_bin
;
bin_offsets_
.
emplace_back
(
num_total_bin_
);
memory_ptr
+=
bin_mappers_
[
i
]
->
SizesInByte
();
}
data_size_t
num_data
=
num_all_data
;
if
(
!
local_used_indices
.
empty
())
{
num_data
=
static_cast
<
data_size_t
>
(
local_used_indices
.
size
());
}
if
(
is_sparse_
)
{
bin_data_
.
reset
(
Bin
::
CreateSparseBin
(
num_data
,
bin_mapper_
->
num_bin
(),
bin_mapper_
->
GetDefaultBin
(),
bin_mapper_
->
bin_type
()
));
bin_data_
.
reset
(
Bin
::
CreateSparseBin
(
num_data
,
num_total_bin_
));
}
else
{
bin_data_
.
reset
(
Bin
::
CreateDenseBin
(
num_data
,
bin_mapper_
->
num_bin
(),
bin_mapper_
->
GetDefaultBin
(),
bin_mapper_
->
bin_type
()
));
bin_data_
.
reset
(
Bin
::
CreateDenseBin
(
num_data
,
num_total_bin_
));
}
// get bin data
bin_data_
->
LoadFromMemory
(
memory_ptr
,
local_used_indices
);
}
/*! \brief Destructor */
~
Feature
()
{
}
bool
CheckAlign
(
const
Feature
&
other
)
const
{
if
(
feature_index_
!=
other
.
feature_index_
)
{
return
false
;
}
return
bin_mapper_
->
CheckAlign
(
*
(
other
.
bin_mapper_
.
get
()));
~
FeatureGroup
()
{
}
/*!
...
...
@@ -76,78 +100,91 @@ public:
* \param idx Index of record
* \param value feature value of record
*/
inline
void
PushData
(
int
tid
,
data_size_t
line_idx
,
double
value
)
{
unsigned
int
bin
=
bin_mapper_
->
ValueToBin
(
value
);
bin_data_
->
Push
(
tid
,
line_idx
,
bin
);
}
inline
void
PushBin
(
int
tid
,
data_size_t
line_idx
,
unsigned
int
bin
)
{
inline
void
PushData
(
int
tid
,
int
sub_feature_idx
,
data_size_t
line_idx
,
double
value
)
{
uint32_t
bin
=
bin_mappers_
[
sub_feature_idx
]
->
ValueToBin
(
value
);
if
(
bin
==
bin_mappers_
[
sub_feature_idx
]
->
GetDefaultBin
())
{
return
;
}
bin
+=
bin_offsets_
[
sub_feature_idx
];
if
(
bin_mappers_
[
sub_feature_idx
]
->
GetDefaultBin
()
==
0
)
{
bin
-=
1
;
}
bin_data_
->
Push
(
tid
,
line_idx
,
bin
);
}
inline
void
CopySubset
(
const
Feature
*
full_feature
,
const
data_size_t
*
used_indices
,
data_size_t
num_used_indices
)
{
inline
void
CopySubset
(
const
Feature
Group
*
full_feature
,
const
data_size_t
*
used_indices
,
data_size_t
num_used_indices
)
{
bin_data_
->
CopySubset
(
full_feature
->
bin_data_
.
get
(),
used_indices
,
num_used_indices
);
}
inline
void
ReSize
(
data_size_t
num_data
)
{
bin_data_
->
ReSize
(
num_data
);
inline
BinIterator
*
SubFetureIterator
(
int
sub_feature
)
{
uint32_t
min_bin
=
bin_offsets_
[
sub_feature
];
uint32_t
max_bin
=
bin_offsets_
[
sub_feature
+
1
]
-
1
;
uint32_t
default_bin
=
bin_mappers_
[
sub_feature
]
->
GetDefaultBin
();
return
bin_data_
->
GetIterator
(
min_bin
,
max_bin
,
default_bin
);
}
inline
bool
is_sparse
()
const
{
return
is_sparse_
;
}
inline
void
FinishLoad
()
{
bin_data_
->
FinishLoad
();
}
/*! \brief Index of this feature */
inline
int
feature_index
()
const
{
return
feature_index_
;
}
/*! \brief Bin mapper that this feature used */
inline
const
BinMapper
*
bin_mapper
()
const
{
return
bin_mapper_
.
get
();
}
/*! \brief Number of bin of this feature */
inline
int
num_bin
()
const
{
return
bin_mapper_
->
num_bin
();
}
inline
BinType
bin_type
()
const
{
return
bin_mapper_
->
bin_type
();
}
/*! \brief Get bin data of this feature */
inline
const
Bin
*
bin_data
()
const
{
return
bin_data_
.
get
();
}
inline
data_size_t
Split
(
int
sub_feature
,
uint32_t
threshold
,
data_size_t
*
data_indices
,
data_size_t
num_data
,
data_size_t
*
lte_indices
,
data_size_t
*
gt_indices
)
const
{
uint32_t
min_bin
=
bin_offsets_
[
sub_feature
];
uint32_t
max_bin
=
bin_offsets_
[
sub_feature
+
1
]
-
1
;
uint32_t
default_bin
=
bin_mappers_
[
sub_feature
]
->
GetDefaultBin
();
return
bin_data_
->
Split
(
min_bin
,
max_bin
,
default_bin
,
threshold
,
data_indices
,
num_data
,
lte_indices
,
gt_indices
);
}
/*!
* \brief From bin to feature value
* \param bin
* \return Feature value of this bin
* \return Feature
Group
value of this bin
*/
inline
double
BinToValue
(
unsigned
int
bin
)
const
{
return
bin_mapper_
->
BinToValue
(
bin
);
}
inline
double
BinToValue
(
int
sub_feature_idx
,
uint32_t
bin
)
const
{
return
bin_mappers_
[
sub_feature_idx
]
->
BinToValue
(
bin
);
}
/*!
* \brief Save binary data to file
* \param file File want to write
*/
void
SaveBinaryToFile
(
FILE
*
file
)
const
{
fwrite
(
&
feature_index_
,
sizeof
(
feature_index_
),
1
,
file
);
fwrite
(
&
is_sparse_
,
sizeof
(
is_sparse_
),
1
,
file
);
bin_mapper_
->
SaveBinaryToFile
(
file
);
fwrite
(
&
num_feature_
,
sizeof
(
num_feature_
),
1
,
file
);
for
(
int
i
=
0
;
i
<
num_feature_
;
++
i
)
{
bin_mappers_
[
i
]
->
SaveBinaryToFile
(
file
);
}
bin_data_
->
SaveBinaryToFile
(
file
);
}
/*!
* \brief Get sizes in byte of this object
*/
size_t
SizesInByte
()
const
{
return
sizeof
(
feature_index_
)
+
sizeof
(
is_sparse_
)
+
bin_mapper_
->
SizesInByte
()
+
bin_data_
->
SizesInByte
();
size_t
ret
=
sizeof
(
is_sparse_
)
+
sizeof
(
num_feature_
);
for
(
int
i
=
0
;
i
<
num_feature_
;
++
i
)
{
ret
+=
bin_mappers_
[
i
]
->
SizesInByte
();
}
ret
+=
bin_data_
->
SizesInByte
();
return
ret
;
}
/*! \brief Disable copy */
Feature
&
operator
=
(
const
Feature
&
)
=
delete
;
Feature
Group
&
operator
=
(
const
Feature
Group
&
)
=
delete
;
/*! \brief Disable copy */
Feature
(
const
Feature
&
)
=
delete
;
Feature
Group
(
const
Feature
Group
&
)
=
delete
;
private:
/*! \brief Index of this feature */
int
feature_index_
;
/*! \brief Bin mapper that this feature used */
std
::
unique_ptr
<
BinMapper
>
bin_mapper_
;
/*! \brief Number of features */
int
num_feature_
;
/*! \brief Bin mapper for sub features */
std
::
vector
<
std
::
unique_ptr
<
BinMapper
>>
bin_mappers_
;
/*! \brief Bin offsets for sub features */
std
::
vector
<
uint32_t
>
bin_offsets_
;
/*! \brief Bin data of this feature */
std
::
unique_ptr
<
Bin
>
bin_data_
;
/*! \brief True if this feature is sparse */
bool
is_sparse_
;
int
num_total_bin_
;
};
}
// namespace LightGBM
#endif // L
ight
GBM_FEATURE_H_
#endif // L
IGHT
GBM_FEATURE_
GROUP_
H_
include/LightGBM/tree.h
View file @
4f77bd28
...
...
@@ -2,7 +2,6 @@
#define LIGHTGBM_TREE_H_
#include <LightGBM/meta.h>
#include <LightGBM/feature.h>
#include <LightGBM/dataset.h>
#include <string>
...
...
@@ -35,7 +34,6 @@ public:
* \brief Performing a split on tree leaves.
* \param leaf Index of leaf to be split
* \param feature Index of feature; the converted index after removing useless features
* \param bin_type type of this feature, numerical or categorical
* \param threshold Threshold(bin) of split
* \param real_feature Index of feature, the original index on data
* \param threshold_double Threshold on feature value
...
...
@@ -46,7 +44,7 @@ public:
* \param gain Split gain
* \return The index of new leaf.
*/
int
Split
(
int
leaf
,
int
feature
,
BinType
bin_type
,
unsigned
in
t
threshold
,
int
real_feature
,
int
Split
(
int
leaf
,
int
feature
,
uint32_
t
threshold
,
int
real_feature
,
double
threshold_double
,
double
left_value
,
double
right_value
,
data_size_t
left_cnt
,
data_size_t
right_cnt
,
double
gain
);
...
...
@@ -64,8 +62,9 @@ public:
* \param num_data Number of total data
* \param score Will add prediction to score
*/
void
AddPredictionToScore
(
const
Dataset
*
data
,
data_size_t
num_data
,
double
*
score
)
const
;
void
AddPredictionToScore
(
const
Dataset
*
data
,
data_size_t
num_data
,
double
*
score
)
const
;
/*!
* \brief Adding prediction value of this tree model to scorese
...
...
@@ -93,7 +92,7 @@ public:
inline
int
leaf_depth
(
int
leaf_idx
)
const
{
return
leaf_depth_
[
leaf_idx
];
}
/*! \brief Get feature of specific split*/
inline
int
split_feature
_real
(
int
split_idx
)
const
{
return
split_feature_
real_
[
split_idx
];
}
inline
int
split_feature
(
int
split_idx
)
const
{
return
split_feature_
[
split_idx
];
}
/*!
* \brief Shrinkage for the tree's output
...
...
@@ -101,8 +100,9 @@ public:
* \param rate The factor of shrinkage
*/
inline
void
Shrinkage
(
double
rate
)
{
#pragma omp parallel for schedule(static)
for
(
int
i
=
0
;
i
<
num_leaves_
;
++
i
)
{
leaf_value_
[
i
]
=
leaf_value_
[
i
]
*
rate
;
leaf_value_
[
i
]
*
=
rate
;
}
}
...
...
@@ -112,15 +112,6 @@ public:
/*! \brief Serialize this object to json*/
std
::
string
ToJSON
();
template
<
typename
T
>
static
bool
CategoricalDecision
(
T
fval
,
T
threshold
)
{
if
(
static_cast
<
int
>
(
fval
)
==
static_cast
<
int
>
(
threshold
))
{
return
true
;
}
else
{
return
false
;
}
}
template
<
typename
T
>
static
bool
NumericalDecision
(
T
fval
,
T
threshold
)
{
if
(
fval
<=
threshold
)
{
...
...
@@ -130,26 +121,13 @@ public:
}
}
static
const
char
*
GetDecisionTypeName
(
int8_t
type
)
{
if
(
type
==
0
)
{
return
"no_greater"
;
}
else
{
return
"is"
;
}
}
private:
static
std
::
vector
<
bool
(
*
)(
unsigned
int
,
unsigned
int
)
>
inner_decision_funs
;
static
std
::
vector
<
bool
(
*
)(
double
,
double
)
>
decision_funs
;
inline
int
GetLeaf
(
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>&
iterators
,
data_size_t
data_idx
)
const
;
private:
/*!
* \brief Find leaf index of which record belongs by data
* \param data The dataset
* \param data_idx Index of record
* \return Leaf index
*/
inline
int
GetLeaf
(
const
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>&
iterators
,
data_size_t
data_idx
)
const
;
inline
int
GetLeafRaw
(
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>&
iterators
,
data_size_t
data_idx
)
const
;
/*!
* \brief Find leaf index of which record belongs by features
...
...
@@ -171,15 +149,13 @@ private:
/*! \brief A non-leaf node's right child */
std
::
vector
<
int
>
right_child_
;
/*! \brief A non-leaf node's split feature */
std
::
vector
<
int
>
split_feature_
;
std
::
vector
<
int
>
split_feature_
inner
;
/*! \brief A non-leaf node's split feature, the original index */
std
::
vector
<
int
>
split_feature_
real_
;
std
::
vector
<
int
>
split_feature_
;
/*! \brief A non-leaf node's split threshold in bin */
std
::
vector
<
u
nsigned
in
t
>
threshold_in_bin_
;
std
::
vector
<
u
int32_
t
>
threshold_in_bin_
;
/*! \brief A non-leaf node's split threshold in feature value */
std
::
vector
<
double
>
threshold_
;
/*! \brief Decision type, 0 for '<='(numerical feature), 1 for 'is'(categorical feature) */
std
::
vector
<
int8_t
>
decision_type_
;
/*! \brief A non-leaf node's split gain */
std
::
vector
<
double
>
split_gain_
;
// used for leaf node
...
...
@@ -208,13 +184,28 @@ inline int Tree::PredictLeafIndex(const double* feature_values) const {
return
leaf
;
}
inline
int
Tree
::
GetLeaf
(
const
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>&
iterators
,
data_size_t
data_idx
)
const
{
inline
int
Tree
::
GetLeaf
(
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>&
iterators
,
data_size_t
data_idx
)
const
{
int
node
=
0
;
while
(
node
>=
0
)
{
if
(
NumericalDecision
<
uint32_t
>
(
iterators
[
node
]
->
Get
(
data_idx
),
threshold_in_bin_
[
node
]))
{
node
=
left_child_
[
node
];
}
else
{
node
=
right_child_
[
node
];
}
}
return
~
node
;
}
inline
int
Tree
::
GetLeafRaw
(
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>&
iterators
,
data_size_t
data_idx
)
const
{
int
node
=
0
;
while
(
node
>=
0
)
{
if
(
inner_decision_funs
[
decision_type_
[
node
]]
(
iterators
[
split_feature_
[
node
]]
->
Get
(
data_idx
),
threshold_in_bin_
[
node
]))
{
if
(
NumericalDecision
<
uint32_t
>
(
iterators
[
split_feature_
inner
[
node
]]
->
Get
(
data_idx
),
threshold_in_bin_
[
node
]))
{
node
=
left_child_
[
node
];
}
else
{
node
=
right_child_
[
node
];
...
...
@@ -226,8 +217,8 @@ inline int Tree::GetLeaf(const std::vector<std::unique_ptr<BinIterator>>& iterat
inline
int
Tree
::
GetLeaf
(
const
double
*
feature_values
)
const
{
int
node
=
0
;
while
(
node
>=
0
)
{
if
(
decision_funs
[
decision_type_
[
node
]]
(
feature_values
[
split_feature_
real_
[
node
]],
if
(
NumericalDecision
<
double
>
(
feature_values
[
split_feature_
[
node
]],
threshold_
[
node
]))
{
node
=
left_child_
[
node
];
}
else
{
...
...
include/LightGBM/utils/array_args.h
View file @
4f77bd28
...
...
@@ -3,6 +3,7 @@
#include <vector>
#include <algorithm>
#include <LightGBM/utils/openmp_wrapper.h>
namespace
LightGBM
{
...
...
@@ -12,88 +13,136 @@ namespace LightGBM {
template
<
typename
VAL_T
>
class
ArrayArgs
{
public:
inline
static
size_t
ArgMaxMT
(
const
std
::
vector
<
VAL_T
>&
array
)
{
int
num_threads
=
1
;
#pragma omp parallel
#pragma omp master
{
num_threads
=
omp_get_num_threads
();
}
int
step
=
std
::
max
(
1
,
(
static_cast
<
int
>
(
array
.
size
())
+
num_threads
-
1
)
/
num_threads
);
std
::
vector
<
size_t
>
arg_maxs
(
num_threads
,
0
);
#pragma omp parallel for schedule(static,1)
for
(
int
i
=
0
;
i
<
num_threads
;
++
i
)
{
size_t
start
=
step
*
i
;
if
(
start
>=
array
.
size
())
{
continue
;
}
size_t
end
=
std
::
min
(
array
.
size
(),
start
+
step
);
size_t
arg_max
=
start
;
for
(
size_t
j
=
start
+
1
;
j
<
end
;
++
j
)
{
if
(
array
[
j
]
>
array
[
arg_max
])
{
arg_max
=
j
;
}
}
arg_maxs
[
i
]
=
arg_max
;
}
size_t
ret
=
arg_maxs
[
0
];
for
(
int
i
=
1
;
i
<
num_threads
;
++
i
)
{
if
(
array
[
arg_maxs
[
i
]]
>
array
[
ret
])
{
ret
=
arg_maxs
[
i
];
}
}
return
ret
;
}
inline
static
size_t
ArgMax
(
const
std
::
vector
<
VAL_T
>&
array
)
{
if
(
array
.
empty
())
{
return
0
;
}
size_t
argMax
=
0
;
for
(
size_t
i
=
1
;
i
<
array
.
size
();
++
i
)
{
if
(
array
[
i
]
>
array
[
argMax
])
{
argMax
=
i
;
if
(
array
.
size
()
>
100
)
{
return
ArgMaxMT
(
array
);
}
else
{
size_t
arg_max
=
0
;
for
(
size_t
i
=
1
;
i
<
array
.
size
();
++
i
)
{
if
(
array
[
i
]
>
array
[
arg_max
])
{
arg_max
=
i
;
}
}
return
arg_max
;
}
return
argMax
;
}
inline
static
size_t
ArgMin
(
const
std
::
vector
<
VAL_T
>&
array
)
{
if
(
array
.
empty
())
{
return
0
;
}
size_t
arg
M
in
=
0
;
size_t
arg
_m
in
=
0
;
for
(
size_t
i
=
1
;
i
<
array
.
size
();
++
i
)
{
if
(
array
[
i
]
<
array
[
arg
M
in
])
{
arg
M
in
=
i
;
if
(
array
[
i
]
<
array
[
arg
_m
in
])
{
arg
_m
in
=
i
;
}
}
return
arg
M
in
;
return
arg
_m
in
;
}
inline
static
size_t
ArgMax
(
const
VAL_T
*
array
,
size_t
n
)
{
if
(
n
<=
0
)
{
return
0
;
}
size_t
arg
M
ax
=
0
;
size_t
arg
_m
ax
=
0
;
for
(
size_t
i
=
1
;
i
<
n
;
++
i
)
{
if
(
array
[
i
]
>
array
[
arg
M
ax
])
{
arg
M
ax
=
i
;
if
(
array
[
i
]
>
array
[
arg
_m
ax
])
{
arg
_m
ax
=
i
;
}
}
return
arg
M
ax
;
return
arg
_m
ax
;
}
inline
static
size_t
ArgMin
(
const
VAL_T
*
array
,
size_t
n
)
{
if
(
n
<=
0
)
{
return
0
;
}
size_t
arg
M
in
=
0
;
size_t
arg
_m
in
=
0
;
for
(
size_t
i
=
1
;
i
<
n
;
++
i
)
{
if
(
array
[
i
]
<
array
[
arg
M
in
])
{
arg
M
in
=
i
;
if
(
array
[
i
]
<
array
[
arg
_m
in
])
{
arg
_m
in
=
i
;
}
}
return
arg
M
in
;
return
arg
_m
in
;
}
inline
static
size_t
Partition
(
std
::
vector
<
VAL_T
>*
array
,
size_t
start
,
size_t
end
)
{
VAL_T
&
pivot
=
(
*
array
)[
end
-
1
];
size_t
p_idx
=
start
;
for
(
size_t
i
=
start
;
i
<
end
-
1
;
++
i
)
{
if
((
*
array
)[
i
]
>
pivot
)
{
std
::
swap
((
*
array
)[
p_idx
],
(
*
array
)[
i
]);
++
p_idx
;
}
inline
static
void
Partition
(
std
::
vector
<
VAL_T
>*
arr
,
int
start
,
int
end
,
int
*
l
,
int
*
r
)
{
int
i
=
start
-
1
;
int
j
=
end
-
1
;
int
p
=
i
;
int
q
=
j
;
if
(
start
>=
end
)
{
return
;
}
std
::
swap
((
*
array
)[
p_idx
],
(
*
array
)[
end
-
1
]);
return
p_idx
;
std
::
vector
<
VAL_T
>&
ref
=
*
arr
;
VAL_T
v
=
ref
[
end
-
1
];
for
(;;)
{
while
(
ref
[
++
i
]
>
v
);
while
(
v
>
ref
[
--
j
])
{
if
(
j
==
start
)
{
break
;
}
}
if
(
i
>=
j
)
{
break
;
}
std
::
swap
(
ref
[
i
],
ref
[
j
]);
if
(
ref
[
i
]
==
v
)
{
p
++
;
std
::
swap
(
ref
[
p
],
ref
[
i
]);
}
if
(
v
==
ref
[
j
])
{
q
--
;
std
::
swap
(
ref
[
j
],
ref
[
q
]);
}
}
std
::
swap
(
ref
[
i
],
ref
[
end
-
1
]);
j
=
i
-
1
;
i
=
i
+
1
;
for
(
int
k
=
start
;
k
<=
p
;
k
++
,
j
--
)
{
std
::
swap
(
ref
[
k
],
ref
[
j
]);
}
for
(
int
k
=
end
-
2
;
k
>=
q
;
k
--
,
i
++
)
{
std
::
swap
(
ref
[
i
],
ref
[
k
]);
}
*
l
=
j
;
*
r
=
i
;
};
inline
static
size_
t
ArgMaxAtK
(
std
::
vector
<
VAL_T
>*
arr
ay
,
size_
t
start
,
size_
t
end
,
size_
t
k
)
{
if
(
start
=
=
end
-
1
)
{
inline
static
in
t
ArgMaxAtK
(
std
::
vector
<
VAL_T
>*
arr
,
in
t
start
,
in
t
end
,
in
t
k
)
{
if
(
start
>
=
end
-
1
)
{
return
start
;
}
size_t
p_idx
=
Partition
(
array
,
start
,
end
)
;
i
f
(
p_idx
==
k
)
{
return
p_idx
;
}
else
if
(
k
<
p_idx
)
{
return
ArgMaxAtK
(
array
,
start
,
p_idx
,
k
);
}
else
{
return
ArgMaxAtK
(
arr
ay
,
p_idx
+
1
,
end
,
k
);
int
l
=
start
;
i
nt
r
=
end
-
1
;
Partition
(
arr
,
start
,
end
,
&
l
,
&
r
)
;
if
((
k
>
l
&&
k
<
r
)
||
l
==
0
||
r
==
end
-
1
)
{
return
k
;
}
else
if
(
k
<=
l
)
{
return
ArgMaxAtK
(
arr
,
start
,
l
,
k
);
}
else
{
return
ArgMaxAtK
(
arr
,
r
,
end
,
k
);
}
}
inline
static
void
MaxK
(
const
std
::
vector
<
VAL_T
>&
array
,
size_
t
k
,
std
::
vector
<
VAL_T
>*
out
)
{
inline
static
void
MaxK
(
const
std
::
vector
<
VAL_T
>&
array
,
in
t
k
,
std
::
vector
<
VAL_T
>*
out
)
{
out
->
clear
();
if
(
k
<=
0
)
{
return
;
...
...
@@ -104,7 +153,7 @@ public:
if
(
k
>=
array
.
size
())
{
return
;
}
ArgMaxAtK
(
out
,
0
,
out
->
size
(),
k
-
1
);
ArgMaxAtK
(
out
,
0
,
static_cast
<
int
>
(
out
->
size
()
)
,
k
-
1
);
out
->
erase
(
out
->
begin
()
+
k
,
out
->
end
());
}
...
...
include/LightGBM/utils/common.h
View file @
4f77bd28
...
...
@@ -150,7 +150,7 @@ inline static const char* Atof(const char* p, double* out) {
frac
=
0
;
scale
=
1.0
;
if
((
*
p
==
'e'
)
||
(
*
p
==
'E'
))
{
u
nsigned
in
t
expon
;
u
int32_
t
expon
;
// Get sign of exponent, if any.
++
p
;
if
(
*
p
==
'-'
)
{
...
...
include/LightGBM/utils/random.h
View file @
4f77bd28
...
...
@@ -20,30 +20,41 @@ public:
std
::
random_device
rd
;
auto
genrator
=
std
::
mt19937
(
rd
());
std
::
uniform_int_distribution
<
int
>
distribution
(
0
,
x
);
x
=
static_cast
<
unsigned
int
>
(
distribution
(
genrator
)
)
;
x
=
distribution
(
genrator
);
}
/*!
* \brief Constructor, with specific seed
*/
Random
(
int
seed
)
{
x
=
static_cast
<
unsigned
int
>
(
seed
)
;
x
=
seed
;
}
/*!
* \brief Generate random integer
* \brief Generate random integer, int16 range. [0, 65536]
* \param lower_bound lower bound
* \param upper_bound upper bound
* \return The random integer between [lower_bound, upper_bound)
*/
inline
int
NextShort
(
int
lower_bound
,
int
upper_bound
)
{
return
(
RandInt16
())
%
(
upper_bound
-
lower_bound
)
+
lower_bound
;
}
/*!
* \brief Generate random integer, int32 range
* \param lower_bound lower bound
* \param upper_bound upper bound
* \return The random integer between [lower_bound, upper_bound)
*/
inline
int
NextInt
(
int
lower_bound
,
int
upper_bound
)
{
return
(
fastrand
())
%
(
upper_bound
-
lower_bound
)
+
lower_bound
;
return
(
RandInt32
())
%
(
upper_bound
-
lower_bound
)
+
lower_bound
;
}
/*!
* \brief Generate random float data
* \return The random float between [0.0, 1.0)
*/
inline
float
NextFloat
()
{
// get random float in [0,1)
return
static_cast
<
float
>
(
fastrand
())
/
(
32768.0
f
);
return
static_cast
<
float
>
(
RandInt16
())
/
(
32768.0
f
);
}
/*!
* \brief Sample K data from {0,1,...,N-1}
...
...
@@ -65,10 +76,16 @@ public:
return
ret
;
}
private:
inline
int
fastrand
()
{
inline
int
RandInt16
()
{
x
=
(
214013
*
x
+
2531011
);
return
(
x
>>
16
)
&
0x7FFF
;
}
inline
int
RandInt32
()
{
x
=
(
214013
*
x
+
2531011
);
return
x
&
0x7FFFFFF
;
}
int
x
=
123456789
;
};
...
...
pmml/pmml.py
View file @
4f77bd28
...
...
@@ -31,9 +31,9 @@ def get_threshold(node_id, prev_node_idx, is_child):
def
print_simple_predicate
(
tab_len
,
node_id
,
is_left_child
,
prev_node_idx
,
is_leaf
):
if
is_left_child
:
op
=
'equal'
if
decision_type
[
prev_node_idx
]
==
1
else
'lessOrEqual'
op
=
'lessOrEqual'
else
:
op
=
'notEqual'
if
decision_type
[
prev_node_idx
]
==
1
else
'greaterThan'
op
=
'greaterThan'
out_
(
'
\t
'
*
(
tab_len
+
1
)
+
(
"<SimplePredicate field=
\"
{0}
\"
"
+
" operator=
\"
{1}
\"
value=
\"
{2}
\"
/>"
).
format
(
get_field_name
(
node_id
,
prev_node_idx
,
is_leaf
),
op
,
get_threshold
(
node_id
,
prev_node_idx
,
is_leaf
)))
...
...
@@ -128,7 +128,6 @@ with open('LightGBM_pmml.xml', 'w') as pmml_out:
split_feature
=
get_array_ints
(
next
(
model_content
))
split_gain
=
next
(
model_content
)
# unused
threshold
=
get_array_strings
(
next
(
model_content
))
decision_type
=
get_array_ints
(
next
(
model_content
))
left_child
=
get_array_ints
(
next
(
model_content
))
right_child
=
get_array_ints
(
next
(
model_content
))
leaf_parent
=
get_array_ints
(
next
(
model_content
))
...
...
python-package/lightgbm/basic.py
View file @
4f77bd28
...
...
@@ -221,7 +221,7 @@ PANDAS_DTYPE_MAPPER = {'int8': 'int', 'int16': 'int', 'int32': 'int',
'float32'
:
'float'
,
'float64'
:
'float'
,
'bool'
:
'int'
}
def
_data_from_pandas
(
data
,
feature_name
,
categorical_feature
,
pandas_categorical
):
def
_data_from_pandas
(
data
,
feature_name
):
if
isinstance
(
data
,
DataFrame
):
if
feature_name
==
'auto'
or
feature_name
is
None
:
if
all
([
isinstance
(
name
,
integer_types
+
(
np
.
integer
,
))
for
name
in
data
.
columns
]):
...
...
@@ -229,25 +229,6 @@ def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorica
warnings
.
filterwarnings
(
'once'
)
warnings
.
warn
(
msg
,
stacklevel
=
5
)
data
=
data
.
rename
(
columns
=
str
)
cat_cols
=
data
.
select_dtypes
(
include
=
[
'category'
]).
columns
if
pandas_categorical
is
None
:
# train dataset
pandas_categorical
=
[
list
(
data
[
col
].
cat
.
categories
)
for
col
in
cat_cols
]
else
:
if
len
(
cat_cols
)
!=
len
(
pandas_categorical
):
raise
ValueError
(
'train and valid dataset categorical_feature do not match.'
)
for
col
,
category
in
zip
(
cat_cols
,
pandas_categorical
):
if
list
(
data
[
col
].
cat
.
categories
)
!=
list
(
category
):
data
[
col
]
=
data
[
col
].
cat
.
set_categories
(
category
)
if
len
(
cat_cols
):
# cat_cols is pandas Index object
data
=
data
.
copy
()
# not alter origin DataFrame
data
[
cat_cols
]
=
data
[
cat_cols
].
apply
(
lambda
x
:
x
.
cat
.
codes
)
if
categorical_feature
is
not
None
:
if
feature_name
is
None
:
feature_name
=
list
(
data
.
columns
)
if
categorical_feature
==
'auto'
:
categorical_feature
=
list
(
cat_cols
)
else
:
categorical_feature
=
list
(
categorical_feature
)
+
list
(
cat_cols
)
if
feature_name
==
'auto'
:
feature_name
=
list
(
data
.
columns
)
data_dtypes
=
data
.
dtypes
...
...
@@ -261,9 +242,7 @@ def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorica
else
:
if
feature_name
==
'auto'
:
feature_name
=
None
if
categorical_feature
==
'auto'
:
categorical_feature
=
None
return
data
,
feature_name
,
categorical_feature
,
pandas_categorical
return
data
,
feature_name
def
_label_from_pandas
(
label
):
...
...
@@ -277,19 +256,6 @@ def _label_from_pandas(label):
return
label
def
_save_pandas_categorical
(
file_name
,
pandas_categorical
):
with
open
(
file_name
,
'a'
)
as
f
:
f
.
write
(
'
\n
pandas_categorical:'
+
json
.
dumps
(
pandas_categorical
,
default
=
json_default_with_numpy
))
def
_load_pandas_categorical
(
file_name
):
with
open
(
file_name
,
'r'
)
as
f
:
last_line
=
f
.
readlines
()[
-
1
]
if
last_line
.
startswith
(
'pandas_categorical:'
):
return
json
.
loads
(
last_line
[
len
(
'pandas_categorical:'
):])
return
None
class
_InnerPredictor
(
object
):
"""
A _InnerPredictor of LightGBM.
...
...
@@ -321,7 +287,6 @@ class _InnerPredictor(object):
ctypes
.
byref
(
out_num_class
)))
self
.
num_class
=
out_num_class
.
value
self
.
num_total_iteration
=
out_num_iterations
.
value
self
.
pandas_categorical
=
_load_pandas_categorical
(
model_file
)
elif
booster_handle
is
not
None
:
self
.
__is_manage_handle
=
False
self
.
handle
=
booster_handle
...
...
@@ -335,7 +300,6 @@ class _InnerPredictor(object):
self
.
handle
,
ctypes
.
byref
(
out_num_iterations
)))
self
.
num_total_iteration
=
out_num_iterations
.
value
self
.
pandas_categorical
=
None
else
:
raise
TypeError
(
'Need Model file or Booster handle to create a predictor'
)
...
...
@@ -371,7 +335,7 @@ class _InnerPredictor(object):
"""
if
isinstance
(
data
,
Dataset
):
raise
TypeError
(
"Cannot use Dataset instance for prediction, please use raw data instead"
)
data
=
_data_from_pandas
(
data
,
None
,
None
,
self
.
pandas_categorical
)[
0
]
data
=
_data_from_pandas
(
data
,
None
)[
0
]
predict_type
=
C_API_PREDICT_NORMAL
if
raw_score
:
predict_type
=
C_API_PREDICT_RAW_SCORE
...
...
@@ -532,7 +496,7 @@ class Dataset(object):
"""Dataset in LightGBM."""
def
__init__
(
self
,
data
,
label
=
None
,
max_bin
=
255
,
reference
=
None
,
weight
=
None
,
group
=
None
,
silent
=
False
,
feature_name
=
'auto'
,
categorical_feature
=
'auto'
,
params
=
None
,
feature_name
=
'auto'
,
params
=
None
,
free_raw_data
=
True
):
"""
Parameters
...
...
@@ -555,11 +519,6 @@ class Dataset(object):
feature_name : list of str, or 'auto'
Feature names
If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
params: dict, optional
Other parameters
free_raw_data: Bool
...
...
@@ -574,12 +533,10 @@ class Dataset(object):
self
.
group
=
group
self
.
silent
=
silent
self
.
feature_name
=
feature_name
self
.
categorical_feature
=
categorical_feature
self
.
params
=
params
self
.
free_raw_data
=
free_raw_data
self
.
used_indices
=
None
self
.
_predictor
=
None
self
.
pandas_categorical
=
None
def
__del__
(
self
):
self
.
_free_handle
()
...
...
@@ -592,11 +549,11 @@ class Dataset(object):
def
_lazy_init
(
self
,
data
,
label
=
None
,
max_bin
=
255
,
reference
=
None
,
weight
=
None
,
group
=
None
,
predictor
=
None
,
silent
=
False
,
feature_name
=
'auto'
,
categorical_feature
=
'auto'
,
params
=
None
):
params
=
None
):
if
data
is
None
:
self
.
handle
=
None
return
data
,
feature_name
,
categorical_feature
,
self
.
pandas_categorical
=
_data_from_pandas
(
data
,
feature_name
,
categorical_feature
,
self
.
pandas_categorical
)
data
,
feature_name
,
=
_data_from_pandas
(
data
,
feature_name
)
label
=
_label_from_pandas
(
label
)
self
.
data_has_header
=
False
"""process for args"""
...
...
@@ -608,23 +565,6 @@ class Dataset(object):
params
[
"verbose"
]
=
0
elif
"verbose"
not
in
params
:
params
[
"verbose"
]
=
1
"""get categorical features"""
if
categorical_feature
is
not
None
:
categorical_indices
=
set
()
feature_dict
=
{}
if
feature_name
is
not
None
:
feature_dict
=
{
name
:
i
for
i
,
name
in
enumerate
(
feature_name
)}
for
name
in
categorical_feature
:
if
isinstance
(
name
,
string_type
)
and
name
in
feature_dict
:
categorical_indices
.
add
(
feature_dict
[
name
])
elif
isinstance
(
name
,
integer_types
):
categorical_indices
.
add
(
name
)
else
:
raise
TypeError
(
"Wrong type({}) or unknown name({}) in categorical_feature"
.
format
(
type
(
name
).
__name__
,
name
))
params
[
'categorical_column'
]
=
sorted
(
categorical_indices
)
params_str
=
param_dict_to_str
(
params
)
"""process for reference dataset"""
ref_dataset
=
None
...
...
@@ -784,7 +724,7 @@ class Dataset(object):
self
.
_lazy_init
(
self
.
data
,
label
=
self
.
label
,
max_bin
=
self
.
max_bin
,
weight
=
self
.
weight
,
group
=
self
.
group
,
predictor
=
self
.
_predictor
,
silent
=
self
.
silent
,
feature_name
=
self
.
feature_name
,
categorical_feature
=
self
.
categorical_feature
,
params
=
self
.
params
)
params
=
self
.
params
)
if
self
.
free_raw_data
:
self
.
data
=
None
return
self
...
...
@@ -814,7 +754,6 @@ class Dataset(object):
weight
=
weight
,
group
=
group
,
silent
=
silent
,
params
=
params
,
free_raw_data
=
self
.
free_raw_data
)
ret
.
_predictor
=
self
.
_predictor
ret
.
pandas_categorical
=
self
.
pandas_categorical
return
ret
def
subset
(
self
,
used_indices
,
params
=
None
):
...
...
@@ -829,9 +768,8 @@ class Dataset(object):
Other parameters
"""
ret
=
Dataset
(
None
,
reference
=
self
,
feature_name
=
self
.
feature_name
,
categorical_feature
=
self
.
categorical_feature
,
params
=
params
)
params
=
params
)
ret
.
_predictor
=
self
.
_predictor
ret
.
pandas_categorical
=
self
.
pandas_categorical
ret
.
used_indices
=
used_indices
return
ret
...
...
@@ -939,24 +877,6 @@ class Dataset(object):
else
:
raise
TypeError
(
"Unknown type"
)
def
set_categorical_feature
(
self
,
categorical_feature
):
"""
Set categorical features
Parameters
----------
categorical_feature : list of int or str
Name/index of categorical features
"""
if
self
.
categorical_feature
==
categorical_feature
:
return
if
self
.
data
is
not
None
:
self
.
categorical_feature
=
categorical_feature
self
.
_free_handle
()
else
:
raise
LightGBMError
(
"Cannot set categorical feature after freed raw data, set free_raw_data=False when construct Dataset to avoid this."
)
def
_set_predictor
(
self
,
predictor
):
"""
Set predictor for continued training, not recommand for user to call this function.
...
...
@@ -979,7 +899,6 @@ class Dataset(object):
reference : Dataset
Will use reference as template to consturct current dataset
"""
self
.
set_categorical_feature
(
reference
.
categorical_feature
)
self
.
set_feature_name
(
reference
.
feature_name
)
self
.
_set_predictor
(
reference
.
_predictor
)
if
self
.
reference
is
reference
:
...
...
@@ -1208,7 +1127,6 @@ class Booster(object):
self
.
__inner_predict_buffer
=
[
None
]
self
.
__is_predicted_cur_iter
=
[
False
]
self
.
__get_eval_info
()
self
.
pandas_categorical
=
train_set
.
pandas_categorical
elif
model_file
is
not
None
:
"""Prediction task"""
out_num_iterations
=
ctypes
.
c_int
(
0
)
...
...
@@ -1221,7 +1139,6 @@ class Booster(object):
self
.
handle
,
ctypes
.
byref
(
out_num_class
)))
self
.
__num_class
=
out_num_class
.
value
self
.
pandas_categorical
=
_load_pandas_categorical
(
model_file
)
elif
'model_str'
in
params
:
self
.
__load_model_from_string
(
params
[
'model_str'
])
else
:
...
...
@@ -1237,7 +1154,6 @@ class Booster(object):
def
__deepcopy__
(
self
,
_
):
model_str
=
self
.
__save_model_to_string
()
booster
=
Booster
({
'model_str'
:
model_str
})
booster
.
pandas_categorical
=
self
.
pandas_categorical
return
booster
def
__getstate__
(
self
):
...
...
@@ -1477,7 +1393,6 @@ class Booster(object):
self
.
handle
,
ctypes
.
c_int
(
num_iteration
),
c_str
(
filename
)))
_save_pandas_categorical
(
filename
,
self
.
pandas_categorical
)
def
__load_model_from_string
(
self
,
model_str
):
"""[Private] Load model from string"""
...
...
@@ -1589,7 +1504,6 @@ class Booster(object):
def
_to_predictor
(
self
):
"""Convert to predictor"""
predictor
=
_InnerPredictor
(
booster_handle
=
self
.
handle
)
predictor
.
pandas_categorical
=
self
.
pandas_categorical
return
predictor
def
feature_name
(
self
):
...
...
python-package/lightgbm/engine.py
View file @
4f77bd28
...
...
@@ -17,7 +17,7 @@ from .compat import (SKLEARN_INSTALLED, LGBMStratifiedKFold, integer_types,
def
train
(
params
,
train_set
,
num_boost_round
=
100
,
valid_sets
=
None
,
valid_names
=
None
,
fobj
=
None
,
feval
=
None
,
init_model
=
None
,
feature_name
=
'auto'
,
categorical_feature
=
'auto'
,
feature_name
=
'auto'
,
early_stopping_rounds
=
None
,
evals_result
=
None
,
verbose_eval
=
True
,
learning_rates
=
None
,
callbacks
=
None
):
"""
...
...
@@ -45,11 +45,6 @@ def train(params, train_set, num_boost_round=100,
feature_name : list of str, or 'auto'
Feature names
If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
early_stopping_rounds: int
Activates early stopping.
Requires at least one validation data and one metric
...
...
@@ -103,7 +98,6 @@ def train(params, train_set, num_boost_round=100,
train_set
.
_update_params
(
params
)
train_set
.
_set_predictor
(
predictor
)
train_set
.
set_feature_name
(
feature_name
)
train_set
.
set_categorical_feature
(
categorical_feature
)
is_valid_contain_train
=
False
train_data_name
=
"training"
...
...
@@ -277,7 +271,7 @@ def _agg_cv_result(raw_results):
def
cv
(
params
,
train_set
,
num_boost_round
=
10
,
data_splitter
=
None
,
nfold
=
5
,
stratified
=
False
,
shuffle
=
True
,
metrics
=
None
,
fobj
=
None
,
feval
=
None
,
init_model
=
None
,
feature_name
=
'auto'
,
categorical_feature
=
'auto'
,
feature_name
=
'auto'
,
early_stopping_rounds
=
None
,
fpreproc
=
None
,
verbose_eval
=
None
,
show_stdv
=
True
,
seed
=
0
,
callbacks
=
None
):
...
...
@@ -311,11 +305,6 @@ def cv(params, train_set, num_boost_round=10,
feature_name : list of str, or 'auto'
Feature names
If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
early_stopping_rounds: int
Activates early stopping. CV error needs to decrease at least
every <early_stopping_rounds> round(s) to continue.
...
...
@@ -354,7 +343,6 @@ def cv(params, train_set, num_boost_round=10,
train_set
.
_update_params
(
params
)
train_set
.
_set_predictor
(
predictor
)
train_set
.
set_feature_name
(
feature_name
)
train_set
.
set_categorical_feature
(
categorical_feature
)
if
metrics
:
params
.
setdefault
(
'metric'
,
[])
...
...
python-package/lightgbm/plotting.py
View file @
4f77bd28
...
...
@@ -257,12 +257,7 @@ def _to_graphviz(graph, tree_info, show_info, feature_names):
if
info
in
{
'split_gain'
,
'internal_value'
,
'internal_count'
}:
label
+=
'
\n
'
+
info
+
':'
+
str
(
root
[
info
])
graph
.
node
(
name
,
label
=
label
)
if
root
[
'decision_type'
]
==
'no_greater'
:
l_dec
,
r_dec
=
'<='
,
'>'
elif
root
[
'decision_type'
]
==
'is'
:
l_dec
,
r_dec
=
'is'
,
"isn't"
else
:
raise
ValueError
(
'Invalid decision type in tree model.'
)
l_dec
,
r_dec
=
'<='
,
'>'
add
(
root
[
'left_child'
],
name
,
l_dec
)
add
(
root
[
'right_child'
],
name
,
r_dec
)
else
:
# leaf
...
...
python-package/lightgbm/sklearn.py
View file @
4f77bd28
...
...
@@ -284,7 +284,7 @@ class LGBMModel(LGBMModelBase):
eval_init_score
=
None
,
eval_group
=
None
,
eval_metric
=
None
,
early_stopping_rounds
=
None
,
verbose
=
True
,
feature_name
=
'auto'
,
categorical_feature
=
'auto'
,
feature_name
=
'auto'
,
callbacks
=
None
):
"""
Fit the gradient boosting model
...
...
@@ -318,11 +318,6 @@ class LGBMModel(LGBMModelBase):
feature_name : list of str, or 'auto'
Feature names
If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
callbacks : list of callback functions
List of callback functions that are applied at each iteration.
See Callbacks in Python-API.md for more information.
...
...
@@ -406,7 +401,6 @@ class LGBMModel(LGBMModelBase):
early_stopping_rounds
=
early_stopping_rounds
,
evals_result
=
evals_result
,
fobj
=
self
.
fobj
,
feval
=
feval
,
verbose_eval
=
verbose
,
feature_name
=
feature_name
,
categorical_feature
=
categorical_feature
,
callbacks
=
callbacks
)
if
evals_result
:
...
...
@@ -514,7 +508,7 @@ class LGBMRegressor(LGBMModel, LGBMRegressorBase):
eval_init_score
=
None
,
eval_metric
=
"l2"
,
early_stopping_rounds
=
None
,
verbose
=
True
,
feature_name
=
'auto'
,
categorical_feature
=
'auto'
,
callbacks
=
None
):
feature_name
=
'auto'
,
callbacks
=
None
):
super
(
LGBMRegressor
,
self
).
fit
(
X
,
y
,
sample_weight
=
sample_weight
,
init_score
=
init_score
,
eval_set
=
eval_set
,
...
...
@@ -523,7 +517,6 @@ class LGBMRegressor(LGBMModel, LGBMRegressorBase):
eval_metric
=
eval_metric
,
early_stopping_rounds
=
early_stopping_rounds
,
verbose
=
verbose
,
feature_name
=
feature_name
,
categorical_feature
=
categorical_feature
,
callbacks
=
callbacks
)
return
self
...
...
@@ -560,7 +553,7 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
eval_init_score
=
None
,
eval_metric
=
"binary_logloss"
,
early_stopping_rounds
=
None
,
verbose
=
True
,
feature_name
=
'auto'
,
categorical_feature
=
'auto'
,
feature_name
=
'auto'
,
callbacks
=
None
):
self
.
_le
=
LGBMLabelEncoder
().
fit
(
y
)
y
=
self
.
_le
.
transform
(
y
)
...
...
@@ -583,7 +576,6 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
eval_metric
=
eval_metric
,
early_stopping_rounds
=
early_stopping_rounds
,
verbose
=
verbose
,
feature_name
=
feature_name
,
categorical_feature
=
categorical_feature
,
callbacks
=
callbacks
)
return
self
...
...
@@ -661,7 +653,7 @@ class LGBMRanker(LGBMModel):
eval_init_score
=
None
,
eval_group
=
None
,
eval_metric
=
'ndcg'
,
eval_at
=
1
,
early_stopping_rounds
=
None
,
verbose
=
True
,
feature_name
=
'auto'
,
categorical_feature
=
'auto'
,
feature_name
=
'auto'
,
callbacks
=
None
):
"""
Most arguments like common methods except following:
...
...
@@ -692,6 +684,5 @@ class LGBMRanker(LGBMModel):
eval_metric
=
eval_metric
,
early_stopping_rounds
=
early_stopping_rounds
,
verbose
=
verbose
,
feature_name
=
feature_name
,
categorical_feature
=
categorical_feature
,
callbacks
=
callbacks
)
return
self
src/boosting/boosting.cpp
View file @
4f77bd28
#include <LightGBM/boosting.h>
#include "gbdt.h"
#include "dart.hpp"
#include "goss.hpp"
namespace
LightGBM
{
...
...
@@ -31,6 +32,8 @@ Boosting* Boosting::CreateBoosting(const std::string& type, const char* filename
return
new
GBDT
();
}
else
if
(
type
==
std
::
string
(
"dart"
))
{
return
new
DART
();
}
else
if
(
type
==
std
::
string
(
"goss"
))
{
return
new
GOSS
();
}
else
{
return
nullptr
;
}
...
...
@@ -42,6 +45,10 @@ Boosting* Boosting::CreateBoosting(const std::string& type, const char* filename
ret
.
reset
(
new
GBDT
());
}
else
if
(
type
==
std
::
string
(
"dart"
))
{
ret
.
reset
(
new
DART
());
}
else
if
(
type
==
std
::
string
(
"goss"
))
{
ret
.
reset
(
new
GOSS
());
}
else
{
Log
::
Fatal
(
"unknow boosting type %s"
,
type
.
c_str
());
}
LoadFileToBoosting
(
ret
.
get
(),
filename
);
}
else
{
...
...
src/boosting/dart.hpp
View file @
4f77bd28
...
...
@@ -38,6 +38,11 @@ public:
random_for_drop_
=
Random
(
gbdt_config_
->
drop_seed
);
sum_weight_
=
0.0
f
;
}
void
ResetTrainingData
(
const
BoostingConfig
*
config
,
const
Dataset
*
train_data
,
const
ObjectiveFunction
*
object_function
,
const
std
::
vector
<
const
Metric
*>&
training_metrics
)
override
{
GBDT
::
ResetTrainingData
(
config
,
train_data
,
object_function
,
training_metrics
);
}
/*!
* \brief one training iteration
*/
...
...
src/boosting/gbdt.cpp
View file @
4f77bd28
...
...
@@ -4,7 +4,6 @@
#include <LightGBM/utils/common.h>
#include <LightGBM/feature.h>
#include <LightGBM/objective_function.h>
#include <LightGBM/metric.h>
...
...
@@ -37,7 +36,6 @@ GBDT::GBDT()
}
GBDT
::~
GBDT
()
{
}
void
GBDT
::
Init
(
const
BoostingConfig
*
config
,
const
Dataset
*
train_data
,
const
ObjectiveFunction
*
object_function
,
...
...
@@ -106,16 +104,6 @@ void GBDT::ResetTrainingData(const BoostingConfig* config, const Dataset* train_
label_idx_
=
train_data
->
label_idx
();
// get feature names
feature_names_
=
train_data
->
feature_names
();
// get feature infos
feature_infos_
.
clear
();
for
(
int
i
=
0
;
i
<
max_feature_idx_
+
1
;
++
i
)
{
int
feature_idx
=
train_data
->
GetInnerFeatureIndex
(
i
);
if
(
feature_idx
<
0
)
{
feature_infos_
.
push_back
(
"trival feature"
);
}
else
{
feature_infos_
.
push_back
(
train_data
->
FeatureAt
(
feature_idx
)
->
bin_mapper
()
->
bin_info
());
}
}
}
if
((
train_data_
!=
train_data
&&
train_data
!=
nullptr
)
...
...
@@ -587,11 +575,6 @@ std::string GBDT::SaveModelToString(int num_iterations) const {
ss
<<
pairs
[
i
].
second
<<
"="
<<
std
::
to_string
(
pairs
[
i
].
first
)
<<
std
::
endl
;
}
ss
<<
std
::
endl
<<
"feature information:"
<<
std
::
endl
;
for
(
int
i
=
0
;
i
<
max_feature_idx_
+
1
;
++
i
)
{
ss
<<
feature_names_
[
i
]
<<
"="
<<
feature_infos_
[
i
]
<<
std
::
endl
;
}
return
ss
.
str
();
}
...
...
@@ -651,51 +634,12 @@ bool GBDT::LoadModelFromString(const std::string& model_str) {
Log
::
Fatal
(
"Wrong size of feature_names"
);
return
false
;
}
}
else
{
}
else
{
Log
::
Fatal
(
"Model file doesn't contain feature names"
);
return
false
;
}
// returns offset, or lines.size() if not found.
auto
find_string_lineno
=
[
&
lines
](
const
std
::
string
&
str
,
size_t
start_line
)
{
size_t
i
=
start_line
;
size_t
featinfo_find_pos
=
std
::
string
::
npos
;
while
(
i
<
lines
.
size
())
{
featinfo_find_pos
=
lines
[
i
].
find
(
str
);
if
(
featinfo_find_pos
!=
std
::
string
::
npos
)
break
;
++
i
;
}
return
i
;
};
// load feature information
{
size_t
finfo_line_idx
=
find_string_lineno
(
"feature information:"
,
0
);
if
(
finfo_line_idx
>=
lines
.
size
())
{
Log
::
Fatal
(
"Model file doesn't contain feature information"
);
return
false
;
}
feature_infos_
.
resize
(
max_feature_idx_
+
1
);
// search for each feature name
for
(
int
i
=
0
;
i
<
max_feature_idx_
+
1
;
i
++
)
{
const
auto
feat_name
=
feature_names_
[
i
];
size_t
line_idx
=
find_string_lineno
(
feat_name
+
"="
,
finfo_line_idx
+
1
);
if
(
line_idx
>=
lines
.
size
())
{
Log
::
Fatal
((
"Model file doesn't contain feature information for feature "
+
feat_name
).
c_str
());
return
false
;
}
const
auto
this_line
=
lines
[
line_idx
];
feature_infos_
[
i
]
=
this_line
.
substr
((
feat_name
+
"="
).
size
());
}
}
// get tree models
size_t
i
=
0
;
while
(
i
<
lines
.
size
())
{
...
...
@@ -725,7 +669,7 @@ std::vector<std::pair<size_t, std::string>> GBDT::FeatureImportance() const {
std
::
vector
<
size_t
>
feature_importances
(
max_feature_idx_
+
1
,
0
);
for
(
size_t
iter
=
0
;
iter
<
models_
.
size
();
++
iter
)
{
for
(
int
split_idx
=
0
;
split_idx
<
models_
[
iter
]
->
num_leaves
()
-
1
;
++
split_idx
)
{
++
feature_importances
[
models_
[
iter
]
->
split_feature
_real
(
split_idx
)];
++
feature_importances
[
models_
[
iter
]
->
split_feature
(
split_idx
)];
}
}
// store the importance first
...
...
src/boosting/gbdt.h
View file @
4f77bd28
...
...
@@ -329,8 +329,6 @@ protected:
int
num_init_iteration_
;
/*! \brief Feature names */
std
::
vector
<
std
::
string
>
feature_names_
;
/*! \brief Feature informations */
std
::
vector
<
std
::
string
>
feature_infos_
;
/*! \brief number of threads */
int
num_threads_
;
/*! \brief Buffer for multi-threading bagging */
...
...
src/boosting/goss.hpp
0 → 100644
View file @
4f77bd28
#ifndef LIGHTGBM_BOOSTING_GOSS_H_
#define LIGHTGBM_BOOSTING_GOSS_H_
#include <LightGBM/utils/array_args.h>
#include <LightGBM/utils/log.h>
#include <LightGBM/utils/openmp_wrapper.h>
#include <LightGBM/boosting.h>
#include "score_updater.hpp"
#include "gbdt.h"
#include <cstdio>
#include <vector>
#include <string>
#include <fstream>
#include <chrono>
namespace
LightGBM
{
class
GOSS
:
public
GBDT
{
public:
/*!
* \brief Constructor
*/
GOSS
()
:
GBDT
()
{
}
~
GOSS
()
{
}
void
Init
(
const
BoostingConfig
*
config
,
const
Dataset
*
train_data
,
const
ObjectiveFunction
*
object_function
,
const
std
::
vector
<
const
Metric
*>&
training_metrics
)
override
{
GBDT
::
Init
(
config
,
train_data
,
object_function
,
training_metrics
);
CHECK
(
gbdt_config_
->
top_rate
+
gbdt_config_
->
other_rate
<=
1.0
f
);
CHECK
(
gbdt_config_
->
top_rate
>
0.0
f
&&
gbdt_config_
->
other_rate
>
0.0
f
);
if
(
gbdt_config_
->
bagging_freq
>
0
&&
gbdt_config_
->
bagging_fraction
!=
1.0
f
)
{
Log
::
Fatal
(
"cannot used bagging in GOSS"
);
}
Log
::
Info
(
"using GOSS"
);
}
void
ResetTrainingData
(
const
BoostingConfig
*
config
,
const
Dataset
*
train_data
,
const
ObjectiveFunction
*
object_function
,
const
std
::
vector
<
const
Metric
*>&
training_metrics
)
override
{
if
(
config
->
bagging_freq
>
0
&&
config
->
bagging_fraction
!=
1.0
f
)
{
Log
::
Fatal
(
"cannot used bagging in GOSS"
);
}
GBDT
::
ResetTrainingData
(
config
,
train_data
,
object_function
,
training_metrics
);
if
(
train_data_
==
nullptr
)
{
return
;
}
bag_data_indices_
.
resize
(
num_data_
);
tmp_indices_
.
resize
(
num_data_
);
tmp_indice_right_
.
resize
(
num_data_
);
offsets_buf_
.
resize
(
num_threads_
);
left_cnts_buf_
.
resize
(
num_threads_
);
right_cnts_buf_
.
resize
(
num_threads_
);
left_write_pos_buf_
.
resize
(
num_threads_
);
right_write_pos_buf_
.
resize
(
num_threads_
);
is_use_subset_
=
false
;
if
(
config
->
top_rate
+
config
->
other_rate
<=
0.5
)
{
auto
bag_data_cnt
=
static_cast
<
data_size_t
>
((
config
->
top_rate
+
config
->
other_rate
)
*
num_data_
);
tmp_subset_
.
reset
(
new
Dataset
(
bag_data_cnt
));
tmp_subset_
->
CopyFeatureMapperFrom
(
train_data_
);
is_use_subset_
=
true
;
}
// flag to not bagging first
bag_data_cnt_
=
num_data_
;
}
data_size_t
BaggingHelper
(
Random
&
cur_rand
,
data_size_t
start
,
data_size_t
cnt
,
data_size_t
*
buffer
,
data_size_t
*
buffer_right
)
{
std
::
vector
<
score_t
>
tmp_gradients
(
cnt
);
for
(
data_size_t
i
=
0
;
i
<
cnt
;
++
i
)
{
tmp_gradients
[
i
]
=
std
::
fabs
(
gradients_
[
start
+
i
]
*
hessians_
[
start
+
i
]);
}
data_size_t
top_k
=
static_cast
<
data_size_t
>
(
cnt
*
gbdt_config_
->
top_rate
);
data_size_t
other_k
=
static_cast
<
data_size_t
>
(
cnt
*
gbdt_config_
->
other_rate
);
top_k
=
std
::
max
(
1
,
top_k
);
ArrayArgs
<
score_t
>::
ArgMaxAtK
(
&
tmp_gradients
,
0
,
static_cast
<
int
>
(
tmp_gradients
.
size
()),
top_k
);
score_t
threshold
=
tmp_gradients
[
top_k
-
1
];
score_t
multiply
=
static_cast
<
score_t
>
(
cnt
-
top_k
)
/
other_k
;
data_size_t
cur_left_cnt
=
0
;
data_size_t
cur_right_cnt
=
0
;
data_size_t
big_weight_cnt
=
0
;
for
(
data_size_t
i
=
0
;
i
<
cnt
;
++
i
)
{
if
(
std
::
fabs
(
gradients_
[
start
+
i
]
*
hessians_
[
start
+
i
])
>=
threshold
)
{
buffer
[
cur_left_cnt
++
]
=
start
+
i
;
++
big_weight_cnt
;
}
else
{
data_size_t
sampled
=
cur_left_cnt
-
big_weight_cnt
;
data_size_t
rest_need
=
other_k
-
sampled
;
data_size_t
rest_all
=
(
cnt
-
i
)
-
(
top_k
-
big_weight_cnt
);
double
prob
=
(
rest_need
)
/
static_cast
<
double
>
(
rest_all
);
if
(
cur_rand
.
NextFloat
()
<
prob
)
{
buffer
[
cur_left_cnt
++
]
=
start
+
i
;
gradients_
[
start
+
i
]
*=
multiply
;
hessians_
[
start
+
i
]
*=
multiply
;
}
else
{
buffer_right
[
cur_right_cnt
++
]
=
start
+
i
;
}
}
}
return
cur_left_cnt
;
}
void
Bagging
(
int
iter
)
override
{
bag_data_cnt_
=
num_data_
;
// not subsample for first iterations
if
(
iter
<
static_cast
<
int
>
(
1.0
f
/
gbdt_config_
->
learning_rate
))
{
return
;
}
const
data_size_t
min_inner_size
=
1000
;
data_size_t
inner_size
=
(
num_data_
+
num_threads_
-
1
)
/
num_threads_
;
if
(
inner_size
<
min_inner_size
)
{
inner_size
=
min_inner_size
;
}
#pragma omp parallel for schedule(static, 1)
for
(
int
i
=
0
;
i
<
num_threads_
;
++
i
)
{
left_cnts_buf_
[
i
]
=
0
;
right_cnts_buf_
[
i
]
=
0
;
data_size_t
cur_start
=
i
*
inner_size
;
if
(
cur_start
>
num_data_
)
{
continue
;
}
data_size_t
cur_cnt
=
inner_size
;
if
(
cur_start
+
cur_cnt
>
num_data_
)
{
cur_cnt
=
num_data_
-
cur_start
;
}
Random
cur_rand
(
gbdt_config_
->
bagging_seed
+
iter
*
num_threads_
+
i
);
data_size_t
cur_left_count
=
BaggingHelper
(
cur_rand
,
cur_start
,
cur_cnt
,
tmp_indices_
.
data
()
+
cur_start
,
tmp_indice_right_
.
data
()
+
cur_start
);
offsets_buf_
[
i
]
=
cur_start
;
left_cnts_buf_
[
i
]
=
cur_left_count
;
right_cnts_buf_
[
i
]
=
cur_cnt
-
cur_left_count
;
}
data_size_t
left_cnt
=
0
;
left_write_pos_buf_
[
0
]
=
0
;
right_write_pos_buf_
[
0
]
=
0
;
for
(
int
i
=
1
;
i
<
num_threads_
;
++
i
)
{
left_write_pos_buf_
[
i
]
=
left_write_pos_buf_
[
i
-
1
]
+
left_cnts_buf_
[
i
-
1
];
right_write_pos_buf_
[
i
]
=
right_write_pos_buf_
[
i
-
1
]
+
right_cnts_buf_
[
i
-
1
];
}
left_cnt
=
left_write_pos_buf_
[
num_threads_
-
1
]
+
left_cnts_buf_
[
num_threads_
-
1
];
#pragma omp parallel for schedule(static, 1)
for
(
int
i
=
0
;
i
<
num_threads_
;
++
i
)
{
if
(
left_cnts_buf_
[
i
]
>
0
)
{
std
::
memcpy
(
bag_data_indices_
.
data
()
+
left_write_pos_buf_
[
i
],
tmp_indices_
.
data
()
+
offsets_buf_
[
i
],
left_cnts_buf_
[
i
]
*
sizeof
(
data_size_t
));
}
if
(
right_cnts_buf_
[
i
]
>
0
)
{
std
::
memcpy
(
bag_data_indices_
.
data
()
+
left_cnt
+
right_write_pos_buf_
[
i
],
tmp_indice_right_
.
data
()
+
offsets_buf_
[
i
],
right_cnts_buf_
[
i
]
*
sizeof
(
data_size_t
));
}
}
bag_data_cnt_
=
left_cnt
;
// set bagging data to tree learner
if
(
!
is_use_subset_
)
{
tree_learner_
->
SetBaggingData
(
bag_data_indices_
.
data
(),
bag_data_cnt_
);
}
else
{
// get subset
tmp_subset_
->
ReSize
(
bag_data_cnt_
);
tmp_subset_
->
CopySubset
(
train_data_
,
bag_data_indices_
.
data
(),
bag_data_cnt_
,
false
);
tree_learner_
->
ResetTrainingData
(
tmp_subset_
.
get
());
}
}
/*!
* \brief Get Type name of this boosting object
*/
const
char
*
SubModelName
()
const
override
{
return
"tree"
;
}
private:
std
::
vector
<
data_size_t
>
tmp_indice_right_
;
};
}
// namespace LightGBM
#endif // LIGHTGBM_BOOSTING_GOSS_H_
src/boosting/score_updater.hpp
View file @
4f77bd28
#ifndef LIGHTGBM_BOOSTING_SCORE_UPDATER_HPP_
#define LIGHTGBM_BOOSTING_SCORE_UPDATER_HPP_
#include <LightGBM/utils/openmp_wrapper.h>
#include <LightGBM/meta.h>
#include <LightGBM/dataset.h>
#include <LightGBM/tree.h>
...
...
src/c_api.cpp
View file @
4f77bd28
...
...
@@ -330,20 +330,22 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromMat(const void* data,
const
int
sample_cnt
=
static_cast
<
int
>
(
nrow
<
io_config
.
bin_construct_sample_cnt
?
nrow
:
io_config
.
bin_construct_sample_cnt
);
auto
sample_indices
=
rand
.
Sample
(
nrow
,
sample_cnt
);
std
::
vector
<
std
::
vector
<
double
>>
sample_values
(
ncol
);
std
::
vector
<
std
::
vector
<
int
>>
sample_idx
(
ncol
);
for
(
size_t
i
=
0
;
i
<
sample_indices
.
size
();
++
i
)
{
auto
idx
=
sample_indices
[
i
];
auto
row
=
get_row_fun
(
static_cast
<
int
>
(
idx
));
for
(
size_t
j
=
0
;
j
<
row
.
size
();
++
j
)
{
if
(
std
::
fabs
(
row
[
j
])
>
1e-15
)
{
sample_values
[
j
].
push_back
(
row
[
j
]);
if
(
std
::
fabs
(
row
[
j
])
>
kEpsilon
)
{
sample_values
[
j
].
emplace_back
(
row
[
j
]);
sample_idx
[
j
].
emplace_back
(
static_cast
<
int
>
(
i
));
}
}
}
DatasetLoader
loader
(
io_config
,
nullptr
,
1
,
nullptr
);
ret
.
reset
(
loader
.
CostructFromSampleData
(
sample_values
,
sample_cnt
,
nrow
));
ret
.
reset
(
loader
.
CostructFromSampleData
(
sample_values
,
sample_idx
,
sample_cnt
,
nrow
));
}
else
{
ret
.
reset
(
new
Dataset
(
nrow
));
ret
->
C
opyFeatureMapperFrom
(
ret
->
C
reateValid
(
reinterpret_cast
<
const
Dataset
*>
(
reference
));
}
...
...
@@ -382,29 +384,28 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromCSR(const void* indptr,
const
int
sample_cnt
=
static_cast
<
int
>
(
nrow
<
io_config
.
bin_construct_sample_cnt
?
nrow
:
io_config
.
bin_construct_sample_cnt
);
auto
sample_indices
=
rand
.
Sample
(
nrow
,
sample_cnt
);
std
::
vector
<
std
::
vector
<
double
>>
sample_values
;
std
::
vector
<
std
::
vector
<
int
>>
sample_idx
;
for
(
size_t
i
=
0
;
i
<
sample_indices
.
size
();
++
i
)
{
auto
idx
=
sample_indices
[
i
];
auto
row
=
get_row_fun
(
static_cast
<
int
>
(
idx
));
for
(
std
::
pair
<
int
,
double
>&
inner_data
:
row
)
{
if
(
static_cast
<
size_t
>
(
inner_data
.
first
)
>=
sample_values
.
size
())
{
// if need expand feature set
size_t
need_size
=
inner_data
.
first
-
sample_values
.
size
()
+
1
;
for
(
size_t
j
=
0
;
j
<
need_size
;
++
j
)
{
sample_values
.
emplace_back
();
}
sample_values
.
resize
(
inner_data
.
first
+
1
);
sample_idx
.
resize
(
inner_data
.
first
+
1
);
}
if
(
std
::
fabs
(
inner_data
.
second
)
>
1e-15
)
{
if
(
std
::
fabs
(
inner_data
.
second
)
>
kEpsilon
)
{
// edit the feature value
sample_values
[
inner_data
.
first
].
push_back
(
inner_data
.
second
);
sample_values
[
inner_data
.
first
].
emplace_back
(
inner_data
.
second
);
sample_idx
[
inner_data
.
first
].
emplace_back
(
static_cast
<
int
>
(
i
));
}
}
}
CHECK
(
num_col
>=
static_cast
<
int
>
(
sample_values
.
size
()));
DatasetLoader
loader
(
io_config
,
nullptr
,
1
,
nullptr
);
ret
.
reset
(
loader
.
CostructFromSampleData
(
sample_values
,
sample_cnt
,
nrow
));
ret
.
reset
(
loader
.
CostructFromSampleData
(
sample_values
,
sample_idx
,
sample_cnt
,
nrow
));
}
else
{
ret
.
reset
(
new
Dataset
(
nrow
));
ret
->
C
opyFeatureMapperFrom
(
ret
->
C
reateValid
(
reinterpret_cast
<
const
Dataset
*>
(
reference
));
}
...
...
@@ -442,29 +443,33 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromCSC(const void* col_ptr,
const
int
sample_cnt
=
static_cast
<
int
>
(
nrow
<
io_config
.
bin_construct_sample_cnt
?
nrow
:
io_config
.
bin_construct_sample_cnt
);
auto
sample_indices
=
rand
.
Sample
(
nrow
,
sample_cnt
);
std
::
vector
<
std
::
vector
<
double
>>
sample_values
(
ncol_ptr
-
1
);
std
::
vector
<
std
::
vector
<
int
>>
sample_idx
(
ncol_ptr
-
1
);
#pragma omp parallel for schedule(guided)
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
sample_values
.
size
());
++
i
)
{
CSC_RowIterator
col_it
(
col_ptr
,
col_ptr_type
,
indices
,
data
,
data_type
,
ncol_ptr
,
nelem
,
i
);
for
(
int
j
=
0
;
j
<
sample_cnt
;
j
++
)
{
auto
val
=
col_it
.
Get
(
sample_indices
[
j
]);
if
(
std
::
fabs
(
val
)
>
kEpsilon
)
{
sample_values
[
i
].
push_back
(
val
);
sample_values
[
i
].
emplace_back
(
val
);
sample_idx
[
i
].
emplace_back
(
j
);
}
}
}
DatasetLoader
loader
(
io_config
,
nullptr
,
1
,
nullptr
);
ret
.
reset
(
loader
.
CostructFromSampleData
(
sample_values
,
sample_cnt
,
nrow
));
ret
.
reset
(
loader
.
CostructFromSampleData
(
sample_values
,
sample_idx
,
sample_cnt
,
nrow
));
}
else
{
ret
.
reset
(
new
Dataset
(
nrow
));
ret
->
C
opyFeatureMapperFrom
(
ret
->
C
reateValid
(
reinterpret_cast
<
const
Dataset
*>
(
reference
));
}
#pragma omp parallel for schedule(guided)
for
(
int
i
=
0
;
i
<
ncol_ptr
-
1
;
++
i
)
{
const
int
tid
=
omp_get_thread_num
();
int
feature_idx
=
ret
->
Get
InnerFeatureIndex
(
i
);
int
feature_idx
=
ret
->
InnerFeatureIndex
(
i
);
if
(
feature_idx
<
0
)
{
continue
;
}
int
group
=
ret
->
Feature2Group
(
feature_idx
);
int
sub_feature
=
ret
->
Feture2SubFeature
(
feature_idx
);
CSC_RowIterator
col_it
(
col_ptr
,
col_ptr_type
,
indices
,
data
,
data_type
,
ncol_ptr
,
nelem
,
i
);
int
row_idx
=
0
;
while
(
row_idx
<
nrow
)
{
...
...
@@ -472,7 +477,7 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromCSC(const void* col_ptr,
row_idx
=
pair
.
first
;
// no more data
if
(
row_idx
<
0
)
{
break
;
}
ret
->
FeatureAt
(
feature_idx
)
->
PushData
(
tid
,
row_idx
,
pair
.
second
);
ret
->
Push
One
Data
(
tid
,
row_idx
,
group
,
sub_feature
,
pair
.
second
);
}
}
ret
->
FinishLoad
();
...
...
src/io/bin.cpp
View file @
4f77bd28
...
...
@@ -23,16 +23,10 @@ BinMapper::BinMapper(const BinMapper& other) {
num_bin_
=
other
.
num_bin_
;
is_trival_
=
other
.
is_trival_
;
sparse_rate_
=
other
.
sparse_rate_
;
bin_type_
=
other
.
bin_type_
;
if
(
bin_type_
==
BinType
::
NumericalBin
)
{
bin_upper_bound_
=
other
.
bin_upper_bound_
;
}
else
{
bin_2_categorical_
=
other
.
bin_2_categorical_
;
categorical_2_bin_
=
other
.
categorical_2_bin_
;
}
bin_upper_bound_
=
other
.
bin_upper_bound_
;
min_val_
=
other
.
min_val_
;
max_val_
=
other
.
max_val_
;
default_bin_
=
other
.
default_bin_
;
}
BinMapper
::
BinMapper
(
const
void
*
memory
)
{
...
...
@@ -43,37 +37,48 @@ BinMapper::~BinMapper() {
}
void
BinMapper
::
FindBin
(
std
::
vector
<
double
>*
values
,
size_t
total_sample_cnt
,
int
max_bin
,
BinType
bin_type
)
{
bin_type_
=
bin_type
;
std
::
vector
<
double
>&
ref_values
=
(
*
values
);
size_t
sample_size
=
total_sample_cnt
;
int
zero_cnt
=
static_cast
<
int
>
(
total_sample_cnt
-
ref_values
.
size
());
bool
NeedFilter
(
std
::
vector
<
int
>&
cnt_in_bin
,
int
total_cnt
,
int
filter_cnt
)
{
int
sum_left
=
0
;
for
(
size_t
i
=
0
;
i
<
cnt_in_bin
.
size
()
-
1
;
++
i
)
{
sum_left
+=
cnt_in_bin
[
i
];
if
(
sum_left
>=
filter_cnt
)
{
return
false
;
}
else
if
(
total_cnt
-
sum_left
>=
filter_cnt
)
{
return
false
;
}
}
return
true
;
}
void
BinMapper
::
FindBin
(
std
::
vector
<
double
>&
values
,
size_t
total_sample_cnt
,
int
max_bin
,
int
min_data_in_bin
,
int
min_split_data
)
{
// limit max_bin by min_data_in_bin
std
::
vector
<
double
>&
raw_values
=
values
;
int
zero_cnt
=
static_cast
<
int
>
(
total_sample_cnt
-
raw_values
.
size
());
// find distinct_values first
std
::
vector
<
double
>
distinct_values
;
std
::
vector
<
int
>
counts
;
std
::
sort
(
r
ef
_values
.
begin
(),
r
ef
_values
.
end
());
std
::
sort
(
r
aw
_values
.
begin
(),
r
aw
_values
.
end
());
// push zero in the front
if
(
r
ef
_values
.
empty
()
||
(
r
ef
_values
[
0
]
>
0.0
f
&&
zero_cnt
>
0
))
{
distinct_values
.
push_back
(
0
);
if
(
r
aw
_values
.
empty
()
||
(
r
aw
_values
[
0
]
>
0.0
f
&&
zero_cnt
>
0
))
{
distinct_values
.
push_back
(
0
.0
f
);
counts
.
push_back
(
zero_cnt
);
}
if
(
!
r
ef
_values
.
empty
())
{
distinct_values
.
push_back
(
r
ef
_values
[
0
]);
if
(
!
r
aw
_values
.
empty
())
{
distinct_values
.
push_back
(
r
aw
_values
[
0
]);
counts
.
push_back
(
1
);
}
for
(
size_t
i
=
1
;
i
<
ref_values
.
size
();
++
i
)
{
if
(
ref_values
[
i
]
!=
ref_values
[
i
-
1
])
{
if
(
ref_values
[
i
-
1
]
==
0.0
f
)
{
counts
.
back
()
+=
zero_cnt
;
}
else
if
(
ref_values
[
i
-
1
]
<
0.0
f
&&
ref_values
[
i
]
>
0.0
f
)
{
distinct_values
.
push_back
(
0
);
for
(
size_t
i
=
1
;
i
<
raw_values
.
size
();
++
i
)
{
if
(
raw_values
[
i
]
!=
raw_values
[
i
-
1
])
{
if
(
raw_values
[
i
-
1
]
<
0.0
f
&&
raw_values
[
i
]
>
0.0
f
)
{
distinct_values
.
push_back
(
0.0
f
);
counts
.
push_back
(
zero_cnt
);
}
distinct_values
.
push_back
(
r
ef
_values
[
i
]);
distinct_values
.
push_back
(
r
aw
_values
[
i
]);
counts
.
push_back
(
1
);
}
else
{
++
counts
.
back
();
...
...
@@ -81,119 +86,106 @@ void BinMapper::FindBin(std::vector<double>* values, size_t total_sample_cnt, in
}
// push zero in the back
if
(
!
r
ef
_values
.
empty
()
&&
r
ef
_values
.
back
()
<
0.0
f
&&
zero_cnt
>
0
)
{
distinct_values
.
push_back
(
0
);
if
(
!
r
aw
_values
.
empty
()
&&
r
aw
_values
.
back
()
<
0.0
f
&&
zero_cnt
>
0
)
{
distinct_values
.
push_back
(
0
.0
f
);
counts
.
push_back
(
zero_cnt
);
}
min_val_
=
distinct_values
.
front
();
max_val_
=
distinct_values
.
back
();
std
::
vector
<
int
>
cnt_in_bin
;
int
num_values
=
static_cast
<
int
>
(
distinct_values
.
size
());
if
(
bin_type_
==
BinType
::
NumericalBin
)
{
if
(
num_values
<=
max_bin
)
{
std
::
sort
(
distinct_values
.
begin
(),
distinct_values
.
end
());
// use distinct value is enough
num_bin_
=
num_values
;
bin_upper_bound_
=
std
::
vector
<
double
>
(
num_values
);
for
(
int
i
=
0
;
i
<
num_values
-
1
;
++
i
)
{
bin_upper_bound_
[
i
]
=
(
distinct_values
[
i
]
+
distinct_values
[
i
+
1
])
/
2
;
}
cnt_in_bin
=
counts
;
bin_upper_bound_
[
num_values
-
1
]
=
std
::
numeric_limits
<
double
>::
infinity
();
}
else
{
// mean size for one bin
double
mean_bin_size
=
sample_size
/
static_cast
<
double
>
(
max_bin
);
int
rest_bin_cnt
=
max_bin
;
int
rest_sample_cnt
=
static_cast
<
int
>
(
sample_size
);
std
::
vector
<
bool
>
is_big_count_value
(
num_values
,
false
);
for
(
int
i
=
0
;
i
<
num_values
;
++
i
)
{
if
(
counts
[
i
]
>=
mean_bin_size
)
{
is_big_count_value
[
i
]
=
true
;
--
rest_bin_cnt
;
rest_sample_cnt
-=
counts
[
i
];
}
}
mean_bin_size
=
rest_sample_cnt
/
static_cast
<
double
>
(
rest_bin_cnt
);
std
::
vector
<
double
>
upper_bounds
(
max_bin
,
std
::
numeric_limits
<
double
>::
infinity
());
std
::
vector
<
double
>
lower_bounds
(
max_bin
,
std
::
numeric_limits
<
double
>::
infinity
());
int
bin_cnt
=
0
;
lower_bounds
[
bin_cnt
]
=
distinct_values
[
0
];
int
cur_cnt_inbin
=
0
;
for
(
int
i
=
0
;
i
<
num_values
-
1
;
++
i
)
{
if
(
!
is_big_count_value
[
i
])
{
rest_sample_cnt
-=
counts
[
i
];
}
cur_cnt_inbin
+=
counts
[
i
];
// need a new bin
if
(
is_big_count_value
[
i
]
||
cur_cnt_inbin
>=
mean_bin_size
||
(
is_big_count_value
[
i
+
1
]
&&
cur_cnt_inbin
>=
std
::
max
(
1.0
,
mean_bin_size
*
0.5
f
)))
{
upper_bounds
[
bin_cnt
]
=
distinct_values
[
i
];
cnt_in_bin
.
push_back
(
cur_cnt_inbin
);
++
bin_cnt
;
lower_bounds
[
bin_cnt
]
=
distinct_values
[
i
+
1
];
if
(
bin_cnt
>=
max_bin
-
1
)
{
break
;
}
cur_cnt_inbin
=
0
;
if
(
!
is_big_count_value
[
i
])
{
--
rest_bin_cnt
;
mean_bin_size
=
rest_sample_cnt
/
static_cast
<
double
>
(
rest_bin_cnt
);
}
}
if
(
num_values
<=
max_bin
)
{
// use distinct value is enough
bin_upper_bound_
.
clear
();
int
cur_cnt_inbin
=
0
;
for
(
int
i
=
0
;
i
<
num_values
-
1
;
++
i
)
{
cur_cnt_inbin
+=
counts
[
i
];
if
(
cur_cnt_inbin
>=
min_data_in_bin
)
{
bin_upper_bound_
.
push_back
((
distinct_values
[
i
]
+
distinct_values
[
i
+
1
])
/
2
);
cnt_in_bin
.
push_back
(
cur_cnt_inbin
);
cur_cnt_inbin
=
0
;
}
++
bin_cnt
;
// update bin upper bound
bin_upper_bound_
=
std
::
vector
<
double
>
(
bin_cnt
);
num_bin_
=
bin_cnt
;
for
(
int
i
=
0
;
i
<
bin_cnt
-
1
;
++
i
)
{
bin_upper_bound_
[
i
]
=
(
upper_bounds
[
i
]
+
lower_bounds
[
i
+
1
])
/
2.0
f
;
}
// last bin upper bound
bin_upper_bound_
[
bin_cnt
-
1
]
=
std
::
numeric_limits
<
double
>::
infinity
();
}
cur_cnt_inbin
+=
counts
.
back
();
cnt_in_bin
.
push_back
(
cur_cnt_inbin
);
bin_upper_bound_
.
push_back
(
std
::
numeric_limits
<
double
>::
infinity
());
num_bin_
=
static_cast
<
int
>
(
bin_upper_bound_
.
size
());
}
else
{
// convert to int type first
std
::
vector
<
int
>
distinct_values_int
;
std
::
vector
<
int
>
counts_int
;
distinct_values_int
.
push_back
(
static_cast
<
int
>
(
distinct_values
[
0
]));
counts_int
.
push_back
(
counts
[
0
]);
for
(
size_t
i
=
1
;
i
<
distinct_values
.
size
();
++
i
)
{
if
(
static_cast
<
int
>
(
distinct_values
[
i
])
!=
distinct_values_int
.
back
())
{
distinct_values_int
.
push_back
(
static_cast
<
int
>
(
distinct_values
[
i
]));
counts_int
.
push_back
(
counts
[
i
]);
}
else
{
counts_int
.
back
()
+=
counts
[
i
];
if
(
min_data_in_bin
>
0
)
{
max_bin
=
std
::
min
(
max_bin
,
static_cast
<
int
>
(
total_sample_cnt
/
min_data_in_bin
));
max_bin
=
std
::
max
(
max_bin
,
1
);
}
double
mean_bin_size
=
static_cast
<
double
>
(
total_sample_cnt
)
/
max_bin
;
if
(
zero_cnt
>
mean_bin_size
)
{
int
non_zero_cnt
=
static_cast
<
int
>
(
raw_values
.
size
());
max_bin
=
std
::
min
(
max_bin
,
1
+
static_cast
<
int
>
(
non_zero_cnt
/
min_data_in_bin
));
}
// mean size for one bin
int
rest_bin_cnt
=
max_bin
;
int
rest_sample_cnt
=
static_cast
<
int
>
(
total_sample_cnt
);
std
::
vector
<
bool
>
is_big_count_value
(
num_values
,
false
);
for
(
int
i
=
0
;
i
<
num_values
;
++
i
)
{
if
(
counts
[
i
]
>=
mean_bin_size
)
{
is_big_count_value
[
i
]
=
true
;
--
rest_bin_cnt
;
rest_sample_cnt
-=
counts
[
i
];
}
}
mean_bin_size
=
static_cast
<
double
>
(
rest_sample_cnt
)
/
rest_bin_cnt
;
std
::
vector
<
double
>
upper_bounds
(
max_bin
,
std
::
numeric_limits
<
double
>::
infinity
());
std
::
vector
<
double
>
lower_bounds
(
max_bin
,
std
::
numeric_limits
<
double
>::
infinity
());
int
bin_cnt
=
0
;
lower_bounds
[
bin_cnt
]
=
distinct_values
[
0
];
int
cur_cnt_inbin
=
0
;
for
(
int
i
=
0
;
i
<
num_values
-
1
;
++
i
)
{
if
(
!
is_big_count_value
[
i
])
{
rest_sample_cnt
-=
counts
[
i
];
}
cur_cnt_inbin
+=
counts
[
i
];
// need a new bin
if
(
is_big_count_value
[
i
]
||
cur_cnt_inbin
>=
mean_bin_size
||
(
is_big_count_value
[
i
+
1
]
&&
cur_cnt_inbin
>=
std
::
max
(
1.0
,
mean_bin_size
*
0.5
f
)))
{
upper_bounds
[
bin_cnt
]
=
distinct_values
[
i
];
cnt_in_bin
.
push_back
(
cur_cnt_inbin
);
++
bin_cnt
;
lower_bounds
[
bin_cnt
]
=
distinct_values
[
i
+
1
];
if
(
bin_cnt
>=
max_bin
-
1
)
{
break
;
}
cur_cnt_inbin
=
0
;
if
(
!
is_big_count_value
[
i
])
{
--
rest_bin_cnt
;
mean_bin_size
=
rest_sample_cnt
/
static_cast
<
double
>
(
rest_bin_cnt
);
}
}
}
// sort by counts
Common
::
SortForPair
<
int
,
int
>
(
counts_int
,
distinct_values_int
,
0
,
true
);
// will ingore the categorical of small counts
const
int
cut_cnt
=
static_cast
<
int
>
(
sample_size
*
0.95
f
);
categorical_2_bin_
.
clear
();
bin_2_categorical_
.
clear
();
num_bin_
=
0
;
int
used_cnt
=
0
;
max_bin
=
std
::
min
(
static_cast
<
int
>
(
distinct_values_int
.
size
()),
max_bin
);
while
(
used_cnt
<
cut_cnt
||
num_bin_
<
max_bin
)
{
bin_2_categorical_
.
push_back
(
distinct_values_int
[
num_bin_
]);
categorical_2_bin_
[
distinct_values_int
[
num_bin_
]]
=
static_cast
<
unsigned
int
>
(
num_bin_
);
used_cnt
+=
counts_int
[
num_bin_
];
++
num_bin_
;
cur_cnt_inbin
+=
counts
.
back
();
cnt_in_bin
.
push_back
(
cur_cnt_inbin
);
++
bin_cnt
;
// update bin upper bound
bin_upper_bound_
=
std
::
vector
<
double
>
(
bin_cnt
);
num_bin_
=
bin_cnt
;
for
(
int
i
=
0
;
i
<
bin_cnt
-
1
;
++
i
)
{
bin_upper_bound_
[
i
]
=
(
upper_bounds
[
i
]
+
lower_bounds
[
i
+
1
])
/
2.0
f
;
}
cnt_in_bin
=
counts_int
;
cnt_in_bin
[
0
]
+
=
st
atic_cast
<
int
>
(
sample_size
)
-
used_cnt
;
// last bin upper bound
bin_upper_bound_
[
bin_cnt
-
1
]
=
st
d
::
numeric_limits
<
double
>::
infinity
()
;
}
// check trival(num_bin_ == 1) feature
if
(
num_bin_
<=
1
)
{
is_trival_
=
true
;
default_bin_
=
0
;
}
else
{
is_trival_
=
false
;
default_bin_
=
ValueToBin
(
0
);
}
if
(
NeedFilter
(
cnt_in_bin
,
static_cast
<
int
>
(
total_sample_cnt
),
min_split_data
))
{
is_trival_
=
true
;
}
// calculate sparse rate
CHECK
(
num_bin_
<=
max_bin
);
sparse_rate_
=
static_cast
<
double
>
(
cnt_in_bin
[
GetDefaultBin
()])
/
static_cast
<
double
>
(
sample_
size
);
sparse_rate_
=
static_cast
<
double
>
(
cnt_in_bin
[
GetDefaultBin
()])
/
static_cast
<
double
>
(
total_
sample_
cnt
);
}
...
...
@@ -202,8 +194,9 @@ int BinMapper::SizeForSpecificBin(int bin) {
size
+=
sizeof
(
int
);
size
+=
sizeof
(
bool
);
size
+=
sizeof
(
double
);
size
+=
sizeof
(
BinTyp
e
);
size
+=
2
*
sizeof
(
doubl
e
);
size
+=
bin
*
sizeof
(
double
);
size
+=
sizeof
(
uint32_t
);
return
size
;
}
...
...
@@ -214,18 +207,13 @@ void BinMapper::CopyTo(char * buffer) {
buffer
+=
sizeof
(
is_trival_
);
std
::
memcpy
(
buffer
,
&
sparse_rate_
,
sizeof
(
sparse_rate_
));
buffer
+=
sizeof
(
sparse_rate_
);
std
::
memcpy
(
buffer
,
&
bin_type_
,
sizeof
(
bin_type_
));
buffer
+=
sizeof
(
bin_type_
);
std
::
memcpy
(
&
min_val_
,
buffer
,
sizeof
(
min_val_
));
buffer
+=
sizeof
(
min_val_
);
std
::
memcpy
(
&
max_val_
,
buffer
,
sizeof
(
max_val_
));
buffer
+=
sizeof
(
max_val_
);
if
(
bin_type_
==
BinType
::
NumericalBin
)
{
std
::
memcpy
(
buffer
,
bin_upper_bound_
.
data
(),
num_bin_
*
sizeof
(
double
));
}
else
{
std
::
memcpy
(
buffer
,
bin_2_categorical_
.
data
(),
num_bin_
*
sizeof
(
int
));
}
std
::
memcpy
(
&
default_bin_
,
buffer
,
sizeof
(
default_bin_
));
buffer
+=
sizeof
(
default_bin_
);
std
::
memcpy
(
buffer
,
bin_upper_bound_
.
data
(),
num_bin_
*
sizeof
(
double
));
}
void
BinMapper
::
CopyFrom
(
const
char
*
buffer
)
{
...
...
@@ -235,48 +223,30 @@ void BinMapper::CopyFrom(const char * buffer) {
buffer
+=
sizeof
(
is_trival_
);
std
::
memcpy
(
&
sparse_rate_
,
buffer
,
sizeof
(
sparse_rate_
));
buffer
+=
sizeof
(
sparse_rate_
);
std
::
memcpy
(
&
bin_type_
,
buffer
,
sizeof
(
bin_type_
));
buffer
+=
sizeof
(
bin_type_
);
std
::
memcpy
(
&
min_val_
,
buffer
,
sizeof
(
min_val_
));
buffer
+=
sizeof
(
min_val_
);
std
::
memcpy
(
&
max_val_
,
buffer
,
sizeof
(
max_val_
));
buffer
+=
sizeof
(
max_val_
);
if
(
bin_type_
==
BinType
::
NumericalBin
)
{
bin_upper_bound_
=
std
::
vector
<
double
>
(
num_bin_
);
std
::
memcpy
(
bin_upper_bound_
.
data
(),
buffer
,
num_bin_
*
sizeof
(
double
));
}
else
{
bin_2_categorical_
=
std
::
vector
<
int
>
(
num_bin_
);
std
::
memcpy
(
bin_2_categorical_
.
data
(),
buffer
,
num_bin_
*
sizeof
(
int
));
categorical_2_bin_
.
clear
();
for
(
int
i
=
0
;
i
<
num_bin_
;
++
i
)
{
categorical_2_bin_
[
bin_2_categorical_
[
i
]]
=
static_cast
<
unsigned
int
>
(
i
);
}
}
std
::
memcpy
(
&
default_bin_
,
buffer
,
sizeof
(
default_bin_
));
buffer
+=
sizeof
(
default_bin_
);
bin_upper_bound_
=
std
::
vector
<
double
>
(
num_bin_
);
std
::
memcpy
(
bin_upper_bound_
.
data
(),
buffer
,
num_bin_
*
sizeof
(
double
));
}
void
BinMapper
::
SaveBinaryToFile
(
FILE
*
file
)
const
{
fwrite
(
&
num_bin_
,
sizeof
(
num_bin_
),
1
,
file
);
fwrite
(
&
is_trival_
,
sizeof
(
is_trival_
),
1
,
file
);
fwrite
(
&
sparse_rate_
,
sizeof
(
sparse_rate_
),
1
,
file
);
fwrite
(
&
bin_type_
,
sizeof
(
bin_type_
),
1
,
file
);
fwrite
(
&
min_val_
,
sizeof
(
min_val_
),
1
,
file
);
fwrite
(
&
max_val_
,
sizeof
(
max_val_
),
1
,
file
);
if
(
bin_type_
==
BinType
::
NumericalBin
)
{
fwrite
(
bin_upper_bound_
.
data
(),
sizeof
(
double
),
num_bin_
,
file
);
}
else
{
fwrite
(
bin_2_categorical_
.
data
(),
sizeof
(
int
),
num_bin_
,
file
);
}
fwrite
(
&
default_bin_
,
sizeof
(
default_bin_
),
1
,
file
);
fwrite
(
bin_upper_bound_
.
data
(),
sizeof
(
double
),
num_bin_
,
file
);
}
size_t
BinMapper
::
SizesInByte
()
const
{
size_t
ret
=
sizeof
(
num_bin_
)
+
sizeof
(
is_trival_
)
+
sizeof
(
sparse_rate_
)
+
sizeof
(
bin_type_
)
+
sizeof
(
min_val_
)
+
sizeof
(
max_val_
);
if
(
bin_type_
==
BinType
::
NumericalBin
)
{
ret
+=
sizeof
(
double
)
*
num_bin_
;
}
else
{
ret
+=
sizeof
(
int
)
*
num_bin_
;
}
+
sizeof
(
min_val_
)
+
sizeof
(
max_val_
)
+
sizeof
(
default_bin_
);
ret
+=
sizeof
(
double
)
*
num_bin_
;
return
ret
;
}
...
...
@@ -284,73 +254,46 @@ template class DenseBin<uint8_t>;
template
class
DenseBin
<
uint16_t
>;
template
class
DenseBin
<
uint32_t
>;
template
class
DenseCategoricalBin
<
uint8_t
>;
template
class
DenseCategoricalBin
<
uint16_t
>;
template
class
DenseCategoricalBin
<
uint32_t
>;
template
class
SparseBin
<
uint8_t
>;
template
class
SparseBin
<
uint16_t
>;
template
class
SparseBin
<
uint32_t
>;
template
class
SparseCategoricalBin
<
uint8_t
>;
template
class
SparseCategoricalBin
<
uint16_t
>;
template
class
SparseCategoricalBin
<
uint32_t
>;
template
class
OrderedSparseBin
<
uint8_t
>;
template
class
OrderedSparseBin
<
uint16_t
>;
template
class
OrderedSparseBin
<
uint32_t
>;
double
BinMapper
::
kSparseThreshold
=
0.8
f
;
Bin
*
Bin
::
CreateBin
(
data_size_t
num_data
,
int
num_bin
,
double
sparse_rate
,
bool
is_enable_sparse
,
bool
*
is_sparse
,
uint32_t
default_bin
,
BinType
bin_type
)
{
bool
is_enable_sparse
,
bool
*
is_sparse
)
{
// sparse threshold
const
double
kSparseThreshold
=
0.8
f
;
if
(
sparse_rate
>=
kSparseThreshold
&&
is_enable_sparse
)
{
if
(
sparse_rate
>=
BinMapper
::
kSparseThreshold
&&
is_enable_sparse
)
{
*
is_sparse
=
true
;
return
CreateSparseBin
(
num_data
,
num_bin
,
default_bin
,
bin_type
);
return
CreateSparseBin
(
num_data
,
num_bin
);
}
else
{
*
is_sparse
=
false
;
return
CreateDenseBin
(
num_data
,
num_bin
,
default_bin
,
bin_type
);
return
CreateDenseBin
(
num_data
,
num_bin
);
}
}
Bin
*
Bin
::
CreateDenseBin
(
data_size_t
num_data
,
int
num_bin
,
uint32_t
default_bin
,
BinType
bin_type
)
{
if
(
bin_type
==
BinType
::
NumericalBin
)
{
if
(
num_bin
<=
255
)
{
return
new
DenseBin
<
uint8_t
>
(
num_data
,
default_bin
);
}
else
if
(
num_bin
<=
65535
)
{
return
new
DenseBin
<
uint16_t
>
(
num_data
,
default_bin
);
}
else
{
return
new
DenseBin
<
uint32_t
>
(
num_data
,
default_bin
);
}
Bin
*
Bin
::
CreateDenseBin
(
data_size_t
num_data
,
int
num_bin
)
{
if
(
num_bin
<=
256
)
{
return
new
DenseBin
<
uint8_t
>
(
num_data
);
}
else
if
(
num_bin
<=
65536
)
{
return
new
DenseBin
<
uint16_t
>
(
num_data
);
}
else
{
if
(
num_bin
<=
255
)
{
return
new
DenseCategoricalBin
<
uint8_t
>
(
num_data
,
default_bin
);
}
else
if
(
num_bin
<=
65535
)
{
return
new
DenseCategoricalBin
<
uint16_t
>
(
num_data
,
default_bin
);
}
else
{
return
new
DenseCategoricalBin
<
uint32_t
>
(
num_data
,
default_bin
);
}
return
new
DenseBin
<
uint32_t
>
(
num_data
);
}
}
Bin
*
Bin
::
CreateSparseBin
(
data_size_t
num_data
,
int
num_bin
,
uint32_t
default_bin
,
BinType
bin_type
)
{
if
(
bin_type
==
BinType
::
NumericalBin
)
{
if
(
num_bin
<=
255
)
{
return
new
SparseBin
<
uint8_t
>
(
num_data
,
default_bin
);
}
else
if
(
num_bin
<=
65535
)
{
return
new
SparseBin
<
uint16_t
>
(
num_data
,
default_bin
);
}
else
{
return
new
SparseBin
<
uint32_t
>
(
num_data
,
default_bin
);
}
Bin
*
Bin
::
CreateSparseBin
(
data_size_t
num_data
,
int
num_bin
)
{
if
(
num_bin
<=
256
)
{
return
new
SparseBin
<
uint8_t
>
(
num_data
);
}
else
if
(
num_bin
<=
65536
)
{
return
new
SparseBin
<
uint16_t
>
(
num_data
);
}
else
{
if
(
num_bin
<=
255
)
{
return
new
SparseCategoricalBin
<
uint8_t
>
(
num_data
,
default_bin
);
}
else
if
(
num_bin
<=
65535
)
{
return
new
SparseCategoricalBin
<
uint16_t
>
(
num_data
,
default_bin
);
}
else
{
return
new
SparseCategoricalBin
<
uint32_t
>
(
num_data
,
default_bin
);
}
return
new
SparseBin
<
uint32_t
>
(
num_data
);
}
}
...
...
src/io/config.cpp
View file @
4f77bd28
...
...
@@ -39,11 +39,11 @@ void OverallConfig::Set(const std::unordered_map<std::string, std::string>& para
// generate seeds by seed.
if
(
GetInt
(
params
,
"seed"
,
&
seed
))
{
Random
rand
(
seed
);
int
int_max
=
std
::
numeric_limits
<
in
t
>::
max
();
io_config
.
data_random_seed
=
static_cast
<
int
>
(
rand
.
Next
In
t
(
0
,
int_max
));
boosting_config
.
bagging_seed
=
static_cast
<
int
>
(
rand
.
Next
In
t
(
0
,
int_max
));
boosting_config
.
drop_seed
=
static_cast
<
int
>
(
rand
.
Next
In
t
(
0
,
int_max
));
boosting_config
.
tree_config
.
feature_fraction_seed
=
static_cast
<
int
>
(
rand
.
Next
In
t
(
0
,
int_max
));
int
int_max
=
std
::
numeric_limits
<
shor
t
>::
max
();
io_config
.
data_random_seed
=
static_cast
<
int
>
(
rand
.
Next
Shor
t
(
0
,
int_max
));
boosting_config
.
bagging_seed
=
static_cast
<
int
>
(
rand
.
Next
Shor
t
(
0
,
int_max
));
boosting_config
.
drop_seed
=
static_cast
<
int
>
(
rand
.
Next
Shor
t
(
0
,
int_max
));
boosting_config
.
tree_config
.
feature_fraction_seed
=
static_cast
<
int
>
(
rand
.
Next
Shor
t
(
0
,
int_max
));
}
GetTaskType
(
params
);
GetBoostingType
(
params
);
...
...
@@ -79,6 +79,8 @@ void OverallConfig::GetBoostingType(const std::unordered_map<std::string, std::s
boosting_type
=
"gbdt"
;
}
else
if
(
value
==
std
::
string
(
"dart"
))
{
boosting_type
=
"dart"
;
}
else
if
(
value
==
std
::
string
(
"goss"
))
{
boosting_type
=
"goss"
;
}
else
{
Log
::
Fatal
(
"Unknown boosting type %s"
,
value
.
c_str
());
}
...
...
@@ -214,7 +216,11 @@ void IOConfig::Set(const std::unordered_map<std::string, std::string>& params) {
GetString
(
params
,
"weight_column"
,
&
weight_column
);
GetString
(
params
,
"group_column"
,
&
group_column
);
GetString
(
params
,
"ignore_column"
,
&
ignore_column
);
GetString
(
params
,
"categorical_column"
,
&
categorical_column
);
GetInt
(
params
,
"min_data_in_leaf"
,
&
min_data_in_leaf
);
GetInt
(
params
,
"min_dato_in_bin"
,
&
min_data_in_bin
);
GetDouble
(
params
,
"max_conflict_rate"
,
&
max_conflict_rate
);
GetBool
(
params
,
"enable_bundle"
,
&
enable_bundle
);
GetBool
(
params
,
"adjacent_bundle"
,
&
adjacent_bundle
);
}
...
...
@@ -323,6 +329,8 @@ void BoostingConfig::Set(const std::unordered_map<std::string, std::string>& par
GetInt
(
params
,
"max_drop"
,
&
max_drop
);
GetBool
(
params
,
"xgboost_dart_mode"
,
&
xgboost_dart_mode
);
GetBool
(
params
,
"uniform_drop"
,
&
uniform_drop
);
GetDouble
(
params
,
"top_rate"
,
&
top_rate
);
GetDouble
(
params
,
"other_rate"
,
&
other_rate
);
CHECK
(
drop_rate
<=
1.0
&&
drop_rate
>=
0.0
);
CHECK
(
skip_drop
<=
1.0
&&
skip_drop
>=
0.0
);
GetTreeLearnerType
(
params
);
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment