Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tianlh
LightGBM-DCU
Commits
4f77bd28
Commit
4f77bd28
authored
Feb 20, 2017
by
Guolin Ke
Browse files
update to v2.
parent
13d4581b
Changes
64
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
726 additions
and
506 deletions
+726
-506
include/LightGBM/dataset_loader.h
include/LightGBM/dataset_loader.h
+3
-3
include/LightGBM/feature_group.h
include/LightGBM/feature_group.h
+190
-0
include/LightGBM/tree.h
include/LightGBM/tree.h
+37
-46
include/LightGBM/utils/array_args.h
include/LightGBM/utils/array_args.h
+89
-40
include/LightGBM/utils/common.h
include/LightGBM/utils/common.h
+1
-1
include/LightGBM/utils/random.h
include/LightGBM/utils/random.h
+23
-6
pmml/pmml.py
pmml/pmml.py
+2
-3
python-package/lightgbm/basic.py
python-package/lightgbm/basic.py
+8
-94
python-package/lightgbm/engine.py
python-package/lightgbm/engine.py
+2
-14
python-package/lightgbm/plotting.py
python-package/lightgbm/plotting.py
+1
-6
python-package/lightgbm/sklearn.py
python-package/lightgbm/sklearn.py
+4
-13
src/boosting/boosting.cpp
src/boosting/boosting.cpp
+7
-0
src/boosting/dart.hpp
src/boosting/dart.hpp
+5
-0
src/boosting/gbdt.cpp
src/boosting/gbdt.cpp
+3
-59
src/boosting/gbdt.h
src/boosting/gbdt.h
+0
-2
src/boosting/goss.hpp
src/boosting/goss.hpp
+174
-0
src/boosting/score_updater.hpp
src/boosting/score_updater.hpp
+2
-0
src/c_api.cpp
src/c_api.cpp
+23
-18
src/io/bin.cpp
src/io/bin.cpp
+138
-195
src/io/config.cpp
src/io/config.cpp
+14
-6
No files found.
include/LightGBM/dataset_loader.h
View file @
4f77bd28
...
@@ -20,7 +20,9 @@ public:
...
@@ -20,7 +20,9 @@ public:
LIGHTGBM_EXPORT
Dataset
*
LoadFromFileAlignWithOtherDataset
(
const
char
*
filename
,
const
Dataset
*
train_data
);
LIGHTGBM_EXPORT
Dataset
*
LoadFromFileAlignWithOtherDataset
(
const
char
*
filename
,
const
Dataset
*
train_data
);
LIGHTGBM_EXPORT
Dataset
*
CostructFromSampleData
(
std
::
vector
<
std
::
vector
<
double
>>&
sample_values
,
size_t
total_sample_size
,
data_size_t
num_data
);
LIGHTGBM_EXPORT
Dataset
*
CostructFromSampleData
(
std
::
vector
<
std
::
vector
<
double
>>&
sample_values
,
std
::
vector
<
std
::
vector
<
int
>>&
sample_indices
,
size_t
total_sample_size
,
data_size_t
num_data
);
/*! \brief Disable copy */
/*! \brief Disable copy */
DatasetLoader
&
operator
=
(
const
DatasetLoader
&
)
=
delete
;
DatasetLoader
&
operator
=
(
const
DatasetLoader
&
)
=
delete
;
...
@@ -69,8 +71,6 @@ private:
...
@@ -69,8 +71,6 @@ private:
std
::
unordered_set
<
int
>
ignore_features_
;
std
::
unordered_set
<
int
>
ignore_features_
;
/*! \brief store feature names */
/*! \brief store feature names */
std
::
vector
<
std
::
string
>
feature_names_
;
std
::
vector
<
std
::
string
>
feature_names_
;
/*! \brief Mapper from real feature index to used index*/
std
::
unordered_set
<
int
>
categorical_features_
;
};
};
...
...
include/LightGBM/feature.h
→
include/LightGBM/feature
_group
.h
View file @
4f77bd28
#ifndef LIGHTGBM_FEATURE_H_
#ifndef LIGHTGBM_FEATURE_
GROUP_
H_
#define LIGHTGBM_FEATURE_H_
#define LIGHTGBM_FEATURE_
GROUP_
H_
#include <LightGBM/utils/random.h>
#include <LightGBM/utils/random.h>
...
@@ -12,22 +12,41 @@
...
@@ -12,22 +12,41 @@
namespace
LightGBM
{
namespace
LightGBM
{
/*! \brief Using to store data and providing some operations on one feature*/
class
Dataset
;
class
Feature
{
class
DatasetLoader
;
/*! \brief Using to store data and providing some operations on one feature group*/
class
FeatureGroup
{
public:
public:
friend
Dataset
;
friend
DatasetLoader
;
/*!
/*!
* \brief Constructor
* \brief Constructor
* \param feature
_idx Index of this feature
* \param
num_
feature
number of features of this group
* \param bin_mapper Bin mapper for
this
feature
* \param bin_mapper
s
Bin mapper for feature
s
* \param num_data Total number of data
* \param num_data Total number of data
* \param is_enable_sparse True if enable sparse feature
* \param is_enable_sparse True if enable sparse feature
*/
*/
Feature
(
int
feature_idx
,
BinMapper
*
bin_mapper
,
FeatureGroup
(
int
num_feature
,
data_size_t
num_data
,
bool
is_enable_sparse
)
std
::
vector
<
std
::
unique_ptr
<
BinMapper
>>&
bin_mappers
,
:
bin_mapper_
(
bin_mapper
)
{
data_size_t
num_data
,
bool
is_enable_sparse
)
:
num_feature_
(
num_feature
)
{
feature_index_
=
feature_idx
;
CHECK
(
static_cast
<
int
>
(
bin_mappers
.
size
())
==
num_feature
);
bin_data_
.
reset
(
Bin
::
CreateBin
(
num_data
,
bin_mapper_
->
num_bin
(),
// use bin at zero to store default_bin
bin_mapper_
->
sparse_rate
(),
is_enable_sparse
,
&
is_sparse_
,
bin_mapper_
->
GetDefaultBin
(),
bin_mapper_
->
bin_type
()));
num_total_bin_
=
1
;
bin_offsets_
.
emplace_back
(
num_total_bin_
);
int
cnt_non_zero
=
0
;
for
(
int
i
=
0
;
i
<
num_feature_
;
++
i
)
{
bin_mappers_
.
emplace_back
(
bin_mappers
[
i
].
release
());
auto
num_bin
=
bin_mappers_
[
i
]
->
num_bin
();
if
(
bin_mappers_
[
i
]
->
GetDefaultBin
()
==
0
)
{
num_bin
-=
1
;
}
num_total_bin_
+=
num_bin
;
bin_offsets_
.
emplace_back
(
num_total_bin_
);
cnt_non_zero
+=
static_cast
<
int
>
(
num_data
*
(
1.0
f
-
bin_mappers_
[
i
]
->
sparse_rate
()));
}
double
sparse_rate
=
1.0
f
-
static_cast
<
double
>
(
cnt_non_zero
)
/
(
num_data
);
bin_data_
.
reset
(
Bin
::
CreateBin
(
num_data
,
num_total_bin_
,
sparse_rate
,
is_enable_sparse
,
&
is_sparse_
));
}
}
/*!
/*!
* \brief Constructor from memory
* \brief Constructor from memory
...
@@ -35,39 +54,44 @@ public:
...
@@ -35,39 +54,44 @@ public:
* \param num_all_data Number of global data
* \param num_all_data Number of global data
* \param local_used_indices Local used indices, empty means using all data
* \param local_used_indices Local used indices, empty means using all data
*/
*/
Feature
(
const
void
*
memory
,
data_size_t
num_all_data
,
Feature
Group
(
const
void
*
memory
,
data_size_t
num_all_data
,
const
std
::
vector
<
data_size_t
>&
local_used_indices
)
{
const
std
::
vector
<
data_size_t
>&
local_used_indices
)
{
const
char
*
memory_ptr
=
reinterpret_cast
<
const
char
*>
(
memory
);
const
char
*
memory_ptr
=
reinterpret_cast
<
const
char
*>
(
memory
);
// get featuer index
feature_index_
=
*
(
reinterpret_cast
<
const
int
*>
(
memory_ptr
));
memory_ptr
+=
sizeof
(
feature_index_
);
// get is_sparse
// get is_sparse
is_sparse_
=
*
(
reinterpret_cast
<
const
bool
*>
(
memory_ptr
));
is_sparse_
=
*
(
reinterpret_cast
<
const
bool
*>
(
memory_ptr
));
memory_ptr
+=
sizeof
(
is_sparse_
);
memory_ptr
+=
sizeof
(
is_sparse_
);
num_feature_
=
*
(
reinterpret_cast
<
const
int
*>
(
memory_ptr
));
memory_ptr
+=
sizeof
(
num_feature_
);
// get bin mapper
// get bin mapper
bin_mapper_
.
reset
(
new
BinMapper
(
memory_ptr
));
bin_mappers_
.
clear
();
memory_ptr
+=
bin_mapper_
->
SizesInByte
();
bin_offsets_
.
clear
();
// start from 1, due to need to store zero bin in this slot
num_total_bin_
=
1
;
bin_offsets_
.
emplace_back
(
num_total_bin_
);
for
(
int
i
=
0
;
i
<
num_feature_
;
++
i
)
{
bin_mappers_
.
emplace_back
(
new
BinMapper
(
memory_ptr
));
auto
num_bin
=
bin_mappers_
[
i
]
->
num_bin
();
if
(
bin_mappers_
[
i
]
->
GetDefaultBin
()
==
0
)
{
num_bin
-=
1
;
}
num_total_bin_
+=
num_bin
;
bin_offsets_
.
emplace_back
(
num_total_bin_
);
memory_ptr
+=
bin_mappers_
[
i
]
->
SizesInByte
();
}
data_size_t
num_data
=
num_all_data
;
data_size_t
num_data
=
num_all_data
;
if
(
!
local_used_indices
.
empty
())
{
if
(
!
local_used_indices
.
empty
())
{
num_data
=
static_cast
<
data_size_t
>
(
local_used_indices
.
size
());
num_data
=
static_cast
<
data_size_t
>
(
local_used_indices
.
size
());
}
}
if
(
is_sparse_
)
{
if
(
is_sparse_
)
{
bin_data_
.
reset
(
Bin
::
CreateSparseBin
(
num_data
,
bin_mapper_
->
num_bin
(),
bin_mapper_
->
GetDefaultBin
(),
bin_mapper_
->
bin_type
()
));
bin_data_
.
reset
(
Bin
::
CreateSparseBin
(
num_data
,
num_total_bin_
));
}
else
{
}
else
{
bin_data_
.
reset
(
Bin
::
CreateDenseBin
(
num_data
,
bin_mapper_
->
num_bin
(),
bin_mapper_
->
GetDefaultBin
(),
bin_mapper_
->
bin_type
()
));
bin_data_
.
reset
(
Bin
::
CreateDenseBin
(
num_data
,
num_total_bin_
));
}
}
// get bin data
// get bin data
bin_data_
->
LoadFromMemory
(
memory_ptr
,
local_used_indices
);
bin_data_
->
LoadFromMemory
(
memory_ptr
,
local_used_indices
);
}
}
/*! \brief Destructor */
/*! \brief Destructor */
~
Feature
()
{
~
FeatureGroup
()
{
}
bool
CheckAlign
(
const
Feature
&
other
)
const
{
if
(
feature_index_
!=
other
.
feature_index_
)
{
return
false
;
}
return
bin_mapper_
->
CheckAlign
(
*
(
other
.
bin_mapper_
.
get
()));
}
}
/*!
/*!
...
@@ -76,78 +100,91 @@ public:
...
@@ -76,78 +100,91 @@ public:
* \param idx Index of record
* \param idx Index of record
* \param value feature value of record
* \param value feature value of record
*/
*/
inline
void
PushData
(
int
tid
,
data_size_t
line_idx
,
double
value
)
{
inline
void
PushData
(
int
tid
,
int
sub_feature_idx
,
data_size_t
line_idx
,
double
value
)
{
unsigned
int
bin
=
bin_mapper_
->
ValueToBin
(
value
);
uint32_t
bin
=
bin_mappers_
[
sub_feature_idx
]
->
ValueToBin
(
value
);
bin_data_
->
Push
(
tid
,
line_idx
,
bin
);
if
(
bin
==
bin_mappers_
[
sub_feature_idx
]
->
GetDefaultBin
())
{
return
;
}
}
bin
+=
bin_offsets_
[
sub_feature_idx
];
if
(
bin_mappers_
[
sub_feature_idx
]
->
GetDefaultBin
()
==
0
)
{
inline
void
PushBin
(
int
tid
,
data_size_t
line_idx
,
unsigned
int
bin
)
{
bin
-=
1
;
}
bin_data_
->
Push
(
tid
,
line_idx
,
bin
);
bin_data_
->
Push
(
tid
,
line_idx
,
bin
);
}
}
inline
void
CopySubset
(
const
Feature
*
full_feature
,
const
data_size_t
*
used_indices
,
data_size_t
num_used_indices
)
{
inline
void
CopySubset
(
const
Feature
Group
*
full_feature
,
const
data_size_t
*
used_indices
,
data_size_t
num_used_indices
)
{
bin_data_
->
CopySubset
(
full_feature
->
bin_data_
.
get
(),
used_indices
,
num_used_indices
);
bin_data_
->
CopySubset
(
full_feature
->
bin_data_
.
get
(),
used_indices
,
num_used_indices
);
}
}
inline
void
ReSize
(
data_size_t
num_data
)
{
inline
BinIterator
*
SubFetureIterator
(
int
sub_feature
)
{
bin_data_
->
ReSize
(
num_data
);
uint32_t
min_bin
=
bin_offsets_
[
sub_feature
];
uint32_t
max_bin
=
bin_offsets_
[
sub_feature
+
1
]
-
1
;
uint32_t
default_bin
=
bin_mappers_
[
sub_feature
]
->
GetDefaultBin
();
return
bin_data_
->
GetIterator
(
min_bin
,
max_bin
,
default_bin
);
}
}
inline
bool
is_sparse
()
const
{
return
is_sparse_
;
}
inline
data_size_t
Split
(
int
sub_feature
,
inline
void
FinishLoad
()
{
bin_data_
->
FinishLoad
();
}
uint32_t
threshold
,
/*! \brief Index of this feature */
data_size_t
*
data_indices
,
data_size_t
num_data
,
inline
int
feature_index
()
const
{
return
feature_index_
;
}
data_size_t
*
lte_indices
,
data_size_t
*
gt_indices
)
const
{
/*! \brief Bin mapper that this feature used */
inline
const
BinMapper
*
bin_mapper
()
const
{
return
bin_mapper_
.
get
();
}
uint32_t
min_bin
=
bin_offsets_
[
sub_feature
];
/*! \brief Number of bin of this feature */
uint32_t
max_bin
=
bin_offsets_
[
sub_feature
+
1
]
-
1
;
inline
int
num_bin
()
const
{
return
bin_mapper_
->
num_bin
();
}
uint32_t
default_bin
=
bin_mappers_
[
sub_feature
]
->
GetDefaultBin
();
return
bin_data_
->
Split
(
min_bin
,
max_bin
,
default_bin
,
inline
BinType
bin_type
()
const
{
return
bin_mapper_
->
bin_type
();
}
threshold
,
data_indices
,
num_data
,
lte_indices
,
gt_indices
);
/*! \brief Get bin data of this feature */
}
inline
const
Bin
*
bin_data
()
const
{
return
bin_data_
.
get
();
}
/*!
/*!
* \brief From bin to feature value
* \brief From bin to feature value
* \param bin
* \param bin
* \return Feature value of this bin
* \return Feature
Group
value of this bin
*/
*/
inline
double
BinToValue
(
unsigned
int
bin
)
inline
double
BinToValue
(
int
sub_feature_idx
,
uint32_t
bin
)
const
{
const
{
return
bin_mapper_
->
BinToValue
(
bin
);
}
return
bin_mappers_
[
sub_feature_idx
]
->
BinToValue
(
bin
);
}
/*!
/*!
* \brief Save binary data to file
* \brief Save binary data to file
* \param file File want to write
* \param file File want to write
*/
*/
void
SaveBinaryToFile
(
FILE
*
file
)
const
{
void
SaveBinaryToFile
(
FILE
*
file
)
const
{
fwrite
(
&
feature_index_
,
sizeof
(
feature_index_
),
1
,
file
);
fwrite
(
&
is_sparse_
,
sizeof
(
is_sparse_
),
1
,
file
);
fwrite
(
&
is_sparse_
,
sizeof
(
is_sparse_
),
1
,
file
);
bin_mapper_
->
SaveBinaryToFile
(
file
);
fwrite
(
&
num_feature_
,
sizeof
(
num_feature_
),
1
,
file
);
for
(
int
i
=
0
;
i
<
num_feature_
;
++
i
)
{
bin_mappers_
[
i
]
->
SaveBinaryToFile
(
file
);
}
bin_data_
->
SaveBinaryToFile
(
file
);
bin_data_
->
SaveBinaryToFile
(
file
);
}
}
/*!
/*!
* \brief Get sizes in byte of this object
* \brief Get sizes in byte of this object
*/
*/
size_t
SizesInByte
()
const
{
size_t
SizesInByte
()
const
{
return
sizeof
(
feature_index_
)
+
sizeof
(
is_sparse_
)
+
size_t
ret
=
sizeof
(
is_sparse_
)
+
sizeof
(
num_feature_
);
bin_mapper_
->
SizesInByte
()
+
bin_data_
->
SizesInByte
();
for
(
int
i
=
0
;
i
<
num_feature_
;
++
i
)
{
ret
+=
bin_mappers_
[
i
]
->
SizesInByte
();
}
ret
+=
bin_data_
->
SizesInByte
();
return
ret
;
}
}
/*! \brief Disable copy */
/*! \brief Disable copy */
Feature
&
operator
=
(
const
Feature
&
)
=
delete
;
Feature
Group
&
operator
=
(
const
Feature
Group
&
)
=
delete
;
/*! \brief Disable copy */
/*! \brief Disable copy */
Feature
(
const
Feature
&
)
=
delete
;
Feature
Group
(
const
Feature
Group
&
)
=
delete
;
private:
private:
/*! \brief Index of this feature */
/*! \brief Number of features */
int
feature_index_
;
int
num_feature_
;
/*! \brief Bin mapper that this feature used */
/*! \brief Bin mapper for sub features */
std
::
unique_ptr
<
BinMapper
>
bin_mapper_
;
std
::
vector
<
std
::
unique_ptr
<
BinMapper
>>
bin_mappers_
;
/*! \brief Bin offsets for sub features */
std
::
vector
<
uint32_t
>
bin_offsets_
;
/*! \brief Bin data of this feature */
/*! \brief Bin data of this feature */
std
::
unique_ptr
<
Bin
>
bin_data_
;
std
::
unique_ptr
<
Bin
>
bin_data_
;
/*! \brief True if this feature is sparse */
/*! \brief True if this feature is sparse */
bool
is_sparse_
;
bool
is_sparse_
;
int
num_total_bin_
;
};
};
}
// namespace LightGBM
}
// namespace LightGBM
#endif // L
ight
GBM_FEATURE_H_
#endif // L
IGHT
GBM_FEATURE_
GROUP_
H_
include/LightGBM/tree.h
View file @
4f77bd28
...
@@ -2,7 +2,6 @@
...
@@ -2,7 +2,6 @@
#define LIGHTGBM_TREE_H_
#define LIGHTGBM_TREE_H_
#include <LightGBM/meta.h>
#include <LightGBM/meta.h>
#include <LightGBM/feature.h>
#include <LightGBM/dataset.h>
#include <LightGBM/dataset.h>
#include <string>
#include <string>
...
@@ -35,7 +34,6 @@ public:
...
@@ -35,7 +34,6 @@ public:
* \brief Performing a split on tree leaves.
* \brief Performing a split on tree leaves.
* \param leaf Index of leaf to be split
* \param leaf Index of leaf to be split
* \param feature Index of feature; the converted index after removing useless features
* \param feature Index of feature; the converted index after removing useless features
* \param bin_type type of this feature, numerical or categorical
* \param threshold Threshold(bin) of split
* \param threshold Threshold(bin) of split
* \param real_feature Index of feature, the original index on data
* \param real_feature Index of feature, the original index on data
* \param threshold_double Threshold on feature value
* \param threshold_double Threshold on feature value
...
@@ -46,7 +44,7 @@ public:
...
@@ -46,7 +44,7 @@ public:
* \param gain Split gain
* \param gain Split gain
* \return The index of new leaf.
* \return The index of new leaf.
*/
*/
int
Split
(
int
leaf
,
int
feature
,
BinType
bin_type
,
unsigned
in
t
threshold
,
int
real_feature
,
int
Split
(
int
leaf
,
int
feature
,
uint32_
t
threshold
,
int
real_feature
,
double
threshold_double
,
double
left_value
,
double
threshold_double
,
double
left_value
,
double
right_value
,
data_size_t
left_cnt
,
data_size_t
right_cnt
,
double
gain
);
double
right_value
,
data_size_t
left_cnt
,
data_size_t
right_cnt
,
double
gain
);
...
@@ -64,8 +62,9 @@ public:
...
@@ -64,8 +62,9 @@ public:
* \param num_data Number of total data
* \param num_data Number of total data
* \param score Will add prediction to score
* \param score Will add prediction to score
*/
*/
void
AddPredictionToScore
(
const
Dataset
*
data
,
data_size_t
num_data
,
void
AddPredictionToScore
(
const
Dataset
*
data
,
double
*
score
)
const
;
data_size_t
num_data
,
double
*
score
)
const
;
/*!
/*!
* \brief Adding prediction value of this tree model to scorese
* \brief Adding prediction value of this tree model to scorese
...
@@ -93,7 +92,7 @@ public:
...
@@ -93,7 +92,7 @@ public:
inline
int
leaf_depth
(
int
leaf_idx
)
const
{
return
leaf_depth_
[
leaf_idx
];
}
inline
int
leaf_depth
(
int
leaf_idx
)
const
{
return
leaf_depth_
[
leaf_idx
];
}
/*! \brief Get feature of specific split*/
/*! \brief Get feature of specific split*/
inline
int
split_feature
_real
(
int
split_idx
)
const
{
return
split_feature_
real_
[
split_idx
];
}
inline
int
split_feature
(
int
split_idx
)
const
{
return
split_feature_
[
split_idx
];
}
/*!
/*!
* \brief Shrinkage for the tree's output
* \brief Shrinkage for the tree's output
...
@@ -101,8 +100,9 @@ public:
...
@@ -101,8 +100,9 @@ public:
* \param rate The factor of shrinkage
* \param rate The factor of shrinkage
*/
*/
inline
void
Shrinkage
(
double
rate
)
{
inline
void
Shrinkage
(
double
rate
)
{
#pragma omp parallel for schedule(static)
for
(
int
i
=
0
;
i
<
num_leaves_
;
++
i
)
{
for
(
int
i
=
0
;
i
<
num_leaves_
;
++
i
)
{
leaf_value_
[
i
]
=
leaf_value_
[
i
]
*
rate
;
leaf_value_
[
i
]
*
=
rate
;
}
}
}
}
...
@@ -112,15 +112,6 @@ public:
...
@@ -112,15 +112,6 @@ public:
/*! \brief Serialize this object to json*/
/*! \brief Serialize this object to json*/
std
::
string
ToJSON
();
std
::
string
ToJSON
();
template
<
typename
T
>
static
bool
CategoricalDecision
(
T
fval
,
T
threshold
)
{
if
(
static_cast
<
int
>
(
fval
)
==
static_cast
<
int
>
(
threshold
))
{
return
true
;
}
else
{
return
false
;
}
}
template
<
typename
T
>
template
<
typename
T
>
static
bool
NumericalDecision
(
T
fval
,
T
threshold
)
{
static
bool
NumericalDecision
(
T
fval
,
T
threshold
)
{
if
(
fval
<=
threshold
)
{
if
(
fval
<=
threshold
)
{
...
@@ -130,26 +121,13 @@ public:
...
@@ -130,26 +121,13 @@ public:
}
}
}
}
static
const
char
*
GetDecisionTypeName
(
int8_t
type
)
{
private:
if
(
type
==
0
)
{
return
"no_greater"
;
}
else
{
return
"is"
;
}
}
static
std
::
vector
<
bool
(
*
)(
unsigned
int
,
unsigned
int
)
>
inner_decision_funs
;
inline
int
GetLeaf
(
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>&
iterators
,
static
std
::
vector
<
bool
(
*
)(
double
,
double
)
>
decision_funs
;
data_size_t
data_idx
)
const
;
private:
inline
int
GetLeafRaw
(
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>&
iterators
,
/*!
data_size_t
data_idx
)
const
;
* \brief Find leaf index of which record belongs by data
* \param data The dataset
* \param data_idx Index of record
* \return Leaf index
*/
inline
int
GetLeaf
(
const
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>&
iterators
,
data_size_t
data_idx
)
const
;
/*!
/*!
* \brief Find leaf index of which record belongs by features
* \brief Find leaf index of which record belongs by features
...
@@ -171,15 +149,13 @@ private:
...
@@ -171,15 +149,13 @@ private:
/*! \brief A non-leaf node's right child */
/*! \brief A non-leaf node's right child */
std
::
vector
<
int
>
right_child_
;
std
::
vector
<
int
>
right_child_
;
/*! \brief A non-leaf node's split feature */
/*! \brief A non-leaf node's split feature */
std
::
vector
<
int
>
split_feature_
;
std
::
vector
<
int
>
split_feature_
inner
;
/*! \brief A non-leaf node's split feature, the original index */
/*! \brief A non-leaf node's split feature, the original index */
std
::
vector
<
int
>
split_feature_
real_
;
std
::
vector
<
int
>
split_feature_
;
/*! \brief A non-leaf node's split threshold in bin */
/*! \brief A non-leaf node's split threshold in bin */
std
::
vector
<
u
nsigned
in
t
>
threshold_in_bin_
;
std
::
vector
<
u
int32_
t
>
threshold_in_bin_
;
/*! \brief A non-leaf node's split threshold in feature value */
/*! \brief A non-leaf node's split threshold in feature value */
std
::
vector
<
double
>
threshold_
;
std
::
vector
<
double
>
threshold_
;
/*! \brief Decision type, 0 for '<='(numerical feature), 1 for 'is'(categorical feature) */
std
::
vector
<
int8_t
>
decision_type_
;
/*! \brief A non-leaf node's split gain */
/*! \brief A non-leaf node's split gain */
std
::
vector
<
double
>
split_gain_
;
std
::
vector
<
double
>
split_gain_
;
// used for leaf node
// used for leaf node
...
@@ -208,13 +184,28 @@ inline int Tree::PredictLeafIndex(const double* feature_values) const {
...
@@ -208,13 +184,28 @@ inline int Tree::PredictLeafIndex(const double* feature_values) const {
return
leaf
;
return
leaf
;
}
}
inline
int
Tree
::
GetLeaf
(
const
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>&
iterators
,
inline
int
Tree
::
GetLeaf
(
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>&
iterators
,
data_size_t
data_idx
)
const
{
data_size_t
data_idx
)
const
{
int
node
=
0
;
while
(
node
>=
0
)
{
if
(
NumericalDecision
<
uint32_t
>
(
iterators
[
node
]
->
Get
(
data_idx
),
threshold_in_bin_
[
node
]))
{
node
=
left_child_
[
node
];
}
else
{
node
=
right_child_
[
node
];
}
}
return
~
node
;
}
inline
int
Tree
::
GetLeafRaw
(
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>&
iterators
,
data_size_t
data_idx
)
const
{
int
node
=
0
;
int
node
=
0
;
while
(
node
>=
0
)
{
while
(
node
>=
0
)
{
if
(
inner_decision_funs
[
decision_type_
[
node
]]
(
if
(
NumericalDecision
<
uint32_t
>
(
iterators
[
split_feature_
[
node
]]
->
Get
(
data_idx
),
iterators
[
split_feature_
inner
[
node
]]
->
Get
(
data_idx
),
threshold_in_bin_
[
node
]))
{
threshold_in_bin_
[
node
]))
{
node
=
left_child_
[
node
];
node
=
left_child_
[
node
];
}
else
{
}
else
{
node
=
right_child_
[
node
];
node
=
right_child_
[
node
];
...
@@ -226,8 +217,8 @@ inline int Tree::GetLeaf(const std::vector<std::unique_ptr<BinIterator>>& iterat
...
@@ -226,8 +217,8 @@ inline int Tree::GetLeaf(const std::vector<std::unique_ptr<BinIterator>>& iterat
inline
int
Tree
::
GetLeaf
(
const
double
*
feature_values
)
const
{
inline
int
Tree
::
GetLeaf
(
const
double
*
feature_values
)
const
{
int
node
=
0
;
int
node
=
0
;
while
(
node
>=
0
)
{
while
(
node
>=
0
)
{
if
(
decision_funs
[
decision_type_
[
node
]]
(
if
(
NumericalDecision
<
double
>
(
feature_values
[
split_feature_
real_
[
node
]],
feature_values
[
split_feature_
[
node
]],
threshold_
[
node
]))
{
threshold_
[
node
]))
{
node
=
left_child_
[
node
];
node
=
left_child_
[
node
];
}
else
{
}
else
{
...
...
include/LightGBM/utils/array_args.h
View file @
4f77bd28
...
@@ -3,6 +3,7 @@
...
@@ -3,6 +3,7 @@
#include <vector>
#include <vector>
#include <algorithm>
#include <algorithm>
#include <LightGBM/utils/openmp_wrapper.h>
namespace
LightGBM
{
namespace
LightGBM
{
...
@@ -12,88 +13,136 @@ namespace LightGBM {
...
@@ -12,88 +13,136 @@ namespace LightGBM {
template
<
typename
VAL_T
>
template
<
typename
VAL_T
>
class
ArrayArgs
{
class
ArrayArgs
{
public:
public:
inline
static
size_t
ArgMaxMT
(
const
std
::
vector
<
VAL_T
>&
array
)
{
int
num_threads
=
1
;
#pragma omp parallel
#pragma omp master
{
num_threads
=
omp_get_num_threads
();
}
int
step
=
std
::
max
(
1
,
(
static_cast
<
int
>
(
array
.
size
())
+
num_threads
-
1
)
/
num_threads
);
std
::
vector
<
size_t
>
arg_maxs
(
num_threads
,
0
);
#pragma omp parallel for schedule(static,1)
for
(
int
i
=
0
;
i
<
num_threads
;
++
i
)
{
size_t
start
=
step
*
i
;
if
(
start
>=
array
.
size
())
{
continue
;
}
size_t
end
=
std
::
min
(
array
.
size
(),
start
+
step
);
size_t
arg_max
=
start
;
for
(
size_t
j
=
start
+
1
;
j
<
end
;
++
j
)
{
if
(
array
[
j
]
>
array
[
arg_max
])
{
arg_max
=
j
;
}
}
arg_maxs
[
i
]
=
arg_max
;
}
size_t
ret
=
arg_maxs
[
0
];
for
(
int
i
=
1
;
i
<
num_threads
;
++
i
)
{
if
(
array
[
arg_maxs
[
i
]]
>
array
[
ret
])
{
ret
=
arg_maxs
[
i
];
}
}
return
ret
;
}
inline
static
size_t
ArgMax
(
const
std
::
vector
<
VAL_T
>&
array
)
{
inline
static
size_t
ArgMax
(
const
std
::
vector
<
VAL_T
>&
array
)
{
if
(
array
.
empty
())
{
if
(
array
.
empty
())
{
return
0
;
return
0
;
}
}
size_t
argMax
=
0
;
if
(
array
.
size
()
>
100
)
{
for
(
size_t
i
=
1
;
i
<
array
.
size
();
++
i
)
{
return
ArgMaxMT
(
array
);
if
(
array
[
i
]
>
array
[
argMax
])
{
}
else
{
argMax
=
i
;
size_t
arg_max
=
0
;
for
(
size_t
i
=
1
;
i
<
array
.
size
();
++
i
)
{
if
(
array
[
i
]
>
array
[
arg_max
])
{
arg_max
=
i
;
}
}
}
return
arg_max
;
}
}
return
argMax
;
}
}
inline
static
size_t
ArgMin
(
const
std
::
vector
<
VAL_T
>&
array
)
{
inline
static
size_t
ArgMin
(
const
std
::
vector
<
VAL_T
>&
array
)
{
if
(
array
.
empty
())
{
if
(
array
.
empty
())
{
return
0
;
return
0
;
}
}
size_t
arg
M
in
=
0
;
size_t
arg
_m
in
=
0
;
for
(
size_t
i
=
1
;
i
<
array
.
size
();
++
i
)
{
for
(
size_t
i
=
1
;
i
<
array
.
size
();
++
i
)
{
if
(
array
[
i
]
<
array
[
arg
M
in
])
{
if
(
array
[
i
]
<
array
[
arg
_m
in
])
{
arg
M
in
=
i
;
arg
_m
in
=
i
;
}
}
}
}
return
arg
M
in
;
return
arg
_m
in
;
}
}
inline
static
size_t
ArgMax
(
const
VAL_T
*
array
,
size_t
n
)
{
inline
static
size_t
ArgMax
(
const
VAL_T
*
array
,
size_t
n
)
{
if
(
n
<=
0
)
{
if
(
n
<=
0
)
{
return
0
;
return
0
;
}
}
size_t
arg
M
ax
=
0
;
size_t
arg
_m
ax
=
0
;
for
(
size_t
i
=
1
;
i
<
n
;
++
i
)
{
for
(
size_t
i
=
1
;
i
<
n
;
++
i
)
{
if
(
array
[
i
]
>
array
[
arg
M
ax
])
{
if
(
array
[
i
]
>
array
[
arg
_m
ax
])
{
arg
M
ax
=
i
;
arg
_m
ax
=
i
;
}
}
}
}
return
arg
M
ax
;
return
arg
_m
ax
;
}
}
inline
static
size_t
ArgMin
(
const
VAL_T
*
array
,
size_t
n
)
{
inline
static
size_t
ArgMin
(
const
VAL_T
*
array
,
size_t
n
)
{
if
(
n
<=
0
)
{
if
(
n
<=
0
)
{
return
0
;
return
0
;
}
}
size_t
arg
M
in
=
0
;
size_t
arg
_m
in
=
0
;
for
(
size_t
i
=
1
;
i
<
n
;
++
i
)
{
for
(
size_t
i
=
1
;
i
<
n
;
++
i
)
{
if
(
array
[
i
]
<
array
[
arg
M
in
])
{
if
(
array
[
i
]
<
array
[
arg
_m
in
])
{
arg
M
in
=
i
;
arg
_m
in
=
i
;
}
}
}
}
return
arg
M
in
;
return
arg
_m
in
;
}
}
inline
static
size_t
Partition
(
std
::
vector
<
VAL_T
>*
array
,
size_t
start
,
size_t
end
)
{
inline
static
void
Partition
(
std
::
vector
<
VAL_T
>*
arr
,
int
start
,
int
end
,
int
*
l
,
int
*
r
)
{
VAL_T
&
pivot
=
(
*
array
)[
end
-
1
];
int
i
=
start
-
1
;
size_t
p_idx
=
start
;
int
j
=
end
-
1
;
for
(
size_t
i
=
start
;
i
<
end
-
1
;
++
i
)
{
int
p
=
i
;
if
((
*
array
)[
i
]
>
pivot
)
{
int
q
=
j
;
std
::
swap
((
*
array
)[
p_idx
],
(
*
array
)[
i
]);
if
(
start
>=
end
)
{
++
p_idx
;
return
;
}
}
}
std
::
swap
((
*
array
)[
p_idx
],
(
*
array
)[
end
-
1
]);
std
::
vector
<
VAL_T
>&
ref
=
*
arr
;
return
p_idx
;
VAL_T
v
=
ref
[
end
-
1
];
for
(;;)
{
while
(
ref
[
++
i
]
>
v
);
while
(
v
>
ref
[
--
j
])
{
if
(
j
==
start
)
{
break
;
}
}
if
(
i
>=
j
)
{
break
;
}
std
::
swap
(
ref
[
i
],
ref
[
j
]);
if
(
ref
[
i
]
==
v
)
{
p
++
;
std
::
swap
(
ref
[
p
],
ref
[
i
]);
}
if
(
v
==
ref
[
j
])
{
q
--
;
std
::
swap
(
ref
[
j
],
ref
[
q
]);
}
}
std
::
swap
(
ref
[
i
],
ref
[
end
-
1
]);
j
=
i
-
1
;
i
=
i
+
1
;
for
(
int
k
=
start
;
k
<=
p
;
k
++
,
j
--
)
{
std
::
swap
(
ref
[
k
],
ref
[
j
]);
}
for
(
int
k
=
end
-
2
;
k
>=
q
;
k
--
,
i
++
)
{
std
::
swap
(
ref
[
i
],
ref
[
k
]);
}
*
l
=
j
;
*
r
=
i
;
};
};
inline
static
size_
t
ArgMaxAtK
(
std
::
vector
<
VAL_T
>*
arr
ay
,
size_
t
start
,
size_
t
end
,
size_
t
k
)
{
inline
static
in
t
ArgMaxAtK
(
std
::
vector
<
VAL_T
>*
arr
,
in
t
start
,
in
t
end
,
in
t
k
)
{
if
(
start
=
=
end
-
1
)
{
if
(
start
>
=
end
-
1
)
{
return
start
;
return
start
;
}
}
size_t
p_idx
=
Partition
(
array
,
start
,
end
)
;
int
l
=
start
;
i
f
(
p_idx
==
k
)
{
i
nt
r
=
end
-
1
;
return
p_idx
;
Partition
(
arr
,
start
,
end
,
&
l
,
&
r
)
;
}
if
((
k
>
l
&&
k
<
r
)
||
l
==
0
||
r
==
end
-
1
)
{
else
if
(
k
<
p_idx
)
{
return
k
;
return
ArgMaxAtK
(
array
,
start
,
p_idx
,
k
);
}
else
if
(
k
<=
l
)
{
}
return
ArgMaxAtK
(
arr
,
start
,
l
,
k
);
else
{
}
else
{
return
ArgMaxAtK
(
arr
ay
,
p_idx
+
1
,
end
,
k
);
return
ArgMaxAtK
(
arr
,
r
,
end
,
k
);
}
}
}
}
inline
static
void
MaxK
(
const
std
::
vector
<
VAL_T
>&
array
,
size_
t
k
,
std
::
vector
<
VAL_T
>*
out
)
{
inline
static
void
MaxK
(
const
std
::
vector
<
VAL_T
>&
array
,
in
t
k
,
std
::
vector
<
VAL_T
>*
out
)
{
out
->
clear
();
out
->
clear
();
if
(
k
<=
0
)
{
if
(
k
<=
0
)
{
return
;
return
;
...
@@ -104,7 +153,7 @@ public:
...
@@ -104,7 +153,7 @@ public:
if
(
k
>=
array
.
size
())
{
if
(
k
>=
array
.
size
())
{
return
;
return
;
}
}
ArgMaxAtK
(
out
,
0
,
out
->
size
(),
k
-
1
);
ArgMaxAtK
(
out
,
0
,
static_cast
<
int
>
(
out
->
size
()
)
,
k
-
1
);
out
->
erase
(
out
->
begin
()
+
k
,
out
->
end
());
out
->
erase
(
out
->
begin
()
+
k
,
out
->
end
());
}
}
...
...
include/LightGBM/utils/common.h
View file @
4f77bd28
...
@@ -150,7 +150,7 @@ inline static const char* Atof(const char* p, double* out) {
...
@@ -150,7 +150,7 @@ inline static const char* Atof(const char* p, double* out) {
frac
=
0
;
frac
=
0
;
scale
=
1.0
;
scale
=
1.0
;
if
((
*
p
==
'e'
)
||
(
*
p
==
'E'
))
{
if
((
*
p
==
'e'
)
||
(
*
p
==
'E'
))
{
u
nsigned
in
t
expon
;
u
int32_
t
expon
;
// Get sign of exponent, if any.
// Get sign of exponent, if any.
++
p
;
++
p
;
if
(
*
p
==
'-'
)
{
if
(
*
p
==
'-'
)
{
...
...
include/LightGBM/utils/random.h
View file @
4f77bd28
...
@@ -20,30 +20,41 @@ public:
...
@@ -20,30 +20,41 @@ public:
std
::
random_device
rd
;
std
::
random_device
rd
;
auto
genrator
=
std
::
mt19937
(
rd
());
auto
genrator
=
std
::
mt19937
(
rd
());
std
::
uniform_int_distribution
<
int
>
distribution
(
0
,
x
);
std
::
uniform_int_distribution
<
int
>
distribution
(
0
,
x
);
x
=
static_cast
<
unsigned
int
>
(
distribution
(
genrator
)
)
;
x
=
distribution
(
genrator
);
}
}
/*!
/*!
* \brief Constructor, with specific seed
* \brief Constructor, with specific seed
*/
*/
Random
(
int
seed
)
{
Random
(
int
seed
)
{
x
=
static_cast
<
unsigned
int
>
(
seed
)
;
x
=
seed
;
}
}
/*!
/*!
* \brief Generate random integer
* \brief Generate random integer, int16 range. [0, 65536]
* \param lower_bound lower bound
* \param upper_bound upper bound
* \return The random integer between [lower_bound, upper_bound)
*/
inline
int
NextShort
(
int
lower_bound
,
int
upper_bound
)
{
return
(
RandInt16
())
%
(
upper_bound
-
lower_bound
)
+
lower_bound
;
}
/*!
* \brief Generate random integer, int32 range
* \param lower_bound lower bound
* \param lower_bound lower bound
* \param upper_bound upper bound
* \param upper_bound upper bound
* \return The random integer between [lower_bound, upper_bound)
* \return The random integer between [lower_bound, upper_bound)
*/
*/
inline
int
NextInt
(
int
lower_bound
,
int
upper_bound
)
{
inline
int
NextInt
(
int
lower_bound
,
int
upper_bound
)
{
return
(
fastrand
())
%
(
upper_bound
-
lower_bound
)
+
lower_bound
;
return
(
RandInt32
())
%
(
upper_bound
-
lower_bound
)
+
lower_bound
;
}
}
/*!
/*!
* \brief Generate random float data
* \brief Generate random float data
* \return The random float between [0.0, 1.0)
* \return The random float between [0.0, 1.0)
*/
*/
inline
float
NextFloat
()
{
inline
float
NextFloat
()
{
// get random float in [0,1)
// get random float in [0,1)
return
static_cast
<
float
>
(
fastrand
())
/
(
32768.0
f
);
return
static_cast
<
float
>
(
RandInt16
())
/
(
32768.0
f
);
}
}
/*!
/*!
* \brief Sample K data from {0,1,...,N-1}
* \brief Sample K data from {0,1,...,N-1}
...
@@ -65,10 +76,16 @@ public:
...
@@ -65,10 +76,16 @@ public:
return
ret
;
return
ret
;
}
}
private:
private:
inline
int
fastrand
()
{
inline
int
RandInt16
()
{
x
=
(
214013
*
x
+
2531011
);
x
=
(
214013
*
x
+
2531011
);
return
(
x
>>
16
)
&
0x7FFF
;
return
(
x
>>
16
)
&
0x7FFF
;
}
}
inline
int
RandInt32
()
{
x
=
(
214013
*
x
+
2531011
);
return
x
&
0x7FFFFFF
;
}
int
x
=
123456789
;
int
x
=
123456789
;
};
};
...
...
pmml/pmml.py
View file @
4f77bd28
...
@@ -31,9 +31,9 @@ def get_threshold(node_id, prev_node_idx, is_child):
...
@@ -31,9 +31,9 @@ def get_threshold(node_id, prev_node_idx, is_child):
def
print_simple_predicate
(
tab_len
,
node_id
,
is_left_child
,
prev_node_idx
,
is_leaf
):
def
print_simple_predicate
(
tab_len
,
node_id
,
is_left_child
,
prev_node_idx
,
is_leaf
):
if
is_left_child
:
if
is_left_child
:
op
=
'equal'
if
decision_type
[
prev_node_idx
]
==
1
else
'lessOrEqual'
op
=
'lessOrEqual'
else
:
else
:
op
=
'notEqual'
if
decision_type
[
prev_node_idx
]
==
1
else
'greaterThan'
op
=
'greaterThan'
out_
(
'
\t
'
*
(
tab_len
+
1
)
+
(
"<SimplePredicate field=
\"
{0}
\"
"
+
" operator=
\"
{1}
\"
value=
\"
{2}
\"
/>"
).
format
(
out_
(
'
\t
'
*
(
tab_len
+
1
)
+
(
"<SimplePredicate field=
\"
{0}
\"
"
+
" operator=
\"
{1}
\"
value=
\"
{2}
\"
/>"
).
format
(
get_field_name
(
node_id
,
prev_node_idx
,
is_leaf
),
op
,
get_threshold
(
node_id
,
prev_node_idx
,
is_leaf
)))
get_field_name
(
node_id
,
prev_node_idx
,
is_leaf
),
op
,
get_threshold
(
node_id
,
prev_node_idx
,
is_leaf
)))
...
@@ -128,7 +128,6 @@ with open('LightGBM_pmml.xml', 'w') as pmml_out:
...
@@ -128,7 +128,6 @@ with open('LightGBM_pmml.xml', 'w') as pmml_out:
split_feature
=
get_array_ints
(
next
(
model_content
))
split_feature
=
get_array_ints
(
next
(
model_content
))
split_gain
=
next
(
model_content
)
# unused
split_gain
=
next
(
model_content
)
# unused
threshold
=
get_array_strings
(
next
(
model_content
))
threshold
=
get_array_strings
(
next
(
model_content
))
decision_type
=
get_array_ints
(
next
(
model_content
))
left_child
=
get_array_ints
(
next
(
model_content
))
left_child
=
get_array_ints
(
next
(
model_content
))
right_child
=
get_array_ints
(
next
(
model_content
))
right_child
=
get_array_ints
(
next
(
model_content
))
leaf_parent
=
get_array_ints
(
next
(
model_content
))
leaf_parent
=
get_array_ints
(
next
(
model_content
))
...
...
python-package/lightgbm/basic.py
View file @
4f77bd28
...
@@ -221,7 +221,7 @@ PANDAS_DTYPE_MAPPER = {'int8': 'int', 'int16': 'int', 'int32': 'int',
...
@@ -221,7 +221,7 @@ PANDAS_DTYPE_MAPPER = {'int8': 'int', 'int16': 'int', 'int32': 'int',
'float32'
:
'float'
,
'float64'
:
'float'
,
'bool'
:
'int'
}
'float32'
:
'float'
,
'float64'
:
'float'
,
'bool'
:
'int'
}
def
_data_from_pandas
(
data
,
feature_name
,
categorical_feature
,
pandas_categorical
):
def
_data_from_pandas
(
data
,
feature_name
):
if
isinstance
(
data
,
DataFrame
):
if
isinstance
(
data
,
DataFrame
):
if
feature_name
==
'auto'
or
feature_name
is
None
:
if
feature_name
==
'auto'
or
feature_name
is
None
:
if
all
([
isinstance
(
name
,
integer_types
+
(
np
.
integer
,
))
for
name
in
data
.
columns
]):
if
all
([
isinstance
(
name
,
integer_types
+
(
np
.
integer
,
))
for
name
in
data
.
columns
]):
...
@@ -229,25 +229,6 @@ def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorica
...
@@ -229,25 +229,6 @@ def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorica
warnings
.
filterwarnings
(
'once'
)
warnings
.
filterwarnings
(
'once'
)
warnings
.
warn
(
msg
,
stacklevel
=
5
)
warnings
.
warn
(
msg
,
stacklevel
=
5
)
data
=
data
.
rename
(
columns
=
str
)
data
=
data
.
rename
(
columns
=
str
)
cat_cols
=
data
.
select_dtypes
(
include
=
[
'category'
]).
columns
if
pandas_categorical
is
None
:
# train dataset
pandas_categorical
=
[
list
(
data
[
col
].
cat
.
categories
)
for
col
in
cat_cols
]
else
:
if
len
(
cat_cols
)
!=
len
(
pandas_categorical
):
raise
ValueError
(
'train and valid dataset categorical_feature do not match.'
)
for
col
,
category
in
zip
(
cat_cols
,
pandas_categorical
):
if
list
(
data
[
col
].
cat
.
categories
)
!=
list
(
category
):
data
[
col
]
=
data
[
col
].
cat
.
set_categories
(
category
)
if
len
(
cat_cols
):
# cat_cols is pandas Index object
data
=
data
.
copy
()
# not alter origin DataFrame
data
[
cat_cols
]
=
data
[
cat_cols
].
apply
(
lambda
x
:
x
.
cat
.
codes
)
if
categorical_feature
is
not
None
:
if
feature_name
is
None
:
feature_name
=
list
(
data
.
columns
)
if
categorical_feature
==
'auto'
:
categorical_feature
=
list
(
cat_cols
)
else
:
categorical_feature
=
list
(
categorical_feature
)
+
list
(
cat_cols
)
if
feature_name
==
'auto'
:
if
feature_name
==
'auto'
:
feature_name
=
list
(
data
.
columns
)
feature_name
=
list
(
data
.
columns
)
data_dtypes
=
data
.
dtypes
data_dtypes
=
data
.
dtypes
...
@@ -261,9 +242,7 @@ def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorica
...
@@ -261,9 +242,7 @@ def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorica
else
:
else
:
if
feature_name
==
'auto'
:
if
feature_name
==
'auto'
:
feature_name
=
None
feature_name
=
None
if
categorical_feature
==
'auto'
:
return
data
,
feature_name
categorical_feature
=
None
return
data
,
feature_name
,
categorical_feature
,
pandas_categorical
def
_label_from_pandas
(
label
):
def
_label_from_pandas
(
label
):
...
@@ -277,19 +256,6 @@ def _label_from_pandas(label):
...
@@ -277,19 +256,6 @@ def _label_from_pandas(label):
return
label
return
label
def
_save_pandas_categorical
(
file_name
,
pandas_categorical
):
with
open
(
file_name
,
'a'
)
as
f
:
f
.
write
(
'
\n
pandas_categorical:'
+
json
.
dumps
(
pandas_categorical
,
default
=
json_default_with_numpy
))
def
_load_pandas_categorical
(
file_name
):
with
open
(
file_name
,
'r'
)
as
f
:
last_line
=
f
.
readlines
()[
-
1
]
if
last_line
.
startswith
(
'pandas_categorical:'
):
return
json
.
loads
(
last_line
[
len
(
'pandas_categorical:'
):])
return
None
class
_InnerPredictor
(
object
):
class
_InnerPredictor
(
object
):
"""
"""
A _InnerPredictor of LightGBM.
A _InnerPredictor of LightGBM.
...
@@ -321,7 +287,6 @@ class _InnerPredictor(object):
...
@@ -321,7 +287,6 @@ class _InnerPredictor(object):
ctypes
.
byref
(
out_num_class
)))
ctypes
.
byref
(
out_num_class
)))
self
.
num_class
=
out_num_class
.
value
self
.
num_class
=
out_num_class
.
value
self
.
num_total_iteration
=
out_num_iterations
.
value
self
.
num_total_iteration
=
out_num_iterations
.
value
self
.
pandas_categorical
=
_load_pandas_categorical
(
model_file
)
elif
booster_handle
is
not
None
:
elif
booster_handle
is
not
None
:
self
.
__is_manage_handle
=
False
self
.
__is_manage_handle
=
False
self
.
handle
=
booster_handle
self
.
handle
=
booster_handle
...
@@ -335,7 +300,6 @@ class _InnerPredictor(object):
...
@@ -335,7 +300,6 @@ class _InnerPredictor(object):
self
.
handle
,
self
.
handle
,
ctypes
.
byref
(
out_num_iterations
)))
ctypes
.
byref
(
out_num_iterations
)))
self
.
num_total_iteration
=
out_num_iterations
.
value
self
.
num_total_iteration
=
out_num_iterations
.
value
self
.
pandas_categorical
=
None
else
:
else
:
raise
TypeError
(
'Need Model file or Booster handle to create a predictor'
)
raise
TypeError
(
'Need Model file or Booster handle to create a predictor'
)
...
@@ -371,7 +335,7 @@ class _InnerPredictor(object):
...
@@ -371,7 +335,7 @@ class _InnerPredictor(object):
"""
"""
if
isinstance
(
data
,
Dataset
):
if
isinstance
(
data
,
Dataset
):
raise
TypeError
(
"Cannot use Dataset instance for prediction, please use raw data instead"
)
raise
TypeError
(
"Cannot use Dataset instance for prediction, please use raw data instead"
)
data
=
_data_from_pandas
(
data
,
None
,
None
,
self
.
pandas_categorical
)[
0
]
data
=
_data_from_pandas
(
data
,
None
)[
0
]
predict_type
=
C_API_PREDICT_NORMAL
predict_type
=
C_API_PREDICT_NORMAL
if
raw_score
:
if
raw_score
:
predict_type
=
C_API_PREDICT_RAW_SCORE
predict_type
=
C_API_PREDICT_RAW_SCORE
...
@@ -532,7 +496,7 @@ class Dataset(object):
...
@@ -532,7 +496,7 @@ class Dataset(object):
"""Dataset in LightGBM."""
"""Dataset in LightGBM."""
def
__init__
(
self
,
data
,
label
=
None
,
max_bin
=
255
,
reference
=
None
,
def
__init__
(
self
,
data
,
label
=
None
,
max_bin
=
255
,
reference
=
None
,
weight
=
None
,
group
=
None
,
silent
=
False
,
weight
=
None
,
group
=
None
,
silent
=
False
,
feature_name
=
'auto'
,
categorical_feature
=
'auto'
,
params
=
None
,
feature_name
=
'auto'
,
params
=
None
,
free_raw_data
=
True
):
free_raw_data
=
True
):
"""
"""
Parameters
Parameters
...
@@ -555,11 +519,6 @@ class Dataset(object):
...
@@ -555,11 +519,6 @@ class Dataset(object):
feature_name : list of str, or 'auto'
feature_name : list of str, or 'auto'
Feature names
Feature names
If 'auto' and data is pandas DataFrame, use data columns name
If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
params: dict, optional
params: dict, optional
Other parameters
Other parameters
free_raw_data: Bool
free_raw_data: Bool
...
@@ -574,12 +533,10 @@ class Dataset(object):
...
@@ -574,12 +533,10 @@ class Dataset(object):
self
.
group
=
group
self
.
group
=
group
self
.
silent
=
silent
self
.
silent
=
silent
self
.
feature_name
=
feature_name
self
.
feature_name
=
feature_name
self
.
categorical_feature
=
categorical_feature
self
.
params
=
params
self
.
params
=
params
self
.
free_raw_data
=
free_raw_data
self
.
free_raw_data
=
free_raw_data
self
.
used_indices
=
None
self
.
used_indices
=
None
self
.
_predictor
=
None
self
.
_predictor
=
None
self
.
pandas_categorical
=
None
def
__del__
(
self
):
def
__del__
(
self
):
self
.
_free_handle
()
self
.
_free_handle
()
...
@@ -592,11 +549,11 @@ class Dataset(object):
...
@@ -592,11 +549,11 @@ class Dataset(object):
def
_lazy_init
(
self
,
data
,
label
=
None
,
max_bin
=
255
,
reference
=
None
,
def
_lazy_init
(
self
,
data
,
label
=
None
,
max_bin
=
255
,
reference
=
None
,
weight
=
None
,
group
=
None
,
predictor
=
None
,
weight
=
None
,
group
=
None
,
predictor
=
None
,
silent
=
False
,
feature_name
=
'auto'
,
silent
=
False
,
feature_name
=
'auto'
,
categorical_feature
=
'auto'
,
params
=
None
):
params
=
None
):
if
data
is
None
:
if
data
is
None
:
self
.
handle
=
None
self
.
handle
=
None
return
return
data
,
feature_name
,
categorical_feature
,
self
.
pandas_categorical
=
_data_from_pandas
(
data
,
feature_name
,
categorical_feature
,
self
.
pandas_categorical
)
data
,
feature_name
,
=
_data_from_pandas
(
data
,
feature_name
)
label
=
_label_from_pandas
(
label
)
label
=
_label_from_pandas
(
label
)
self
.
data_has_header
=
False
self
.
data_has_header
=
False
"""process for args"""
"""process for args"""
...
@@ -608,23 +565,6 @@ class Dataset(object):
...
@@ -608,23 +565,6 @@ class Dataset(object):
params
[
"verbose"
]
=
0
params
[
"verbose"
]
=
0
elif
"verbose"
not
in
params
:
elif
"verbose"
not
in
params
:
params
[
"verbose"
]
=
1
params
[
"verbose"
]
=
1
"""get categorical features"""
if
categorical_feature
is
not
None
:
categorical_indices
=
set
()
feature_dict
=
{}
if
feature_name
is
not
None
:
feature_dict
=
{
name
:
i
for
i
,
name
in
enumerate
(
feature_name
)}
for
name
in
categorical_feature
:
if
isinstance
(
name
,
string_type
)
and
name
in
feature_dict
:
categorical_indices
.
add
(
feature_dict
[
name
])
elif
isinstance
(
name
,
integer_types
):
categorical_indices
.
add
(
name
)
else
:
raise
TypeError
(
"Wrong type({}) or unknown name({}) in categorical_feature"
.
format
(
type
(
name
).
__name__
,
name
))
params
[
'categorical_column'
]
=
sorted
(
categorical_indices
)
params_str
=
param_dict_to_str
(
params
)
params_str
=
param_dict_to_str
(
params
)
"""process for reference dataset"""
"""process for reference dataset"""
ref_dataset
=
None
ref_dataset
=
None
...
@@ -784,7 +724,7 @@ class Dataset(object):
...
@@ -784,7 +724,7 @@ class Dataset(object):
self
.
_lazy_init
(
self
.
data
,
label
=
self
.
label
,
max_bin
=
self
.
max_bin
,
self
.
_lazy_init
(
self
.
data
,
label
=
self
.
label
,
max_bin
=
self
.
max_bin
,
weight
=
self
.
weight
,
group
=
self
.
group
,
predictor
=
self
.
_predictor
,
weight
=
self
.
weight
,
group
=
self
.
group
,
predictor
=
self
.
_predictor
,
silent
=
self
.
silent
,
feature_name
=
self
.
feature_name
,
silent
=
self
.
silent
,
feature_name
=
self
.
feature_name
,
categorical_feature
=
self
.
categorical_feature
,
params
=
self
.
params
)
params
=
self
.
params
)
if
self
.
free_raw_data
:
if
self
.
free_raw_data
:
self
.
data
=
None
self
.
data
=
None
return
self
return
self
...
@@ -814,7 +754,6 @@ class Dataset(object):
...
@@ -814,7 +754,6 @@ class Dataset(object):
weight
=
weight
,
group
=
group
,
silent
=
silent
,
params
=
params
,
weight
=
weight
,
group
=
group
,
silent
=
silent
,
params
=
params
,
free_raw_data
=
self
.
free_raw_data
)
free_raw_data
=
self
.
free_raw_data
)
ret
.
_predictor
=
self
.
_predictor
ret
.
_predictor
=
self
.
_predictor
ret
.
pandas_categorical
=
self
.
pandas_categorical
return
ret
return
ret
def
subset
(
self
,
used_indices
,
params
=
None
):
def
subset
(
self
,
used_indices
,
params
=
None
):
...
@@ -829,9 +768,8 @@ class Dataset(object):
...
@@ -829,9 +768,8 @@ class Dataset(object):
Other parameters
Other parameters
"""
"""
ret
=
Dataset
(
None
,
reference
=
self
,
feature_name
=
self
.
feature_name
,
ret
=
Dataset
(
None
,
reference
=
self
,
feature_name
=
self
.
feature_name
,
categorical_feature
=
self
.
categorical_feature
,
params
=
params
)
params
=
params
)
ret
.
_predictor
=
self
.
_predictor
ret
.
_predictor
=
self
.
_predictor
ret
.
pandas_categorical
=
self
.
pandas_categorical
ret
.
used_indices
=
used_indices
ret
.
used_indices
=
used_indices
return
ret
return
ret
...
@@ -939,24 +877,6 @@ class Dataset(object):
...
@@ -939,24 +877,6 @@ class Dataset(object):
else
:
else
:
raise
TypeError
(
"Unknown type"
)
raise
TypeError
(
"Unknown type"
)
def
set_categorical_feature
(
self
,
categorical_feature
):
"""
Set categorical features
Parameters
----------
categorical_feature : list of int or str
Name/index of categorical features
"""
if
self
.
categorical_feature
==
categorical_feature
:
return
if
self
.
data
is
not
None
:
self
.
categorical_feature
=
categorical_feature
self
.
_free_handle
()
else
:
raise
LightGBMError
(
"Cannot set categorical feature after freed raw data, set free_raw_data=False when construct Dataset to avoid this."
)
def
_set_predictor
(
self
,
predictor
):
def
_set_predictor
(
self
,
predictor
):
"""
"""
Set predictor for continued training, not recommand for user to call this function.
Set predictor for continued training, not recommand for user to call this function.
...
@@ -979,7 +899,6 @@ class Dataset(object):
...
@@ -979,7 +899,6 @@ class Dataset(object):
reference : Dataset
reference : Dataset
Will use reference as template to consturct current dataset
Will use reference as template to consturct current dataset
"""
"""
self
.
set_categorical_feature
(
reference
.
categorical_feature
)
self
.
set_feature_name
(
reference
.
feature_name
)
self
.
set_feature_name
(
reference
.
feature_name
)
self
.
_set_predictor
(
reference
.
_predictor
)
self
.
_set_predictor
(
reference
.
_predictor
)
if
self
.
reference
is
reference
:
if
self
.
reference
is
reference
:
...
@@ -1208,7 +1127,6 @@ class Booster(object):
...
@@ -1208,7 +1127,6 @@ class Booster(object):
self
.
__inner_predict_buffer
=
[
None
]
self
.
__inner_predict_buffer
=
[
None
]
self
.
__is_predicted_cur_iter
=
[
False
]
self
.
__is_predicted_cur_iter
=
[
False
]
self
.
__get_eval_info
()
self
.
__get_eval_info
()
self
.
pandas_categorical
=
train_set
.
pandas_categorical
elif
model_file
is
not
None
:
elif
model_file
is
not
None
:
"""Prediction task"""
"""Prediction task"""
out_num_iterations
=
ctypes
.
c_int
(
0
)
out_num_iterations
=
ctypes
.
c_int
(
0
)
...
@@ -1221,7 +1139,6 @@ class Booster(object):
...
@@ -1221,7 +1139,6 @@ class Booster(object):
self
.
handle
,
self
.
handle
,
ctypes
.
byref
(
out_num_class
)))
ctypes
.
byref
(
out_num_class
)))
self
.
__num_class
=
out_num_class
.
value
self
.
__num_class
=
out_num_class
.
value
self
.
pandas_categorical
=
_load_pandas_categorical
(
model_file
)
elif
'model_str'
in
params
:
elif
'model_str'
in
params
:
self
.
__load_model_from_string
(
params
[
'model_str'
])
self
.
__load_model_from_string
(
params
[
'model_str'
])
else
:
else
:
...
@@ -1237,7 +1154,6 @@ class Booster(object):
...
@@ -1237,7 +1154,6 @@ class Booster(object):
def
__deepcopy__
(
self
,
_
):
def
__deepcopy__
(
self
,
_
):
model_str
=
self
.
__save_model_to_string
()
model_str
=
self
.
__save_model_to_string
()
booster
=
Booster
({
'model_str'
:
model_str
})
booster
=
Booster
({
'model_str'
:
model_str
})
booster
.
pandas_categorical
=
self
.
pandas_categorical
return
booster
return
booster
def
__getstate__
(
self
):
def
__getstate__
(
self
):
...
@@ -1477,7 +1393,6 @@ class Booster(object):
...
@@ -1477,7 +1393,6 @@ class Booster(object):
self
.
handle
,
self
.
handle
,
ctypes
.
c_int
(
num_iteration
),
ctypes
.
c_int
(
num_iteration
),
c_str
(
filename
)))
c_str
(
filename
)))
_save_pandas_categorical
(
filename
,
self
.
pandas_categorical
)
def
__load_model_from_string
(
self
,
model_str
):
def
__load_model_from_string
(
self
,
model_str
):
"""[Private] Load model from string"""
"""[Private] Load model from string"""
...
@@ -1589,7 +1504,6 @@ class Booster(object):
...
@@ -1589,7 +1504,6 @@ class Booster(object):
def
_to_predictor
(
self
):
def
_to_predictor
(
self
):
"""Convert to predictor"""
"""Convert to predictor"""
predictor
=
_InnerPredictor
(
booster_handle
=
self
.
handle
)
predictor
=
_InnerPredictor
(
booster_handle
=
self
.
handle
)
predictor
.
pandas_categorical
=
self
.
pandas_categorical
return
predictor
return
predictor
def
feature_name
(
self
):
def
feature_name
(
self
):
...
...
python-package/lightgbm/engine.py
View file @
4f77bd28
...
@@ -17,7 +17,7 @@ from .compat import (SKLEARN_INSTALLED, LGBMStratifiedKFold, integer_types,
...
@@ -17,7 +17,7 @@ from .compat import (SKLEARN_INSTALLED, LGBMStratifiedKFold, integer_types,
def
train
(
params
,
train_set
,
num_boost_round
=
100
,
def
train
(
params
,
train_set
,
num_boost_round
=
100
,
valid_sets
=
None
,
valid_names
=
None
,
valid_sets
=
None
,
valid_names
=
None
,
fobj
=
None
,
feval
=
None
,
init_model
=
None
,
fobj
=
None
,
feval
=
None
,
init_model
=
None
,
feature_name
=
'auto'
,
categorical_feature
=
'auto'
,
feature_name
=
'auto'
,
early_stopping_rounds
=
None
,
evals_result
=
None
,
early_stopping_rounds
=
None
,
evals_result
=
None
,
verbose_eval
=
True
,
learning_rates
=
None
,
callbacks
=
None
):
verbose_eval
=
True
,
learning_rates
=
None
,
callbacks
=
None
):
"""
"""
...
@@ -45,11 +45,6 @@ def train(params, train_set, num_boost_round=100,
...
@@ -45,11 +45,6 @@ def train(params, train_set, num_boost_round=100,
feature_name : list of str, or 'auto'
feature_name : list of str, or 'auto'
Feature names
Feature names
If 'auto' and data is pandas DataFrame, use data columns name
If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
early_stopping_rounds: int
early_stopping_rounds: int
Activates early stopping.
Activates early stopping.
Requires at least one validation data and one metric
Requires at least one validation data and one metric
...
@@ -103,7 +98,6 @@ def train(params, train_set, num_boost_round=100,
...
@@ -103,7 +98,6 @@ def train(params, train_set, num_boost_round=100,
train_set
.
_update_params
(
params
)
train_set
.
_update_params
(
params
)
train_set
.
_set_predictor
(
predictor
)
train_set
.
_set_predictor
(
predictor
)
train_set
.
set_feature_name
(
feature_name
)
train_set
.
set_feature_name
(
feature_name
)
train_set
.
set_categorical_feature
(
categorical_feature
)
is_valid_contain_train
=
False
is_valid_contain_train
=
False
train_data_name
=
"training"
train_data_name
=
"training"
...
@@ -277,7 +271,7 @@ def _agg_cv_result(raw_results):
...
@@ -277,7 +271,7 @@ def _agg_cv_result(raw_results):
def
cv
(
params
,
train_set
,
num_boost_round
=
10
,
def
cv
(
params
,
train_set
,
num_boost_round
=
10
,
data_splitter
=
None
,
nfold
=
5
,
stratified
=
False
,
shuffle
=
True
,
data_splitter
=
None
,
nfold
=
5
,
stratified
=
False
,
shuffle
=
True
,
metrics
=
None
,
fobj
=
None
,
feval
=
None
,
init_model
=
None
,
metrics
=
None
,
fobj
=
None
,
feval
=
None
,
init_model
=
None
,
feature_name
=
'auto'
,
categorical_feature
=
'auto'
,
feature_name
=
'auto'
,
early_stopping_rounds
=
None
,
fpreproc
=
None
,
early_stopping_rounds
=
None
,
fpreproc
=
None
,
verbose_eval
=
None
,
show_stdv
=
True
,
seed
=
0
,
verbose_eval
=
None
,
show_stdv
=
True
,
seed
=
0
,
callbacks
=
None
):
callbacks
=
None
):
...
@@ -311,11 +305,6 @@ def cv(params, train_set, num_boost_round=10,
...
@@ -311,11 +305,6 @@ def cv(params, train_set, num_boost_round=10,
feature_name : list of str, or 'auto'
feature_name : list of str, or 'auto'
Feature names
Feature names
If 'auto' and data is pandas DataFrame, use data columns name
If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
early_stopping_rounds: int
early_stopping_rounds: int
Activates early stopping. CV error needs to decrease at least
Activates early stopping. CV error needs to decrease at least
every <early_stopping_rounds> round(s) to continue.
every <early_stopping_rounds> round(s) to continue.
...
@@ -354,7 +343,6 @@ def cv(params, train_set, num_boost_round=10,
...
@@ -354,7 +343,6 @@ def cv(params, train_set, num_boost_round=10,
train_set
.
_update_params
(
params
)
train_set
.
_update_params
(
params
)
train_set
.
_set_predictor
(
predictor
)
train_set
.
_set_predictor
(
predictor
)
train_set
.
set_feature_name
(
feature_name
)
train_set
.
set_feature_name
(
feature_name
)
train_set
.
set_categorical_feature
(
categorical_feature
)
if
metrics
:
if
metrics
:
params
.
setdefault
(
'metric'
,
[])
params
.
setdefault
(
'metric'
,
[])
...
...
python-package/lightgbm/plotting.py
View file @
4f77bd28
...
@@ -257,12 +257,7 @@ def _to_graphviz(graph, tree_info, show_info, feature_names):
...
@@ -257,12 +257,7 @@ def _to_graphviz(graph, tree_info, show_info, feature_names):
if
info
in
{
'split_gain'
,
'internal_value'
,
'internal_count'
}:
if
info
in
{
'split_gain'
,
'internal_value'
,
'internal_count'
}:
label
+=
'
\n
'
+
info
+
':'
+
str
(
root
[
info
])
label
+=
'
\n
'
+
info
+
':'
+
str
(
root
[
info
])
graph
.
node
(
name
,
label
=
label
)
graph
.
node
(
name
,
label
=
label
)
if
root
[
'decision_type'
]
==
'no_greater'
:
l_dec
,
r_dec
=
'<='
,
'>'
l_dec
,
r_dec
=
'<='
,
'>'
elif
root
[
'decision_type'
]
==
'is'
:
l_dec
,
r_dec
=
'is'
,
"isn't"
else
:
raise
ValueError
(
'Invalid decision type in tree model.'
)
add
(
root
[
'left_child'
],
name
,
l_dec
)
add
(
root
[
'left_child'
],
name
,
l_dec
)
add
(
root
[
'right_child'
],
name
,
r_dec
)
add
(
root
[
'right_child'
],
name
,
r_dec
)
else
:
# leaf
else
:
# leaf
...
...
python-package/lightgbm/sklearn.py
View file @
4f77bd28
...
@@ -284,7 +284,7 @@ class LGBMModel(LGBMModelBase):
...
@@ -284,7 +284,7 @@ class LGBMModel(LGBMModelBase):
eval_init_score
=
None
,
eval_group
=
None
,
eval_init_score
=
None
,
eval_group
=
None
,
eval_metric
=
None
,
eval_metric
=
None
,
early_stopping_rounds
=
None
,
verbose
=
True
,
early_stopping_rounds
=
None
,
verbose
=
True
,
feature_name
=
'auto'
,
categorical_feature
=
'auto'
,
feature_name
=
'auto'
,
callbacks
=
None
):
callbacks
=
None
):
"""
"""
Fit the gradient boosting model
Fit the gradient boosting model
...
@@ -318,11 +318,6 @@ class LGBMModel(LGBMModelBase):
...
@@ -318,11 +318,6 @@ class LGBMModel(LGBMModelBase):
feature_name : list of str, or 'auto'
feature_name : list of str, or 'auto'
Feature names
Feature names
If 'auto' and data is pandas DataFrame, use data columns name
If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
callbacks : list of callback functions
callbacks : list of callback functions
List of callback functions that are applied at each iteration.
List of callback functions that are applied at each iteration.
See Callbacks in Python-API.md for more information.
See Callbacks in Python-API.md for more information.
...
@@ -406,7 +401,6 @@ class LGBMModel(LGBMModelBase):
...
@@ -406,7 +401,6 @@ class LGBMModel(LGBMModelBase):
early_stopping_rounds
=
early_stopping_rounds
,
early_stopping_rounds
=
early_stopping_rounds
,
evals_result
=
evals_result
,
fobj
=
self
.
fobj
,
feval
=
feval
,
evals_result
=
evals_result
,
fobj
=
self
.
fobj
,
feval
=
feval
,
verbose_eval
=
verbose
,
feature_name
=
feature_name
,
verbose_eval
=
verbose
,
feature_name
=
feature_name
,
categorical_feature
=
categorical_feature
,
callbacks
=
callbacks
)
callbacks
=
callbacks
)
if
evals_result
:
if
evals_result
:
...
@@ -514,7 +508,7 @@ class LGBMRegressor(LGBMModel, LGBMRegressorBase):
...
@@ -514,7 +508,7 @@ class LGBMRegressor(LGBMModel, LGBMRegressorBase):
eval_init_score
=
None
,
eval_init_score
=
None
,
eval_metric
=
"l2"
,
eval_metric
=
"l2"
,
early_stopping_rounds
=
None
,
verbose
=
True
,
early_stopping_rounds
=
None
,
verbose
=
True
,
feature_name
=
'auto'
,
categorical_feature
=
'auto'
,
callbacks
=
None
):
feature_name
=
'auto'
,
callbacks
=
None
):
super
(
LGBMRegressor
,
self
).
fit
(
X
,
y
,
sample_weight
=
sample_weight
,
super
(
LGBMRegressor
,
self
).
fit
(
X
,
y
,
sample_weight
=
sample_weight
,
init_score
=
init_score
,
eval_set
=
eval_set
,
init_score
=
init_score
,
eval_set
=
eval_set
,
...
@@ -523,7 +517,6 @@ class LGBMRegressor(LGBMModel, LGBMRegressorBase):
...
@@ -523,7 +517,6 @@ class LGBMRegressor(LGBMModel, LGBMRegressorBase):
eval_metric
=
eval_metric
,
eval_metric
=
eval_metric
,
early_stopping_rounds
=
early_stopping_rounds
,
early_stopping_rounds
=
early_stopping_rounds
,
verbose
=
verbose
,
feature_name
=
feature_name
,
verbose
=
verbose
,
feature_name
=
feature_name
,
categorical_feature
=
categorical_feature
,
callbacks
=
callbacks
)
callbacks
=
callbacks
)
return
self
return
self
...
@@ -560,7 +553,7 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
...
@@ -560,7 +553,7 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
eval_init_score
=
None
,
eval_init_score
=
None
,
eval_metric
=
"binary_logloss"
,
eval_metric
=
"binary_logloss"
,
early_stopping_rounds
=
None
,
verbose
=
True
,
early_stopping_rounds
=
None
,
verbose
=
True
,
feature_name
=
'auto'
,
categorical_feature
=
'auto'
,
feature_name
=
'auto'
,
callbacks
=
None
):
callbacks
=
None
):
self
.
_le
=
LGBMLabelEncoder
().
fit
(
y
)
self
.
_le
=
LGBMLabelEncoder
().
fit
(
y
)
y
=
self
.
_le
.
transform
(
y
)
y
=
self
.
_le
.
transform
(
y
)
...
@@ -583,7 +576,6 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
...
@@ -583,7 +576,6 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
eval_metric
=
eval_metric
,
eval_metric
=
eval_metric
,
early_stopping_rounds
=
early_stopping_rounds
,
early_stopping_rounds
=
early_stopping_rounds
,
verbose
=
verbose
,
feature_name
=
feature_name
,
verbose
=
verbose
,
feature_name
=
feature_name
,
categorical_feature
=
categorical_feature
,
callbacks
=
callbacks
)
callbacks
=
callbacks
)
return
self
return
self
...
@@ -661,7 +653,7 @@ class LGBMRanker(LGBMModel):
...
@@ -661,7 +653,7 @@ class LGBMRanker(LGBMModel):
eval_init_score
=
None
,
eval_group
=
None
,
eval_init_score
=
None
,
eval_group
=
None
,
eval_metric
=
'ndcg'
,
eval_at
=
1
,
eval_metric
=
'ndcg'
,
eval_at
=
1
,
early_stopping_rounds
=
None
,
verbose
=
True
,
early_stopping_rounds
=
None
,
verbose
=
True
,
feature_name
=
'auto'
,
categorical_feature
=
'auto'
,
feature_name
=
'auto'
,
callbacks
=
None
):
callbacks
=
None
):
"""
"""
Most arguments like common methods except following:
Most arguments like common methods except following:
...
@@ -692,6 +684,5 @@ class LGBMRanker(LGBMModel):
...
@@ -692,6 +684,5 @@ class LGBMRanker(LGBMModel):
eval_metric
=
eval_metric
,
eval_metric
=
eval_metric
,
early_stopping_rounds
=
early_stopping_rounds
,
early_stopping_rounds
=
early_stopping_rounds
,
verbose
=
verbose
,
feature_name
=
feature_name
,
verbose
=
verbose
,
feature_name
=
feature_name
,
categorical_feature
=
categorical_feature
,
callbacks
=
callbacks
)
callbacks
=
callbacks
)
return
self
return
self
src/boosting/boosting.cpp
View file @
4f77bd28
#include <LightGBM/boosting.h>
#include <LightGBM/boosting.h>
#include "gbdt.h"
#include "gbdt.h"
#include "dart.hpp"
#include "dart.hpp"
#include "goss.hpp"
namespace
LightGBM
{
namespace
LightGBM
{
...
@@ -31,6 +32,8 @@ Boosting* Boosting::CreateBoosting(const std::string& type, const char* filename
...
@@ -31,6 +32,8 @@ Boosting* Boosting::CreateBoosting(const std::string& type, const char* filename
return
new
GBDT
();
return
new
GBDT
();
}
else
if
(
type
==
std
::
string
(
"dart"
))
{
}
else
if
(
type
==
std
::
string
(
"dart"
))
{
return
new
DART
();
return
new
DART
();
}
else
if
(
type
==
std
::
string
(
"goss"
))
{
return
new
GOSS
();
}
else
{
}
else
{
return
nullptr
;
return
nullptr
;
}
}
...
@@ -42,6 +45,10 @@ Boosting* Boosting::CreateBoosting(const std::string& type, const char* filename
...
@@ -42,6 +45,10 @@ Boosting* Boosting::CreateBoosting(const std::string& type, const char* filename
ret
.
reset
(
new
GBDT
());
ret
.
reset
(
new
GBDT
());
}
else
if
(
type
==
std
::
string
(
"dart"
))
{
}
else
if
(
type
==
std
::
string
(
"dart"
))
{
ret
.
reset
(
new
DART
());
ret
.
reset
(
new
DART
());
}
else
if
(
type
==
std
::
string
(
"goss"
))
{
ret
.
reset
(
new
GOSS
());
}
else
{
Log
::
Fatal
(
"unknow boosting type %s"
,
type
.
c_str
());
}
}
LoadFileToBoosting
(
ret
.
get
(),
filename
);
LoadFileToBoosting
(
ret
.
get
(),
filename
);
}
else
{
}
else
{
...
...
src/boosting/dart.hpp
View file @
4f77bd28
...
@@ -38,6 +38,11 @@ public:
...
@@ -38,6 +38,11 @@ public:
random_for_drop_
=
Random
(
gbdt_config_
->
drop_seed
);
random_for_drop_
=
Random
(
gbdt_config_
->
drop_seed
);
sum_weight_
=
0.0
f
;
sum_weight_
=
0.0
f
;
}
}
void
ResetTrainingData
(
const
BoostingConfig
*
config
,
const
Dataset
*
train_data
,
const
ObjectiveFunction
*
object_function
,
const
std
::
vector
<
const
Metric
*>&
training_metrics
)
override
{
GBDT
::
ResetTrainingData
(
config
,
train_data
,
object_function
,
training_metrics
);
}
/*!
/*!
* \brief one training iteration
* \brief one training iteration
*/
*/
...
...
src/boosting/gbdt.cpp
View file @
4f77bd28
...
@@ -4,7 +4,6 @@
...
@@ -4,7 +4,6 @@
#include <LightGBM/utils/common.h>
#include <LightGBM/utils/common.h>
#include <LightGBM/feature.h>
#include <LightGBM/objective_function.h>
#include <LightGBM/objective_function.h>
#include <LightGBM/metric.h>
#include <LightGBM/metric.h>
...
@@ -37,7 +36,6 @@ GBDT::GBDT()
...
@@ -37,7 +36,6 @@ GBDT::GBDT()
}
}
GBDT
::~
GBDT
()
{
GBDT
::~
GBDT
()
{
}
}
void
GBDT
::
Init
(
const
BoostingConfig
*
config
,
const
Dataset
*
train_data
,
const
ObjectiveFunction
*
object_function
,
void
GBDT
::
Init
(
const
BoostingConfig
*
config
,
const
Dataset
*
train_data
,
const
ObjectiveFunction
*
object_function
,
...
@@ -106,16 +104,6 @@ void GBDT::ResetTrainingData(const BoostingConfig* config, const Dataset* train_
...
@@ -106,16 +104,6 @@ void GBDT::ResetTrainingData(const BoostingConfig* config, const Dataset* train_
label_idx_
=
train_data
->
label_idx
();
label_idx_
=
train_data
->
label_idx
();
// get feature names
// get feature names
feature_names_
=
train_data
->
feature_names
();
feature_names_
=
train_data
->
feature_names
();
// get feature infos
feature_infos_
.
clear
();
for
(
int
i
=
0
;
i
<
max_feature_idx_
+
1
;
++
i
)
{
int
feature_idx
=
train_data
->
GetInnerFeatureIndex
(
i
);
if
(
feature_idx
<
0
)
{
feature_infos_
.
push_back
(
"trival feature"
);
}
else
{
feature_infos_
.
push_back
(
train_data
->
FeatureAt
(
feature_idx
)
->
bin_mapper
()
->
bin_info
());
}
}
}
}
if
((
train_data_
!=
train_data
&&
train_data
!=
nullptr
)
if
((
train_data_
!=
train_data
&&
train_data
!=
nullptr
)
...
@@ -587,11 +575,6 @@ std::string GBDT::SaveModelToString(int num_iterations) const {
...
@@ -587,11 +575,6 @@ std::string GBDT::SaveModelToString(int num_iterations) const {
ss
<<
pairs
[
i
].
second
<<
"="
<<
std
::
to_string
(
pairs
[
i
].
first
)
<<
std
::
endl
;
ss
<<
pairs
[
i
].
second
<<
"="
<<
std
::
to_string
(
pairs
[
i
].
first
)
<<
std
::
endl
;
}
}
ss
<<
std
::
endl
<<
"feature information:"
<<
std
::
endl
;
for
(
int
i
=
0
;
i
<
max_feature_idx_
+
1
;
++
i
)
{
ss
<<
feature_names_
[
i
]
<<
"="
<<
feature_infos_
[
i
]
<<
std
::
endl
;
}
return
ss
.
str
();
return
ss
.
str
();
}
}
...
@@ -651,51 +634,12 @@ bool GBDT::LoadModelFromString(const std::string& model_str) {
...
@@ -651,51 +634,12 @@ bool GBDT::LoadModelFromString(const std::string& model_str) {
Log
::
Fatal
(
"Wrong size of feature_names"
);
Log
::
Fatal
(
"Wrong size of feature_names"
);
return
false
;
return
false
;
}
}
}
else
{
}
else
{
Log
::
Fatal
(
"Model file doesn't contain feature names"
);
Log
::
Fatal
(
"Model file doesn't contain feature names"
);
return
false
;
return
false
;
}
}
// returns offset, or lines.size() if not found.
auto
find_string_lineno
=
[
&
lines
](
const
std
::
string
&
str
,
size_t
start_line
)
{
size_t
i
=
start_line
;
size_t
featinfo_find_pos
=
std
::
string
::
npos
;
while
(
i
<
lines
.
size
())
{
featinfo_find_pos
=
lines
[
i
].
find
(
str
);
if
(
featinfo_find_pos
!=
std
::
string
::
npos
)
break
;
++
i
;
}
return
i
;
};
// load feature information
{
size_t
finfo_line_idx
=
find_string_lineno
(
"feature information:"
,
0
);
if
(
finfo_line_idx
>=
lines
.
size
())
{
Log
::
Fatal
(
"Model file doesn't contain feature information"
);
return
false
;
}
feature_infos_
.
resize
(
max_feature_idx_
+
1
);
// search for each feature name
for
(
int
i
=
0
;
i
<
max_feature_idx_
+
1
;
i
++
)
{
const
auto
feat_name
=
feature_names_
[
i
];
size_t
line_idx
=
find_string_lineno
(
feat_name
+
"="
,
finfo_line_idx
+
1
);
if
(
line_idx
>=
lines
.
size
())
{
Log
::
Fatal
((
"Model file doesn't contain feature information for feature "
+
feat_name
).
c_str
());
return
false
;
}
const
auto
this_line
=
lines
[
line_idx
];
feature_infos_
[
i
]
=
this_line
.
substr
((
feat_name
+
"="
).
size
());
}
}
// get tree models
// get tree models
size_t
i
=
0
;
size_t
i
=
0
;
while
(
i
<
lines
.
size
())
{
while
(
i
<
lines
.
size
())
{
...
@@ -725,7 +669,7 @@ std::vector<std::pair<size_t, std::string>> GBDT::FeatureImportance() const {
...
@@ -725,7 +669,7 @@ std::vector<std::pair<size_t, std::string>> GBDT::FeatureImportance() const {
std
::
vector
<
size_t
>
feature_importances
(
max_feature_idx_
+
1
,
0
);
std
::
vector
<
size_t
>
feature_importances
(
max_feature_idx_
+
1
,
0
);
for
(
size_t
iter
=
0
;
iter
<
models_
.
size
();
++
iter
)
{
for
(
size_t
iter
=
0
;
iter
<
models_
.
size
();
++
iter
)
{
for
(
int
split_idx
=
0
;
split_idx
<
models_
[
iter
]
->
num_leaves
()
-
1
;
++
split_idx
)
{
for
(
int
split_idx
=
0
;
split_idx
<
models_
[
iter
]
->
num_leaves
()
-
1
;
++
split_idx
)
{
++
feature_importances
[
models_
[
iter
]
->
split_feature
_real
(
split_idx
)];
++
feature_importances
[
models_
[
iter
]
->
split_feature
(
split_idx
)];
}
}
}
}
// store the importance first
// store the importance first
...
...
src/boosting/gbdt.h
View file @
4f77bd28
...
@@ -329,8 +329,6 @@ protected:
...
@@ -329,8 +329,6 @@ protected:
int
num_init_iteration_
;
int
num_init_iteration_
;
/*! \brief Feature names */
/*! \brief Feature names */
std
::
vector
<
std
::
string
>
feature_names_
;
std
::
vector
<
std
::
string
>
feature_names_
;
/*! \brief Feature informations */
std
::
vector
<
std
::
string
>
feature_infos_
;
/*! \brief number of threads */
/*! \brief number of threads */
int
num_threads_
;
int
num_threads_
;
/*! \brief Buffer for multi-threading bagging */
/*! \brief Buffer for multi-threading bagging */
...
...
src/boosting/goss.hpp
0 → 100644
View file @
4f77bd28
#ifndef LIGHTGBM_BOOSTING_GOSS_H_
#define LIGHTGBM_BOOSTING_GOSS_H_
#include <LightGBM/utils/array_args.h>
#include <LightGBM/utils/log.h>
#include <LightGBM/utils/openmp_wrapper.h>
#include <LightGBM/boosting.h>
#include "score_updater.hpp"
#include "gbdt.h"
#include <cstdio>
#include <vector>
#include <string>
#include <fstream>
#include <chrono>
namespace
LightGBM
{
class
GOSS
:
public
GBDT
{
public:
/*!
* \brief Constructor
*/
GOSS
()
:
GBDT
()
{
}
~
GOSS
()
{
}
void
Init
(
const
BoostingConfig
*
config
,
const
Dataset
*
train_data
,
const
ObjectiveFunction
*
object_function
,
const
std
::
vector
<
const
Metric
*>&
training_metrics
)
override
{
GBDT
::
Init
(
config
,
train_data
,
object_function
,
training_metrics
);
CHECK
(
gbdt_config_
->
top_rate
+
gbdt_config_
->
other_rate
<=
1.0
f
);
CHECK
(
gbdt_config_
->
top_rate
>
0.0
f
&&
gbdt_config_
->
other_rate
>
0.0
f
);
if
(
gbdt_config_
->
bagging_freq
>
0
&&
gbdt_config_
->
bagging_fraction
!=
1.0
f
)
{
Log
::
Fatal
(
"cannot used bagging in GOSS"
);
}
Log
::
Info
(
"using GOSS"
);
}
void
ResetTrainingData
(
const
BoostingConfig
*
config
,
const
Dataset
*
train_data
,
const
ObjectiveFunction
*
object_function
,
const
std
::
vector
<
const
Metric
*>&
training_metrics
)
override
{
if
(
config
->
bagging_freq
>
0
&&
config
->
bagging_fraction
!=
1.0
f
)
{
Log
::
Fatal
(
"cannot used bagging in GOSS"
);
}
GBDT
::
ResetTrainingData
(
config
,
train_data
,
object_function
,
training_metrics
);
if
(
train_data_
==
nullptr
)
{
return
;
}
bag_data_indices_
.
resize
(
num_data_
);
tmp_indices_
.
resize
(
num_data_
);
tmp_indice_right_
.
resize
(
num_data_
);
offsets_buf_
.
resize
(
num_threads_
);
left_cnts_buf_
.
resize
(
num_threads_
);
right_cnts_buf_
.
resize
(
num_threads_
);
left_write_pos_buf_
.
resize
(
num_threads_
);
right_write_pos_buf_
.
resize
(
num_threads_
);
is_use_subset_
=
false
;
if
(
config
->
top_rate
+
config
->
other_rate
<=
0.5
)
{
auto
bag_data_cnt
=
static_cast
<
data_size_t
>
((
config
->
top_rate
+
config
->
other_rate
)
*
num_data_
);
tmp_subset_
.
reset
(
new
Dataset
(
bag_data_cnt
));
tmp_subset_
->
CopyFeatureMapperFrom
(
train_data_
);
is_use_subset_
=
true
;
}
// flag to not bagging first
bag_data_cnt_
=
num_data_
;
}
data_size_t
BaggingHelper
(
Random
&
cur_rand
,
data_size_t
start
,
data_size_t
cnt
,
data_size_t
*
buffer
,
data_size_t
*
buffer_right
)
{
std
::
vector
<
score_t
>
tmp_gradients
(
cnt
);
for
(
data_size_t
i
=
0
;
i
<
cnt
;
++
i
)
{
tmp_gradients
[
i
]
=
std
::
fabs
(
gradients_
[
start
+
i
]
*
hessians_
[
start
+
i
]);
}
data_size_t
top_k
=
static_cast
<
data_size_t
>
(
cnt
*
gbdt_config_
->
top_rate
);
data_size_t
other_k
=
static_cast
<
data_size_t
>
(
cnt
*
gbdt_config_
->
other_rate
);
top_k
=
std
::
max
(
1
,
top_k
);
ArrayArgs
<
score_t
>::
ArgMaxAtK
(
&
tmp_gradients
,
0
,
static_cast
<
int
>
(
tmp_gradients
.
size
()),
top_k
);
score_t
threshold
=
tmp_gradients
[
top_k
-
1
];
score_t
multiply
=
static_cast
<
score_t
>
(
cnt
-
top_k
)
/
other_k
;
data_size_t
cur_left_cnt
=
0
;
data_size_t
cur_right_cnt
=
0
;
data_size_t
big_weight_cnt
=
0
;
for
(
data_size_t
i
=
0
;
i
<
cnt
;
++
i
)
{
if
(
std
::
fabs
(
gradients_
[
start
+
i
]
*
hessians_
[
start
+
i
])
>=
threshold
)
{
buffer
[
cur_left_cnt
++
]
=
start
+
i
;
++
big_weight_cnt
;
}
else
{
data_size_t
sampled
=
cur_left_cnt
-
big_weight_cnt
;
data_size_t
rest_need
=
other_k
-
sampled
;
data_size_t
rest_all
=
(
cnt
-
i
)
-
(
top_k
-
big_weight_cnt
);
double
prob
=
(
rest_need
)
/
static_cast
<
double
>
(
rest_all
);
if
(
cur_rand
.
NextFloat
()
<
prob
)
{
buffer
[
cur_left_cnt
++
]
=
start
+
i
;
gradients_
[
start
+
i
]
*=
multiply
;
hessians_
[
start
+
i
]
*=
multiply
;
}
else
{
buffer_right
[
cur_right_cnt
++
]
=
start
+
i
;
}
}
}
return
cur_left_cnt
;
}
void
Bagging
(
int
iter
)
override
{
bag_data_cnt_
=
num_data_
;
// not subsample for first iterations
if
(
iter
<
static_cast
<
int
>
(
1.0
f
/
gbdt_config_
->
learning_rate
))
{
return
;
}
const
data_size_t
min_inner_size
=
1000
;
data_size_t
inner_size
=
(
num_data_
+
num_threads_
-
1
)
/
num_threads_
;
if
(
inner_size
<
min_inner_size
)
{
inner_size
=
min_inner_size
;
}
#pragma omp parallel for schedule(static, 1)
for
(
int
i
=
0
;
i
<
num_threads_
;
++
i
)
{
left_cnts_buf_
[
i
]
=
0
;
right_cnts_buf_
[
i
]
=
0
;
data_size_t
cur_start
=
i
*
inner_size
;
if
(
cur_start
>
num_data_
)
{
continue
;
}
data_size_t
cur_cnt
=
inner_size
;
if
(
cur_start
+
cur_cnt
>
num_data_
)
{
cur_cnt
=
num_data_
-
cur_start
;
}
Random
cur_rand
(
gbdt_config_
->
bagging_seed
+
iter
*
num_threads_
+
i
);
data_size_t
cur_left_count
=
BaggingHelper
(
cur_rand
,
cur_start
,
cur_cnt
,
tmp_indices_
.
data
()
+
cur_start
,
tmp_indice_right_
.
data
()
+
cur_start
);
offsets_buf_
[
i
]
=
cur_start
;
left_cnts_buf_
[
i
]
=
cur_left_count
;
right_cnts_buf_
[
i
]
=
cur_cnt
-
cur_left_count
;
}
data_size_t
left_cnt
=
0
;
left_write_pos_buf_
[
0
]
=
0
;
right_write_pos_buf_
[
0
]
=
0
;
for
(
int
i
=
1
;
i
<
num_threads_
;
++
i
)
{
left_write_pos_buf_
[
i
]
=
left_write_pos_buf_
[
i
-
1
]
+
left_cnts_buf_
[
i
-
1
];
right_write_pos_buf_
[
i
]
=
right_write_pos_buf_
[
i
-
1
]
+
right_cnts_buf_
[
i
-
1
];
}
left_cnt
=
left_write_pos_buf_
[
num_threads_
-
1
]
+
left_cnts_buf_
[
num_threads_
-
1
];
#pragma omp parallel for schedule(static, 1)
for
(
int
i
=
0
;
i
<
num_threads_
;
++
i
)
{
if
(
left_cnts_buf_
[
i
]
>
0
)
{
std
::
memcpy
(
bag_data_indices_
.
data
()
+
left_write_pos_buf_
[
i
],
tmp_indices_
.
data
()
+
offsets_buf_
[
i
],
left_cnts_buf_
[
i
]
*
sizeof
(
data_size_t
));
}
if
(
right_cnts_buf_
[
i
]
>
0
)
{
std
::
memcpy
(
bag_data_indices_
.
data
()
+
left_cnt
+
right_write_pos_buf_
[
i
],
tmp_indice_right_
.
data
()
+
offsets_buf_
[
i
],
right_cnts_buf_
[
i
]
*
sizeof
(
data_size_t
));
}
}
bag_data_cnt_
=
left_cnt
;
// set bagging data to tree learner
if
(
!
is_use_subset_
)
{
tree_learner_
->
SetBaggingData
(
bag_data_indices_
.
data
(),
bag_data_cnt_
);
}
else
{
// get subset
tmp_subset_
->
ReSize
(
bag_data_cnt_
);
tmp_subset_
->
CopySubset
(
train_data_
,
bag_data_indices_
.
data
(),
bag_data_cnt_
,
false
);
tree_learner_
->
ResetTrainingData
(
tmp_subset_
.
get
());
}
}
/*!
* \brief Get Type name of this boosting object
*/
const
char
*
SubModelName
()
const
override
{
return
"tree"
;
}
private:
std
::
vector
<
data_size_t
>
tmp_indice_right_
;
};
}
// namespace LightGBM
#endif // LIGHTGBM_BOOSTING_GOSS_H_
src/boosting/score_updater.hpp
View file @
4f77bd28
#ifndef LIGHTGBM_BOOSTING_SCORE_UPDATER_HPP_
#ifndef LIGHTGBM_BOOSTING_SCORE_UPDATER_HPP_
#define LIGHTGBM_BOOSTING_SCORE_UPDATER_HPP_
#define LIGHTGBM_BOOSTING_SCORE_UPDATER_HPP_
#include <LightGBM/utils/openmp_wrapper.h>
#include <LightGBM/meta.h>
#include <LightGBM/meta.h>
#include <LightGBM/dataset.h>
#include <LightGBM/dataset.h>
#include <LightGBM/tree.h>
#include <LightGBM/tree.h>
...
...
src/c_api.cpp
View file @
4f77bd28
...
@@ -330,20 +330,22 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromMat(const void* data,
...
@@ -330,20 +330,22 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromMat(const void* data,
const
int
sample_cnt
=
static_cast
<
int
>
(
nrow
<
io_config
.
bin_construct_sample_cnt
?
nrow
:
io_config
.
bin_construct_sample_cnt
);
const
int
sample_cnt
=
static_cast
<
int
>
(
nrow
<
io_config
.
bin_construct_sample_cnt
?
nrow
:
io_config
.
bin_construct_sample_cnt
);
auto
sample_indices
=
rand
.
Sample
(
nrow
,
sample_cnt
);
auto
sample_indices
=
rand
.
Sample
(
nrow
,
sample_cnt
);
std
::
vector
<
std
::
vector
<
double
>>
sample_values
(
ncol
);
std
::
vector
<
std
::
vector
<
double
>>
sample_values
(
ncol
);
std
::
vector
<
std
::
vector
<
int
>>
sample_idx
(
ncol
);
for
(
size_t
i
=
0
;
i
<
sample_indices
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
sample_indices
.
size
();
++
i
)
{
auto
idx
=
sample_indices
[
i
];
auto
idx
=
sample_indices
[
i
];
auto
row
=
get_row_fun
(
static_cast
<
int
>
(
idx
));
auto
row
=
get_row_fun
(
static_cast
<
int
>
(
idx
));
for
(
size_t
j
=
0
;
j
<
row
.
size
();
++
j
)
{
for
(
size_t
j
=
0
;
j
<
row
.
size
();
++
j
)
{
if
(
std
::
fabs
(
row
[
j
])
>
1e-15
)
{
if
(
std
::
fabs
(
row
[
j
])
>
kEpsilon
)
{
sample_values
[
j
].
push_back
(
row
[
j
]);
sample_values
[
j
].
emplace_back
(
row
[
j
]);
sample_idx
[
j
].
emplace_back
(
static_cast
<
int
>
(
i
));
}
}
}
}
}
}
DatasetLoader
loader
(
io_config
,
nullptr
,
1
,
nullptr
);
DatasetLoader
loader
(
io_config
,
nullptr
,
1
,
nullptr
);
ret
.
reset
(
loader
.
CostructFromSampleData
(
sample_values
,
sample_cnt
,
nrow
));
ret
.
reset
(
loader
.
CostructFromSampleData
(
sample_values
,
sample_idx
,
sample_cnt
,
nrow
));
}
else
{
}
else
{
ret
.
reset
(
new
Dataset
(
nrow
));
ret
.
reset
(
new
Dataset
(
nrow
));
ret
->
C
opyFeatureMapperFrom
(
ret
->
C
reateValid
(
reinterpret_cast
<
const
Dataset
*>
(
reference
));
reinterpret_cast
<
const
Dataset
*>
(
reference
));
}
}
...
@@ -382,29 +384,28 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromCSR(const void* indptr,
...
@@ -382,29 +384,28 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromCSR(const void* indptr,
const
int
sample_cnt
=
static_cast
<
int
>
(
nrow
<
io_config
.
bin_construct_sample_cnt
?
nrow
:
io_config
.
bin_construct_sample_cnt
);
const
int
sample_cnt
=
static_cast
<
int
>
(
nrow
<
io_config
.
bin_construct_sample_cnt
?
nrow
:
io_config
.
bin_construct_sample_cnt
);
auto
sample_indices
=
rand
.
Sample
(
nrow
,
sample_cnt
);
auto
sample_indices
=
rand
.
Sample
(
nrow
,
sample_cnt
);
std
::
vector
<
std
::
vector
<
double
>>
sample_values
;
std
::
vector
<
std
::
vector
<
double
>>
sample_values
;
std
::
vector
<
std
::
vector
<
int
>>
sample_idx
;
for
(
size_t
i
=
0
;
i
<
sample_indices
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
sample_indices
.
size
();
++
i
)
{
auto
idx
=
sample_indices
[
i
];
auto
idx
=
sample_indices
[
i
];
auto
row
=
get_row_fun
(
static_cast
<
int
>
(
idx
));
auto
row
=
get_row_fun
(
static_cast
<
int
>
(
idx
));
for
(
std
::
pair
<
int
,
double
>&
inner_data
:
row
)
{
for
(
std
::
pair
<
int
,
double
>&
inner_data
:
row
)
{
if
(
static_cast
<
size_t
>
(
inner_data
.
first
)
>=
sample_values
.
size
())
{
if
(
static_cast
<
size_t
>
(
inner_data
.
first
)
>=
sample_values
.
size
())
{
// if need expand feature set
sample_values
.
resize
(
inner_data
.
first
+
1
);
size_t
need_size
=
inner_data
.
first
-
sample_values
.
size
()
+
1
;
sample_idx
.
resize
(
inner_data
.
first
+
1
);
for
(
size_t
j
=
0
;
j
<
need_size
;
++
j
)
{
sample_values
.
emplace_back
();
}
}
}
if
(
std
::
fabs
(
inner_data
.
second
)
>
1e-15
)
{
if
(
std
::
fabs
(
inner_data
.
second
)
>
kEpsilon
)
{
// edit the feature value
// edit the feature value
sample_values
[
inner_data
.
first
].
push_back
(
inner_data
.
second
);
sample_values
[
inner_data
.
first
].
emplace_back
(
inner_data
.
second
);
sample_idx
[
inner_data
.
first
].
emplace_back
(
static_cast
<
int
>
(
i
));
}
}
}
}
}
}
CHECK
(
num_col
>=
static_cast
<
int
>
(
sample_values
.
size
()));
CHECK
(
num_col
>=
static_cast
<
int
>
(
sample_values
.
size
()));
DatasetLoader
loader
(
io_config
,
nullptr
,
1
,
nullptr
);
DatasetLoader
loader
(
io_config
,
nullptr
,
1
,
nullptr
);
ret
.
reset
(
loader
.
CostructFromSampleData
(
sample_values
,
sample_cnt
,
nrow
));
ret
.
reset
(
loader
.
CostructFromSampleData
(
sample_values
,
sample_idx
,
sample_cnt
,
nrow
));
}
else
{
}
else
{
ret
.
reset
(
new
Dataset
(
nrow
));
ret
.
reset
(
new
Dataset
(
nrow
));
ret
->
C
opyFeatureMapperFrom
(
ret
->
C
reateValid
(
reinterpret_cast
<
const
Dataset
*>
(
reference
));
reinterpret_cast
<
const
Dataset
*>
(
reference
));
}
}
...
@@ -442,29 +443,33 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromCSC(const void* col_ptr,
...
@@ -442,29 +443,33 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromCSC(const void* col_ptr,
const
int
sample_cnt
=
static_cast
<
int
>
(
nrow
<
io_config
.
bin_construct_sample_cnt
?
nrow
:
io_config
.
bin_construct_sample_cnt
);
const
int
sample_cnt
=
static_cast
<
int
>
(
nrow
<
io_config
.
bin_construct_sample_cnt
?
nrow
:
io_config
.
bin_construct_sample_cnt
);
auto
sample_indices
=
rand
.
Sample
(
nrow
,
sample_cnt
);
auto
sample_indices
=
rand
.
Sample
(
nrow
,
sample_cnt
);
std
::
vector
<
std
::
vector
<
double
>>
sample_values
(
ncol_ptr
-
1
);
std
::
vector
<
std
::
vector
<
double
>>
sample_values
(
ncol_ptr
-
1
);
std
::
vector
<
std
::
vector
<
int
>>
sample_idx
(
ncol_ptr
-
1
);
#pragma omp parallel for schedule(guided)
#pragma omp parallel for schedule(guided)
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
sample_values
.
size
());
++
i
)
{
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
sample_values
.
size
());
++
i
)
{
CSC_RowIterator
col_it
(
col_ptr
,
col_ptr_type
,
indices
,
data
,
data_type
,
ncol_ptr
,
nelem
,
i
);
CSC_RowIterator
col_it
(
col_ptr
,
col_ptr_type
,
indices
,
data
,
data_type
,
ncol_ptr
,
nelem
,
i
);
for
(
int
j
=
0
;
j
<
sample_cnt
;
j
++
)
{
for
(
int
j
=
0
;
j
<
sample_cnt
;
j
++
)
{
auto
val
=
col_it
.
Get
(
sample_indices
[
j
]);
auto
val
=
col_it
.
Get
(
sample_indices
[
j
]);
if
(
std
::
fabs
(
val
)
>
kEpsilon
)
{
if
(
std
::
fabs
(
val
)
>
kEpsilon
)
{
sample_values
[
i
].
push_back
(
val
);
sample_values
[
i
].
emplace_back
(
val
);
sample_idx
[
i
].
emplace_back
(
j
);
}
}
}
}
}
}
DatasetLoader
loader
(
io_config
,
nullptr
,
1
,
nullptr
);
DatasetLoader
loader
(
io_config
,
nullptr
,
1
,
nullptr
);
ret
.
reset
(
loader
.
CostructFromSampleData
(
sample_values
,
sample_cnt
,
nrow
));
ret
.
reset
(
loader
.
CostructFromSampleData
(
sample_values
,
sample_idx
,
sample_cnt
,
nrow
));
}
else
{
}
else
{
ret
.
reset
(
new
Dataset
(
nrow
));
ret
.
reset
(
new
Dataset
(
nrow
));
ret
->
C
opyFeatureMapperFrom
(
ret
->
C
reateValid
(
reinterpret_cast
<
const
Dataset
*>
(
reference
));
reinterpret_cast
<
const
Dataset
*>
(
reference
));
}
}
#pragma omp parallel for schedule(guided)
#pragma omp parallel for schedule(guided)
for
(
int
i
=
0
;
i
<
ncol_ptr
-
1
;
++
i
)
{
for
(
int
i
=
0
;
i
<
ncol_ptr
-
1
;
++
i
)
{
const
int
tid
=
omp_get_thread_num
();
const
int
tid
=
omp_get_thread_num
();
int
feature_idx
=
ret
->
Get
InnerFeatureIndex
(
i
);
int
feature_idx
=
ret
->
InnerFeatureIndex
(
i
);
if
(
feature_idx
<
0
)
{
continue
;
}
if
(
feature_idx
<
0
)
{
continue
;
}
int
group
=
ret
->
Feature2Group
(
feature_idx
);
int
sub_feature
=
ret
->
Feture2SubFeature
(
feature_idx
);
CSC_RowIterator
col_it
(
col_ptr
,
col_ptr_type
,
indices
,
data
,
data_type
,
ncol_ptr
,
nelem
,
i
);
CSC_RowIterator
col_it
(
col_ptr
,
col_ptr_type
,
indices
,
data
,
data_type
,
ncol_ptr
,
nelem
,
i
);
int
row_idx
=
0
;
int
row_idx
=
0
;
while
(
row_idx
<
nrow
)
{
while
(
row_idx
<
nrow
)
{
...
@@ -472,7 +477,7 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromCSC(const void* col_ptr,
...
@@ -472,7 +477,7 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromCSC(const void* col_ptr,
row_idx
=
pair
.
first
;
row_idx
=
pair
.
first
;
// no more data
// no more data
if
(
row_idx
<
0
)
{
break
;
}
if
(
row_idx
<
0
)
{
break
;
}
ret
->
FeatureAt
(
feature_idx
)
->
PushData
(
tid
,
row_idx
,
pair
.
second
);
ret
->
Push
One
Data
(
tid
,
row_idx
,
group
,
sub_feature
,
pair
.
second
);
}
}
}
}
ret
->
FinishLoad
();
ret
->
FinishLoad
();
...
...
src/io/bin.cpp
View file @
4f77bd28
...
@@ -23,16 +23,10 @@ BinMapper::BinMapper(const BinMapper& other) {
...
@@ -23,16 +23,10 @@ BinMapper::BinMapper(const BinMapper& other) {
num_bin_
=
other
.
num_bin_
;
num_bin_
=
other
.
num_bin_
;
is_trival_
=
other
.
is_trival_
;
is_trival_
=
other
.
is_trival_
;
sparse_rate_
=
other
.
sparse_rate_
;
sparse_rate_
=
other
.
sparse_rate_
;
bin_type_
=
other
.
bin_type_
;
bin_upper_bound_
=
other
.
bin_upper_bound_
;
if
(
bin_type_
==
BinType
::
NumericalBin
)
{
bin_upper_bound_
=
other
.
bin_upper_bound_
;
}
else
{
bin_2_categorical_
=
other
.
bin_2_categorical_
;
categorical_2_bin_
=
other
.
categorical_2_bin_
;
}
min_val_
=
other
.
min_val_
;
min_val_
=
other
.
min_val_
;
max_val_
=
other
.
max_val_
;
max_val_
=
other
.
max_val_
;
default_bin_
=
other
.
default_bin_
;
}
}
BinMapper
::
BinMapper
(
const
void
*
memory
)
{
BinMapper
::
BinMapper
(
const
void
*
memory
)
{
...
@@ -43,37 +37,48 @@ BinMapper::~BinMapper() {
...
@@ -43,37 +37,48 @@ BinMapper::~BinMapper() {
}
}
void
BinMapper
::
FindBin
(
std
::
vector
<
double
>*
values
,
size_t
total_sample_cnt
,
int
max_bin
,
BinType
bin_type
)
{
bool
NeedFilter
(
std
::
vector
<
int
>&
cnt_in_bin
,
int
total_cnt
,
int
filter_cnt
)
{
bin_type_
=
bin_type
;
int
sum_left
=
0
;
std
::
vector
<
double
>&
ref_values
=
(
*
values
);
for
(
size_t
i
=
0
;
i
<
cnt_in_bin
.
size
()
-
1
;
++
i
)
{
size_t
sample_size
=
total_sample_cnt
;
sum_left
+=
cnt_in_bin
[
i
];
int
zero_cnt
=
static_cast
<
int
>
(
total_sample_cnt
-
ref_values
.
size
());
if
(
sum_left
>=
filter_cnt
)
{
return
false
;
}
else
if
(
total_cnt
-
sum_left
>=
filter_cnt
)
{
return
false
;
}
}
return
true
;
}
void
BinMapper
::
FindBin
(
std
::
vector
<
double
>&
values
,
size_t
total_sample_cnt
,
int
max_bin
,
int
min_data_in_bin
,
int
min_split_data
)
{
// limit max_bin by min_data_in_bin
std
::
vector
<
double
>&
raw_values
=
values
;
int
zero_cnt
=
static_cast
<
int
>
(
total_sample_cnt
-
raw_values
.
size
());
// find distinct_values first
// find distinct_values first
std
::
vector
<
double
>
distinct_values
;
std
::
vector
<
double
>
distinct_values
;
std
::
vector
<
int
>
counts
;
std
::
vector
<
int
>
counts
;
std
::
sort
(
r
ef
_values
.
begin
(),
r
ef
_values
.
end
());
std
::
sort
(
r
aw
_values
.
begin
(),
r
aw
_values
.
end
());
// push zero in the front
// push zero in the front
if
(
r
ef
_values
.
empty
()
||
(
r
ef
_values
[
0
]
>
0.0
f
&&
zero_cnt
>
0
))
{
if
(
r
aw
_values
.
empty
()
||
(
r
aw
_values
[
0
]
>
0.0
f
&&
zero_cnt
>
0
))
{
distinct_values
.
push_back
(
0
);
distinct_values
.
push_back
(
0
.0
f
);
counts
.
push_back
(
zero_cnt
);
counts
.
push_back
(
zero_cnt
);
}
}
if
(
!
r
ef
_values
.
empty
())
{
if
(
!
r
aw
_values
.
empty
())
{
distinct_values
.
push_back
(
r
ef
_values
[
0
]);
distinct_values
.
push_back
(
r
aw
_values
[
0
]);
counts
.
push_back
(
1
);
counts
.
push_back
(
1
);
}
}
for
(
size_t
i
=
1
;
i
<
ref_values
.
size
();
++
i
)
{
for
(
size_t
i
=
1
;
i
<
raw_values
.
size
();
++
i
)
{
if
(
ref_values
[
i
]
!=
ref_values
[
i
-
1
])
{
if
(
raw_values
[
i
]
!=
raw_values
[
i
-
1
])
{
if
(
ref_values
[
i
-
1
]
==
0.0
f
)
{
if
(
raw_values
[
i
-
1
]
<
0.0
f
&&
raw_values
[
i
]
>
0.0
f
)
{
counts
.
back
()
+=
zero_cnt
;
distinct_values
.
push_back
(
0.0
f
);
}
else
if
(
ref_values
[
i
-
1
]
<
0.0
f
&&
ref_values
[
i
]
>
0.0
f
)
{
distinct_values
.
push_back
(
0
);
counts
.
push_back
(
zero_cnt
);
counts
.
push_back
(
zero_cnt
);
}
}
distinct_values
.
push_back
(
r
ef
_values
[
i
]);
distinct_values
.
push_back
(
r
aw
_values
[
i
]);
counts
.
push_back
(
1
);
counts
.
push_back
(
1
);
}
else
{
}
else
{
++
counts
.
back
();
++
counts
.
back
();
...
@@ -81,119 +86,106 @@ void BinMapper::FindBin(std::vector<double>* values, size_t total_sample_cnt, in
...
@@ -81,119 +86,106 @@ void BinMapper::FindBin(std::vector<double>* values, size_t total_sample_cnt, in
}
}
// push zero in the back
// push zero in the back
if
(
!
r
ef
_values
.
empty
()
&&
r
ef
_values
.
back
()
<
0.0
f
&&
zero_cnt
>
0
)
{
if
(
!
r
aw
_values
.
empty
()
&&
r
aw
_values
.
back
()
<
0.0
f
&&
zero_cnt
>
0
)
{
distinct_values
.
push_back
(
0
);
distinct_values
.
push_back
(
0
.0
f
);
counts
.
push_back
(
zero_cnt
);
counts
.
push_back
(
zero_cnt
);
}
}
min_val_
=
distinct_values
.
front
();
min_val_
=
distinct_values
.
front
();
max_val_
=
distinct_values
.
back
();
max_val_
=
distinct_values
.
back
();
std
::
vector
<
int
>
cnt_in_bin
;
std
::
vector
<
int
>
cnt_in_bin
;
int
num_values
=
static_cast
<
int
>
(
distinct_values
.
size
());
int
num_values
=
static_cast
<
int
>
(
distinct_values
.
size
());
if
(
bin_type_
==
BinType
::
NumericalBin
)
{
if
(
num_values
<=
max_bin
)
{
std
::
sort
(
distinct_values
.
begin
(),
distinct_values
.
end
());
// use distinct value is enough
num_bin_
=
num_values
;
bin_upper_bound_
=
std
::
vector
<
double
>
(
num_values
);
for
(
int
i
=
0
;
i
<
num_values
-
1
;
++
i
)
{
bin_upper_bound_
[
i
]
=
(
distinct_values
[
i
]
+
distinct_values
[
i
+
1
])
/
2
;
}
cnt_in_bin
=
counts
;
bin_upper_bound_
[
num_values
-
1
]
=
std
::
numeric_limits
<
double
>::
infinity
();
}
else
{
// mean size for one bin
double
mean_bin_size
=
sample_size
/
static_cast
<
double
>
(
max_bin
);
int
rest_bin_cnt
=
max_bin
;
int
rest_sample_cnt
=
static_cast
<
int
>
(
sample_size
);
std
::
vector
<
bool
>
is_big_count_value
(
num_values
,
false
);
for
(
int
i
=
0
;
i
<
num_values
;
++
i
)
{
if
(
counts
[
i
]
>=
mean_bin_size
)
{
is_big_count_value
[
i
]
=
true
;
--
rest_bin_cnt
;
rest_sample_cnt
-=
counts
[
i
];
}
}
mean_bin_size
=
rest_sample_cnt
/
static_cast
<
double
>
(
rest_bin_cnt
);
std
::
vector
<
double
>
upper_bounds
(
max_bin
,
std
::
numeric_limits
<
double
>::
infinity
());
std
::
vector
<
double
>
lower_bounds
(
max_bin
,
std
::
numeric_limits
<
double
>::
infinity
());
int
bin_cnt
=
0
;
if
(
num_values
<=
max_bin
)
{
lower_bounds
[
bin_cnt
]
=
distinct_values
[
0
];
// use distinct value is enough
int
cur_cnt_inbin
=
0
;
bin_upper_bound_
.
clear
();
for
(
int
i
=
0
;
i
<
num_values
-
1
;
++
i
)
{
int
cur_cnt_inbin
=
0
;
if
(
!
is_big_count_value
[
i
])
{
for
(
int
i
=
0
;
i
<
num_values
-
1
;
++
i
)
{
rest_sample_cnt
-=
counts
[
i
];
cur_cnt_inbin
+=
counts
[
i
];
}
if
(
cur_cnt_inbin
>=
min_data_in_bin
)
{
cur_cnt_inbin
+=
counts
[
i
];
bin_upper_bound_
.
push_back
((
distinct_values
[
i
]
+
distinct_values
[
i
+
1
])
/
2
);
// need a new bin
cnt_in_bin
.
push_back
(
cur_cnt_inbin
);
if
(
is_big_count_value
[
i
]
||
cur_cnt_inbin
>=
mean_bin_size
||
cur_cnt_inbin
=
0
;
(
is_big_count_value
[
i
+
1
]
&&
cur_cnt_inbin
>=
std
::
max
(
1.0
,
mean_bin_size
*
0.5
f
)))
{
upper_bounds
[
bin_cnt
]
=
distinct_values
[
i
];
cnt_in_bin
.
push_back
(
cur_cnt_inbin
);
++
bin_cnt
;
lower_bounds
[
bin_cnt
]
=
distinct_values
[
i
+
1
];
if
(
bin_cnt
>=
max_bin
-
1
)
{
break
;
}
cur_cnt_inbin
=
0
;
if
(
!
is_big_count_value
[
i
])
{
--
rest_bin_cnt
;
mean_bin_size
=
rest_sample_cnt
/
static_cast
<
double
>
(
rest_bin_cnt
);
}
}
}
}
++
bin_cnt
;
// update bin upper bound
bin_upper_bound_
=
std
::
vector
<
double
>
(
bin_cnt
);
num_bin_
=
bin_cnt
;
for
(
int
i
=
0
;
i
<
bin_cnt
-
1
;
++
i
)
{
bin_upper_bound_
[
i
]
=
(
upper_bounds
[
i
]
+
lower_bounds
[
i
+
1
])
/
2.0
f
;
}
// last bin upper bound
bin_upper_bound_
[
bin_cnt
-
1
]
=
std
::
numeric_limits
<
double
>::
infinity
();
}
}
cur_cnt_inbin
+=
counts
.
back
();
cnt_in_bin
.
push_back
(
cur_cnt_inbin
);
bin_upper_bound_
.
push_back
(
std
::
numeric_limits
<
double
>::
infinity
());
num_bin_
=
static_cast
<
int
>
(
bin_upper_bound_
.
size
());
}
else
{
}
else
{
// convert to int type first
if
(
min_data_in_bin
>
0
)
{
std
::
vector
<
int
>
distinct_values_int
;
max_bin
=
std
::
min
(
max_bin
,
static_cast
<
int
>
(
total_sample_cnt
/
min_data_in_bin
));
std
::
vector
<
int
>
counts_int
;
max_bin
=
std
::
max
(
max_bin
,
1
);
distinct_values_int
.
push_back
(
static_cast
<
int
>
(
distinct_values
[
0
]));
}
counts_int
.
push_back
(
counts
[
0
]);
double
mean_bin_size
=
static_cast
<
double
>
(
total_sample_cnt
)
/
max_bin
;
for
(
size_t
i
=
1
;
i
<
distinct_values
.
size
();
++
i
)
{
if
(
zero_cnt
>
mean_bin_size
)
{
if
(
static_cast
<
int
>
(
distinct_values
[
i
])
!=
distinct_values_int
.
back
())
{
int
non_zero_cnt
=
static_cast
<
int
>
(
raw_values
.
size
());
distinct_values_int
.
push_back
(
static_cast
<
int
>
(
distinct_values
[
i
]));
max_bin
=
std
::
min
(
max_bin
,
1
+
static_cast
<
int
>
(
non_zero_cnt
/
min_data_in_bin
));
counts_int
.
push_back
(
counts
[
i
]);
}
}
else
{
// mean size for one bin
counts_int
.
back
()
+=
counts
[
i
];
int
rest_bin_cnt
=
max_bin
;
int
rest_sample_cnt
=
static_cast
<
int
>
(
total_sample_cnt
);
std
::
vector
<
bool
>
is_big_count_value
(
num_values
,
false
);
for
(
int
i
=
0
;
i
<
num_values
;
++
i
)
{
if
(
counts
[
i
]
>=
mean_bin_size
)
{
is_big_count_value
[
i
]
=
true
;
--
rest_bin_cnt
;
rest_sample_cnt
-=
counts
[
i
];
}
}
mean_bin_size
=
static_cast
<
double
>
(
rest_sample_cnt
)
/
rest_bin_cnt
;
std
::
vector
<
double
>
upper_bounds
(
max_bin
,
std
::
numeric_limits
<
double
>::
infinity
());
std
::
vector
<
double
>
lower_bounds
(
max_bin
,
std
::
numeric_limits
<
double
>::
infinity
());
int
bin_cnt
=
0
;
lower_bounds
[
bin_cnt
]
=
distinct_values
[
0
];
int
cur_cnt_inbin
=
0
;
for
(
int
i
=
0
;
i
<
num_values
-
1
;
++
i
)
{
if
(
!
is_big_count_value
[
i
])
{
rest_sample_cnt
-=
counts
[
i
];
}
cur_cnt_inbin
+=
counts
[
i
];
// need a new bin
if
(
is_big_count_value
[
i
]
||
cur_cnt_inbin
>=
mean_bin_size
||
(
is_big_count_value
[
i
+
1
]
&&
cur_cnt_inbin
>=
std
::
max
(
1.0
,
mean_bin_size
*
0.5
f
)))
{
upper_bounds
[
bin_cnt
]
=
distinct_values
[
i
];
cnt_in_bin
.
push_back
(
cur_cnt_inbin
);
++
bin_cnt
;
lower_bounds
[
bin_cnt
]
=
distinct_values
[
i
+
1
];
if
(
bin_cnt
>=
max_bin
-
1
)
{
break
;
}
cur_cnt_inbin
=
0
;
if
(
!
is_big_count_value
[
i
])
{
--
rest_bin_cnt
;
mean_bin_size
=
rest_sample_cnt
/
static_cast
<
double
>
(
rest_bin_cnt
);
}
}
}
}
}
// sort by counts
cur_cnt_inbin
+=
counts
.
back
();
Common
::
SortForPair
<
int
,
int
>
(
counts_int
,
distinct_values_int
,
0
,
true
);
cnt_in_bin
.
push_back
(
cur_cnt_inbin
);
// will ingore the categorical of small counts
++
bin_cnt
;
const
int
cut_cnt
=
static_cast
<
int
>
(
sample_size
*
0.95
f
);
// update bin upper bound
categorical_2_bin_
.
clear
();
bin_upper_bound_
=
std
::
vector
<
double
>
(
bin_cnt
);
bin_2_categorical_
.
clear
();
num_bin_
=
bin_cnt
;
num_bin_
=
0
;
for
(
int
i
=
0
;
i
<
bin_cnt
-
1
;
++
i
)
{
int
used_cnt
=
0
;
bin_upper_bound_
[
i
]
=
(
upper_bounds
[
i
]
+
lower_bounds
[
i
+
1
])
/
2.0
f
;
max_bin
=
std
::
min
(
static_cast
<
int
>
(
distinct_values_int
.
size
()),
max_bin
);
while
(
used_cnt
<
cut_cnt
||
num_bin_
<
max_bin
)
{
bin_2_categorical_
.
push_back
(
distinct_values_int
[
num_bin_
]);
categorical_2_bin_
[
distinct_values_int
[
num_bin_
]]
=
static_cast
<
unsigned
int
>
(
num_bin_
);
used_cnt
+=
counts_int
[
num_bin_
];
++
num_bin_
;
}
}
cnt_in_bin
=
counts_int
;
// last bin upper bound
cnt_in_bin
[
0
]
+
=
st
atic_cast
<
int
>
(
sample_size
)
-
used_cnt
;
bin_upper_bound_
[
bin_cnt
-
1
]
=
st
d
::
numeric_limits
<
double
>::
infinity
()
;
}
}
// check trival(num_bin_ == 1) feature
// check trival(num_bin_ == 1) feature
if
(
num_bin_
<=
1
)
{
if
(
num_bin_
<=
1
)
{
is_trival_
=
true
;
is_trival_
=
true
;
default_bin_
=
0
;
}
else
{
}
else
{
is_trival_
=
false
;
is_trival_
=
false
;
default_bin_
=
ValueToBin
(
0
);
}
if
(
NeedFilter
(
cnt_in_bin
,
static_cast
<
int
>
(
total_sample_cnt
),
min_split_data
))
{
is_trival_
=
true
;
}
}
// calculate sparse rate
// calculate sparse rate
CHECK
(
num_bin_
<=
max_bin
);
CHECK
(
num_bin_
<=
max_bin
);
sparse_rate_
=
static_cast
<
double
>
(
cnt_in_bin
[
GetDefaultBin
()])
/
static_cast
<
double
>
(
sample_
size
);
sparse_rate_
=
static_cast
<
double
>
(
cnt_in_bin
[
GetDefaultBin
()])
/
static_cast
<
double
>
(
total_
sample_
cnt
);
}
}
...
@@ -202,8 +194,9 @@ int BinMapper::SizeForSpecificBin(int bin) {
...
@@ -202,8 +194,9 @@ int BinMapper::SizeForSpecificBin(int bin) {
size
+=
sizeof
(
int
);
size
+=
sizeof
(
int
);
size
+=
sizeof
(
bool
);
size
+=
sizeof
(
bool
);
size
+=
sizeof
(
double
);
size
+=
sizeof
(
double
);
size
+=
sizeof
(
BinTyp
e
);
size
+=
2
*
sizeof
(
doubl
e
);
size
+=
bin
*
sizeof
(
double
);
size
+=
bin
*
sizeof
(
double
);
size
+=
sizeof
(
uint32_t
);
return
size
;
return
size
;
}
}
...
@@ -214,18 +207,13 @@ void BinMapper::CopyTo(char * buffer) {
...
@@ -214,18 +207,13 @@ void BinMapper::CopyTo(char * buffer) {
buffer
+=
sizeof
(
is_trival_
);
buffer
+=
sizeof
(
is_trival_
);
std
::
memcpy
(
buffer
,
&
sparse_rate_
,
sizeof
(
sparse_rate_
));
std
::
memcpy
(
buffer
,
&
sparse_rate_
,
sizeof
(
sparse_rate_
));
buffer
+=
sizeof
(
sparse_rate_
);
buffer
+=
sizeof
(
sparse_rate_
);
std
::
memcpy
(
buffer
,
&
bin_type_
,
sizeof
(
bin_type_
));
buffer
+=
sizeof
(
bin_type_
);
std
::
memcpy
(
&
min_val_
,
buffer
,
sizeof
(
min_val_
));
std
::
memcpy
(
&
min_val_
,
buffer
,
sizeof
(
min_val_
));
buffer
+=
sizeof
(
min_val_
);
buffer
+=
sizeof
(
min_val_
);
std
::
memcpy
(
&
max_val_
,
buffer
,
sizeof
(
max_val_
));
std
::
memcpy
(
&
max_val_
,
buffer
,
sizeof
(
max_val_
));
buffer
+=
sizeof
(
max_val_
);
buffer
+=
sizeof
(
max_val_
);
std
::
memcpy
(
&
default_bin_
,
buffer
,
sizeof
(
default_bin_
));
if
(
bin_type_
==
BinType
::
NumericalBin
)
{
buffer
+=
sizeof
(
default_bin_
);
std
::
memcpy
(
buffer
,
bin_upper_bound_
.
data
(),
num_bin_
*
sizeof
(
double
));
std
::
memcpy
(
buffer
,
bin_upper_bound_
.
data
(),
num_bin_
*
sizeof
(
double
));
}
else
{
std
::
memcpy
(
buffer
,
bin_2_categorical_
.
data
(),
num_bin_
*
sizeof
(
int
));
}
}
}
void
BinMapper
::
CopyFrom
(
const
char
*
buffer
)
{
void
BinMapper
::
CopyFrom
(
const
char
*
buffer
)
{
...
@@ -235,48 +223,30 @@ void BinMapper::CopyFrom(const char * buffer) {
...
@@ -235,48 +223,30 @@ void BinMapper::CopyFrom(const char * buffer) {
buffer
+=
sizeof
(
is_trival_
);
buffer
+=
sizeof
(
is_trival_
);
std
::
memcpy
(
&
sparse_rate_
,
buffer
,
sizeof
(
sparse_rate_
));
std
::
memcpy
(
&
sparse_rate_
,
buffer
,
sizeof
(
sparse_rate_
));
buffer
+=
sizeof
(
sparse_rate_
);
buffer
+=
sizeof
(
sparse_rate_
);
std
::
memcpy
(
&
bin_type_
,
buffer
,
sizeof
(
bin_type_
));
buffer
+=
sizeof
(
bin_type_
);
std
::
memcpy
(
&
min_val_
,
buffer
,
sizeof
(
min_val_
));
std
::
memcpy
(
&
min_val_
,
buffer
,
sizeof
(
min_val_
));
buffer
+=
sizeof
(
min_val_
);
buffer
+=
sizeof
(
min_val_
);
std
::
memcpy
(
&
max_val_
,
buffer
,
sizeof
(
max_val_
));
std
::
memcpy
(
&
max_val_
,
buffer
,
sizeof
(
max_val_
));
buffer
+=
sizeof
(
max_val_
);
buffer
+=
sizeof
(
max_val_
);
std
::
memcpy
(
&
default_bin_
,
buffer
,
sizeof
(
default_bin_
));
if
(
bin_type_
==
BinType
::
NumericalBin
)
{
buffer
+=
sizeof
(
default_bin_
);
bin_upper_bound_
=
std
::
vector
<
double
>
(
num_bin_
);
bin_upper_bound_
=
std
::
vector
<
double
>
(
num_bin_
);
std
::
memcpy
(
bin_upper_bound_
.
data
(),
buffer
,
num_bin_
*
sizeof
(
double
));
std
::
memcpy
(
bin_upper_bound_
.
data
(),
buffer
,
num_bin_
*
sizeof
(
double
));
}
else
{
bin_2_categorical_
=
std
::
vector
<
int
>
(
num_bin_
);
std
::
memcpy
(
bin_2_categorical_
.
data
(),
buffer
,
num_bin_
*
sizeof
(
int
));
categorical_2_bin_
.
clear
();
for
(
int
i
=
0
;
i
<
num_bin_
;
++
i
)
{
categorical_2_bin_
[
bin_2_categorical_
[
i
]]
=
static_cast
<
unsigned
int
>
(
i
);
}
}
}
}
void
BinMapper
::
SaveBinaryToFile
(
FILE
*
file
)
const
{
void
BinMapper
::
SaveBinaryToFile
(
FILE
*
file
)
const
{
fwrite
(
&
num_bin_
,
sizeof
(
num_bin_
),
1
,
file
);
fwrite
(
&
num_bin_
,
sizeof
(
num_bin_
),
1
,
file
);
fwrite
(
&
is_trival_
,
sizeof
(
is_trival_
),
1
,
file
);
fwrite
(
&
is_trival_
,
sizeof
(
is_trival_
),
1
,
file
);
fwrite
(
&
sparse_rate_
,
sizeof
(
sparse_rate_
),
1
,
file
);
fwrite
(
&
sparse_rate_
,
sizeof
(
sparse_rate_
),
1
,
file
);
fwrite
(
&
bin_type_
,
sizeof
(
bin_type_
),
1
,
file
);
fwrite
(
&
min_val_
,
sizeof
(
min_val_
),
1
,
file
);
fwrite
(
&
min_val_
,
sizeof
(
min_val_
),
1
,
file
);
fwrite
(
&
max_val_
,
sizeof
(
max_val_
),
1
,
file
);
fwrite
(
&
max_val_
,
sizeof
(
max_val_
),
1
,
file
);
if
(
bin_type_
==
BinType
::
NumericalBin
)
{
fwrite
(
&
default_bin_
,
sizeof
(
default_bin_
),
1
,
file
);
fwrite
(
bin_upper_bound_
.
data
(),
sizeof
(
double
),
num_bin_
,
file
);
fwrite
(
bin_upper_bound_
.
data
(),
sizeof
(
double
),
num_bin_
,
file
);
}
else
{
fwrite
(
bin_2_categorical_
.
data
(),
sizeof
(
int
),
num_bin_
,
file
);
}
}
}
size_t
BinMapper
::
SizesInByte
()
const
{
size_t
BinMapper
::
SizesInByte
()
const
{
size_t
ret
=
sizeof
(
num_bin_
)
+
sizeof
(
is_trival_
)
+
sizeof
(
sparse_rate_
)
size_t
ret
=
sizeof
(
num_bin_
)
+
sizeof
(
is_trival_
)
+
sizeof
(
sparse_rate_
)
+
sizeof
(
bin_type_
)
+
sizeof
(
min_val_
)
+
sizeof
(
max_val_
);
+
sizeof
(
min_val_
)
+
sizeof
(
max_val_
)
+
sizeof
(
default_bin_
);
if
(
bin_type_
==
BinType
::
NumericalBin
)
{
ret
+=
sizeof
(
double
)
*
num_bin_
;
ret
+=
sizeof
(
double
)
*
num_bin_
;
}
else
{
ret
+=
sizeof
(
int
)
*
num_bin_
;
}
return
ret
;
return
ret
;
}
}
...
@@ -284,73 +254,46 @@ template class DenseBin<uint8_t>;
...
@@ -284,73 +254,46 @@ template class DenseBin<uint8_t>;
template
class
DenseBin
<
uint16_t
>;
template
class
DenseBin
<
uint16_t
>;
template
class
DenseBin
<
uint32_t
>;
template
class
DenseBin
<
uint32_t
>;
template
class
DenseCategoricalBin
<
uint8_t
>;
template
class
DenseCategoricalBin
<
uint16_t
>;
template
class
DenseCategoricalBin
<
uint32_t
>;
template
class
SparseBin
<
uint8_t
>;
template
class
SparseBin
<
uint8_t
>;
template
class
SparseBin
<
uint16_t
>;
template
class
SparseBin
<
uint16_t
>;
template
class
SparseBin
<
uint32_t
>;
template
class
SparseBin
<
uint32_t
>;
template
class
SparseCategoricalBin
<
uint8_t
>;
template
class
SparseCategoricalBin
<
uint16_t
>;
template
class
SparseCategoricalBin
<
uint32_t
>;
template
class
OrderedSparseBin
<
uint8_t
>;
template
class
OrderedSparseBin
<
uint8_t
>;
template
class
OrderedSparseBin
<
uint16_t
>;
template
class
OrderedSparseBin
<
uint16_t
>;
template
class
OrderedSparseBin
<
uint32_t
>;
template
class
OrderedSparseBin
<
uint32_t
>;
double
BinMapper
::
kSparseThreshold
=
0.8
f
;
Bin
*
Bin
::
CreateBin
(
data_size_t
num_data
,
int
num_bin
,
double
sparse_rate
,
Bin
*
Bin
::
CreateBin
(
data_size_t
num_data
,
int
num_bin
,
double
sparse_rate
,
bool
is_enable_sparse
,
bool
*
is_sparse
,
uint32_t
default_bin
,
BinType
bin_type
)
{
bool
is_enable_sparse
,
bool
*
is_sparse
)
{
// sparse threshold
// sparse threshold
const
double
kSparseThreshold
=
0.8
f
;
if
(
sparse_rate
>=
BinMapper
::
kSparseThreshold
&&
is_enable_sparse
)
{
if
(
sparse_rate
>=
kSparseThreshold
&&
is_enable_sparse
)
{
*
is_sparse
=
true
;
*
is_sparse
=
true
;
return
CreateSparseBin
(
num_data
,
num_bin
,
default_bin
,
bin_type
);
return
CreateSparseBin
(
num_data
,
num_bin
);
}
else
{
}
else
{
*
is_sparse
=
false
;
*
is_sparse
=
false
;
return
CreateDenseBin
(
num_data
,
num_bin
,
default_bin
,
bin_type
);
return
CreateDenseBin
(
num_data
,
num_bin
);
}
}
}
}
Bin
*
Bin
::
CreateDenseBin
(
data_size_t
num_data
,
int
num_bin
,
uint32_t
default_bin
,
BinType
bin_type
)
{
Bin
*
Bin
::
CreateDenseBin
(
data_size_t
num_data
,
int
num_bin
)
{
if
(
bin_type
==
BinType
::
NumericalBin
)
{
if
(
num_bin
<=
256
)
{
if
(
num_bin
<=
255
)
{
return
new
DenseBin
<
uint8_t
>
(
num_data
);
return
new
DenseBin
<
uint8_t
>
(
num_data
,
default_bin
);
}
else
if
(
num_bin
<=
65536
)
{
}
else
if
(
num_bin
<=
65535
)
{
return
new
DenseBin
<
uint16_t
>
(
num_data
);
return
new
DenseBin
<
uint16_t
>
(
num_data
,
default_bin
);
}
else
{
return
new
DenseBin
<
uint32_t
>
(
num_data
,
default_bin
);
}
}
else
{
}
else
{
if
(
num_bin
<=
255
)
{
return
new
DenseBin
<
uint32_t
>
(
num_data
);
return
new
DenseCategoricalBin
<
uint8_t
>
(
num_data
,
default_bin
);
}
else
if
(
num_bin
<=
65535
)
{
return
new
DenseCategoricalBin
<
uint16_t
>
(
num_data
,
default_bin
);
}
else
{
return
new
DenseCategoricalBin
<
uint32_t
>
(
num_data
,
default_bin
);
}
}
}
}
}
Bin
*
Bin
::
CreateSparseBin
(
data_size_t
num_data
,
int
num_bin
,
uint32_t
default_bin
,
BinType
bin_type
)
{
Bin
*
Bin
::
CreateSparseBin
(
data_size_t
num_data
,
int
num_bin
)
{
if
(
bin_type
==
BinType
::
NumericalBin
)
{
if
(
num_bin
<=
256
)
{
if
(
num_bin
<=
255
)
{
return
new
SparseBin
<
uint8_t
>
(
num_data
);
return
new
SparseBin
<
uint8_t
>
(
num_data
,
default_bin
);
}
else
if
(
num_bin
<=
65536
)
{
}
else
if
(
num_bin
<=
65535
)
{
return
new
SparseBin
<
uint16_t
>
(
num_data
);
return
new
SparseBin
<
uint16_t
>
(
num_data
,
default_bin
);
}
else
{
return
new
SparseBin
<
uint32_t
>
(
num_data
,
default_bin
);
}
}
else
{
}
else
{
if
(
num_bin
<=
255
)
{
return
new
SparseBin
<
uint32_t
>
(
num_data
);
return
new
SparseCategoricalBin
<
uint8_t
>
(
num_data
,
default_bin
);
}
else
if
(
num_bin
<=
65535
)
{
return
new
SparseCategoricalBin
<
uint16_t
>
(
num_data
,
default_bin
);
}
else
{
return
new
SparseCategoricalBin
<
uint32_t
>
(
num_data
,
default_bin
);
}
}
}
}
}
...
...
src/io/config.cpp
View file @
4f77bd28
...
@@ -39,11 +39,11 @@ void OverallConfig::Set(const std::unordered_map<std::string, std::string>& para
...
@@ -39,11 +39,11 @@ void OverallConfig::Set(const std::unordered_map<std::string, std::string>& para
// generate seeds by seed.
// generate seeds by seed.
if
(
GetInt
(
params
,
"seed"
,
&
seed
))
{
if
(
GetInt
(
params
,
"seed"
,
&
seed
))
{
Random
rand
(
seed
);
Random
rand
(
seed
);
int
int_max
=
std
::
numeric_limits
<
in
t
>::
max
();
int
int_max
=
std
::
numeric_limits
<
shor
t
>::
max
();
io_config
.
data_random_seed
=
static_cast
<
int
>
(
rand
.
Next
In
t
(
0
,
int_max
));
io_config
.
data_random_seed
=
static_cast
<
int
>
(
rand
.
Next
Shor
t
(
0
,
int_max
));
boosting_config
.
bagging_seed
=
static_cast
<
int
>
(
rand
.
Next
In
t
(
0
,
int_max
));
boosting_config
.
bagging_seed
=
static_cast
<
int
>
(
rand
.
Next
Shor
t
(
0
,
int_max
));
boosting_config
.
drop_seed
=
static_cast
<
int
>
(
rand
.
Next
In
t
(
0
,
int_max
));
boosting_config
.
drop_seed
=
static_cast
<
int
>
(
rand
.
Next
Shor
t
(
0
,
int_max
));
boosting_config
.
tree_config
.
feature_fraction_seed
=
static_cast
<
int
>
(
rand
.
Next
In
t
(
0
,
int_max
));
boosting_config
.
tree_config
.
feature_fraction_seed
=
static_cast
<
int
>
(
rand
.
Next
Shor
t
(
0
,
int_max
));
}
}
GetTaskType
(
params
);
GetTaskType
(
params
);
GetBoostingType
(
params
);
GetBoostingType
(
params
);
...
@@ -79,6 +79,8 @@ void OverallConfig::GetBoostingType(const std::unordered_map<std::string, std::s
...
@@ -79,6 +79,8 @@ void OverallConfig::GetBoostingType(const std::unordered_map<std::string, std::s
boosting_type
=
"gbdt"
;
boosting_type
=
"gbdt"
;
}
else
if
(
value
==
std
::
string
(
"dart"
))
{
}
else
if
(
value
==
std
::
string
(
"dart"
))
{
boosting_type
=
"dart"
;
boosting_type
=
"dart"
;
}
else
if
(
value
==
std
::
string
(
"goss"
))
{
boosting_type
=
"goss"
;
}
else
{
}
else
{
Log
::
Fatal
(
"Unknown boosting type %s"
,
value
.
c_str
());
Log
::
Fatal
(
"Unknown boosting type %s"
,
value
.
c_str
());
}
}
...
@@ -214,7 +216,11 @@ void IOConfig::Set(const std::unordered_map<std::string, std::string>& params) {
...
@@ -214,7 +216,11 @@ void IOConfig::Set(const std::unordered_map<std::string, std::string>& params) {
GetString
(
params
,
"weight_column"
,
&
weight_column
);
GetString
(
params
,
"weight_column"
,
&
weight_column
);
GetString
(
params
,
"group_column"
,
&
group_column
);
GetString
(
params
,
"group_column"
,
&
group_column
);
GetString
(
params
,
"ignore_column"
,
&
ignore_column
);
GetString
(
params
,
"ignore_column"
,
&
ignore_column
);
GetString
(
params
,
"categorical_column"
,
&
categorical_column
);
GetInt
(
params
,
"min_data_in_leaf"
,
&
min_data_in_leaf
);
GetInt
(
params
,
"min_dato_in_bin"
,
&
min_data_in_bin
);
GetDouble
(
params
,
"max_conflict_rate"
,
&
max_conflict_rate
);
GetBool
(
params
,
"enable_bundle"
,
&
enable_bundle
);
GetBool
(
params
,
"adjacent_bundle"
,
&
adjacent_bundle
);
}
}
...
@@ -323,6 +329,8 @@ void BoostingConfig::Set(const std::unordered_map<std::string, std::string>& par
...
@@ -323,6 +329,8 @@ void BoostingConfig::Set(const std::unordered_map<std::string, std::string>& par
GetInt
(
params
,
"max_drop"
,
&
max_drop
);
GetInt
(
params
,
"max_drop"
,
&
max_drop
);
GetBool
(
params
,
"xgboost_dart_mode"
,
&
xgboost_dart_mode
);
GetBool
(
params
,
"xgboost_dart_mode"
,
&
xgboost_dart_mode
);
GetBool
(
params
,
"uniform_drop"
,
&
uniform_drop
);
GetBool
(
params
,
"uniform_drop"
,
&
uniform_drop
);
GetDouble
(
params
,
"top_rate"
,
&
top_rate
);
GetDouble
(
params
,
"other_rate"
,
&
other_rate
);
CHECK
(
drop_rate
<=
1.0
&&
drop_rate
>=
0.0
);
CHECK
(
drop_rate
<=
1.0
&&
drop_rate
>=
0.0
);
CHECK
(
skip_drop
<=
1.0
&&
skip_drop
>=
0.0
);
CHECK
(
skip_drop
<=
1.0
&&
skip_drop
>=
0.0
);
GetTreeLearnerType
(
params
);
GetTreeLearnerType
(
params
);
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment