Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tianlh
LightGBM-DCU
Commits
9962e6d6
Commit
9962e6d6
authored
Jan 25, 2017
by
Guolin Ke
Browse files
support negative values for sparse features.
parent
1765b2e3
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
87 additions
and
56 deletions
+87
-56
R-package/lightgbm_0.1.zip
R-package/lightgbm_0.1.zip
+0
-0
include/LightGBM/bin.h
include/LightGBM/bin.h
+5
-5
include/LightGBM/config.h
include/LightGBM/config.h
+1
-1
src/io/bin.cpp
src/io/bin.cpp
+33
-18
src/io/dense_bin.hpp
src/io/dense_bin.hpp
+2
-8
src/io/sparse_bin.hpp
src/io/sparse_bin.hpp
+24
-24
src/treelearner/feature_histogram.hpp
src/treelearner/feature_histogram.hpp
+22
-0
No files found.
R-package/lightgbm_0.1.zip
0 → 100644
View file @
9962e6d6
File added
include/LightGBM/bin.h
View file @
9962e6d6
...
@@ -21,9 +21,9 @@ enum BinType {
...
@@ -21,9 +21,9 @@ enum BinType {
struct
HistogramBinEntry
{
struct
HistogramBinEntry
{
public:
public:
/*! \brief Sum of gradients on this bin */
/*! \brief Sum of gradients on this bin */
double
sum_gradients
=
0.0
;
double
sum_gradients
=
0.0
f
;
/*! \brief Sum of hessians on this bin */
/*! \brief Sum of hessians on this bin */
double
sum_hessians
=
0.0
;
double
sum_hessians
=
0.0
f
;
/*! \brief Number of data on this bin */
/*! \brief Number of data on this bin */
data_size_t
cnt
=
0
;
data_size_t
cnt
=
0
;
...
@@ -352,7 +352,7 @@ public:
...
@@ -352,7 +352,7 @@ public:
*/
*/
static
Bin
*
CreateBin
(
data_size_t
num_data
,
int
num_bin
,
static
Bin
*
CreateBin
(
data_size_t
num_data
,
int
num_bin
,
double
sparse_rate
,
bool
is_enable_sparse
,
double
sparse_rate
,
bool
is_enable_sparse
,
bool
*
is_sparse
,
int
default_bin
,
BinType
bin_type
);
bool
*
is_sparse
,
u
int
32_t
default_bin
,
BinType
bin_type
);
/*!
/*!
* \brief Create object for bin data of one feature, used for dense feature
* \brief Create object for bin data of one feature, used for dense feature
...
@@ -363,7 +363,7 @@ public:
...
@@ -363,7 +363,7 @@ public:
* \return The bin data object
* \return The bin data object
*/
*/
static
Bin
*
CreateDenseBin
(
data_size_t
num_data
,
int
num_bin
,
static
Bin
*
CreateDenseBin
(
data_size_t
num_data
,
int
num_bin
,
int
default_bin
,
BinType
bin_type
);
u
int
32_t
default_bin
,
BinType
bin_type
);
/*!
/*!
* \brief Create object for bin data of one feature, used for sparse feature
* \brief Create object for bin data of one feature, used for sparse feature
...
@@ -374,7 +374,7 @@ public:
...
@@ -374,7 +374,7 @@ public:
* \return The bin data object
* \return The bin data object
*/
*/
static
Bin
*
CreateSparseBin
(
data_size_t
num_data
,
static
Bin
*
CreateSparseBin
(
data_size_t
num_data
,
int
num_bin
,
int
default_bin
,
BinType
bin_type
);
int
num_bin
,
u
int
32_t
default_bin
,
BinType
bin_type
);
};
};
inline
unsigned
int
BinMapper
::
ValueToBin
(
double
value
)
const
{
inline
unsigned
int
BinMapper
::
ValueToBin
(
double
value
)
const
{
...
...
include/LightGBM/config.h
View file @
9962e6d6
...
@@ -85,7 +85,7 @@ enum TaskType {
...
@@ -85,7 +85,7 @@ enum TaskType {
/*! \brief Config for input and output files */
/*! \brief Config for input and output files */
struct
IOConfig
:
public
ConfigBase
{
struct
IOConfig
:
public
ConfigBase
{
public:
public:
int
max_bin
=
25
6
;
int
max_bin
=
25
5
;
int
num_class
=
1
;
int
num_class
=
1
;
int
data_random_seed
=
1
;
int
data_random_seed
=
1
;
std
::
string
data_filename
=
""
;
std
::
string
data_filename
=
""
;
...
...
src/io/bin.cpp
View file @
9962e6d6
...
@@ -30,6 +30,8 @@ BinMapper::BinMapper(const BinMapper& other) {
...
@@ -30,6 +30,8 @@ BinMapper::BinMapper(const BinMapper& other) {
bin_2_categorical_
=
other
.
bin_2_categorical_
;
bin_2_categorical_
=
other
.
bin_2_categorical_
;
categorical_2_bin_
=
other
.
categorical_2_bin_
;
categorical_2_bin_
=
other
.
categorical_2_bin_
;
}
}
min_val_
=
other
.
min_val_
;
max_val_
=
other
.
max_val_
;
}
}
...
@@ -85,6 +87,7 @@ void BinMapper::FindBin(const std::string& column_name, std::vector<double>* val
...
@@ -85,6 +87,7 @@ void BinMapper::FindBin(const std::string& column_name, std::vector<double>* val
}
}
min_val_
=
distinct_values
.
front
();
min_val_
=
distinct_values
.
front
();
max_val_
=
distinct_values
.
back
();
max_val_
=
distinct_values
.
back
();
std
::
vector
<
int
>
cnt_in_bin
;
int
num_values
=
static_cast
<
int
>
(
distinct_values
.
size
());
int
num_values
=
static_cast
<
int
>
(
distinct_values
.
size
());
int
cnt_in_bin0
=
0
;
int
cnt_in_bin0
=
0
;
if
(
bin_type_
==
BinType
::
NumericalBin
)
{
if
(
bin_type_
==
BinType
::
NumericalBin
)
{
...
@@ -96,7 +99,7 @@ void BinMapper::FindBin(const std::string& column_name, std::vector<double>* val
...
@@ -96,7 +99,7 @@ void BinMapper::FindBin(const std::string& column_name, std::vector<double>* val
for
(
int
i
=
0
;
i
<
num_values
-
1
;
++
i
)
{
for
(
int
i
=
0
;
i
<
num_values
-
1
;
++
i
)
{
bin_upper_bound_
[
i
]
=
(
distinct_values
[
i
]
+
distinct_values
[
i
+
1
])
/
2
;
bin_upper_bound_
[
i
]
=
(
distinct_values
[
i
]
+
distinct_values
[
i
+
1
])
/
2
;
}
}
cnt_in_bin
0
=
counts
[
0
]
;
cnt_in_bin
=
counts
;
bin_upper_bound_
[
num_values
-
1
]
=
std
::
numeric_limits
<
double
>::
infinity
();
bin_upper_bound_
[
num_values
-
1
]
=
std
::
numeric_limits
<
double
>::
infinity
();
}
else
{
}
else
{
// mean size for one bin
// mean size for one bin
...
@@ -128,9 +131,7 @@ void BinMapper::FindBin(const std::string& column_name, std::vector<double>* val
...
@@ -128,9 +131,7 @@ void BinMapper::FindBin(const std::string& column_name, std::vector<double>* val
if
(
is_big_count_value
[
i
]
||
cur_cnt_inbin
>=
mean_bin_size
||
if
(
is_big_count_value
[
i
]
||
cur_cnt_inbin
>=
mean_bin_size
||
(
is_big_count_value
[
i
+
1
]
&&
cur_cnt_inbin
>=
std
::
max
(
1.0
,
mean_bin_size
*
0.5
f
)))
{
(
is_big_count_value
[
i
+
1
]
&&
cur_cnt_inbin
>=
std
::
max
(
1.0
,
mean_bin_size
*
0.5
f
)))
{
upper_bounds
[
bin_cnt
]
=
distinct_values
[
i
];
upper_bounds
[
bin_cnt
]
=
distinct_values
[
i
];
if
(
bin_cnt
==
0
)
{
cnt_in_bin
.
push_back
(
cur_cnt_inbin
);
cnt_in_bin0
=
cur_cnt_inbin
;
}
++
bin_cnt
;
++
bin_cnt
;
lower_bounds
[
bin_cnt
]
=
distinct_values
[
i
+
1
];
lower_bounds
[
bin_cnt
]
=
distinct_values
[
i
+
1
];
if
(
bin_cnt
>=
max_bin
-
1
)
{
break
;
}
if
(
bin_cnt
>=
max_bin
-
1
)
{
break
;
}
...
@@ -183,7 +184,8 @@ void BinMapper::FindBin(const std::string& column_name, std::vector<double>* val
...
@@ -183,7 +184,8 @@ void BinMapper::FindBin(const std::string& column_name, std::vector<double>* val
Log
::
Warning
(
"Too many categoricals are ignored, \
Log
::
Warning
(
"Too many categoricals are ignored, \
please use bigger max_bin or partition column
\"
%s
\"
"
,
column_name
.
c_str
());
please use bigger max_bin or partition column
\"
%s
\"
"
,
column_name
.
c_str
());
}
}
cnt_in_bin0
=
static_cast
<
int
>
(
sample_size
)
-
used_cnt
+
counts_int
[
0
];
cnt_in_bin
=
counts_int
;
cnt_in_bin
[
0
]
+=
static_cast
<
int
>
(
sample_size
)
-
used_cnt
;
}
}
// check trival(num_bin_ == 1) feature
// check trival(num_bin_ == 1) feature
...
@@ -193,7 +195,8 @@ void BinMapper::FindBin(const std::string& column_name, std::vector<double>* val
...
@@ -193,7 +195,8 @@ void BinMapper::FindBin(const std::string& column_name, std::vector<double>* val
is_trival_
=
false
;
is_trival_
=
false
;
}
}
// calculate sparse rate
// calculate sparse rate
sparse_rate_
=
static_cast
<
double
>
(
cnt_in_bin0
)
/
static_cast
<
double
>
(
sample_size
);
CHECK
(
num_bin_
<=
max_bin
);
sparse_rate_
=
static_cast
<
double
>
(
cnt_in_bin
[
GetDefaultBin
()])
/
static_cast
<
double
>
(
sample_size
);
}
}
...
@@ -216,6 +219,11 @@ void BinMapper::CopyTo(char * buffer) {
...
@@ -216,6 +219,11 @@ void BinMapper::CopyTo(char * buffer) {
buffer
+=
sizeof
(
sparse_rate_
);
buffer
+=
sizeof
(
sparse_rate_
);
std
::
memcpy
(
buffer
,
&
bin_type_
,
sizeof
(
bin_type_
));
std
::
memcpy
(
buffer
,
&
bin_type_
,
sizeof
(
bin_type_
));
buffer
+=
sizeof
(
bin_type_
);
buffer
+=
sizeof
(
bin_type_
);
std
::
memcpy
(
&
min_val_
,
buffer
,
sizeof
(
min_val_
));
buffer
+=
sizeof
(
min_val_
);
std
::
memcpy
(
&
max_val_
,
buffer
,
sizeof
(
max_val_
));
buffer
+=
sizeof
(
max_val_
);
if
(
bin_type_
==
BinType
::
NumericalBin
)
{
if
(
bin_type_
==
BinType
::
NumericalBin
)
{
std
::
memcpy
(
buffer
,
bin_upper_bound_
.
data
(),
num_bin_
*
sizeof
(
double
));
std
::
memcpy
(
buffer
,
bin_upper_bound_
.
data
(),
num_bin_
*
sizeof
(
double
));
}
else
{
}
else
{
...
@@ -232,6 +240,11 @@ void BinMapper::CopyFrom(const char * buffer) {
...
@@ -232,6 +240,11 @@ void BinMapper::CopyFrom(const char * buffer) {
buffer
+=
sizeof
(
sparse_rate_
);
buffer
+=
sizeof
(
sparse_rate_
);
std
::
memcpy
(
&
bin_type_
,
buffer
,
sizeof
(
bin_type_
));
std
::
memcpy
(
&
bin_type_
,
buffer
,
sizeof
(
bin_type_
));
buffer
+=
sizeof
(
bin_type_
);
buffer
+=
sizeof
(
bin_type_
);
std
::
memcpy
(
&
min_val_
,
buffer
,
sizeof
(
min_val_
));
buffer
+=
sizeof
(
min_val_
);
std
::
memcpy
(
&
max_val_
,
buffer
,
sizeof
(
max_val_
));
buffer
+=
sizeof
(
max_val_
);
if
(
bin_type_
==
BinType
::
NumericalBin
)
{
if
(
bin_type_
==
BinType
::
NumericalBin
)
{
bin_upper_bound_
=
std
::
vector
<
double
>
(
num_bin_
);
bin_upper_bound_
=
std
::
vector
<
double
>
(
num_bin_
);
std
::
memcpy
(
bin_upper_bound_
.
data
(),
buffer
,
num_bin_
*
sizeof
(
double
));
std
::
memcpy
(
bin_upper_bound_
.
data
(),
buffer
,
num_bin_
*
sizeof
(
double
));
...
@@ -250,6 +263,8 @@ void BinMapper::SaveBinaryToFile(FILE* file) const {
...
@@ -250,6 +263,8 @@ void BinMapper::SaveBinaryToFile(FILE* file) const {
fwrite
(
&
is_trival_
,
sizeof
(
is_trival_
),
1
,
file
);
fwrite
(
&
is_trival_
,
sizeof
(
is_trival_
),
1
,
file
);
fwrite
(
&
sparse_rate_
,
sizeof
(
sparse_rate_
),
1
,
file
);
fwrite
(
&
sparse_rate_
,
sizeof
(
sparse_rate_
),
1
,
file
);
fwrite
(
&
bin_type_
,
sizeof
(
bin_type_
),
1
,
file
);
fwrite
(
&
bin_type_
,
sizeof
(
bin_type_
),
1
,
file
);
fwrite
(
&
min_val_
,
sizeof
(
min_val_
),
1
,
file
);
fwrite
(
&
max_val_
,
sizeof
(
max_val_
),
1
,
file
);
if
(
bin_type_
==
BinType
::
NumericalBin
)
{
if
(
bin_type_
==
BinType
::
NumericalBin
)
{
fwrite
(
bin_upper_bound_
.
data
(),
sizeof
(
double
),
num_bin_
,
file
);
fwrite
(
bin_upper_bound_
.
data
(),
sizeof
(
double
),
num_bin_
,
file
);
}
else
{
}
else
{
...
@@ -259,7 +274,7 @@ void BinMapper::SaveBinaryToFile(FILE* file) const {
...
@@ -259,7 +274,7 @@ void BinMapper::SaveBinaryToFile(FILE* file) const {
size_t
BinMapper
::
SizesInByte
()
const
{
size_t
BinMapper
::
SizesInByte
()
const
{
size_t
ret
=
sizeof
(
num_bin_
)
+
sizeof
(
is_trival_
)
+
sizeof
(
sparse_rate_
)
size_t
ret
=
sizeof
(
num_bin_
)
+
sizeof
(
is_trival_
)
+
sizeof
(
sparse_rate_
)
+
sizeof
(
bin_type_
);
+
sizeof
(
bin_type_
)
+
sizeof
(
min_val_
)
+
sizeof
(
max_val_
)
;
if
(
bin_type_
==
BinType
::
NumericalBin
)
{
if
(
bin_type_
==
BinType
::
NumericalBin
)
{
ret
+=
sizeof
(
double
)
*
num_bin_
;
ret
+=
sizeof
(
double
)
*
num_bin_
;
}
else
{
}
else
{
...
@@ -290,7 +305,7 @@ template class OrderedSparseBin<uint32_t>;
...
@@ -290,7 +305,7 @@ template class OrderedSparseBin<uint32_t>;
Bin
*
Bin
::
CreateBin
(
data_size_t
num_data
,
int
num_bin
,
double
sparse_rate
,
Bin
*
Bin
::
CreateBin
(
data_size_t
num_data
,
int
num_bin
,
double
sparse_rate
,
bool
is_enable_sparse
,
bool
*
is_sparse
,
int
default_bin
,
BinType
bin_type
)
{
bool
is_enable_sparse
,
bool
*
is_sparse
,
u
int
32_t
default_bin
,
BinType
bin_type
)
{
// sparse threshold
// sparse threshold
const
double
kSparseThreshold
=
0.8
f
;
const
double
kSparseThreshold
=
0.8
f
;
if
(
sparse_rate
>=
kSparseThreshold
&&
is_enable_sparse
)
{
if
(
sparse_rate
>=
kSparseThreshold
&&
is_enable_sparse
)
{
...
@@ -302,19 +317,19 @@ Bin* Bin::CreateBin(data_size_t num_data, int num_bin, double sparse_rate,
...
@@ -302,19 +317,19 @@ Bin* Bin::CreateBin(data_size_t num_data, int num_bin, double sparse_rate,
}
}
}
}
Bin
*
Bin
::
CreateDenseBin
(
data_size_t
num_data
,
int
num_bin
,
int
default_bin
,
BinType
bin_type
)
{
Bin
*
Bin
::
CreateDenseBin
(
data_size_t
num_data
,
int
num_bin
,
u
int
32_t
default_bin
,
BinType
bin_type
)
{
if
(
bin_type
==
BinType
::
NumericalBin
)
{
if
(
bin_type
==
BinType
::
NumericalBin
)
{
if
(
num_bin
<=
25
6
)
{
if
(
num_bin
<=
25
5
)
{
return
new
DenseBin
<
uint8_t
>
(
num_data
,
default_bin
);
return
new
DenseBin
<
uint8_t
>
(
num_data
,
default_bin
);
}
else
if
(
num_bin
<=
6553
6
)
{
}
else
if
(
num_bin
<=
6553
5
)
{
return
new
DenseBin
<
uint16_t
>
(
num_data
,
default_bin
);
return
new
DenseBin
<
uint16_t
>
(
num_data
,
default_bin
);
}
else
{
}
else
{
return
new
DenseBin
<
uint32_t
>
(
num_data
,
default_bin
);
return
new
DenseBin
<
uint32_t
>
(
num_data
,
default_bin
);
}
}
}
else
{
}
else
{
if
(
num_bin
<=
25
6
)
{
if
(
num_bin
<=
25
5
)
{
return
new
DenseCategoricalBin
<
uint8_t
>
(
num_data
,
default_bin
);
return
new
DenseCategoricalBin
<
uint8_t
>
(
num_data
,
default_bin
);
}
else
if
(
num_bin
<=
6553
6
)
{
}
else
if
(
num_bin
<=
6553
5
)
{
return
new
DenseCategoricalBin
<
uint16_t
>
(
num_data
,
default_bin
);
return
new
DenseCategoricalBin
<
uint16_t
>
(
num_data
,
default_bin
);
}
else
{
}
else
{
return
new
DenseCategoricalBin
<
uint32_t
>
(
num_data
,
default_bin
);
return
new
DenseCategoricalBin
<
uint32_t
>
(
num_data
,
default_bin
);
...
@@ -322,19 +337,19 @@ Bin* Bin::CreateDenseBin(data_size_t num_data, int num_bin, int default_bin, Bin
...
@@ -322,19 +337,19 @@ Bin* Bin::CreateDenseBin(data_size_t num_data, int num_bin, int default_bin, Bin
}
}
}
}
Bin
*
Bin
::
CreateSparseBin
(
data_size_t
num_data
,
int
num_bin
,
int
default_bin
,
BinType
bin_type
)
{
Bin
*
Bin
::
CreateSparseBin
(
data_size_t
num_data
,
int
num_bin
,
u
int
32_t
default_bin
,
BinType
bin_type
)
{
if
(
bin_type
==
BinType
::
NumericalBin
)
{
if
(
bin_type
==
BinType
::
NumericalBin
)
{
if
(
num_bin
<=
25
6
)
{
if
(
num_bin
<=
25
5
)
{
return
new
SparseBin
<
uint8_t
>
(
num_data
,
default_bin
);
return
new
SparseBin
<
uint8_t
>
(
num_data
,
default_bin
);
}
else
if
(
num_bin
<=
6553
6
)
{
}
else
if
(
num_bin
<=
6553
5
)
{
return
new
SparseBin
<
uint16_t
>
(
num_data
,
default_bin
);
return
new
SparseBin
<
uint16_t
>
(
num_data
,
default_bin
);
}
else
{
}
else
{
return
new
SparseBin
<
uint32_t
>
(
num_data
,
default_bin
);
return
new
SparseBin
<
uint32_t
>
(
num_data
,
default_bin
);
}
}
}
else
{
}
else
{
if
(
num_bin
<=
25
6
)
{
if
(
num_bin
<=
25
5
)
{
return
new
SparseCategoricalBin
<
uint8_t
>
(
num_data
,
default_bin
);
return
new
SparseCategoricalBin
<
uint8_t
>
(
num_data
,
default_bin
);
}
else
if
(
num_bin
<=
6553
6
)
{
}
else
if
(
num_bin
<=
6553
5
)
{
return
new
SparseCategoricalBin
<
uint16_t
>
(
num_data
,
default_bin
);
return
new
SparseCategoricalBin
<
uint16_t
>
(
num_data
,
default_bin
);
}
else
{
}
else
{
return
new
SparseCategoricalBin
<
uint32_t
>
(
num_data
,
default_bin
);
return
new
SparseCategoricalBin
<
uint32_t
>
(
num_data
,
default_bin
);
...
...
src/io/dense_bin.hpp
View file @
9962e6d6
...
@@ -16,14 +16,8 @@ namespace LightGBM {
...
@@ -16,14 +16,8 @@ namespace LightGBM {
template
<
typename
VAL_T
>
template
<
typename
VAL_T
>
class
DenseBin
:
public
Bin
{
class
DenseBin
:
public
Bin
{
public:
public:
DenseBin
(
data_size_t
num_data
,
int
default_bin
)
DenseBin
(
data_size_t
num_data
,
uint32_t
default_bin
)
:
num_data_
(
num_data
)
{
:
num_data_
(
num_data
),
data_
(
num_data_
,
static_cast
<
VAL_T
>
(
default_bin
))
{
data_
.
resize
(
num_data_
);
VAL_T
default_bin_T
=
static_cast
<
VAL_T
>
(
default_bin
);
#pragma omp parallel for schedule(static)
for
(
data_size_t
i
=
0
;
i
<
num_data_
;
++
i
)
{
data_
[
i
]
=
default_bin_T
;
}
}
}
~
DenseBin
()
{
~
DenseBin
()
{
...
...
src/io/sparse_bin.hpp
View file @
9962e6d6
...
@@ -9,7 +9,7 @@
...
@@ -9,7 +9,7 @@
#include <cstring>
#include <cstring>
#include <cstdint>
#include <cstdint>
#include <limits>
#include <vector>
#include <vector>
namespace
LightGBM
{
namespace
LightGBM
{
...
@@ -50,12 +50,9 @@ public:
...
@@ -50,12 +50,9 @@ public:
friend
class
SparseBinIterator
<
VAL_T
>
;
friend
class
SparseBinIterator
<
VAL_T
>
;
friend
class
OrderedSparseBin
<
VAL_T
>
;
friend
class
OrderedSparseBin
<
VAL_T
>
;
SparseBin
(
data_size_t
num_data
,
int
default_bin
)
SparseBin
(
data_size_t
num_data
,
u
int
32_t
default_bin
)
:
num_data_
(
num_data
)
{
:
num_data_
(
num_data
)
{
default_bin_
=
static_cast
<
VAL_T
>
(
default_bin
);
default_bin_
=
static_cast
<
VAL_T
>
(
default_bin
);
if
(
default_bin_
!=
0
)
{
Log
::
Info
(
"Warning: sparse feature with negative values, treating negative values as zero"
);
}
#pragma omp parallel
#pragma omp parallel
#pragma omp master
#pragma omp master
{
{
...
@@ -75,9 +72,10 @@ public:
...
@@ -75,9 +72,10 @@ public:
}
}
void
Push
(
int
tid
,
data_size_t
idx
,
uint32_t
value
)
override
{
void
Push
(
int
tid
,
data_size_t
idx
,
uint32_t
value
)
override
{
// not store zero data
auto
cur_bin
=
static_cast
<
VAL_T
>
(
value
);
if
(
value
<=
default_bin_
)
{
return
;
}
if
(
cur_bin
!=
default_bin_
)
{
push_buffers_
[
tid
].
emplace_back
(
idx
,
static_cast
<
VAL_T
>
(
value
));
push_buffers_
[
tid
].
emplace_back
(
idx
,
cur_bin
);
}
}
}
BinIterator
*
GetIterator
(
data_size_t
start_idx
)
const
override
;
BinIterator
*
GetIterator
(
data_size_t
start_idx
)
const
override
;
...
@@ -90,10 +88,11 @@ public:
...
@@ -90,10 +88,11 @@ public:
inline
bool
NextNonzero
(
data_size_t
*
i_delta
,
inline
bool
NextNonzero
(
data_size_t
*
i_delta
,
data_size_t
*
cur_pos
)
const
{
data_size_t
*
cur_pos
)
const
{
const
VAL_T
non_data_flag
=
std
::
numeric_limits
<
VAL_T
>::
max
();
++
(
*
i_delta
);
++
(
*
i_delta
);
*
cur_pos
+=
deltas_
[
*
i_delta
];
*
cur_pos
+=
deltas_
[
*
i_delta
];
data_size_t
factor
=
1
;
data_size_t
factor
=
1
;
while
(
*
i_delta
<
num_vals_
&&
vals_
[
*
i_delta
]
==
0
)
{
while
(
*
i_delta
<
num_vals_
&&
vals_
[
*
i_delta
]
==
non_data_flag
)
{
++
(
*
i_delta
);
++
(
*
i_delta
);
factor
*=
kMaxDelta
;
factor
*=
kMaxDelta
;
*
cur_pos
+=
deltas_
[
*
i_delta
]
*
factor
;
*
cur_pos
+=
deltas_
[
*
i_delta
]
*
factor
;
...
@@ -130,41 +129,42 @@ public:
...
@@ -130,41 +129,42 @@ public:
void
FinishLoad
()
override
{
void
FinishLoad
()
override
{
// get total non zero size
// get total non zero size
size_t
non_zero_size
=
0
;
size_t
pair_cnt
=
0
;
for
(
size_t
i
=
0
;
i
<
push_buffers_
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
push_buffers_
.
size
();
++
i
)
{
non_zero_size
+=
push_buffers_
[
i
].
size
();
pair_cnt
+=
push_buffers_
[
i
].
size
();
}
}
std
::
vector
<
std
::
pair
<
data_size_t
,
VAL_T
>>
non_zero
_pair
;
std
::
vector
<
std
::
pair
<
data_size_t
,
VAL_T
>>
idx_val
_pair
s
;
// merge
// merge
non_zero
_pair
.
reserve
(
non_zero_size
);
idx_val
_pair
s
.
reserve
(
pair_cnt
);
for
(
size_t
i
=
0
;
i
<
push_buffers_
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
push_buffers_
.
size
();
++
i
)
{
non_zero
_pair
.
insert
(
non_zero
_pair
.
end
(),
push_buffers_
[
i
].
begin
(),
push_buffers_
[
i
].
end
());
idx_val
_pair
s
.
insert
(
idx_val
_pair
s
.
end
(),
push_buffers_
[
i
].
begin
(),
push_buffers_
[
i
].
end
());
push_buffers_
[
i
].
clear
();
push_buffers_
[
i
].
clear
();
push_buffers_
[
i
].
shrink_to_fit
();
push_buffers_
[
i
].
shrink_to_fit
();
}
}
push_buffers_
.
clear
();
push_buffers_
.
clear
();
push_buffers_
.
shrink_to_fit
();
push_buffers_
.
shrink_to_fit
();
// sort by data index
// sort by data index
std
::
sort
(
non_zero
_pair
.
begin
(),
non_zero
_pair
.
end
(),
std
::
sort
(
idx_val
_pair
s
.
begin
(),
idx_val
_pair
s
.
end
(),
[](
const
std
::
pair
<
data_size_t
,
VAL_T
>&
a
,
const
std
::
pair
<
data_size_t
,
VAL_T
>&
b
)
{
[](
const
std
::
pair
<
data_size_t
,
VAL_T
>&
a
,
const
std
::
pair
<
data_size_t
,
VAL_T
>&
b
)
{
return
a
.
first
<
b
.
first
;
return
a
.
first
<
b
.
first
;
});
});
// load detla array
// load detla array
LoadFromPair
(
non_zero
_pair
);
LoadFromPair
(
idx_val
_pair
s
);
}
}
void
LoadFromPair
(
const
std
::
vector
<
std
::
pair
<
data_size_t
,
VAL_T
>>&
non_zero
_pair
)
{
void
LoadFromPair
(
const
std
::
vector
<
std
::
pair
<
data_size_t
,
VAL_T
>>&
idx_val
_pair
s
)
{
deltas_
.
clear
();
deltas_
.
clear
();
vals_
.
clear
();
vals_
.
clear
();
const
VAL_T
non_data_flag
=
std
::
numeric_limits
<
VAL_T
>::
max
();
// transform to delta array
// transform to delta array
data_size_t
last_idx
=
0
;
data_size_t
last_idx
=
0
;
for
(
size_t
i
=
0
;
i
<
non_zero
_pair
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
idx_val
_pair
s
.
size
();
++
i
)
{
const
data_size_t
cur_idx
=
non_zero
_pair
[
i
].
first
;
const
data_size_t
cur_idx
=
idx_val
_pair
s
[
i
].
first
;
const
VAL_T
bin
=
non_zero
_pair
[
i
].
second
;
const
VAL_T
bin
=
idx_val
_pair
s
[
i
].
second
;
data_size_t
cur_delta
=
cur_idx
-
last_idx
;
data_size_t
cur_delta
=
cur_idx
-
last_idx
;
while
(
cur_delta
>
kMaxDelta
)
{
while
(
cur_delta
>
kMaxDelta
)
{
deltas_
.
push_back
(
cur_delta
%
kMaxDelta
);
deltas_
.
push_back
(
cur_delta
%
kMaxDelta
);
vals_
.
push_back
(
0
);
vals_
.
push_back
(
non_data_flag
);
cur_delta
/=
kMaxDelta
;
cur_delta
/=
kMaxDelta
;
}
}
deltas_
.
push_back
(
static_cast
<
uint8_t
>
(
cur_delta
));
deltas_
.
push_back
(
static_cast
<
uint8_t
>
(
cur_delta
));
...
@@ -270,7 +270,7 @@ public:
...
@@ -270,7 +270,7 @@ public:
std
::
vector
<
std
::
pair
<
data_size_t
,
VAL_T
>>
tmp_pair
;
std
::
vector
<
std
::
pair
<
data_size_t
,
VAL_T
>>
tmp_pair
;
for
(
data_size_t
i
=
0
;
i
<
num_used_indices
;
++
i
)
{
for
(
data_size_t
i
=
0
;
i
<
num_used_indices
;
++
i
)
{
VAL_T
bin
=
iterator
.
InnerGet
(
used_indices
[
i
]);
VAL_T
bin
=
iterator
.
InnerGet
(
used_indices
[
i
]);
if
(
bin
>
0
)
{
if
(
bin
!=
default_bin_
)
{
tmp_pair
.
emplace_back
(
i
,
bin
);
tmp_pair
.
emplace_back
(
i
,
bin
);
}
}
}
}
...
@@ -297,7 +297,7 @@ inline VAL_T SparseBinIterator<VAL_T>::InnerGet(data_size_t idx) {
...
@@ -297,7 +297,7 @@ inline VAL_T SparseBinIterator<VAL_T>::InnerGet(data_size_t idx) {
if
(
cur_pos_
==
idx
&&
i_delta_
<
bin_data_
->
num_vals_
&&
i_delta_
>=
0
)
{
if
(
cur_pos_
==
idx
&&
i_delta_
<
bin_data_
->
num_vals_
&&
i_delta_
>=
0
)
{
return
bin_data_
->
vals_
[
i_delta_
];
return
bin_data_
->
vals_
[
i_delta_
];
}
else
{
}
else
{
return
0
;
return
bin_data_
->
default_bin_
;
}
}
}
}
...
@@ -317,7 +317,7 @@ BinIterator* SparseBin<VAL_T>::GetIterator(data_size_t start_idx) const {
...
@@ -317,7 +317,7 @@ BinIterator* SparseBin<VAL_T>::GetIterator(data_size_t start_idx) const {
template
<
typename
VAL_T
>
template
<
typename
VAL_T
>
class
SparseCategoricalBin
:
public
SparseBin
<
VAL_T
>
{
class
SparseCategoricalBin
:
public
SparseBin
<
VAL_T
>
{
public:
public:
SparseCategoricalBin
(
data_size_t
num_data
,
int
default_bin
)
SparseCategoricalBin
(
data_size_t
num_data
,
u
int
32_t
default_bin
)
:
SparseBin
<
VAL_T
>
(
num_data
,
default_bin
)
{
:
SparseBin
<
VAL_T
>
(
num_data
,
default_bin
)
{
}
}
...
...
src/treelearner/feature_histogram.hpp
View file @
9962e6d6
...
@@ -58,12 +58,34 @@ public:
...
@@ -58,12 +58,34 @@ public:
data_
[
i
].
sum_hessians
-=
other
.
data_
[
i
].
sum_hessians
;
data_
[
i
].
sum_hessians
-=
other
.
data_
[
i
].
sum_hessians
;
}
}
}
}
void
FixIgnoreBin
(
double
sum_gradient
,
double
sum_hessian
,
data_size_t
num_data
)
{
if
(
feature_
->
is_sparse
())
{
// not need to Fix if max heavy bin is 0
if
(
feature_
->
bin_type
()
==
BinType
::
NumericalBin
&&
feature_
->
bin_mapper
()
->
GetDefaultBin
()
==
0
)
{
return
;
}
int
default_bin
=
static_cast
<
int
>
(
feature_
->
bin_mapper
()
->
GetDefaultBin
());
data_
[
default_bin
].
sum_gradients
=
sum_gradient
;
data_
[
default_bin
].
sum_hessians
=
sum_hessian
;
data_
[
default_bin
].
cnt
=
num_data
;
for
(
int
t
=
feature_
->
num_bin
()
-
1
;
t
>=
0
;
--
t
)
{
if
(
t
!=
default_bin
)
{
data_
[
default_bin
].
sum_gradients
-=
data_
[
t
].
sum_gradients
;
data_
[
default_bin
].
sum_hessians
-=
data_
[
t
].
sum_hessians
;
data_
[
default_bin
].
cnt
-=
data_
[
t
].
cnt
;
}
}
}
}
/*!
/*!
* \brief Find best threshold for this histogram
* \brief Find best threshold for this histogram
* \param output The best split result
* \param output The best split result
*/
*/
void
FindBestThreshold
(
double
sum_gradient
,
double
sum_hessian
,
data_size_t
num_data
,
void
FindBestThreshold
(
double
sum_gradient
,
double
sum_hessian
,
data_size_t
num_data
,
SplitInfo
*
output
)
{
SplitInfo
*
output
)
{
FixIgnoreBin
(
sum_gradient
,
sum_hessian
,
num_data
);
find_best_threshold_fun_
(
sum_gradient
,
sum_hessian
+
2
*
kEpsilon
,
num_data
,
output
);
find_best_threshold_fun_
(
sum_gradient
,
sum_hessian
+
2
*
kEpsilon
,
num_data
,
output
);
if
(
output
->
gain
>
kMinScore
)
{
if
(
output
->
gain
>
kMinScore
)
{
is_splittable_
=
true
;
is_splittable_
=
true
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment