Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tianlh
LightGBM-DCU
Commits
c62dcf73
Commit
c62dcf73
authored
Aug 18, 2017
by
Guolin Ke
Browse files
fix merge bugs.
parent
7a82ba4f
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
875 additions
and
831 deletions
+875
-831
include/LightGBM/bin.h
include/LightGBM/bin.h
+435
-431
include/LightGBM/tree.h
include/LightGBM/tree.h
+2
-1
src/io/bin.cpp
src/io/bin.cpp
+437
-398
src/io/config.cpp
src/io/config.cpp
+1
-1
No files found.
include/LightGBM/bin.h
View file @
c62dcf73
...
@@ -12,458 +12,462 @@
...
@@ -12,458 +12,462 @@
namespace
LightGBM
{
namespace
LightGBM
{
enum
BinType
{
enum
BinType
{
NumericalBin
,
NumericalBin
,
CategoricalBin
CategoricalBin
};
};
enum
MissingType
{
enum
MissingType
{
None
,
None
,
Zero
,
Zero
,
NaN
NaN
};
};
/*! \brief Store data for one histogram bin */
/*! \brief Store data for one histogram bin */
struct
HistogramBinEntry
{
struct
HistogramBinEntry
{
public:
public:
/*! \brief Sum of gradients on this bin */
/*! \brief Sum of gradients on this bin */
double
sum_gradients
=
0.0
f
;
double
sum_gradients
=
0.0
f
;
/*! \brief Sum of hessians on this bin */
/*! \brief Sum of hessians on this bin */
double
sum_hessians
=
0.0
f
;
double
sum_hessians
=
0.0
f
;
/*! \brief Number of data on this bin */
/*! \brief Number of data on this bin */
data_size_t
cnt
=
0
;
data_size_t
cnt
=
0
;
/*!
/*!
* \brief Sum up (reducers) functions for histogram bin
* \brief Sum up (reducers) functions for histogram bin
*/
*/
inline
static
void
SumReducer
(
const
char
*
src
,
char
*
dst
,
int
len
)
{
inline
static
void
SumReducer
(
const
char
*
src
,
char
*
dst
,
int
len
)
{
const
int
type_size
=
sizeof
(
HistogramBinEntry
);
const
int
type_size
=
sizeof
(
HistogramBinEntry
);
int
used_size
=
0
;
int
used_size
=
0
;
const
HistogramBinEntry
*
p1
;
const
HistogramBinEntry
*
p1
;
HistogramBinEntry
*
p2
;
HistogramBinEntry
*
p2
;
while
(
used_size
<
len
)
{
while
(
used_size
<
len
)
{
// convert
// convert
p1
=
reinterpret_cast
<
const
HistogramBinEntry
*>
(
src
);
p1
=
reinterpret_cast
<
const
HistogramBinEntry
*>
(
src
);
p2
=
reinterpret_cast
<
HistogramBinEntry
*>
(
dst
);
p2
=
reinterpret_cast
<
HistogramBinEntry
*>
(
dst
);
// add
// add
p2
->
cnt
+=
p1
->
cnt
;
p2
->
cnt
+=
p1
->
cnt
;
p2
->
sum_gradients
+=
p1
->
sum_gradients
;
p2
->
sum_gradients
+=
p1
->
sum_gradients
;
p2
->
sum_hessians
+=
p1
->
sum_hessians
;
p2
->
sum_hessians
+=
p1
->
sum_hessians
;
src
+=
type_size
;
src
+=
type_size
;
dst
+=
type_size
;
dst
+=
type_size
;
used_size
+=
type_size
;
used_size
+=
type_size
;
}
}
}
};
/*! \brief This class used to convert feature values into bin,
* and store some meta information for bin*/
class
BinMapper
{
public:
BinMapper
();
BinMapper
(
const
BinMapper
&
other
);
explicit
BinMapper
(
const
void
*
memory
);
~
BinMapper
();
bool
CheckAlign
(
const
BinMapper
&
other
)
const
{
if
(
num_bin_
!=
other
.
num_bin_
)
{
return
false
;
}
if
(
missing_type_
!=
other
.
missing_type_
)
{
return
false
;
}
}
if
(
bin_type_
==
BinType
::
NumericalBin
)
{
};
for
(
int
i
=
0
;
i
<
num_bin_
;
++
i
)
{
if
(
bin_upper_bound_
[
i
]
!=
other
.
bin_upper_bound_
[
i
])
{
/*! \brief This class used to convert feature values into bin,
return
false
;
* and store some meta information for bin*/
}
class
BinMapper
{
public:
BinMapper
();
BinMapper
(
const
BinMapper
&
other
);
explicit
BinMapper
(
const
void
*
memory
);
~
BinMapper
();
bool
CheckAlign
(
const
BinMapper
&
other
)
const
{
if
(
num_bin_
!=
other
.
num_bin_
)
{
return
false
;
}
}
}
else
{
if
(
missing_type_
!=
other
.
missing_type_
)
{
for
(
int
i
=
0
;
i
<
num_bin_
;
i
++
)
{
return
false
;
if
(
bin_2_categorical_
[
i
]
!=
other
.
bin_2_categorical_
[
i
])
{
}
return
false
;
if
(
bin_type_
==
BinType
::
NumericalBin
)
{
for
(
int
i
=
0
;
i
<
num_bin_
;
++
i
)
{
if
(
bin_upper_bound_
[
i
]
!=
other
.
bin_upper_bound_
[
i
])
{
return
false
;
}
}
}
else
{
for
(
int
i
=
0
;
i
<
num_bin_
;
i
++
)
{
if
(
bin_2_categorical_
[
i
]
!=
other
.
bin_2_categorical_
[
i
])
{
return
false
;
}
}
}
}
}
return
true
;
}
}
return
true
;
}
/*! \brief Get number of bins */
/*! \brief Get number of bins */
inline
int
num_bin
()
const
{
return
num_bin_
;
}
inline
int
num_bin
()
const
{
return
num_bin_
;
}
/*! \brief Missing Type */
/*! \brief Missing Type */
inline
MissingType
missing_type
()
const
{
return
missing_type_
;
}
inline
MissingType
missing_type
()
const
{
return
missing_type_
;
}
/*! \brief True if bin is trival (contains only one bin) */
/*! \brief True if bin is trival (contains only one bin) */
inline
bool
is_trival
()
const
{
return
is_trival_
;
}
inline
bool
is_trival
()
const
{
return
is_trival_
;
}
/*! \brief Sparsity of this bin ( num_zero_bins / num_data ) */
/*! \brief Sparsity of this bin ( num_zero_bins / num_data ) */
inline
double
sparse_rate
()
const
{
return
sparse_rate_
;
}
inline
double
sparse_rate
()
const
{
return
sparse_rate_
;
}
/*!
/*!
* \brief Save binary data to file
* \brief Save binary data to file
* \param file File want to write
* \param file File want to write
*/
*/
void
SaveBinaryToFile
(
FILE
*
file
)
const
;
void
SaveBinaryToFile
(
FILE
*
file
)
const
;
/*!
/*!
* \brief Mapping bin into feature value
* \brief Mapping bin into feature value
* \param bin
* \param bin
* \return Feature value of this bin
* \return Feature value of this bin
*/
*/
inline
double
BinToValue
(
uint32_t
bin
)
const
{
inline
double
BinToValue
(
uint32_t
bin
)
const
{
if
(
bin_type_
==
BinType
::
NumericalBin
)
{
if
(
bin_type_
==
BinType
::
NumericalBin
)
{
return
bin_upper_bound_
[
bin
];
return
bin_upper_bound_
[
bin
];
}
else
{
}
else
{
return
bin_2_categorical_
[
bin
];
return
bin_2_categorical_
[
bin
];
}
}
}
}
/*!
/*!
* \brief Get sizes in byte of this object
* \brief Get sizes in byte of this object
*/
*/
size_t
SizesInByte
()
const
;
size_t
SizesInByte
()
const
;
/*!
/*!
* \brief Mapping feature value into bin
* \
brief Mapping feature value into bin
* \
param value
* \
param
value
* \
return bin for this feature
value
* \return bin for this feature value
*/
*/
inline
uint32_t
ValueToBin
(
double
value
)
const
;
inline
uint32_t
ValueToBin
(
double
value
)
const
;
/*!
/*!
* \brief Get the default bin when value is 0
* \
brief Get the default bin when value is 0
* \
return default bin
* \return default bin
*/
*/
inline
uint32_t
GetDefaultBin
()
const
{
inline
uint32_t
GetDefaultBin
()
const
{
return
default_bin_
;
return
default_bin_
;
}
}
/*!
/*!
* \brief Construct feature value to bin mapper according feature values
* \
brief Construct feature value to bin mapper according feature values
* \
param values (Sampled) values of this feature, Note: not include zero.
* \param values
(Sampled) values of this feature, Note: not include zero.
* \param
num_
values
number of values.
* \param
num_values number of values.
* \param
total_sample_cnt number of total sample count, equal with values.size() + num_zeros
* \param
total_sample_cnt number of total sample count, equal with values.size() + num_zeros
* \param
max_bin The maximal number of bin
* \param m
ax_bin The maximal number of
bin
* \param m
in_data_in_bin min number of data in one
bin
* \param min_
data_in_bin min number of data in one bin
* \param min_
split_data
* \param
m
in_
split_data
* \param
b
in_
type Type of this bin
* \param
bin_type Type of this bin
* \param
use_missing True to enable missing value handle
* \param
use
_missing True to
enable
missing value
handle
* \param
zero_as
_missing True to
use zero as
missing value
* \param zero_as_missing True to use zero as missing value
*/
*/
void
FindBin
(
double
*
values
,
int
num_values
,
size_t
total_sample_cnt
,
int
max_bin
,
int
min_data_in_bin
,
int
min_split_data
,
BinType
bin_type
,
void
FindBin
(
double
*
values
,
int
num_values
,
size_t
total_sample_cnt
,
int
max_b
in
,
int
min_data_in_bin
,
int
min_split_data
,
BinType
bin_type
,
bool
use_miss
in
g
,
bool
zero_as_missing
);
bool
use_missing
,
bool
zero_as_missing
);
/*!
/*!
* \brief Use specific number of bin to calculate the size of this class
* \
brief Use specific number of bin to calculate the size of this class
* \
param bin The number of bin
* \
param bin The number of bin
* \
return Size
* \return Size
*/
*/
static
int
SizeForSpecificBin
(
int
bin
);
static
int
SizeForSpecificBin
(
int
bin
);
/*!
/*!
* \brief Seirilizing this object to buffer
* \
brief Seirilizing this object to buffer
* \
param buffer The destination
* \param buffer The destination
*/
*/
void
CopyTo
(
char
*
buffer
)
const
;
void
CopyTo
(
char
*
buffer
)
const
;
/*!
/*!
* \brief Deserilizing this object from buffer
* \
brief Deserilizing this object from buffer
* \
param buffer The source
* \param buffer The source
*/
*/
void
CopyFrom
(
const
char
*
buffer
);
void
CopyFrom
(
const
char
*
buffer
);
/*!
/*!
* \brief Get bin types
* \brief Get bin types
*/
*/
inline
BinType
bin_type
()
const
{
return
bin_type_
;
}
inline
BinType
bin_type
()
const
{
return
bin_type_
;
}
/*!
/*!
* \brief Get bin info
* \brief Get bin info
*/
*/
inline
std
::
string
bin_info
()
const
{
inline
std
::
string
bin_info
()
const
{
if
(
bin_type_
==
BinType
::
CategoricalBin
)
{
if
(
bin_type_
==
BinType
::
C
ategorical
Bin
)
{
return
Common
::
Join
(
bin_2_c
ategorical
_
,
":"
);
return
Common
::
Join
(
bin_2_categorical_
,
":"
);
}
else
{
}
else
{
std
::
stringstream
str_buf
;
std
::
stringstream
str_buf
;
str_buf
<<
std
::
setprecision
(
std
::
numeric_limits
<
double
>::
digits10
+
2
)
;
str_buf
<<
std
::
setprecision
(
std
::
numeric_limits
<
double
>::
digits10
+
2
)
;
str_buf
<<
'['
<<
min_val_
<<
':'
<<
max_val_
<<
']'
;
str_buf
<<
'['
<<
min_val_
<<
':'
<<
max_val_
<<
']'
;
return
str_buf
.
str
()
;
return
str_buf
.
str
();
}
}
}
}
private:
/*! \brief Number of bins */
int
num_bin_
;
MissingType
missing_type_
;
/*! \brief Store upper bound for each bin */
std
::
vector
<
double
>
bin_upper_bound_
;
/*! \brief True if this feature is trival */
bool
is_trival_
;
/*! \brief Sparse rate of this bins( num_bin0/num_data ) */
double
sparse_rate_
;
/*! \brief Type of this bin */
BinType
bin_type_
;
/*! \brief Mapper from categorical to bin */
std
::
unordered_map
<
int
,
unsigned
int
>
categorical_2_bin_
;
/*! \brief Mapper from bin to categorical */
std
::
vector
<
int
>
bin_2_categorical_
;
/*! \brief minimal feature vaule */
double
min_val_
;
/*! \brief maximum feature value */
double
max_val_
;
/*! \brief bin value of feature value 0 */
uint32_t
default_bin_
;
};
/*!
* \brief Interface for ordered bin data. efficient for construct histogram, especially for sparse bin
* There are 2 advantages by using ordered bin.
* 1. group the data by leafs to improve the cache hit.
* 2. only store the non-zero bin, which can speed up the histogram consturction for sparse features.
* However it brings additional cost: it need re-order the bins after every split, which will cost much for dense feature.
* So we only using ordered bin for sparse situations.
*/
class
OrderedBin
{
public:
/*! \brief virtual destructor */
virtual
~
OrderedBin
()
{}
/*!
* \brief Initialization logic.
* \param used_indices If used_indices.size() == 0 means using all data, otherwise, used_indices[i] == true means i-th data is used
(this logic was build for bagging logic)
* \param num_leaves Number of leaves on this iteration
*/
virtual
void
Init
(
const
char
*
used_indices
,
data_size_t
num_leaves
)
=
0
;
/*!
* \brief Construct histogram by using this bin
* Note: Unlike Bin, OrderedBin doesn't use ordered gradients and ordered hessians.
* Because it is hard to know the relative index in one leaf for sparse bin, since we skipped zero bins.
* \param leaf Using which leaf's data to construct
* \param gradients Gradients, Note:non-oredered by leaf
* \param hessians Hessians, Note:non-oredered by leaf
* \param out Output Result
*/
virtual
void
ConstructHistogram
(
int
leaf
,
const
score_t
*
gradients
,
const
score_t
*
hessians
,
HistogramBinEntry
*
out
)
const
=
0
;
/*!
* \brief Construct histogram by using this bin
* Note: Unlike Bin, OrderedBin doesn't use ordered gradients and ordered hessians.
* Because it is hard to know the relative index in one leaf for sparse bin, since we skipped zero bins.
* \param leaf Using which leaf's data to construct
* \param gradients Gradients, Note:non-oredered by leaf
* \param out Output Result
*/
virtual
void
ConstructHistogram
(
int
leaf
,
const
score_t
*
gradients
,
HistogramBinEntry
*
out
)
const
=
0
;
/*!
* \brief Split current bin, and perform re-order by leaf
* \param leaf Using which leaf's to split
* \param right_leaf The new leaf index after perform this split
* \param is_in_leaf is_in_leaf[i] == mark means the i-th data will be on left leaf after split
* \param mark is_in_leaf[i] == mark means the i-th data will be on left leaf after split
*/
virtual
void
Split
(
int
leaf
,
int
right_leaf
,
const
char
*
is_in_leaf
,
char
mark
)
=
0
;
virtual
data_size_t
NonZeroCount
(
int
leaf
)
const
=
0
;
};
/*! \brief Iterator for one bin column */
class
BinIterator
{
public:
/*!
* \brief Get bin data on specific row index
* \param idx Index of this data
* \return Bin data
*/
virtual
uint32_t
Get
(
data_size_t
idx
)
=
0
;
virtual
uint32_t
RawGet
(
data_size_t
idx
)
=
0
;
virtual
void
Reset
(
data_size_t
idx
)
=
0
;
virtual
~
BinIterator
()
=
default
;
};
/*!
* \brief Interface for bin data. This class will store bin data for one feature.
* unlike OrderedBin, this class will store data by original order.
* Note that it may cause cache misses when construct histogram,
* but it doesn't need to re-order operation, So it will be faster than OrderedBin for dense feature
*/
class
Bin
{
public:
/*! \brief virtual destructor */
virtual
~
Bin
()
{}
/*!
* \brief Push one record
* \pram tid Thread id
* \param idx Index of record
* \param value bin value of record
*/
virtual
void
Push
(
int
tid
,
data_size_t
idx
,
uint32_t
value
)
=
0
;
virtual
void
CopySubset
(
const
Bin
*
full_bin
,
const
data_size_t
*
used_indices
,
data_size_t
num_used_indices
)
=
0
;
/*!
* \brief Get bin iterator of this bin for specific feature
* \param min_bin min_bin of current used feature
* \param max_bin max_bin of current used feature
* \param default_bin default bin if bin not in [min_bin, max_bin]
* \return Iterator of this bin
*/
virtual
BinIterator
*
GetIterator
(
uint32_t
min_bin
,
uint32_t
max_bin
,
uint32_t
default_bin
)
const
=
0
;
/*!
* \brief Save binary data to file
* \param file File want to write
*/
virtual
void
SaveBinaryToFile
(
FILE
*
file
)
const
=
0
;
/*!
* \brief Load from memory
* \param memory
* \param local_used_indices
*/
virtual
void
LoadFromMemory
(
const
void
*
memory
,
const
std
::
vector
<
data_size_t
>&
local_used_indices
)
=
0
;
/*!
* \brief Get sizes in byte of this object
*/
virtual
size_t
SizesInByte
()
const
=
0
;
/*! \brief Number of all data */
virtual
data_size_t
num_data
()
const
=
0
;
virtual
void
ReSize
(
data_size_t
num_data
)
=
0
;
/*!
* \brief Construct histogram of this feature,
* Note: We use ordered_gradients and ordered_hessians to improve cache hit chance
* The naive solution is using gradients[data_indices[i]] for data_indices[i] to get gradients,
which is not cache friendly, since the access of memory is not continuous.
* ordered_gradients and ordered_hessians are preprocessed, and they are re-ordered by data_indices.
* Ordered_gradients[i] is aligned with data_indices[i]'s gradients (same for ordered_hessians).
* \param data_indices Used data indices in current leaf
* \param num_data Number of used data
* \param ordered_gradients Pointer to gradients, the data_indices[i]-th data's gradient is ordered_gradients[i]
* \param ordered_hessians Pointer to hessians, the data_indices[i]-th data's hessian is ordered_hessians[i]
* \param out Output Result
*/
virtual
void
ConstructHistogram
(
const
data_size_t
*
data_indices
,
data_size_t
num_data
,
const
score_t
*
ordered_gradients
,
const
score_t
*
ordered_hessians
,
HistogramBinEntry
*
out
)
const
=
0
;
virtual
void
ConstructHistogram
(
data_size_t
num_data
,
const
score_t
*
ordered_gradients
,
const
score_t
*
ordered_hessians
,
HistogramBinEntry
*
out
)
const
=
0
;
/*!
* \brief Construct histogram of this feature,
* Note: We use ordered_gradients and ordered_hessians to improve cache hit chance
* The naive solution is using gradients[data_indices[i]] for data_indices[i] to get gradients,
which is not cache friendly, since the access of memory is not continuous.
* ordered_gradients and ordered_hessians are preprocessed, and they are re-ordered by data_indices.
* Ordered_gradients[i] is aligned with data_indices[i]'s gradients (same for ordered_hessians).
* \param data_indices Used data indices in current leaf
* \param num_data Number of used data
* \param ordered_gradients Pointer to gradients, the data_indices[i]-th data's gradient is ordered_gradients[i]
* \param out Output Result
*/
virtual
void
ConstructHistogram
(
const
data_size_t
*
data_indices
,
data_size_t
num_data
,
const
score_t
*
ordered_gradients
,
HistogramBinEntry
*
out
)
const
=
0
;
virtual
void
ConstructHistogram
(
data_size_t
num_data
,
const
score_t
*
ordered_gradients
,
HistogramBinEntry
*
out
)
const
=
0
;
/*!
* \brief Split data according to threshold, if bin <= threshold, will put into left(lte_indices), else put into right(gt_indices)
* \param min_bin min_bin of current used feature
* \param max_bin max_bin of current used feature
* \param default_bin defualt bin if bin not in [min_bin, max_bin]
* \param missing_type missing type
* \param default_left missing bin will go to left child
* \param threshold The split threshold.
* \param data_indices Used data indices. After called this function. The less than or equal data indices will store on this object.
* \param num_data Number of used data
* \param lte_indices After called this function. The less or equal data indices will store on this object.
* \param gt_indices After called this function. The greater data indices will store on this object.
* \param bin_type type of bin
* \return The number of less than or equal data.
*/
virtual
data_size_t
Split
(
uint32_t
min_bin
,
uint32_t
max_bin
,
uint32_t
default_bin
,
MissingType
missing_type
,
bool
default_left
,
uint32_t
threshold
,
data_size_t
*
data_indices
,
data_size_t
num_data
,
data_size_t
*
lte_indices
,
data_size_t
*
gt_indices
,
BinType
bin_type
)
const
=
0
;
/*!
* \brief Create the ordered bin for this bin
* \return Pointer to ordered bin
*/
virtual
OrderedBin
*
CreateOrderedBin
()
const
=
0
;
/*!
private:
* \brief After pushed all feature data, call this could have better refactor for bin data
/*! \brief Number of bins */
*/
int
num_bin_
;
virtual
void
FinishLoad
()
=
0
;
MissingType
missing_type_
;
/*! \brief Store upper bound for each bin */
std
::
vector
<
double
>
bin_upper_bound_
;
/*! \brief True if this feature is trival */
bool
is_trival_
;
/*! \brief Sparse rate of this bins( num_bin0/num_data ) */
double
sparse_rate_
;
/*! \brief Type of this bin */
BinType
bin_type_
;
/*! \brief Mapper from categorical to bin */
std
::
unordered_map
<
int
,
unsigned
int
>
categorical_2_bin_
;
/*! \brief Mapper from bin to categorical */
std
::
vector
<
int
>
bin_2_categorical_
;
/*! \brief minimal feature vaule */
double
min_val_
;
/*! \brief maximum feature value */
double
max_val_
;
/*! \brief bin value of feature value 0 */
uint32_t
default_bin_
;
};
/*!
/*!
* \brief Create object for bin data of one feature, will call CreateDenseBin or CreateSparseBin according to "is_sparse"
* \brief Interface for ordered bin data. efficient for construct histogram, especially for sparse bin
* \param num_data Total number of data
* There are 2 advantages by using ordered bin.
* \param num_bin Number of bin
* 1. group the data by leafs to improve the cache hit.
* \param sparse_rate Sparse rate of this bins( num_bin0/num_data )
* 2. only store the non-zero bin, which can speed up the histogram consturction for sparse features.
* \param is_enable_sparse True if enable sparse feature
* However it brings additional cost: it need re-order the bins after every split, which will cost much for dense feature.
* \param sparse_threshold Threshold for treating a feature as a sparse feature
* So we only using ordered bin for sparse situations.
* \param is_sparse Will set to true if this bin is sparse
* \param default_bin Default bin for zeros value
* \return The bin data object
*/
*/
static
Bin
*
CreateBin
(
data_size_t
num_data
,
int
num_bin
,
class
OrderedBin
{
double
sparse_rate
,
bool
is_enable_sparse
,
double
sparse_threshold
,
bool
*
is_sparse
);
public:
/*! \brief virtual destructor */
virtual
~
OrderedBin
()
{}
/*!
* \brief Initialization logic.
* \param used_indices If used_indices.size() == 0 means using all data, otherwise, used_indices[i] == true means i-th data is used
(this logic was build for bagging logic)
* \param num_leaves Number of leaves on this iteration
*/
virtual
void
Init
(
const
char
*
used_indices
,
data_size_t
num_leaves
)
=
0
;
/*!
* \brief Construct histogram by using this bin
* Note: Unlike Bin, OrderedBin doesn't use ordered gradients and ordered hessians.
* Because it is hard to know the relative index in one leaf for sparse bin, since we skipped zero bins.
* \param leaf Using which leaf's data to construct
* \param gradients Gradients, Note:non-oredered by leaf
* \param hessians Hessians, Note:non-oredered by leaf
* \param out Output Result
*/
virtual
void
ConstructHistogram
(
int
leaf
,
const
score_t
*
gradients
,
const
score_t
*
hessians
,
HistogramBinEntry
*
out
)
const
=
0
;
/*!
* \brief Construct histogram by using this bin
* Note: Unlike Bin, OrderedBin doesn't use ordered gradients and ordered hessians.
* Because it is hard to know the relative index in one leaf for sparse bin, since we skipped zero bins.
* \param leaf Using which leaf's data to construct
* \param gradients Gradients, Note:non-oredered by leaf
* \param out Output Result
*/
virtual
void
ConstructHistogram
(
int
leaf
,
const
score_t
*
gradients
,
HistogramBinEntry
*
out
)
const
=
0
;
/*!
* \brief Split current bin, and perform re-order by leaf
* \param leaf Using which leaf's to split
* \param right_leaf The new leaf index after perform this split
* \param is_in_leaf is_in_leaf[i] == mark means the i-th data will be on left leaf after split
* \param mark is_in_leaf[i] == mark means the i-th data will be on left leaf after split
*/
virtual
void
Split
(
int
leaf
,
int
right_leaf
,
const
char
*
is_in_leaf
,
char
mark
)
=
0
;
virtual
data_size_t
NonZeroCount
(
int
leaf
)
const
=
0
;
};
/*! \brief Iterator for one bin column */
class
BinIterator
{
public:
/*!
* \brief Get bin data on specific row index
* \param idx Index of this data
* \return Bin data
*/
virtual
uint32_t
Get
(
data_size_t
idx
)
=
0
;
virtual
uint32_t
RawGet
(
data_size_t
idx
)
=
0
;
virtual
void
Reset
(
data_size_t
idx
)
=
0
;
virtual
~
BinIterator
()
=
default
;
};
/*!
/*!
* \brief
Create object for bin data of one feature, used
for
dens
e feature
* \brief
Interface for bin data. This class will store bin data
for
on
e feature
.
*
\param num_data Total number of data
*
unlike OrderedBin, this class will store data by original order.
*
\param num_bin Number of bin
*
Note that it may cause cache misses when construct histogram,
*
\return The bin data object
*
but it doesn't need to re-order operation, So it will be faster than OrderedBin for dense feature
*/
*/
static
Bin
*
CreateDenseBin
(
data_size_t
num_data
,
int
num_bin
);
class
Bin
{
public:
/*!
/*! \brief virtual destructor */
* \brief Create object for bin data of one feature, used for sparse feature
virtual
~
Bin
()
{}
* \param num_data Total number of data
/*!
* \param num_bin Number of bin
* \brief Push one record
* \return The bin data object
* \pram tid Thread id
*/
* \param idx Index of record
static
Bin
*
CreateSparseBin
(
data_size_t
num_data
,
int
num_bin
);
* \param value bin value of record
};
*/
virtual
void
Push
(
int
tid
,
data_size_t
idx
,
uint32_t
value
)
=
0
;
inline
uint32_t
BinMapper
::
ValueToBin
(
double
value
)
const
{
if
(
std
::
isnan
(
value
))
{
if
(
missing_type_
==
MissingType
::
NaN
)
{
virtual
void
CopySubset
(
const
Bin
*
full_bin
,
const
data_size_t
*
used_indices
,
data_size_t
num_used_indices
)
=
0
;
return
num_bin_
-
1
;
/*!
}
else
{
* \brief Get bin iterator of this bin for specific feature
value
=
0.0
f
;
* \param min_bin min_bin of current used feature
}
* \param max_bin max_bin of current used feature
}
* \param default_bin default bin if bin not in [min_bin, max_bin]
if
(
bin_type_
==
BinType
::
NumericalBin
)
{
* \return Iterator of this bin
// binary search to find bin
*/
int
l
=
0
;
virtual
BinIterator
*
GetIterator
(
uint32_t
min_bin
,
uint32_t
max_bin
,
uint32_t
default_bin
)
const
=
0
;
int
r
=
num_bin_
-
1
;
if
(
missing_type_
==
MissingType
::
NaN
)
{
/*!
r
-=
1
;
* \brief Save binary data to file
}
* \param file File want to write
while
(
l
<
r
)
{
*/
int
m
=
(
r
+
l
-
1
)
/
2
;
virtual
void
SaveBinaryToFile
(
FILE
*
file
)
const
=
0
;
if
(
value
<=
bin_upper_bound_
[
m
])
{
r
=
m
;
/*!
* \brief Load from memory
* \param memory
* \param local_used_indices
*/
virtual
void
LoadFromMemory
(
const
void
*
memory
,
const
std
::
vector
<
data_size_t
>&
local_used_indices
)
=
0
;
/*!
* \brief Get sizes in byte of this object
*/
virtual
size_t
SizesInByte
()
const
=
0
;
/*! \brief Number of all data */
virtual
data_size_t
num_data
()
const
=
0
;
virtual
void
ReSize
(
data_size_t
num_data
)
=
0
;
/*!
* \brief Construct histogram of this feature,
* Note: We use ordered_gradients and ordered_hessians to improve cache hit chance
* The naive solution is using gradients[data_indices[i]] for data_indices[i] to get gradients,
which is not cache friendly, since the access of memory is not continuous.
* ordered_gradients and ordered_hessians are preprocessed, and they are re-ordered by data_indices.
* Ordered_gradients[i] is aligned with data_indices[i]'s gradients (same for ordered_hessians).
* \param data_indices Used data indices in current leaf
* \param num_data Number of used data
* \param ordered_gradients Pointer to gradients, the data_indices[i]-th data's gradient is ordered_gradients[i]
* \param ordered_hessians Pointer to hessians, the data_indices[i]-th data's hessian is ordered_hessians[i]
* \param out Output Result
*/
virtual
void
ConstructHistogram
(
const
data_size_t
*
data_indices
,
data_size_t
num_data
,
const
score_t
*
ordered_gradients
,
const
score_t
*
ordered_hessians
,
HistogramBinEntry
*
out
)
const
=
0
;
virtual
void
ConstructHistogram
(
data_size_t
num_data
,
const
score_t
*
ordered_gradients
,
const
score_t
*
ordered_hessians
,
HistogramBinEntry
*
out
)
const
=
0
;
/*!
* \brief Construct histogram of this feature,
* Note: We use ordered_gradients and ordered_hessians to improve cache hit chance
* The naive solution is using gradients[data_indices[i]] for data_indices[i] to get gradients,
which is not cache friendly, since the access of memory is not continuous.
* ordered_gradients and ordered_hessians are preprocessed, and they are re-ordered by data_indices.
* Ordered_gradients[i] is aligned with data_indices[i]'s gradients (same for ordered_hessians).
* \param data_indices Used data indices in current leaf
* \param num_data Number of used data
* \param ordered_gradients Pointer to gradients, the data_indices[i]-th data's gradient is ordered_gradients[i]
* \param out Output Result
*/
virtual
void
ConstructHistogram
(
const
data_size_t
*
data_indices
,
data_size_t
num_data
,
const
score_t
*
ordered_gradients
,
HistogramBinEntry
*
out
)
const
=
0
;
virtual
void
ConstructHistogram
(
data_size_t
num_data
,
const
score_t
*
ordered_gradients
,
HistogramBinEntry
*
out
)
const
=
0
;
/*!
* \brief Split data according to threshold, if bin <= threshold, will put into left(lte_indices), else put into right(gt_indices)
* \param min_bin min_bin of current used feature
* \param max_bin max_bin of current used feature
* \param default_bin defualt bin if bin not in [min_bin, max_bin]
* \param missing_type missing type
* \param default_left missing bin will go to left child
* \param threshold The split threshold.
* \param data_indices Used data indices. After called this function. The less than or equal data indices will store on this object.
* \param num_data Number of used data
* \param lte_indices After called this function. The less or equal data indices will store on this object.
* \param gt_indices After called this function. The greater data indices will store on this object.
* \param bin_type type of bin
* \return The number of less than or equal data.
*/
virtual
data_size_t
Split
(
uint32_t
min_bin
,
uint32_t
max_bin
,
uint32_t
default_bin
,
MissingType
missing_type
,
bool
default_left
,
uint32_t
threshold
,
data_size_t
*
data_indices
,
data_size_t
num_data
,
data_size_t
*
lte_indices
,
data_size_t
*
gt_indices
,
BinType
bin_type
)
const
=
0
;
/*!
* \brief Create the ordered bin for this bin
* \return Pointer to ordered bin
*/
virtual
OrderedBin
*
CreateOrderedBin
()
const
=
0
;
/*!
* \brief After pushed all feature data, call this could have better refactor for bin data
*/
virtual
void
FinishLoad
()
=
0
;
/*!
* \brief Create object for bin data of one feature, will call CreateDenseBin or CreateSparseBin according to "is_sparse"
* \param num_data Total number of data
* \param num_bin Number of bin
* \param sparse_rate Sparse rate of this bins( num_bin0/num_data )
* \param is_enable_sparse True if enable sparse feature
* \param sparse_threshold Threshold for treating a feature as a sparse feature
* \param is_sparse Will set to true if this bin is sparse
* \param default_bin Default bin for zeros value
* \return The bin data object
*/
static
Bin
*
CreateBin
(
data_size_t
num_data
,
int
num_bin
,
double
sparse_rate
,
bool
is_enable_sparse
,
double
sparse_threshold
,
bool
*
is_sparse
);
/*!
* \brief Create object for bin data of one feature, used for dense feature
* \param num_data Total number of data
* \param num_bin Number of bin
* \return The bin data object
*/
static
Bin
*
CreateDenseBin
(
data_size_t
num_data
,
int
num_bin
);
/*!
* \brief Create object for bin data of one feature, used for sparse feature
* \param num_data Total number of data
* \param num_bin Number of bin
* \return The bin data object
*/
static
Bin
*
CreateSparseBin
(
data_size_t
num_data
,
int
num_bin
);
};
inline
uint32_t
BinMapper
::
ValueToBin
(
double
value
)
const
{
if
(
std
::
isnan
(
value
))
{
if
(
missing_type_
==
MissingType
::
NaN
)
{
return
num_bin_
-
1
;
}
else
{
}
else
{
l
=
m
+
1
;
value
=
0.0
f
;
}
}
}
}
return
l
;
if
(
bin_type_
==
BinType
::
NumericalBin
)
{
}
else
{
// binary search to find bin
int
int_value
=
static_cast
<
int
>
(
value
);
int
l
=
0
;
if
(
categorical_2_bin_
.
count
(
int_value
))
{
int
r
=
num_bin_
-
1
;
return
categorical_2_bin_
.
at
(
int_value
);
if
(
missing_type_
==
MissingType
::
NaN
)
{
r
-=
1
;
}
while
(
l
<
r
)
{
int
m
=
(
r
+
l
-
1
)
/
2
;
if
(
value
<=
bin_upper_bound_
[
m
])
{
r
=
m
;
}
else
{
l
=
m
+
1
;
}
}
return
l
;
}
else
{
}
else
{
return
num_bin_
-
1
;
int
int_value
=
static_cast
<
int
>
(
value
);
// convert negative value to NaN bin
if
(
int_value
<
0
)
{
return
num_bin_
-
1
;
}
if
(
categorical_2_bin_
.
count
(
int_value
))
{
return
categorical_2_bin_
.
at
(
int_value
);
}
else
{
return
num_bin_
-
1
;
}
}
}
}
}
}
}
// namespace LightGBM
}
// namespace LightGBM
...
...
include/LightGBM/tree.h
View file @
c62dcf73
...
@@ -409,7 +409,8 @@ inline void Tree::TreeSHAP(const double *feature_values, double *phi,
...
@@ -409,7 +409,8 @@ inline void Tree::TreeSHAP(const double *feature_values, double *phi,
// internal node
// internal node
}
else
{
}
else
{
const
int
hot_index
=
Decision
(
feature_values
[
split_index
],
node
);
const
int
hot_index
=
decision_funs
[
GetDecisionType
(
decision_type_
[
node
],
kCategoricalMask
)](
feature_values
[
split_index
],
threshold_
[
node
]);
const
int
cold_index
=
(
hot_index
==
left_child_
[
node
]
?
right_child_
[
node
]
:
left_child_
[
node
]);
const
int
cold_index
=
(
hot_index
==
left_child_
[
node
]
?
right_child_
[
node
]
:
left_child_
[
node
]);
const
double
w
=
data_count
(
node
);
const
double
w
=
data_count
(
node
);
const
double
hot_zero_fraction
=
data_count
(
hot_index
)
/
w
;
const
double
hot_zero_fraction
=
data_count
(
hot_index
)
/
w
;
...
...
src/io/bin.cpp
View file @
c62dcf73
...
@@ -16,461 +16,500 @@
...
@@ -16,461 +16,500 @@
namespace
LightGBM
{
namespace
LightGBM
{
BinMapper
::
BinMapper
()
{
BinMapper
::
BinMapper
()
{
}
// deep copy function for BinMapper
BinMapper
::
BinMapper
(
const
BinMapper
&
other
)
{
num_bin_
=
other
.
num_bin_
;
missing_type_
=
other
.
missing_type_
;
is_trival_
=
other
.
is_trival_
;
sparse_rate_
=
other
.
sparse_rate_
;
bin_type_
=
other
.
bin_type_
;
if
(
bin_type_
==
BinType
::
NumericalBin
)
{
bin_upper_bound_
=
other
.
bin_upper_bound_
;
}
else
{
bin_2_categorical_
=
other
.
bin_2_categorical_
;
categorical_2_bin_
=
other
.
categorical_2_bin_
;
}
}
min_val_
=
other
.
min_val_
;
max_val_
=
other
.
max_val_
;
default_bin_
=
other
.
default_bin_
;
}
BinMapper
::
BinMapper
(
const
void
*
memory
)
{
// deep copy function for BinMapper
CopyFrom
(
reinterpret_cast
<
const
char
*>
(
memory
));
BinMapper
::
BinMapper
(
const
BinMapper
&
other
)
{
}
num_bin_
=
other
.
num_bin_
;
missing_type_
=
other
.
missing_type_
;
is_trival_
=
other
.
is_trival_
;
sparse_rate_
=
other
.
sparse_rate_
;
bin_type_
=
other
.
bin_type_
;
if
(
bin_type_
==
BinType
::
NumericalBin
)
{
bin_upper_bound_
=
other
.
bin_upper_bound_
;
}
else
{
bin_2_categorical_
=
other
.
bin_2_categorical_
;
categorical_2_bin_
=
other
.
categorical_2_bin_
;
}
min_val_
=
other
.
min_val_
;
max_val_
=
other
.
max_val_
;
default_bin_
=
other
.
default_bin_
;
}
BinMapper
::~
BinMapper
()
{
BinMapper
::
BinMapper
(
const
void
*
memory
)
{
CopyFrom
(
reinterpret_cast
<
const
char
*>
(
memory
));
}
}
BinMapper
::~
BinMapper
()
{
bool
NeedFilter
(
std
::
vector
<
int
>&
cnt_in_bin
,
int
total_cnt
,
int
filter_cnt
,
BinType
bin_type
)
{
}
if
(
bin_type
==
BinType
::
NumericalBin
)
{
int
sum_left
=
0
;
bool
NeedFilter
(
const
std
::
vector
<
int
>&
cnt_in_bin
,
int
total_cnt
,
int
filter_cnt
,
BinType
bin_type
)
{
for
(
size_t
i
=
0
;
i
<
cnt_in_bin
.
size
()
-
1
;
++
i
)
{
if
(
bin_type
==
BinType
::
NumericalBin
)
{
sum_left
+=
cnt_in_bin
[
i
];
int
sum_left
=
0
;
if
(
sum_left
>=
filter_cnt
&&
total_cnt
-
sum_left
>=
filter_cnt
)
{
for
(
size_t
i
=
0
;
i
<
cnt_in_bin
.
size
()
-
1
;
++
i
)
{
return
false
;
sum_left
+=
cnt_in_bin
[
i
];
if
(
sum_left
>=
filter_cnt
&&
total_cnt
-
sum_left
>=
filter_cnt
)
{
return
false
;
}
}
}
}
}
else
{
}
else
{
if
(
cnt_in_bin
.
size
()
<=
2
)
{
for
(
size_t
i
=
0
;
i
<
cnt_in_bin
.
size
()
-
1
;
++
i
)
{
for
(
size_t
i
=
0
;
i
<
cnt_in_bin
.
size
()
-
1
;
++
i
)
{
int
sum_left
=
cnt_in_bin
[
i
];
int
sum_left
=
cnt_in_bin
[
i
];
if
(
sum_left
>=
filter_cnt
&&
total_cnt
-
sum_left
>=
filter_cnt
)
{
if
(
sum_left
>=
filter_cnt
&&
total_cnt
-
sum_left
>=
filter_cnt
)
{
return
false
;
}
}
}
else
{
return
false
;
return
false
;
}
}
}
}
return
true
;
}
}
return
true
;
}
std
::
vector
<
double
>
GreedyFindBin
(
const
double
*
distinct_values
,
const
int
*
counts
,
int
num_distinct_values
,
int
max_bin
,
size_t
total_cnt
,
int
min_data_in_bin
)
{
std
::
vector
<
double
>
bin_upper_bound
;
if
(
num_distinct_values
<=
max_bin
)
{
bin_upper_bound
.
clear
();
int
cur_cnt_inbin
=
0
;
for
(
int
i
=
0
;
i
<
num_distinct_values
-
1
;
++
i
)
{
cur_cnt_inbin
+=
counts
[
i
];
if
(
cur_cnt_inbin
>=
min_data_in_bin
)
{
bin_upper_bound
.
push_back
((
distinct_values
[
i
]
+
distinct_values
[
i
+
1
])
/
2
);
cur_cnt_inbin
=
0
;
}
}
cur_cnt_inbin
+=
counts
[
num_distinct_values
-
1
];
bin_upper_bound
.
push_back
(
std
::
numeric_limits
<
double
>::
infinity
());
}
else
{
if
(
min_data_in_bin
>
0
)
{
max_bin
=
std
::
min
(
max_bin
,
static_cast
<
int
>
(
total_cnt
/
min_data_in_bin
));
max_bin
=
std
::
max
(
max_bin
,
1
);
}
double
mean_bin_size
=
static_cast
<
double
>
(
total_cnt
)
/
max_bin
;
// mean size for one bin
std
::
vector
<
double
>
GreedyFindBin
(
const
double
*
distinct_values
,
const
int
*
counts
,
int
rest_bin_cnt
=
max_bin
;
int
num_distinct_values
,
int
max_bin
,
size_t
total_cnt
,
int
min_data_in_bin
)
{
int
rest_sample_cnt
=
static_cast
<
int
>
(
total_cnt
);
std
::
vector
<
double
>
bin_upper_bound
;
std
::
vector
<
bool
>
is_big_count_value
(
num_distinct_values
,
false
);
if
(
num_distinct_values
<=
max_bin
)
{
for
(
int
i
=
0
;
i
<
num_distinct_values
;
++
i
)
{
bin_upper_bound
.
clear
();
if
(
counts
[
i
]
>=
mean_bin_size
)
{
int
cur_cnt_inbin
=
0
;
is_big_count_value
[
i
]
=
true
;
for
(
int
i
=
0
;
i
<
num_distinct_values
-
1
;
++
i
)
{
--
rest_bin_cnt
;
cur_cnt_inbin
+=
counts
[
i
];
rest_sample_cnt
-=
counts
[
i
];
if
(
cur_cnt_inbin
>=
min_data_in_bin
)
{
bin_upper_bound
.
push_back
((
distinct_values
[
i
]
+
distinct_values
[
i
+
1
])
/
2
);
cur_cnt_inbin
=
0
;
}
}
}
}
cur_cnt_inbin
+=
counts
[
num_distinct_values
-
1
];
mean_bin_size
=
static_cast
<
double
>
(
rest_sample_cnt
)
/
rest_bin_cnt
;
bin_upper_bound
.
push_back
(
std
::
numeric_limits
<
double
>::
infinity
());
std
::
vector
<
double
>
upper_bounds
(
max_bin
,
std
::
numeric_limits
<
double
>::
infinity
());
}
else
{
std
::
vector
<
double
>
lower_bounds
(
max_bin
,
std
::
numeric_limits
<
double
>::
infinity
());
if
(
min_data_in_bin
>
0
)
{
max_bin
=
std
::
min
(
max_bin
,
static_cast
<
int
>
(
total_cnt
/
min_data_in_bin
));
int
bin_cnt
=
0
;
max_bin
=
std
::
max
(
max_bin
,
1
);
lower_bounds
[
bin_cnt
]
=
distinct_values
[
0
];
int
cur_cnt_inbin
=
0
;
for
(
int
i
=
0
;
i
<
num_distinct_values
-
1
;
++
i
)
{
if
(
!
is_big_count_value
[
i
])
{
rest_sample_cnt
-=
counts
[
i
];
}
}
cur_cnt_inbin
+=
counts
[
i
];
double
mean_bin_size
=
static_cast
<
double
>
(
total_cnt
)
/
max_bin
;
// need a new bin
if
(
is_big_count_value
[
i
]
||
cur_cnt_inbin
>=
mean_bin_size
||
// mean size for one bin
(
is_big_count_value
[
i
+
1
]
&&
cur_cnt_inbin
>=
std
::
max
(
1.0
,
mean_bin_size
*
0.5
f
)))
{
int
rest_bin_cnt
=
max_bin
;
upper_bounds
[
bin_cnt
]
=
distinct_values
[
i
];
int
rest_sample_cnt
=
static_cast
<
int
>
(
total_cnt
);
++
bin_cnt
;
std
::
vector
<
bool
>
is_big_count_value
(
num_distinct_values
,
false
);
lower_bounds
[
bin_cnt
]
=
distinct_values
[
i
+
1
];
for
(
int
i
=
0
;
i
<
num_distinct_values
;
++
i
)
{
if
(
bin_cnt
>=
max_bin
-
1
)
{
break
;
}
if
(
counts
[
i
]
>=
mean_bin_size
)
{
cur_cnt_inbin
=
0
;
is_big_count_value
[
i
]
=
true
;
if
(
!
is_big_count_value
[
i
])
{
--
rest_bin_cnt
;
--
rest_bin_cnt
;
mean_bin_size
=
rest_sample_cnt
/
static_cast
<
double
>
(
rest_bin_cnt
)
;
rest_sample_cnt
-=
counts
[
i
]
;
}
}
}
}
mean_bin_size
=
static_cast
<
double
>
(
rest_sample_cnt
)
/
rest_bin_cnt
;
std
::
vector
<
double
>
upper_bounds
(
max_bin
,
std
::
numeric_limits
<
double
>::
infinity
());
std
::
vector
<
double
>
lower_bounds
(
max_bin
,
std
::
numeric_limits
<
double
>::
infinity
());
int
bin_cnt
=
0
;
lower_bounds
[
bin_cnt
]
=
distinct_values
[
0
];
int
cur_cnt_inbin
=
0
;
for
(
int
i
=
0
;
i
<
num_distinct_values
-
1
;
++
i
)
{
if
(
!
is_big_count_value
[
i
])
{
rest_sample_cnt
-=
counts
[
i
];
}
cur_cnt_inbin
+=
counts
[
i
];
// need a new bin
if
(
is_big_count_value
[
i
]
||
cur_cnt_inbin
>=
mean_bin_size
||
(
is_big_count_value
[
i
+
1
]
&&
cur_cnt_inbin
>=
std
::
max
(
1.0
,
mean_bin_size
*
0.5
f
)))
{
upper_bounds
[
bin_cnt
]
=
distinct_values
[
i
];
++
bin_cnt
;
lower_bounds
[
bin_cnt
]
=
distinct_values
[
i
+
1
];
if
(
bin_cnt
>=
max_bin
-
1
)
{
break
;
}
cur_cnt_inbin
=
0
;
if
(
!
is_big_count_value
[
i
])
{
--
rest_bin_cnt
;
mean_bin_size
=
rest_sample_cnt
/
static_cast
<
double
>
(
rest_bin_cnt
);
}
}
}
++
bin_cnt
;
// update bin upper bound
bin_upper_bound
.
resize
(
bin_cnt
);
for
(
int
i
=
0
;
i
<
bin_cnt
-
1
;
++
i
)
{
bin_upper_bound
[
i
]
=
(
upper_bounds
[
i
]
+
lower_bounds
[
i
+
1
])
/
2.0
f
;
}
// last bin upper bound
bin_upper_bound
[
bin_cnt
-
1
]
=
std
::
numeric_limits
<
double
>::
infinity
();
}
}
++
bin_cnt
;
return
bin_upper_bound
;
// update bin upper bound
bin_upper_bound
.
resize
(
bin_cnt
);
for
(
int
i
=
0
;
i
<
bin_cnt
-
1
;
++
i
)
{
bin_upper_bound
[
i
]
=
(
upper_bounds
[
i
]
+
lower_bounds
[
i
+
1
])
/
2.0
f
;
}
// last bin upper bound
bin_upper_bound
[
bin_cnt
-
1
]
=
std
::
numeric_limits
<
double
>::
infinity
();
}
}
return
bin_upper_bound
;
}
std
::
vector
<
double
>
FindBinWithZeroAsOneBin
(
const
double
*
distinct_values
,
const
int
*
counts
,
int
num_distinct_values
,
int
max_bin
,
size_t
total_sample_cnt
,
int
min_data_in_bin
)
{
std
::
vector
<
double
>
FindBinWithZeroAsMissing
(
const
double
*
distinct_values
,
const
int
*
counts
,
std
::
vector
<
double
>
bin_upper_bound
;
int
num_distinct_values
,
int
max_bin
,
size_t
total_sample_cnt
,
int
min_data_in_bin
)
{
int
left_cnt_data
=
0
;
std
::
vector
<
double
>
bin_upper_bound
;
int
cnt_zero
=
0
;
int
left_cnt_data
=
0
;
int
right_cnt_data
=
0
;
int
cnt_missing
=
0
;
for
(
int
i
=
0
;
i
<
num_distinct_values
;
++
i
)
{
int
right_cnt_data
=
0
;
if
(
distinct_values
[
i
]
<=
-
kZeroAsMissingValueRange
)
{
for
(
int
i
=
0
;
i
<
num_distinct_values
;
++
i
)
{
left_cnt_data
+=
counts
[
i
];
if
(
distinct_values
[
i
]
<=
-
kZeroAsMissingValueRange
)
{
}
else
if
(
distinct_values
[
i
]
>
kZeroAsMissingValueRange
)
{
left_cnt_data
+=
counts
[
i
];
right_cnt_data
+=
counts
[
i
];
}
else
if
(
distinct_values
[
i
]
>
kZeroAsMissingValueRange
)
{
}
else
{
right_cnt_data
+=
counts
[
i
];
cnt_zero
+=
counts
[
i
];
}
else
{
}
cnt_missing
+=
counts
[
i
];
}
}
}
int
left_cnt
=
0
;
int
left_cnt
=
-
1
;
for
(
int
i
=
0
;
i
<
num_distinct_values
;
++
i
)
{
for
(
int
i
=
0
;
i
<
num_distinct_values
;
++
i
)
{
if
(
distinct_values
[
i
]
>
-
kZeroAsMissingValueRange
)
{
if
(
distinct_values
[
i
]
>
-
kZeroAsMissingValueRange
)
{
left_cnt
=
i
;
left_cnt
=
i
;
break
;
break
;
}
}
}
}
if
(
left_cnt
>
0
)
{
if
(
left_cnt
<
0
)
{
int
left_max_bin
=
static_cast
<
int
>
(
static_cast
<
double
>
(
left_cnt_data
)
/
(
total_sample_cnt
-
cnt_missing
)
*
(
max_bin
-
1
));
left_cnt
=
num_distinct_values
;
bin_upper_bound
=
GreedyFindBin
(
distinct_values
,
counts
,
left_cnt
,
left_max_bin
,
left_cnt_data
,
min_data_in_bin
);
}
bin_upper_bound
.
back
()
=
-
kZeroAsMissingValueRange
;
}
int
right_start
=
-
1
;
if
(
left_cnt
>
0
)
{
for
(
int
i
=
left_cnt
;
i
<
num_distinct_values
;
++
i
)
{
int
left_max_bin
=
static_cast
<
int
>
(
static_cast
<
double
>
(
left_cnt_data
)
/
(
total_sample_cnt
-
cnt_zero
)
*
(
max_bin
-
1
));
if
(
distinct_values
[
i
]
>
kZeroAsMissingValueRange
)
{
bin_upper_bound
=
GreedyFindBin
(
distinct_values
,
counts
,
left_cnt
,
left_max_bin
,
left_cnt_data
,
min_data_in_bin
);
right_start
=
i
;
bin_upper_bound
.
back
()
=
-
kZeroAsMissingValueRange
;
break
;
}
}
}
if
(
right_start
>=
0
)
{
int
right_start
=
-
1
;
int
right_max_bin
=
max_bin
-
1
-
static_cast
<
int
>
(
bin_upper_bound
.
size
());
for
(
int
i
=
left_cnt
;
i
<
num_distinct_values
;
++
i
)
{
auto
right_bounds
=
GreedyFindBin
(
distinct_values
+
right_start
,
counts
+
right_start
,
if
(
distinct_values
[
i
]
>
kZeroAsMissingValueRange
)
{
num_distinct_values
-
right_start
,
right_max_bin
,
right_cnt_data
,
min_data_in_bin
);
right_start
=
i
;
bin_upper_bound
.
push_back
(
kZeroAsMissingValueRange
);
break
;
bin_upper_bound
.
insert
(
bin_upper_bound
.
end
(),
right_bounds
.
begin
(),
right_bounds
.
end
());
}
}
else
{
bin_upper_bound
.
push_back
(
std
::
numeric_limits
<
double
>::
infinity
());
}
return
bin_upper_bound
;
}
void
BinMapper
::
FindBin
(
double
*
values
,
int
num_sample_values
,
size_t
total_sample_cnt
,
int
max_bin
,
int
min_data_in_bin
,
int
min_split_data
,
BinType
bin_type
,
bool
use_missing
,
bool
zero_as_missing
)
{
int
na_cnt
=
0
;
int
tmp_num_sample_values
=
0
;
for
(
int
i
=
0
;
i
<
num_sample_values
;
++
i
)
{
if
(
!
std
::
isnan
(
values
[
i
]))
{
values
[
tmp_num_sample_values
++
]
=
values
[
i
];
}
}
if
(
right_start
>=
0
)
{
int
right_max_bin
=
max_bin
-
1
-
static_cast
<
int
>
(
bin_upper_bound
.
size
());
auto
right_bounds
=
GreedyFindBin
(
distinct_values
+
right_start
,
counts
+
right_start
,
num_distinct_values
-
right_start
,
right_max_bin
,
right_cnt_data
,
min_data_in_bin
);
bin_upper_bound
.
push_back
(
kZeroAsMissingValueRange
);
bin_upper_bound
.
insert
(
bin_upper_bound
.
end
(),
right_bounds
.
begin
(),
right_bounds
.
end
());
}
else
{
bin_upper_bound
.
push_back
(
std
::
numeric_limits
<
double
>::
infinity
());
}
return
bin_upper_bound
;
}
}
if
(
!
use_missing
)
{
missing_type_
=
MissingType
::
None
;
void
BinMapper
::
FindBin
(
double
*
values
,
int
num_sample_values
,
size_t
total_sample_cnt
,
}
else
if
(
zero_as_missing
)
{
int
max_bin
,
int
min_data_in_bin
,
int
min_split_data
,
BinType
bin_type
,
bool
use_missing
,
bool
zero_as_missing
)
{
missing_type_
=
MissingType
::
Zero
;
int
na_cnt
=
0
;
}
else
{
int
tmp_num_sample_values
=
0
;
if
(
tmp_num_sample_values
==
num_sample_values
)
{
for
(
int
i
=
0
;
i
<
num_sample_values
;
++
i
)
{
if
(
!
std
::
isnan
(
values
[
i
]))
{
values
[
tmp_num_sample_values
++
]
=
values
[
i
];
}
}
if
(
!
use_missing
)
{
missing_type_
=
MissingType
::
None
;
missing_type_
=
MissingType
::
None
;
}
else
if
(
zero_as_missing
)
{
missing_type_
=
MissingType
::
Zero
;
}
else
{
}
else
{
missing_type_
=
MissingType
::
NaN
;
if
(
tmp_num_sample_values
==
num_sample_values
)
{
missing_type_
=
MissingType
::
None
;
}
else
{
missing_type_
=
MissingType
::
NaN
;
na_cnt
=
num_sample_values
-
tmp_num_sample_values
;
}
}
}
na_cnt
=
num_sample_values
-
tmp_num_sample_values
;
num_sample_values
=
tmp_num_sample_values
;
}
num_sample_values
=
tmp_num_sample_values
;
bin_type_
=
bin_type
;
bin_type_
=
bin_type
;
default_bin_
=
0
;
default_bin_
=
0
;
int
zero_cnt
=
static_cast
<
int
>
(
total_sample_cnt
-
num_sample_values
-
na_cnt
);
int
zero_cnt
=
static_cast
<
int
>
(
total_sample_cnt
-
num_sample_values
-
na_cnt
);
// find distinct_values first
// find distinct_values first
std
::
vector
<
double
>
distinct_values
;
std
::
vector
<
double
>
distinct_values
;
std
::
vector
<
int
>
counts
;
std
::
vector
<
int
>
counts
;
std
::
sort
(
values
,
values
+
num_sample_values
);
std
::
sort
(
values
,
values
+
num_sample_values
);
// push zero in the front
// push zero in the front
if
(
num_sample_values
==
0
||
(
values
[
0
]
>
0.0
f
&&
zero_cnt
>
0
))
{
if
(
num_sample_values
==
0
||
(
values
[
0
]
>
0.0
f
&&
zero_cnt
>
0
))
{
distinct_values
.
push_back
(
0.0
f
);
distinct_values
.
push_back
(
0.0
f
);
counts
.
push_back
(
zero_cnt
);
counts
.
push_back
(
zero_cnt
);
}
}
if
(
num_sample_values
>
0
)
{
if
(
num_sample_values
>
0
)
{
distinct_values
.
push_back
(
values
[
0
]);
distinct_values
.
push_back
(
values
[
0
]);
counts
.
push_back
(
1
);
counts
.
push_back
(
1
);
}
}
for
(
int
i
=
1
;
i
<
num_sample_values
;
++
i
)
{
for
(
int
i
=
1
;
i
<
num_sample_values
;
++
i
)
{
if
(
values
[
i
]
!=
values
[
i
-
1
])
{
if
(
values
[
i
]
!=
values
[
i
-
1
])
{
if
(
values
[
i
-
1
]
<
0.0
f
&&
values
[
i
]
>
0.0
f
)
{
if
(
values
[
i
-
1
]
<
0.0
f
&&
values
[
i
]
>
0.0
f
)
{
distinct_values
.
push_back
(
0.0
f
);
distinct_values
.
push_back
(
0.0
f
);
counts
.
push_back
(
zero_cnt
);
counts
.
push_back
(
zero_cnt
);
}
distinct_values
.
push_back
(
values
[
i
]);
counts
.
push_back
(
1
);
}
else
{
++
counts
.
back
();
}
}
distinct_values
.
push_back
(
values
[
i
]);
counts
.
push_back
(
1
);
}
else
{
++
counts
.
back
();
}
}
}
// push zero in the back
// push zero in the back
if
(
num_sample_values
>
0
&&
values
[
num_sample_values
-
1
]
<
0.0
f
&&
zero_cnt
>
0
)
{
if
(
num_sample_values
>
0
&&
values
[
num_sample_values
-
1
]
<
0.0
f
&&
zero_cnt
>
0
)
{
distinct_values
.
push_back
(
0.0
f
);
distinct_values
.
push_back
(
0.0
f
);
counts
.
push_back
(
zero_cnt
);
counts
.
push_back
(
zero_cnt
);
}
}
min_val_
=
distinct_values
.
front
();
min_val_
=
distinct_values
.
front
();
max_val_
=
distinct_values
.
back
();
max_val_
=
distinct_values
.
back
();
std
::
vector
<
int
>
cnt_in_bin
;
std
::
vector
<
int
>
cnt_in_bin
;
int
num_distinct_values
=
static_cast
<
int
>
(
distinct_values
.
size
());
int
num_distinct_values
=
static_cast
<
int
>
(
distinct_values
.
size
());
if
(
bin_type_
==
BinType
::
NumericalBin
)
{
if
(
bin_type_
==
BinType
::
NumericalBin
)
{
if
(
missing_type_
==
MissingType
::
Zero
)
{
if
(
missing_type_
==
MissingType
::
Zero
)
{
bin_upper_bound_
=
FindBinWithZeroAsMissing
(
distinct_values
.
data
(),
counts
.
data
(),
num_distinct_values
,
max_bin
,
total_sample_cnt
,
min_data_in_bin
);
bin_upper_bound_
=
FindBinWithZeroAsOneBin
(
distinct_values
.
data
(),
counts
.
data
(),
num_distinct_values
,
max_bin
,
total_sample_cnt
,
min_data_in_bin
);
if
(
bin_upper_bound_
.
size
()
==
2
)
{
if
(
bin_upper_bound_
.
size
()
==
2
)
{
missing_type_
=
MissingType
::
None
;
missing_type_
=
MissingType
::
None
;
}
}
else
if
(
missing_type_
==
MissingType
::
None
)
{
bin_upper_bound_
=
FindBinWithZeroAsOneBin
(
distinct_values
.
data
(),
counts
.
data
(),
num_distinct_values
,
max_bin
,
total_sample_cnt
,
min_data_in_bin
);
}
else
{
bin_upper_bound_
=
FindBinWithZeroAsOneBin
(
distinct_values
.
data
(),
counts
.
data
(),
num_distinct_values
,
max_bin
-
1
,
total_sample_cnt
-
na_cnt
,
min_data_in_bin
);
bin_upper_bound_
.
push_back
(
NaN
);
}
num_bin_
=
static_cast
<
int
>
(
bin_upper_bound_
.
size
());
{
cnt_in_bin
.
resize
(
num_bin_
,
0
);
int
i_bin
=
0
;
for
(
int
i
=
0
;
i
<
num_distinct_values
;
++
i
)
{
if
(
distinct_values
[
i
]
>
bin_upper_bound_
[
i_bin
])
{
++
i_bin
;
}
cnt_in_bin
[
i_bin
]
+=
counts
[
i
];
}
if
(
missing_type_
==
MissingType
::
NaN
)
{
cnt_in_bin
[
num_bin_
-
1
]
=
na_cnt
;
}
}
}
}
else
if
(
missing_type_
==
MissingType
::
None
)
{
CHECK
(
num_bin_
<=
max_bin
);
bin_upper_bound_
=
GreedyFindBin
(
distinct_values
.
data
(),
counts
.
data
(),
num_distinct_values
,
max_bin
,
total_sample_cnt
,
min_data_in_bin
);
}
else
{
}
else
{
bin_upper_bound_
=
GreedyFindBin
(
distinct_values
.
data
(),
counts
.
data
(),
num_distinct_values
,
max_bin
-
1
,
total_sample_cnt
-
na_cnt
,
min_data_in_bin
);
// convert to int type first
bin_upper_bound_
.
push_back
(
NaN
)
;
std
::
vector
<
int
>
distinct_values_int
;
}
std
::
vector
<
int
>
counts_int
;
num_bin_
=
static_cast
<
int
>
(
bin_upper_bound_
.
size
(
));
distinct_values_int
.
push_back
(
static_cast
<
int
>
(
distinct_values
[
0
]
));
{
counts_int
.
push_back
(
counts
[
0
]);
cnt_in_bin
.
resize
(
num_bin_
,
0
);
for
(
size_t
i
=
1
;
i
<
distinct_values
.
size
();
++
i
)
{
int
i_bin
=
0
;
if
(
static_cast
<
int
>
(
distinct_values
[
i
])
!=
distinct_values_int
.
back
())
{
for
(
int
i
=
0
;
i
<
num_
distinct_values
;
++
i
)
{
distinct_values_int
.
push_back
(
static_cast
<
int
>
(
distinct_values
[
i
]));
if
(
distinct_values
[
i
]
>
bin_upper_bound_
[
i_bin
])
{
counts_int
.
push_back
(
counts
[
i
]);
++
i_bin
;
}
else
{
}
counts_int
.
back
()
+=
counts
[
i
];
cnt_in_bin
[
i_bin
]
+=
counts
[
i
];
}
}
}
if
(
missing_type_
==
MissingType
::
NaN
)
{
// sort by counts
cnt_in_bin
[
num_bin_
-
1
]
=
na_cnt
;
Common
::
SortForPair
<
int
,
int
>
(
counts_int
,
distinct_values_int
,
0
,
true
);
// avoid first bin is zero
if
(
distinct_values_int
[
0
]
==
0
&&
counts_int
.
size
()
>
1
)
{
std
::
swap
(
counts_int
[
0
],
counts_int
[
1
]);
std
::
swap
(
distinct_values_int
[
0
],
distinct_values_int
[
1
]);
}
}
}
// will ignore the categorical of small counts
CHECK
(
num_bin_
<=
max_bin
);
int
cut_cnt
=
static_cast
<
int
>
((
total_sample_cnt
-
na_cnt
)
*
0.99
f
);
}
else
{
size_t
cur_cat
=
0
;
// No missing handle for categorical features
categorical_2_bin_
.
clear
();
missing_type_
=
MissingType
::
None
;
bin_2_categorical_
.
clear
();
// convert to int type first
num_bin_
=
0
;
std
::
vector
<
int
>
distinct_values_int
;
int
used_cnt
=
0
;
std
::
vector
<
int
>
counts_int
;
max_bin
=
std
::
min
(
static_cast
<
int
>
(
distinct_values_int
.
size
()),
max_bin
);
distinct_values_int
.
push_back
(
static_cast
<
int
>
(
distinct_values
[
0
]));
cnt_in_bin
.
clear
();
counts_int
.
push_back
(
counts
[
0
]);
while
(
cur_cat
<
distinct_values_int
.
size
()
for
(
size_t
i
=
1
;
i
<
distinct_values
.
size
();
++
i
)
{
&&
(
used_cnt
<
cut_cnt
||
num_bin_
<
max_bin
))
{
if
(
static_cast
<
int
>
(
distinct_values
[
i
])
!=
distinct_values_int
.
back
())
{
if
(
distinct_values_int
[
cur_cat
]
<
0
)
{
distinct_values_int
.
push_back
(
static_cast
<
int
>
(
distinct_values
[
i
]));
na_cnt
+=
counts_int
[
cur_cat
];
counts_int
.
push_back
(
counts
[
i
]);
cut_cnt
-=
counts_int
[
cur_cat
];
Log
::
Warning
(
"Met negative value in categorical features, will convert it to NaN"
);
}
else
{
bin_2_categorical_
.
push_back
(
distinct_values_int
[
cur_cat
]);
categorical_2_bin_
[
distinct_values_int
[
cur_cat
]]
=
static_cast
<
unsigned
int
>
(
num_bin_
);
used_cnt
+=
counts_int
[
cur_cat
];
cnt_in_bin
.
push_back
(
counts_int
[
cur_cat
]);
++
num_bin_
;
}
++
cur_cat
;
}
// need an additional bin for NaN
if
(
cur_cat
==
distinct_values_int
.
size
()
&&
na_cnt
>
0
)
{
// use -1 to represent NaN
bin_2_categorical_
.
push_back
(
-
1
);
categorical_2_bin_
[
-
1
]
=
num_bin_
;
cnt_in_bin
.
push_back
(
0
);
++
num_bin_
;
}
// Use MissingType::None to represent this bin contains all categoricals
if
(
cur_cat
==
distinct_values_int
.
size
()
&&
na_cnt
==
0
)
{
missing_type_
=
MissingType
::
None
;
}
else
if
(
na_cnt
==
0
)
{
missing_type_
=
MissingType
::
Zero
;
}
else
{
}
else
{
counts_int
.
back
()
+=
counts
[
i
]
;
missing_type_
=
MissingType
::
NaN
;
}
}
cnt_in_bin
.
back
()
+=
static_cast
<
int
>
(
total_sample_cnt
-
used_cnt
);
}
}
// sort by counts
Common
::
SortForPair
<
int
,
int
>
(
counts_int
,
distinct_values_int
,
0
,
true
);
// check trival(num_bin_ == 1) feature
// will ignore the categorical of small counts
if
(
num_bin_
<=
1
)
{
const
int
cut_cnt
=
static_cast
<
int
>
(
total_sample_cnt
*
0.98
f
);
is_trival_
=
true
;
categorical_2_bin_
.
clear
();
}
else
{
bin_2_categorical_
.
clear
();
is_trival_
=
false
;
num_bin_
=
0
;
}
int
used_cnt
=
0
;
// check useless bin
max_bin
=
std
::
min
(
static_cast
<
int
>
(
distinct_values_int
.
size
()),
max_bin
);
if
(
!
is_trival_
&&
NeedFilter
(
cnt_in_bin
,
static_cast
<
int
>
(
total_sample_cnt
),
min_split_data
,
bin_type_
))
{
while
(
used_cnt
<
cut_cnt
||
num_bin_
<
max_bin
)
{
is_trival_
=
true
;
bin_2_categorical_
.
push_back
(
distinct_values_int
[
num_bin_
]);
categorical_2_bin_
[
distinct_values_int
[
num_bin_
]]
=
static_cast
<
unsigned
int
>
(
num_bin_
);
used_cnt
+=
counts_int
[
num_bin_
];
++
num_bin_
;
}
}
cnt_in_bin
=
counts_int
;
counts_int
.
resize
(
num_bin_
);
counts_int
.
back
()
+=
static_cast
<
int
>
(
total_sample_cnt
-
used_cnt
);
}
// check trival(num_bin_ == 1) feature
if
(
!
is_trival_
)
{
if
(
num_bin_
<=
1
)
{
default_bin_
=
ValueToBin
(
0
);
is_trival_
=
true
;
if
(
bin_type_
==
BinType
::
CategoricalBin
)
{
}
else
{
CHECK
(
default_bin_
>
0
);
is_trival_
=
false
;
}
}
}
// check useless bin
// calculate sparse rate
if
(
!
is_trival_
&&
NeedFilter
(
cnt_in_bin
,
static_cast
<
int
>
(
total_sample_cnt
),
min_split_data
,
bin_type_
))
{
sparse_rate_
=
static_cast
<
double
>
(
cnt_in_bin
[
default_bin_
])
/
static_cast
<
double
>
(
total_sample_cnt
);
is_trival_
=
true
;
}
}
if
(
!
is_trival_
)
{
default_bin_
=
ValueToBin
(
0
);
int
BinMapper
::
SizeForSpecificBin
(
int
bin
)
{
int
size
=
0
;
size
+=
sizeof
(
int
);
size
+=
sizeof
(
MissingType
);
size
+=
sizeof
(
bool
);
size
+=
sizeof
(
double
);
size
+=
sizeof
(
BinType
);
size
+=
2
*
sizeof
(
double
);
size
+=
bin
*
sizeof
(
double
);
size
+=
sizeof
(
uint32_t
);
return
size
;
}
}
// calculate sparse rate
sparse_rate_
=
static_cast
<
double
>
(
cnt_in_bin
[
default_bin_
])
/
static_cast
<
double
>
(
total_sample_cnt
);
void
BinMapper
::
CopyTo
(
char
*
buffer
)
const
{
}
std
::
memcpy
(
buffer
,
&
num_bin_
,
sizeof
(
num_bin_
));
buffer
+=
sizeof
(
num_bin_
);
std
::
memcpy
(
buffer
,
&
missing_type_
,
sizeof
(
missing_type_
));
int
BinMapper
::
SizeForSpecificBin
(
int
bin
)
{
buffer
+=
sizeof
(
missing_type_
);
int
size
=
0
;
std
::
memcpy
(
buffer
,
&
is_trival_
,
sizeof
(
is_trival_
));
size
+=
sizeof
(
int
);
buffer
+=
sizeof
(
is_trival_
);
size
+=
sizeof
(
MissingType
);
std
::
memcpy
(
buffer
,
&
sparse_rate_
,
sizeof
(
sparse_rate_
));
size
+=
sizeof
(
bool
);
buffer
+=
sizeof
(
sparse_rate_
);
size
+=
sizeof
(
double
);
std
::
memcpy
(
buffer
,
&
bin_type_
,
sizeof
(
bin_type_
));
size
+=
sizeof
(
BinType
);
buffer
+=
sizeof
(
bin_type_
);
size
+=
2
*
sizeof
(
double
);
std
::
memcpy
(
buffer
,
&
min_val_
,
sizeof
(
min_val_
));
size
+=
bin
*
sizeof
(
double
);
buffer
+=
sizeof
(
min_val_
);
size
+=
sizeof
(
uint32_t
);
std
::
memcpy
(
buffer
,
&
max_val_
,
sizeof
(
max_val_
));
return
size
;
buffer
+=
sizeof
(
max_val_
);
}
std
::
memcpy
(
buffer
,
&
default_bin_
,
sizeof
(
default_bin_
));
buffer
+=
sizeof
(
default_bin_
);
void
BinMapper
::
CopyTo
(
char
*
buffer
)
const
{
if
(
bin_type_
==
BinType
::
NumericalBin
)
{
std
::
memcpy
(
buffer
,
&
num_bin_
,
sizeof
(
num_bin_
));
std
::
memcpy
(
buffer
,
bin_upper_bound_
.
data
(),
num_bin_
*
sizeof
(
double
));
buffer
+=
sizeof
(
num_bin_
);
}
else
{
std
::
memcpy
(
buffer
,
&
missing_type_
,
sizeof
(
missing_type_
));
std
::
memcpy
(
buffer
,
bin_2_categorical_
.
data
(),
num_bin_
*
sizeof
(
int
));
buffer
+=
sizeof
(
missing_type_
);
}
std
::
memcpy
(
buffer
,
&
is_trival_
,
sizeof
(
is_trival_
));
buffer
+=
sizeof
(
is_trival_
);
std
::
memcpy
(
buffer
,
&
sparse_rate_
,
sizeof
(
sparse_rate_
));
buffer
+=
sizeof
(
sparse_rate_
);
std
::
memcpy
(
buffer
,
&
bin_type_
,
sizeof
(
bin_type_
));
buffer
+=
sizeof
(
bin_type_
);
std
::
memcpy
(
buffer
,
&
min_val_
,
sizeof
(
min_val_
));
buffer
+=
sizeof
(
min_val_
);
std
::
memcpy
(
buffer
,
&
max_val_
,
sizeof
(
max_val_
));
buffer
+=
sizeof
(
max_val_
);
std
::
memcpy
(
buffer
,
&
default_bin_
,
sizeof
(
default_bin_
));
buffer
+=
sizeof
(
default_bin_
);
if
(
bin_type_
==
BinType
::
NumericalBin
)
{
std
::
memcpy
(
buffer
,
bin_upper_bound_
.
data
(),
num_bin_
*
sizeof
(
double
));
}
else
{
std
::
memcpy
(
buffer
,
bin_2_categorical_
.
data
(),
num_bin_
*
sizeof
(
int
));
}
}
}
void
BinMapper
::
CopyFrom
(
const
char
*
buffer
)
{
void
BinMapper
::
CopyFrom
(
const
char
*
buffer
)
{
std
::
memcpy
(
&
num_bin_
,
buffer
,
sizeof
(
num_bin_
));
std
::
memcpy
(
&
num_bin_
,
buffer
,
sizeof
(
num_bin_
)
)
;
buffer
+=
sizeof
(
num_bin_
);
buffer
+=
sizeof
(
num_bin_
);
std
::
memcpy
(
&
missing_type_
,
buffer
,
sizeof
(
missing_type_
)
);
std
::
memcpy
(
&
missing_type_
,
buffer
,
sizeof
(
missing_type_
)
)
;
buffer
+=
sizeof
(
missing_type_
);
buffer
+=
sizeof
(
m
is
sing_type_
);
std
::
memcpy
(
&
is_trival_
,
buffer
,
sizeof
(
is
_trival_
)
);
std
::
memcpy
(
&
is_trival_
,
buffer
,
sizeof
(
is_trival_
)
)
;
buffer
+=
sizeof
(
is_trival_
);
buffer
+=
sizeof
(
is_trival_
);
std
::
memcpy
(
&
sparse_rate_
,
buffer
,
sizeof
(
sparse_rate_
)
);
std
::
memcpy
(
&
sparse_rate_
,
buffer
,
sizeof
(
sparse_rate_
)
)
;
buffer
+=
sizeof
(
sparse_rate_
);
buffer
+=
sizeof
(
sparse_rat
e_
);
std
::
memcpy
(
&
bin_type_
,
buffer
,
sizeof
(
bin_typ
e_
)
)
;
std
::
memcpy
(
&
bin_type_
,
buffer
,
sizeof
(
bin_type_
)
)
;
buffer
+=
sizeof
(
bin_type_
);
buffer
+=
sizeof
(
b
in_
type_
);
std
::
memcpy
(
&
min_val_
,
buffer
,
sizeof
(
m
in_
val_
)
);
std
::
memcpy
(
&
min_val_
,
buffer
,
sizeof
(
min_val_
)
)
;
buffer
+=
sizeof
(
min_val_
);
buffer
+=
sizeof
(
m
in
_val_
);
std
::
memcpy
(
&
max_val_
,
buffer
,
sizeof
(
m
ax
_val_
)
)
;
std
::
memcpy
(
&
max_val_
,
buffer
,
sizeof
(
max_val_
)
)
;
buffer
+=
sizeof
(
max_val_
);
buffer
+=
sizeof
(
max_val_
);
std
::
memcpy
(
&
default_bin_
,
buffer
,
sizeof
(
default_bin_
)
);
std
::
memcpy
(
&
default_bin_
,
buffer
,
sizeof
(
default_bin_
)
)
;
buffer
+=
sizeof
(
default_bin_
);
buffer
+=
sizeof
(
default_bin_
);
if
(
bin_type_
==
BinType
::
NumericalBin
)
{
if
(
bin_type
_
=
=
BinType
::
NumericalBin
)
{
bin_upper_bound
_
=
std
::
vector
<
double
>
(
num_bin_
);
bin_upper_bound_
=
std
::
vector
<
double
>
(
num_bin_
);
std
::
memcpy
(
bin_upper_bound_
.
data
(),
buffer
,
num_bin_
*
sizeof
(
double
)
);
std
::
memcpy
(
bin_upper_bound_
.
data
(),
buffer
,
num_bin_
*
sizeof
(
double
));
}
else
{
}
else
{
bin_2_categorical_
=
std
::
vector
<
int
>
(
num_bin_
);
bin_2_categorical_
=
std
::
vector
<
int
>
(
num_bin_
);
std
::
memcpy
(
bin_2_categorical_
.
data
(),
buffer
,
num_bin_
*
sizeof
(
int
)
);
std
::
memcpy
(
bin_2_categorical_
.
data
(),
buffer
,
num_bin_
*
sizeof
(
int
)
);
categorical_2_bin_
.
clear
(
);
categorical_2_bin_
.
clear
();
for
(
int
i
=
0
;
i
<
num_bin_
;
++
i
)
{
for
(
int
i
=
0
;
i
<
num_bin_
;
++
i
)
{
categorical_2_bin_
[
bin_2_categorical_
[
i
]]
=
static_cast
<
unsigned
int
>
(
i
);
categorical_2_bin_
[
bin_2_categorical_
[
i
]]
=
static_cast
<
unsigned
int
>
(
i
);
}
}
}
}
}
}
void
BinMapper
::
SaveBinaryToFile
(
FILE
*
file
)
const
{
void
BinMapper
::
SaveBinaryToFile
(
FILE
*
file
)
const
{
fwrite
(
&
num_bin_
,
sizeof
(
num_bin_
),
1
,
file
);
fwrite
(
&
num_bin_
,
sizeof
(
num_bin
_
),
1
,
file
);
fwrite
(
&
missing_type_
,
sizeof
(
missing_type
_
),
1
,
file
);
fwrite
(
&
m
is
sing_type
_
,
sizeof
(
m
is
sing_type
_
),
1
,
file
);
fwrite
(
&
is
_trival
_
,
sizeof
(
is
_trival
_
),
1
,
file
);
fwrite
(
&
is_trival_
,
sizeof
(
is_trival
_
),
1
,
file
);
fwrite
(
&
sparse_rate_
,
sizeof
(
sparse_rate
_
),
1
,
file
);
fwrite
(
&
sparse_rat
e_
,
sizeof
(
sparse_rat
e_
),
1
,
file
);
fwrite
(
&
bin_typ
e_
,
sizeof
(
bin_typ
e_
),
1
,
file
);
fwrite
(
&
b
in_
type
_
,
sizeof
(
b
in_
type
_
),
1
,
file
);
fwrite
(
&
m
in_
val
_
,
sizeof
(
m
in_
val
_
),
1
,
file
);
fwrite
(
&
m
in
_val_
,
sizeof
(
m
in
_val_
),
1
,
file
);
fwrite
(
&
m
ax
_val_
,
sizeof
(
m
ax
_val_
),
1
,
file
);
fwrite
(
&
max_val_
,
sizeof
(
max_val
_
),
1
,
file
);
fwrite
(
&
default_bin_
,
sizeof
(
default_bin
_
),
1
,
file
);
fwrite
(
&
default_bin_
,
sizeof
(
default_bin_
),
1
,
file
);
if
(
bin_type_
==
BinType
::
NumericalBin
)
{
if
(
bin_type_
==
BinType
::
NumericalBin
)
{
fwrite
(
bin_upper_bound_
.
data
(),
sizeof
(
double
),
num_bin_
,
file
);
fwrite
(
bin_upper_bound_
.
data
(),
sizeof
(
double
),
num_bin_
,
file
);
}
else
{
}
else
{
fwrite
(
bin_2_categorical_
.
data
(),
sizeof
(
int
),
num_bin_
,
file
);
fwrite
(
bin_2_categorical_
.
data
(),
sizeof
(
int
),
num_bin_
,
file
);
}
}
}
}
size_t
BinMapper
::
SizesInByte
()
const
{
size_t
BinMapper
::
SizesInByte
()
const
{
size_t
ret
=
sizeof
(
num_bin_
)
+
sizeof
(
missing_type_
)
+
sizeof
(
is_trival_
)
+
sizeof
(
sparse_rate_
)
size_t
ret
=
sizeof
(
num_bin_
)
+
sizeof
(
missing_type_
)
+
sizeof
(
is_trival_
)
+
sizeof
(
sparse_rate_
)
+
sizeof
(
bin_type_
)
+
sizeof
(
min_val_
)
+
sizeof
(
max_val_
)
+
sizeof
(
default_bin_
);
+
sizeof
(
bin_type_
)
+
sizeof
(
min_val_
)
+
sizeof
(
max_val_
)
+
sizeof
(
default_bin_
);
if
(
bin_type_
==
BinType
::
NumericalBin
)
{
if
(
bin_type_
==
BinType
::
NumericalBin
)
{
ret
+=
sizeof
(
double
)
*
num_bin_
;
ret
+=
sizeof
(
double
)
*
num_bin_
;
}
else
{
}
else
{
ret
+=
sizeof
(
int
)
*
num_bin_
;
ret
+=
sizeof
(
int
)
*
num_bin_
;
}
return
ret
;
}
}
return
ret
;
}
template
class
DenseBin
<
uint8_t
>;
template
class
DenseBin
<
uint16_t
>;
template
class
DenseBin
<
uint8_t
>;
template
class
DenseBin
<
uint32_t
>;
template
class
DenseBin
<
uint16_t
>;
template
class
DenseBin
<
uint32_t
>;
template
class
SparseBin
<
uint8_t
>;
template
class
SparseBin
<
uint16_t
>;
template
class
SparseBin
<
uint8_t
>;
template
class
SparseBin
<
uint32_t
>;
template
class
SparseBin
<
uint16_t
>;
template
class
SparseBin
<
uint32_t
>;
template
class
OrderedSparseBin
<
uint8_t
>;
template
class
OrderedSparseBin
<
uint16_t
>;
template
class
OrderedSparseBin
<
uint8_t
>;
template
class
OrderedSparseBin
<
uint32_t
>;
template
class
OrderedSparseBin
<
uint16_t
>;
template
class
OrderedSparseBin
<
uint32_t
>;
Bin
*
Bin
::
CreateBin
(
data_size_t
num_data
,
int
num_bin
,
double
sparse_rate
,
bool
is_enable_sparse
,
double
sparse_threshold
,
bool
*
is_sparse
)
{
Bin
*
Bin
::
CreateBin
(
data_size_t
num_data
,
int
num_bin
,
double
sparse_rate
,
// sparse threshold
bool
is_enable_sparse
,
double
sparse_threshold
,
bool
*
is_sparse
)
{
if
(
sparse_rate
>=
sparse_threshold
&&
is_enable_sparse
)
{
// sparse threshold
*
is_sparse
=
true
;
if
(
sparse_rate
>=
sparse_threshold
&&
is_enable_sparse
)
{
return
CreateSparseBin
(
num_data
,
num_bin
);
*
is_sparse
=
true
;
}
else
{
return
CreateSparseBin
(
num_data
,
num_bin
);
*
is_sparse
=
false
;
}
else
{
return
CreateDenseBin
(
num_data
,
num_bin
);
*
is_sparse
=
false
;
}
return
CreateDenseBin
(
num_data
,
num_bin
);
}
}
}
Bin
*
Bin
::
CreateDenseBin
(
data_size_t
num_data
,
int
num_bin
)
{
Bin
*
Bin
::
CreateDenseBin
(
data_size_t
num_data
,
int
num_bin
)
{
if
(
num_bin
<=
16
)
{
if
(
num_bin
<=
16
)
{
return
new
Dense4bitsBin
(
num_data
);
return
new
Dense4bitsBin
(
num_data
);
}
else
if
(
num_bin
<=
256
)
{
}
else
if
(
num_bin
<=
256
)
{
return
new
DenseBin
<
uint8_t
>
(
num_data
);
return
new
DenseBin
<
uint8_t
>
(
num_data
);
}
else
if
(
num_bin
<=
65536
)
{
}
else
if
(
num_bin
<=
65536
)
{
return
new
DenseBin
<
uint16_t
>
(
num_data
);
return
new
DenseBin
<
uint16_t
>
(
num_data
);
}
else
{
}
else
{
return
new
DenseBin
<
uint32_t
>
(
num_data
);
return
new
DenseBin
<
uint32_t
>
(
num_data
);
}
}
}
}
Bin
*
Bin
::
CreateSparseBin
(
data_size_t
num_data
,
int
num_bin
)
{
Bin
*
Bin
::
CreateSparseBin
(
data_size_t
num_data
,
int
num_bin
)
{
if
(
num_bin
<=
256
)
{
if
(
num_bin
<=
256
)
{
return
new
SparseBin
<
uint8_t
>
(
num_data
);
return
new
SparseBin
<
uint8_t
>
(
num_data
);
}
else
if
(
num_bin
<=
65536
)
{
}
else
if
(
num_bin
<=
65536
)
{
return
new
SparseBin
<
uint16_t
>
(
num_data
);
return
new
SparseBin
<
uint16_t
>
(
num_data
);
}
else
{
}
else
{
return
new
SparseBin
<
uint32_t
>
(
num_data
);
return
new
SparseBin
<
uint32_t
>
(
num_data
);
}
}
}
}
}
// namespace LightGBM
}
// namespace LightGBM
src/io/config.cpp
View file @
c62dcf73
...
@@ -239,7 +239,7 @@ void OverallConfig::CheckParamConflict() {
...
@@ -239,7 +239,7 @@ void OverallConfig::CheckParamConflict() {
}
}
// Check max_depth and num_leaves
// Check max_depth and num_leaves
if
(
boosting_config
.
tree_config
.
max_depth
>
0
)
{
if
(
boosting_config
.
tree_config
.
max_depth
>
0
)
{
int
full_num_leaves
=
std
::
pow
(
2
,
boosting_config
.
tree_config
.
max_depth
);
int
full_num_leaves
=
static_cast
<
int
>
(
std
::
pow
(
2
,
boosting_config
.
tree_config
.
max_depth
)
)
;
if
(
full_num_leaves
>
boosting_config
.
tree_config
.
num_leaves
if
(
full_num_leaves
>
boosting_config
.
tree_config
.
num_leaves
&&
boosting_config
.
tree_config
.
num_leaves
==
kDefaultNumLeaves
)
{
&&
boosting_config
.
tree_config
.
num_leaves
==
kDefaultNumLeaves
)
{
Log
::
Warning
(
"Accuarcy may be bad since you didn't set num_leaves."
);
Log
::
Warning
(
"Accuarcy may be bad since you didn't set num_leaves."
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment