Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tianlh
LightGBM-DCU
Commits
eade219e
Commit
eade219e
authored
Mar 18, 2017
by
Qiwei Ye
Browse files
merge conflict
parents
f23e6083
060bd316
Changes
129
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1060 additions
and
596 deletions
+1060
-596
src/io/tree.cpp
src/io/tree.cpp
+210
-37
src/main.cpp
src/main.cpp
+2
-18
src/metric/binary_metric.hpp
src/metric/binary_metric.hpp
+9
-9
src/metric/map_metric.hpp
src/metric/map_metric.hpp
+157
-0
src/metric/metric.cpp
src/metric/metric.cpp
+5
-0
src/metric/rank_metric.hpp
src/metric/rank_metric.hpp
+3
-3
src/metric/regression_metric.hpp
src/metric/regression_metric.hpp
+18
-0
src/network/linkers_socket.cpp
src/network/linkers_socket.cpp
+1
-1
src/objective/binary_objective.hpp
src/objective/binary_objective.hpp
+13
-10
src/objective/multiclass_objective.hpp
src/objective/multiclass_objective.hpp
+29
-9
src/objective/objective_function.cpp
src/objective/objective_function.cpp
+2
-0
src/objective/rank_objective.hpp
src/objective/rank_objective.hpp
+1
-0
src/objective/regression_objective.hpp
src/objective/regression_objective.hpp
+50
-0
src/treelearner/data_parallel_tree_learner.cpp
src/treelearner/data_parallel_tree_learner.cpp
+80
-65
src/treelearner/data_partition.hpp
src/treelearner/data_partition.hpp
+10
-5
src/treelearner/feature_histogram.hpp
src/treelearner/feature_histogram.hpp
+174
-202
src/treelearner/feature_parallel_tree_learner.cpp
src/treelearner/feature_parallel_tree_learner.cpp
+10
-19
src/treelearner/leaf_splits.hpp
src/treelearner/leaf_splits.hpp
+8
-28
src/treelearner/parallel_tree_learner.h
src/treelearner/parallel_tree_learner.h
+5
-1
src/treelearner/serial_tree_learner.cpp
src/treelearner/serial_tree_learner.cpp
+273
-189
No files found.
src/io/tree.cpp
View file @
eade219e
...
...
@@ -4,7 +4,6 @@
#include <LightGBM/utils/common.h>
#include <LightGBM/dataset.h>
#include <LightGBM/feature.h>
#include <sstream>
#include <unordered_map>
...
...
@@ -16,11 +15,10 @@
namespace
LightGBM
{
std
::
vector
<
std
::
function
<
bool
(
unsigned
int
,
unsigned
int
)
>>
Tree
::
inner_decision_funs
=
{
Tree
::
NumericalDecision
<
unsigned
int
>
,
Tree
::
CategoricalDecision
<
unsigned
int
>
};
std
::
vector
<
std
::
function
<
bool
(
double
,
double
)
>>
Tree
::
decision_funs
=
{
Tree
::
NumericalDecision
<
double
>
,
Tree
::
CategoricalDecision
<
double
>
};
std
::
vector
<
bool
(
*
)(
uint32_t
,
uint32_t
)
>
Tree
::
inner_decision_funs
=
{
Tree
::
NumericalDecision
<
uint32_t
>
,
Tree
::
CategoricalDecision
<
uint32_t
>
};
std
::
vector
<
bool
(
*
)(
double
,
double
)
>
Tree
::
decision_funs
=
{
Tree
::
NumericalDecision
<
double
>
,
Tree
::
CategoricalDecision
<
double
>
};
Tree
::
Tree
(
int
max_leaves
)
:
max_leaves_
(
max_leaves
)
{
...
...
@@ -28,9 +26,9 @@ Tree::Tree(int max_leaves)
num_leaves_
=
0
;
left_child_
=
std
::
vector
<
int
>
(
max_leaves_
-
1
);
right_child_
=
std
::
vector
<
int
>
(
max_leaves_
-
1
);
split_feature_inner
=
std
::
vector
<
int
>
(
max_leaves_
-
1
);
split_feature_
=
std
::
vector
<
int
>
(
max_leaves_
-
1
);
split_feature_real_
=
std
::
vector
<
int
>
(
max_leaves_
-
1
);
threshold_in_bin_
=
std
::
vector
<
unsigned
int
>
(
max_leaves_
-
1
);
threshold_in_bin_
=
std
::
vector
<
uint32_t
>
(
max_leaves_
-
1
);
threshold_
=
std
::
vector
<
double
>
(
max_leaves_
-
1
);
decision_type_
=
std
::
vector
<
int8_t
>
(
max_leaves_
-
1
);
split_gain_
=
std
::
vector
<
double
>
(
max_leaves_
-
1
);
...
...
@@ -44,12 +42,14 @@ Tree::Tree(int max_leaves)
leaf_depth_
[
0
]
=
0
;
num_leaves_
=
1
;
leaf_parent_
[
0
]
=
-
1
;
shrinkage_
=
1.0
f
;
has_categorical_
=
false
;
}
Tree
::~
Tree
()
{
}
int
Tree
::
Split
(
int
leaf
,
int
feature
,
BinType
bin_type
,
u
nsigned
in
t
threshold_bin
,
int
real_feature
,
int
Tree
::
Split
(
int
leaf
,
int
feature
,
BinType
bin_type
,
u
int32_
t
threshold_bin
,
int
real_feature
,
double
threshold_double
,
double
left_value
,
double
right_value
,
data_size_t
left_cnt
,
data_size_t
right_cnt
,
double
gain
)
{
int
new_node_idx
=
num_leaves_
-
1
;
...
...
@@ -64,15 +64,16 @@ int Tree::Split(int leaf, int feature, BinType bin_type, unsigned int threshold_
}
}
// add new node
split_feature_
[
new_node_idx
]
=
feature
;
split_feature_real_
[
new_node_idx
]
=
real_feature
;
threshold_in_bin_
[
new_node_idx
]
=
threshold_bin
;
threshold_
[
new_node_idx
]
=
threshold_double
;
split_feature_inner
[
new_node_idx
]
=
feature
;
split_feature_
[
new_node_idx
]
=
real_feature
;
if
(
bin_type
==
BinType
::
NumericalBin
)
{
decision_type_
[
new_node_idx
]
=
0
;
}
else
{
has_categorical_
=
true
;
decision_type_
[
new_node_idx
]
=
1
;
}
threshold_in_bin_
[
new_node_idx
]
=
threshold_bin
;
threshold_
[
new_node_idx
]
=
threshold_double
;
split_gain_
[
new_node_idx
]
=
gain
;
// add two new leaves
left_child_
[
new_node_idx
]
=
~
leaf
;
...
...
@@ -96,36 +97,206 @@ int Tree::Split(int leaf, int feature, BinType bin_type, unsigned int threshold_
}
void
Tree
::
AddPredictionToScore
(
const
Dataset
*
data
,
data_size_t
num_data
,
double
*
score
)
const
{
Threading
::
For
<
data_size_t
>
(
0
,
num_data
,
[
this
,
data
,
score
](
int
,
data_size_t
start
,
data_size_t
end
)
{
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>
iterators
(
data
->
num_features
());
for
(
int
i
=
0
;
i
<
data
->
num_features
();
++
i
)
{
iterators
[
i
].
reset
(
data
->
FeatureAt
(
i
)
->
bin_data
()
->
GetIterator
(
start
));
if
(
num_leaves_
<=
1
)
{
return
;
}
if
(
has_categorical_
)
{
if
(
data
->
num_features
()
>
num_leaves_
-
1
)
{
Threading
::
For
<
data_size_t
>
(
0
,
num_data
,
[
this
,
&
data
,
score
](
int
,
data_size_t
start
,
data_size_t
end
)
{
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>
iter
(
num_leaves_
-
1
);
for
(
int
i
=
0
;
i
<
num_leaves_
-
1
;
++
i
)
{
const
int
fidx
=
split_feature_inner
[
i
];
iter
[
i
].
reset
(
data
->
FeatureIterator
(
fidx
));
iter
[
i
]
->
Reset
(
start
);
}
for
(
data_size_t
i
=
start
;
i
<
end
;
++
i
)
{
int
node
=
0
;
while
(
node
>=
0
)
{
if
(
inner_decision_funs
[
decision_type_
[
node
]](
iter
[
node
]
->
Get
(
i
),
threshold_in_bin_
[
node
]))
{
node
=
left_child_
[
node
];
}
else
{
node
=
right_child_
[
node
];
}
}
score
[
i
]
+=
static_cast
<
double
>
(
leaf_value_
[
~
node
]);
}
});
}
else
{
Threading
::
For
<
data_size_t
>
(
0
,
num_data
,
[
this
,
&
data
,
score
](
int
,
data_size_t
start
,
data_size_t
end
)
{
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>
iter
(
data
->
num_features
());
for
(
int
i
=
0
;
i
<
data
->
num_features
();
++
i
)
{
iter
[
i
].
reset
(
data
->
FeatureIterator
(
i
));
iter
[
i
]
->
Reset
(
start
);
}
for
(
data_size_t
i
=
start
;
i
<
end
;
++
i
)
{
int
node
=
0
;
while
(
node
>=
0
)
{
if
(
inner_decision_funs
[
decision_type_
[
node
]](
iter
[
split_feature_inner
[
node
]]
->
Get
(
i
),
threshold_in_bin_
[
node
]))
{
node
=
left_child_
[
node
];
}
else
{
node
=
right_child_
[
node
];
}
}
score
[
i
]
+=
static_cast
<
double
>
(
leaf_value_
[
~
node
]);
}
});
}
for
(
data_size_t
i
=
start
;
i
<
end
;
++
i
)
{
score
[
i
]
+=
static_cast
<
double
>
(
leaf_value_
[
GetLeaf
(
iterators
,
i
)]);
}
else
{
if
(
data
->
num_features
()
>
num_leaves_
-
1
)
{
Threading
::
For
<
data_size_t
>
(
0
,
num_data
,
[
this
,
&
data
,
score
](
int
,
data_size_t
start
,
data_size_t
end
)
{
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>
iter
(
num_leaves_
-
1
);
for
(
int
i
=
0
;
i
<
num_leaves_
-
1
;
++
i
)
{
const
int
fidx
=
split_feature_inner
[
i
];
iter
[
i
].
reset
(
data
->
FeatureIterator
(
fidx
));
iter
[
i
]
->
Reset
(
start
);
}
for
(
data_size_t
i
=
start
;
i
<
end
;
++
i
)
{
int
node
=
0
;
while
(
node
>=
0
)
{
if
(
iter
[
node
]
->
Get
(
i
)
<=
threshold_in_bin_
[
node
])
{
node
=
left_child_
[
node
];
}
else
{
node
=
right_child_
[
node
];
}
}
score
[
i
]
+=
static_cast
<
double
>
(
leaf_value_
[
~
node
]);
}
});
}
else
{
Threading
::
For
<
data_size_t
>
(
0
,
num_data
,
[
this
,
&
data
,
score
](
int
,
data_size_t
start
,
data_size_t
end
)
{
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>
iter
(
data
->
num_features
());
for
(
int
i
=
0
;
i
<
data
->
num_features
();
++
i
)
{
iter
[
i
].
reset
(
data
->
FeatureIterator
(
i
));
iter
[
i
]
->
Reset
(
start
);
}
for
(
data_size_t
i
=
start
;
i
<
end
;
++
i
)
{
int
node
=
0
;
while
(
node
>=
0
)
{
if
(
iter
[
split_feature_inner
[
node
]]
->
Get
(
i
)
<=
threshold_in_bin_
[
node
])
{
node
=
left_child_
[
node
];
}
else
{
node
=
right_child_
[
node
];
}
}
score
[
i
]
+=
static_cast
<
double
>
(
leaf_value_
[
~
node
]);
}
});
}
}
);
}
}
void
Tree
::
AddPredictionToScore
(
const
Dataset
*
data
,
const
data_size_t
*
used_data_indices
,
data_size_t
num_data
,
double
*
score
)
const
{
Threading
::
For
<
data_size_t
>
(
0
,
num_data
,
[
this
,
data
,
used_data_indices
,
score
](
int
,
data_size_t
start
,
data_size_t
end
)
{
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>
iterators
(
data
->
num_features
());
for
(
int
i
=
0
;
i
<
data
->
num_features
();
++
i
)
{
iterators
[
i
].
reset
(
data
->
FeatureAt
(
i
)
->
bin_data
()
->
GetIterator
(
used_data_indices
[
start
]));
void
Tree
::
AddPredictionToScore
(
const
Dataset
*
data
,
const
data_size_t
*
used_data_indices
,
data_size_t
num_data
,
double
*
score
)
const
{
if
(
num_leaves_
<=
1
)
{
return
;
}
if
(
has_categorical_
)
{
if
(
data
->
num_features
()
>
num_leaves_
-
1
)
{
Threading
::
For
<
data_size_t
>
(
0
,
num_data
,
[
this
,
data
,
used_data_indices
,
score
](
int
,
data_size_t
start
,
data_size_t
end
)
{
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>
iter
(
num_leaves_
-
1
);
for
(
int
i
=
0
;
i
<
num_leaves_
-
1
;
++
i
)
{
const
int
fidx
=
split_feature_inner
[
i
];
iter
[
i
].
reset
(
data
->
FeatureIterator
(
fidx
));
iter
[
i
]
->
Reset
(
used_data_indices
[
start
]);
}
for
(
data_size_t
i
=
start
;
i
<
end
;
++
i
)
{
int
node
=
0
;
const
data_size_t
idx
=
used_data_indices
[
i
];
while
(
node
>=
0
)
{
if
(
inner_decision_funs
[
decision_type_
[
node
]](
iter
[
node
]
->
Get
(
idx
),
threshold_in_bin_
[
node
]))
{
node
=
left_child_
[
node
];
}
else
{
node
=
right_child_
[
node
];
}
}
score
[
idx
]
+=
static_cast
<
double
>
(
leaf_value_
[
~
node
]);
}
});
}
else
{
Threading
::
For
<
data_size_t
>
(
0
,
num_data
,
[
this
,
data
,
used_data_indices
,
score
](
int
,
data_size_t
start
,
data_size_t
end
)
{
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>
iter
(
data
->
num_features
());
for
(
int
i
=
0
;
i
<
data
->
num_features
();
++
i
)
{
iter
[
i
].
reset
(
data
->
FeatureIterator
(
i
));
iter
[
i
]
->
Reset
(
used_data_indices
[
start
]);
}
for
(
data_size_t
i
=
start
;
i
<
end
;
++
i
)
{
const
data_size_t
idx
=
used_data_indices
[
i
];
int
node
=
0
;
while
(
node
>=
0
)
{
if
(
inner_decision_funs
[
decision_type_
[
node
]](
iter
[
split_feature_inner
[
node
]]
->
Get
(
idx
),
threshold_in_bin_
[
node
]))
{
node
=
left_child_
[
node
];
}
else
{
node
=
right_child_
[
node
];
}
}
score
[
idx
]
+=
static_cast
<
double
>
(
leaf_value_
[
~
node
]);
}
});
}
for
(
data_size_t
i
=
start
;
i
<
end
;
++
i
)
{
score
[
used_data_indices
[
i
]]
+=
static_cast
<
double
>
(
leaf_value_
[
GetLeaf
(
iterators
,
used_data_indices
[
i
])]);
}
else
{
if
(
data
->
num_features
()
>
num_leaves_
-
1
)
{
Threading
::
For
<
data_size_t
>
(
0
,
num_data
,
[
this
,
data
,
used_data_indices
,
score
](
int
,
data_size_t
start
,
data_size_t
end
)
{
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>
iter
(
num_leaves_
-
1
);
for
(
int
i
=
0
;
i
<
num_leaves_
-
1
;
++
i
)
{
const
int
fidx
=
split_feature_inner
[
i
];
iter
[
i
].
reset
(
data
->
FeatureIterator
(
fidx
));
iter
[
i
]
->
Reset
(
used_data_indices
[
start
]);
}
for
(
data_size_t
i
=
start
;
i
<
end
;
++
i
)
{
int
node
=
0
;
const
data_size_t
idx
=
used_data_indices
[
i
];
while
(
node
>=
0
)
{
if
(
iter
[
node
]
->
Get
(
idx
)
<=
threshold_in_bin_
[
node
])
{
node
=
left_child_
[
node
];
}
else
{
node
=
right_child_
[
node
];
}
}
score
[
idx
]
+=
static_cast
<
double
>
(
leaf_value_
[
~
node
]);
}
});
}
else
{
Threading
::
For
<
data_size_t
>
(
0
,
num_data
,
[
this
,
data
,
used_data_indices
,
score
](
int
,
data_size_t
start
,
data_size_t
end
)
{
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>
iter
(
data
->
num_features
());
for
(
int
i
=
0
;
i
<
data
->
num_features
();
++
i
)
{
iter
[
i
].
reset
(
data
->
FeatureIterator
(
i
));
iter
[
i
]
->
Reset
(
used_data_indices
[
start
]);
}
for
(
data_size_t
i
=
start
;
i
<
end
;
++
i
)
{
const
data_size_t
idx
=
used_data_indices
[
i
];
int
node
=
0
;
while
(
node
>=
0
)
{
if
(
iter
[
split_feature_inner
[
node
]]
->
Get
(
idx
)
<=
threshold_in_bin_
[
node
])
{
node
=
left_child_
[
node
];
}
else
{
node
=
right_child_
[
node
];
}
}
score
[
idx
]
+=
static_cast
<
double
>
(
leaf_value_
[
~
node
]);
}
});
}
}
);
}
}
std
::
string
Tree
::
ToString
()
{
std
::
stringstream
str_buf
;
str_buf
<<
"num_leaves="
<<
num_leaves_
<<
std
::
endl
;
str_buf
<<
"split_feature="
<<
Common
::
ArrayToString
<
int
>
(
split_feature_
real_
,
num_leaves_
-
1
,
' '
)
<<
std
::
endl
;
<<
Common
::
ArrayToString
<
int
>
(
split_feature_
,
num_leaves_
-
1
,
' '
)
<<
std
::
endl
;
str_buf
<<
"split_gain="
<<
Common
::
ArrayToString
<
double
>
(
split_gain_
,
num_leaves_
-
1
,
' '
)
<<
std
::
endl
;
str_buf
<<
"threshold="
...
...
@@ -146,6 +317,7 @@ std::string Tree::ToString() {
<<
Common
::
ArrayToString
<
double
>
(
internal_value_
,
num_leaves_
-
1
,
' '
)
<<
std
::
endl
;
str_buf
<<
"internal_count="
<<
Common
::
ArrayToString
<
data_size_t
>
(
internal_count_
,
num_leaves_
-
1
,
' '
)
<<
std
::
endl
;
str_buf
<<
"shrinkage="
<<
shrinkage_
<<
std
::
endl
;
str_buf
<<
std
::
endl
;
return
str_buf
.
str
();
}
...
...
@@ -154,7 +326,7 @@ std::string Tree::ToJSON() {
std
::
stringstream
str_buf
;
str_buf
<<
std
::
setprecision
(
std
::
numeric_limits
<
double
>::
digits10
+
2
);
str_buf
<<
"
\"
num_leaves
\"
:"
<<
num_leaves_
<<
","
<<
std
::
endl
;
str_buf
<<
"
\"
shrinkage
\"
:"
<<
shrinkage_
<<
","
<<
std
::
endl
;
str_buf
<<
"
\"
tree_structure
\"
:"
<<
NodeToJSON
(
0
)
<<
std
::
endl
;
return
str_buf
.
str
();
...
...
@@ -167,7 +339,7 @@ std::string Tree::NodeToJSON(int index) {
// non-leaf
str_buf
<<
"{"
<<
std
::
endl
;
str_buf
<<
"
\"
split_index
\"
:"
<<
index
<<
","
<<
std
::
endl
;
str_buf
<<
"
\"
split_feature
\"
:"
<<
split_feature_
real_
[
index
]
<<
","
<<
std
::
endl
;
str_buf
<<
"
\"
split_feature
\"
:"
<<
split_feature_
[
index
]
<<
","
<<
std
::
endl
;
str_buf
<<
"
\"
split_gain
\"
:"
<<
split_gain_
[
index
]
<<
","
<<
std
::
endl
;
str_buf
<<
"
\"
threshold
\"
:"
<<
threshold_
[
index
]
<<
","
<<
std
::
endl
;
str_buf
<<
"
\"
decision_type
\"
:
\"
"
<<
Tree
::
GetDecisionTypeName
(
decision_type_
[
index
])
<<
"
\"
,"
<<
std
::
endl
;
...
...
@@ -208,7 +380,8 @@ Tree::Tree(const std::string& str) {
||
key_vals
.
count
(
"left_child"
)
<=
0
||
key_vals
.
count
(
"right_child"
)
<=
0
||
key_vals
.
count
(
"leaf_parent"
)
<=
0
||
key_vals
.
count
(
"leaf_value"
)
<=
0
||
key_vals
.
count
(
"internal_value"
)
<=
0
||
key_vals
.
count
(
"internal_count"
)
<=
0
||
key_vals
.
count
(
"leaf_count"
)
<=
0
||
key_vals
.
count
(
"decision_type"
)
<=
0
||
key_vals
.
count
(
"leaf_count"
)
<=
0
||
key_vals
.
count
(
"shrinkage"
)
<=
0
||
key_vals
.
count
(
"decision_type"
)
<=
0
)
{
Log
::
Fatal
(
"Tree model string format error"
);
}
...
...
@@ -217,17 +390,17 @@ Tree::Tree(const std::string& str) {
left_child_
=
Common
::
StringToArray
<
int
>
(
key_vals
[
"left_child"
],
' '
,
num_leaves_
-
1
);
right_child_
=
Common
::
StringToArray
<
int
>
(
key_vals
[
"right_child"
],
' '
,
num_leaves_
-
1
);
split_feature_
real_
=
Common
::
StringToArray
<
int
>
(
key_vals
[
"split_feature"
],
' '
,
num_leaves_
-
1
);
split_feature_
=
Common
::
StringToArray
<
int
>
(
key_vals
[
"split_feature"
],
' '
,
num_leaves_
-
1
);
threshold_
=
Common
::
StringToArray
<
double
>
(
key_vals
[
"threshold"
],
' '
,
num_leaves_
-
1
);
decision_type_
=
Common
::
StringToArray
<
int8_t
>
(
key_vals
[
"decision_type"
],
' '
,
num_leaves_
-
1
);
split_gain_
=
Common
::
StringToArray
<
double
>
(
key_vals
[
"split_gain"
],
' '
,
num_leaves_
-
1
);
internal_count_
=
Common
::
StringToArray
<
data_size_t
>
(
key_vals
[
"internal_count"
],
' '
,
num_leaves_
-
1
);
internal_value_
=
Common
::
StringToArray
<
double
>
(
key_vals
[
"internal_value"
],
' '
,
num_leaves_
-
1
);
decision_type_
=
Common
::
StringToArray
<
int8_t
>
(
key_vals
[
"decision_type"
],
' '
,
num_leaves_
-
1
);
leaf_count_
=
Common
::
StringToArray
<
data_size_t
>
(
key_vals
[
"leaf_count"
],
' '
,
num_leaves_
);
leaf_parent_
=
Common
::
StringToArray
<
int
>
(
key_vals
[
"leaf_parent"
],
' '
,
num_leaves_
);
leaf_value_
=
Common
::
StringToArray
<
double
>
(
key_vals
[
"leaf_value"
],
' '
,
num_leaves_
);
Common
::
Atof
(
key_vals
[
"shrinkage"
].
c_str
(),
&
shrinkage_
);
}
}
// namespace LightGBM
src/main.cpp
View file @
eade219e
...
...
@@ -2,22 +2,6 @@
#include <LightGBM/application.h>
int
main
(
int
argc
,
char
**
argv
)
{
try
{
LightGBM
::
Application
app
(
argc
,
argv
);
app
.
Run
();
}
catch
(
const
std
::
exception
&
ex
)
{
std
::
cerr
<<
"Met Exceptions:"
<<
std
::
endl
;
std
::
cerr
<<
ex
.
what
()
<<
std
::
endl
;
exit
(
-
1
);
}
catch
(
const
std
::
string
&
ex
)
{
std
::
cerr
<<
"Met Exceptions:"
<<
std
::
endl
;
std
::
cerr
<<
ex
<<
std
::
endl
;
exit
(
-
1
);
}
catch
(...)
{
std
::
cerr
<<
"Unknown Exceptions"
<<
std
::
endl
;
exit
(
-
1
);
}
LightGBM
::
Application
app
(
argc
,
argv
);
app
.
Run
();
}
src/metric/binary_metric.hpp
View file @
eade219e
...
...
@@ -63,7 +63,7 @@ public:
#pragma omp parallel for schedule(static) reduction(+:sum_loss)
for
(
data_size_t
i
=
0
;
i
<
num_data_
;
++
i
)
{
// sigmoid transform
double
prob
=
1.0
f
/
(
1.0
f
+
std
::
exp
(
-
2.0
f
*
sigmoid_
*
score
[
i
]));
double
prob
=
1.0
f
/
(
1.0
f
+
std
::
exp
(
-
sigmoid_
*
score
[
i
]));
// add loss
sum_loss
+=
PointWiseLossCalculator
::
LossOnPoint
(
label_
[
i
],
prob
);
}
...
...
@@ -71,7 +71,7 @@ public:
#pragma omp parallel for schedule(static) reduction(+:sum_loss)
for
(
data_size_t
i
=
0
;
i
<
num_data_
;
++
i
)
{
// sigmoid transform
double
prob
=
1.0
f
/
(
1.0
f
+
std
::
exp
(
-
2.0
f
*
sigmoid_
*
score
[
i
]));
double
prob
=
1.0
f
/
(
1.0
f
+
std
::
exp
(
-
sigmoid_
*
score
[
i
]));
// add loss
sum_loss
+=
PointWiseLossCalculator
::
LossOnPoint
(
label_
[
i
],
prob
)
*
weights_
[
i
];
}
...
...
@@ -103,7 +103,7 @@ public:
explicit
BinaryLoglossMetric
(
const
MetricConfig
&
config
)
:
BinaryMetric
<
BinaryLoglossMetric
>
(
config
)
{}
inline
static
double
LossOnPoint
(
float
label
,
double
prob
)
{
if
(
label
=
=
0
)
{
if
(
label
<
=
0
)
{
if
(
1.0
f
-
prob
>
kEpsilon
)
{
return
-
std
::
log
(
1.0
f
-
prob
);
}
...
...
@@ -128,9 +128,9 @@ public:
inline
static
double
LossOnPoint
(
float
label
,
double
prob
)
{
if
(
prob
<=
0.5
f
)
{
return
label
;
return
label
>
0
;
}
else
{
return
1.0
f
-
label
;
return
label
<=
0
;
}
}
...
...
@@ -207,8 +207,8 @@ public:
// reset
cur_neg
=
cur_pos
=
0.0
f
;
}
cur_neg
+=
1.0
f
-
cur_label
;
cur_pos
+=
cur_label
;
cur_neg
+=
(
cur_label
<=
0
)
;
cur_pos
+=
(
cur_label
>
0
)
;
}
}
else
{
// has weights
for
(
data_size_t
i
=
0
;
i
<
num_data_
;
++
i
)
{
...
...
@@ -224,8 +224,8 @@ public:
// reset
cur_neg
=
cur_pos
=
0.0
f
;
}
cur_neg
+=
(
1.0
f
-
cur_label
)
*
cur_weight
;
cur_pos
+=
cur_label
*
cur_weight
;
cur_neg
+=
(
cur_label
<=
0
)
*
cur_weight
;
cur_pos
+=
(
cur_label
>
0
)
*
cur_weight
;
}
}
accum
+=
cur_neg
*
(
cur_pos
*
0.5
f
+
sum_pos
);
...
...
src/metric/map_metric.hpp
0 → 100644
View file @
eade219e
#ifndef LIGHTGBM_METRIC_MAP_METRIC_HPP_
#define LIGHTGBM_METRIC_MAP_METRIC_HPP_
#include <LightGBM/utils/common.h>
#include <LightGBM/utils/log.h>
#include <LightGBM/metric.h>
#include <LightGBM/utils/openmp_wrapper.h>
#include <sstream>
#include <vector>
namespace
LightGBM
{
class
MapMetric
:
public
Metric
{
public:
explicit
MapMetric
(
const
MetricConfig
&
config
)
{
// get eval position
for
(
auto
k
:
config
.
eval_at
)
{
eval_at_
.
push_back
(
static_cast
<
data_size_t
>
(
k
));
}
// get number of threads
#pragma omp parallel
#pragma omp master
{
num_threads_
=
omp_get_num_threads
();
}
}
~
MapMetric
()
{
}
void
Init
(
const
Metadata
&
metadata
,
data_size_t
num_data
)
override
{
std
::
stringstream
str_buf
;
for
(
auto
k
:
eval_at_
)
{
name_
.
emplace_back
(
std
::
string
(
"map@"
)
+
std
::
to_string
(
k
));
}
num_data_
=
num_data
;
// get label
label_
=
metadata
.
label
();
// get query boundaries
query_boundaries_
=
metadata
.
query_boundaries
();
if
(
query_boundaries_
==
nullptr
)
{
Log
::
Fatal
(
"For MAP metric, there should be query information"
);
}
num_queries_
=
metadata
.
num_queries
();
Log
::
Info
(
"total groups: %d , total data: %d"
,
num_queries_
,
num_data_
);
// get query weights
query_weights_
=
metadata
.
query_weights
();
if
(
query_weights_
==
nullptr
)
{
sum_query_weights_
=
static_cast
<
double
>
(
num_queries_
);
}
else
{
sum_query_weights_
=
0.0
f
;
for
(
data_size_t
i
=
0
;
i
<
num_queries_
;
++
i
)
{
sum_query_weights_
+=
query_weights_
[
i
];
}
}
}
const
std
::
vector
<
std
::
string
>&
GetName
()
const
override
{
return
name_
;
}
double
factor_to_bigger_better
()
const
override
{
return
1.0
f
;
}
void
CalMapAtK
(
std
::
vector
<
int
>
ks
,
const
float
*
label
,
const
double
*
score
,
data_size_t
num_data
,
std
::
vector
<
double
>*
out
)
const
{
// get sorted indices by score
std
::
vector
<
data_size_t
>
sorted_idx
;
for
(
data_size_t
i
=
0
;
i
<
num_data
;
++
i
)
{
sorted_idx
.
emplace_back
(
i
);
}
std
::
sort
(
sorted_idx
.
begin
(),
sorted_idx
.
end
(),
[
score
](
data_size_t
a
,
data_size_t
b
)
{
return
score
[
a
]
>
score
[
b
];
});
int
num_hit
=
0
;
double
sum_ap
=
0.0
f
;
data_size_t
cur_left
=
0
;
for
(
size_t
i
=
0
;
i
<
ks
.
size
();
++
i
)
{
data_size_t
cur_k
=
ks
[
i
];
if
(
cur_k
>
num_data
)
{
cur_k
=
num_data
;
}
for
(
data_size_t
j
=
cur_left
;
j
<
cur_k
;
++
j
)
{
data_size_t
idx
=
sorted_idx
[
j
];
if
(
label
[
idx
]
>
0.5
f
)
{
++
num_hit
;
sum_ap
+=
static_cast
<
double
>
(
num_hit
)
/
(
i
+
1.0
f
);
}
}
(
*
out
)[
i
]
=
sum_ap
/
cur_k
;
cur_left
=
cur_k
;
}
}
std
::
vector
<
double
>
Eval
(
const
double
*
score
)
const
override
{
// some buffers for multi-threading sum up
std
::
vector
<
std
::
vector
<
double
>>
result_buffer_
;
for
(
int
i
=
0
;
i
<
num_threads_
;
++
i
)
{
result_buffer_
.
emplace_back
(
eval_at_
.
size
(),
0.0
f
);
}
std
::
vector
<
double
>
tmp_map
(
eval_at_
.
size
(),
0.0
f
);
if
(
query_weights_
==
nullptr
)
{
#pragma omp parallel for schedule(guided) firstprivate(tmp_map)
for
(
data_size_t
i
=
0
;
i
<
num_queries_
;
++
i
)
{
const
int
tid
=
omp_get_thread_num
();
CalMapAtK
(
eval_at_
,
label_
+
query_boundaries_
[
i
],
score
+
query_boundaries_
[
i
],
query_boundaries_
[
i
+
1
]
-
query_boundaries_
[
i
],
&
tmp_map
);
for
(
size_t
j
=
0
;
j
<
eval_at_
.
size
();
++
j
)
{
result_buffer_
[
tid
][
j
]
+=
tmp_map
[
j
];
}
}
}
else
{
#pragma omp parallel for schedule(guided) firstprivate(tmp_map)
for
(
data_size_t
i
=
0
;
i
<
num_queries_
;
++
i
)
{
const
int
tid
=
omp_get_thread_num
();
CalMapAtK
(
eval_at_
,
label_
+
query_boundaries_
[
i
],
score
+
query_boundaries_
[
i
],
query_boundaries_
[
i
+
1
]
-
query_boundaries_
[
i
],
&
tmp_map
);
for
(
size_t
j
=
0
;
j
<
eval_at_
.
size
();
++
j
)
{
result_buffer_
[
tid
][
j
]
+=
tmp_map
[
j
]
*
query_weights_
[
i
];
}
}
}
// Get final average MAP
std
::
vector
<
double
>
result
(
eval_at_
.
size
(),
0.0
f
);
for
(
size_t
j
=
0
;
j
<
result
.
size
();
++
j
)
{
for
(
int
i
=
0
;
i
<
num_threads_
;
++
i
)
{
result
[
j
]
+=
result_buffer_
[
i
][
j
];
}
result
[
j
]
/=
sum_query_weights_
;
}
return
result
;
}
private:
/*! \brief Number of data */
data_size_t
num_data_
;
/*! \brief Pointer of label */
const
float
*
label_
;
/*! \brief Query boundaries information */
const
data_size_t
*
query_boundaries_
;
/*! \brief Number of queries */
data_size_t
num_queries_
;
/*! \brief Weights of queries */
const
float
*
query_weights_
;
/*! \brief Sum weights of queries */
double
sum_query_weights_
;
/*! \brief Evaluate position of Nmap */
std
::
vector
<
data_size_t
>
eval_at_
;
/*! \brief Number of threads */
int
num_threads_
;
std
::
vector
<
std
::
string
>
name_
;
};
}
// namespace LightGBM
#endif // LIGHTGBM_METRIC_MAP_METRIC_HPP_
src/metric/metric.cpp
View file @
eade219e
...
...
@@ -2,6 +2,7 @@
#include "regression_metric.hpp"
#include "binary_metric.hpp"
#include "rank_metric.hpp"
#include "map_metric.hpp"
#include "multiclass_metric.hpp"
namespace
LightGBM
{
...
...
@@ -15,6 +16,8 @@ Metric* Metric::CreateMetric(const std::string& type, const MetricConfig& config
return
new
HuberLossMetric
(
config
);
}
else
if
(
type
==
std
::
string
(
"fair"
))
{
return
new
FairLossMetric
(
config
);
}
else
if
(
type
==
std
::
string
(
"poisson"
))
{
return
new
PoissonMetric
(
config
);
}
else
if
(
type
==
std
::
string
(
"binary_logloss"
))
{
return
new
BinaryLoglossMetric
(
config
);
}
else
if
(
type
==
std
::
string
(
"binary_error"
))
{
...
...
@@ -23,6 +26,8 @@ Metric* Metric::CreateMetric(const std::string& type, const MetricConfig& config
return
new
AUCMetric
(
config
);
}
else
if
(
type
==
std
::
string
(
"ndcg"
))
{
return
new
NDCGMetric
(
config
);
}
else
if
(
type
==
std
::
string
(
"map"
))
{
return
new
MapMetric
(
config
);
}
else
if
(
type
==
std
::
string
(
"multi_logloss"
))
{
return
new
MultiLoglossMetric
(
config
);
}
else
if
(
type
==
std
::
string
(
"multi_error"
))
{
...
...
src/metric/rank_metric.hpp
View file @
eade219e
...
...
@@ -6,7 +6,7 @@
#include <LightGBM/metric.h>
#include <
omp
.h>
#include <
LightGBM/utils/openmp_wrapper
.h>
#include <sstream>
#include <vector>
...
...
@@ -90,7 +90,7 @@ public:
}
std
::
vector
<
double
>
tmp_dcg
(
eval_at_
.
size
(),
0.0
f
);
if
(
query_weights_
==
nullptr
)
{
#pragma omp parallel for schedule(
guided
) firstprivate(tmp_dcg)
#pragma omp parallel for schedule(
static
) firstprivate(tmp_dcg)
for
(
data_size_t
i
=
0
;
i
<
num_queries_
;
++
i
)
{
const
int
tid
=
omp_get_thread_num
();
// if all doc in this query are all negative, let its NDCG=1
...
...
@@ -110,7 +110,7 @@ public:
}
}
}
else
{
#pragma omp parallel for schedule(
guided
) firstprivate(tmp_dcg)
#pragma omp parallel for schedule(
static
) firstprivate(tmp_dcg)
for
(
data_size_t
i
=
0
;
i
<
num_queries_
;
++
i
)
{
const
int
tid
=
omp_get_thread_num
();
// if all doc in this query are all negative, let its NDCG=1
...
...
src/metric/regression_metric.hpp
View file @
eade219e
...
...
@@ -162,5 +162,23 @@ public:
}
};
/*! \brief Poisson regression loss for regression task */
class
PoissonMetric
:
public
RegressionMetric
<
PoissonMetric
>
{
public:
explicit
PoissonMetric
(
const
MetricConfig
&
config
)
:
RegressionMetric
<
PoissonMetric
>
(
config
)
{
}
inline
static
double
LossOnPoint
(
float
label
,
double
score
,
double
,
double
)
{
const
double
eps
=
1e-10
f
;
if
(
score
<
eps
)
{
score
=
eps
;
}
return
score
-
label
*
std
::
log
(
score
);
}
inline
static
const
char
*
Name
()
{
return
"poisson"
;
}
};
}
// namespace LightGBM
#endif // LightGBM_METRIC_REGRESSION_METRIC_HPP_
src/network/linkers_socket.cpp
View file @
eade219e
...
...
@@ -25,7 +25,7 @@ Linkers::Linkers(NetworkConfig config) {
local_listen_port_
=
config
.
local_listen_port
;
socket_timeout_
=
config
.
time_out
;
rank_
=
-
1
;
// parse
r
clients from file
// parse clients from file
ParseMachineList
(
config
.
machine_list_filename
.
c_str
());
if
(
rank_
==
-
1
)
{
...
...
src/objective/binary_objective.hpp
View file @
eade219e
...
...
@@ -28,14 +28,15 @@ public:
data_size_t
cnt_positive
=
0
;
data_size_t
cnt_negative
=
0
;
// count for positive and negative samples
#pragma omp parallel for schedule(static) reduction(+:cnt_positive, cnt_negative)
for
(
data_size_t
i
=
0
;
i
<
num_data_
;
++
i
)
{
if
(
label_
[
i
]
==
1
)
{
if
(
label_
[
i
]
>
0
)
{
++
cnt_positive
;
}
else
{
++
cnt_negative
;
}
}
Log
::
Info
(
"Number of postive: %d, number of negative: %d"
,
cnt_positive
,
cnt_negative
);
Log
::
Info
(
"Number of pos
i
tive: %d, number of negative: %d"
,
cnt_positive
,
cnt_negative
);
// cannot continue if all sample are same class
if
(
cnt_positive
==
0
||
cnt_negative
==
0
)
{
Log
::
Fatal
(
"Training data only contains one class"
);
...
...
@@ -64,25 +65,27 @@ public:
#pragma omp parallel for schedule(static)
for
(
data_size_t
i
=
0
;
i
<
num_data_
;
++
i
)
{
// get label and label weights
const
int
label
=
label_val_
[
static_cast
<
int
>
(
label_
[
i
])];
const
double
label_weight
=
label_weights_
[
static_cast
<
int
>
(
label_
[
i
])];
const
int
is_pos
=
label_
[
i
]
>
0
;
const
int
label
=
label_val_
[
is_pos
];
const
double
label_weight
=
label_weights_
[
is_pos
];
// calculate gradients and hessians
const
double
response
=
-
2.0
f
*
label
*
sigmoid_
/
(
1.0
f
+
std
::
exp
(
2.0
f
*
label
*
sigmoid_
*
score
[
i
]));
const
double
response
=
-
label
*
sigmoid_
/
(
1.0
f
+
std
::
exp
(
label
*
sigmoid_
*
score
[
i
]));
const
double
abs_response
=
fabs
(
response
);
gradients
[
i
]
=
static_cast
<
score_t
>
(
response
*
label_weight
);
hessians
[
i
]
=
static_cast
<
score_t
>
(
abs_response
*
(
2.0
f
*
sigmoid_
-
abs_response
)
*
label_weight
);
hessians
[
i
]
=
static_cast
<
score_t
>
(
abs_response
*
(
sigmoid_
-
abs_response
)
*
label_weight
);
}
}
else
{
#pragma omp parallel for schedule(static)
for
(
data_size_t
i
=
0
;
i
<
num_data_
;
++
i
)
{
// get label and label weights
const
int
label
=
label_val_
[
static_cast
<
int
>
(
label_
[
i
])];
const
double
label_weight
=
label_weights_
[
static_cast
<
int
>
(
label_
[
i
])];
const
int
is_pos
=
label_
[
i
]
>
0
;
const
int
label
=
label_val_
[
is_pos
];
const
double
label_weight
=
label_weights_
[
is_pos
];
// calculate gradients and hessians
const
double
response
=
-
2.0
f
*
label
*
sigmoid_
/
(
1.0
f
+
std
::
exp
(
2.0
f
*
label
*
sigmoid_
*
score
[
i
]));
const
double
response
=
-
label
*
sigmoid_
/
(
1.0
f
+
std
::
exp
(
label
*
sigmoid_
*
score
[
i
]));
const
double
abs_response
=
fabs
(
response
);
gradients
[
i
]
=
static_cast
<
score_t
>
(
response
*
label_weight
*
weights_
[
i
]);
hessians
[
i
]
=
static_cast
<
score_t
>
(
abs_response
*
(
2.0
f
*
sigmoid_
-
abs_response
)
*
label_weight
*
weights_
[
i
]);
hessians
[
i
]
=
static_cast
<
score_t
>
(
abs_response
*
(
sigmoid_
-
abs_response
)
*
label_weight
*
weights_
[
i
]);
}
}
}
...
...
src/objective/multiclass_objective.hpp
View file @
eade219e
...
...
@@ -14,6 +14,7 @@ class MulticlassLogloss: public ObjectiveFunction {
public:
explicit
MulticlassLogloss
(
const
ObjectiveConfig
&
config
)
{
num_class_
=
config
.
num_class
;
is_unbalance_
=
config
.
is_unbalance
;
}
~
MulticlassLogloss
()
{
...
...
@@ -24,12 +25,25 @@ public:
label_
=
metadata
.
label
();
weights_
=
metadata
.
weights
();
label_int_
.
resize
(
num_data_
);
for
(
int
i
=
0
;
i
<
num_data_
;
++
i
){
label_int_
[
i
]
=
static_cast
<
int
>
(
label_
[
i
]);
if
(
label_int_
[
i
]
<
0
||
label_int_
[
i
]
>=
num_class_
)
{
Log
::
Fatal
(
"Label must be in [0, %d), but found %d in label"
,
num_class_
,
label_int_
[
i
]);
}
#pragma omp parallel for schedule(static)
for
(
int
i
=
0
;
i
<
num_data_
;
++
i
)
{
label_int_
[
i
]
=
static_cast
<
int
>
(
label_
[
i
]);
if
(
label_int_
[
i
]
<
0
||
label_int_
[
i
]
>=
num_class_
)
{
Log
::
Fatal
(
"Label must be in [0, %d), but found %d in label"
,
num_class_
,
label_int_
[
i
]);
}
}
label_pos_weights_
=
std
::
vector
<
float
>
(
num_class_
,
1
);
if
(
is_unbalance_
)
{
std
::
vector
<
int
>
cnts
(
num_class_
,
0
);
for
(
int
i
=
0
;
i
<
num_data_
;
++
i
)
{
++
cnts
[
label_int_
[
i
]];
}
for
(
int
i
=
0
;
i
<
num_class_
;
++
i
)
{
int
cnt_cur
=
cnts
[
i
];
int
cnt_other
=
(
num_data_
-
cnts
[
i
]);
label_pos_weights_
[
i
]
=
static_cast
<
float
>
(
cnt_other
)
/
cnt_cur
;
}
}
}
void
GetGradients
(
const
double
*
score
,
score_t
*
gradients
,
score_t
*
hessians
)
const
override
{
...
...
@@ -46,11 +60,12 @@ public:
auto
p
=
rec
[
k
];
size_t
idx
=
static_cast
<
size_t
>
(
num_data_
)
*
k
+
i
;
if
(
label_int_
[
i
]
==
k
)
{
gradients
[
idx
]
=
static_cast
<
score_t
>
(
p
-
1.0
f
);
gradients
[
idx
]
=
static_cast
<
score_t
>
(
p
-
1.0
f
)
*
label_pos_weights_
[
k
];
hessians
[
idx
]
=
static_cast
<
score_t
>
(
2.0
f
*
p
*
(
1.0
f
-
p
))
*
label_pos_weights_
[
k
];
}
else
{
gradients
[
idx
]
=
static_cast
<
score_t
>
(
p
);
hessians
[
idx
]
=
static_cast
<
score_t
>
(
2.0
f
*
p
*
(
1.0
f
-
p
));
}
hessians
[
idx
]
=
static_cast
<
score_t
>
(
2.0
f
*
p
*
(
1.0
f
-
p
));
}
}
}
else
{
...
...
@@ -66,11 +81,13 @@ public:
auto
p
=
rec
[
k
];
size_t
idx
=
static_cast
<
size_t
>
(
num_data_
)
*
k
+
i
;
if
(
label_int_
[
i
]
==
k
)
{
gradients
[
idx
]
=
static_cast
<
score_t
>
((
p
-
1.0
f
)
*
weights_
[
i
]);
gradients
[
idx
]
=
static_cast
<
score_t
>
((
p
-
1.0
f
)
*
weights_
[
i
])
*
label_pos_weights_
[
k
];
hessians
[
idx
]
=
static_cast
<
score_t
>
(
2.0
f
*
p
*
(
1.0
f
-
p
)
*
weights_
[
i
])
*
label_pos_weights_
[
k
];
}
else
{
gradients
[
idx
]
=
static_cast
<
score_t
>
(
p
*
weights_
[
i
]);
hessians
[
idx
]
=
static_cast
<
score_t
>
(
2.0
f
*
p
*
(
1.0
f
-
p
)
*
weights_
[
i
]);
}
hessians
[
idx
]
=
static_cast
<
score_t
>
(
2.0
f
*
p
*
(
1.0
f
-
p
)
*
weights_
[
i
]);
}
}
}
...
...
@@ -91,6 +108,9 @@ private:
std
::
vector
<
int
>
label_int_
;
/*! \brief Weights for data */
const
float
*
weights_
;
/*! \brief Weights for label */
std
::
vector
<
float
>
label_pos_weights_
;
bool
is_unbalance_
;
};
}
// namespace LightGBM
...
...
src/objective/objective_function.cpp
View file @
eade219e
...
...
@@ -16,6 +16,8 @@ ObjectiveFunction* ObjectiveFunction::CreateObjectiveFunction(const std::string&
return
new
RegressionHuberLoss
(
config
);
}
else
if
(
type
==
std
::
string
(
"fair"
))
{
return
new
RegressionFairLoss
(
config
);
}
else
if
(
type
==
std
::
string
(
"poisson"
))
{
return
new
RegressionPoissonLoss
(
config
);
}
else
if
(
type
==
std
::
string
(
"binary"
))
{
return
new
BinaryLogloss
(
config
);
}
else
if
(
type
==
std
::
string
(
"lambdarank"
))
{
...
...
src/objective/rank_objective.hpp
View file @
eade219e
...
...
@@ -52,6 +52,7 @@ public:
num_queries_
=
metadata
.
num_queries
();
// cache inverse max DCG, avoid computation many times
inverse_max_dcgs_
.
resize
(
num_queries_
);
#pragma omp parallel for schedule(static)
for
(
data_size_t
i
=
0
;
i
<
num_queries_
;
++
i
)
{
inverse_max_dcgs_
[
i
]
=
DCGCalculator
::
CalMaxDCGAtK
(
optimize_pos_at_
,
label_
+
query_boundaries_
[
i
],
...
...
src/objective/regression_objective.hpp
View file @
eade219e
...
...
@@ -236,5 +236,55 @@ private:
double
c_
;
};
/*!
* \brief Objective function for Poisson regression
*/
class
RegressionPoissonLoss
:
public
ObjectiveFunction
{
public:
explicit
RegressionPoissonLoss
(
const
ObjectiveConfig
&
config
)
{
max_delta_step_
=
static_cast
<
double
>
(
config
.
poisson_max_delta_step
);
}
~
RegressionPoissonLoss
()
{}
void
Init
(
const
Metadata
&
metadata
,
data_size_t
num_data
)
override
{
num_data_
=
num_data
;
label_
=
metadata
.
label
();
weights_
=
metadata
.
weights
();
}
void
GetGradients
(
const
double
*
score
,
score_t
*
gradients
,
score_t
*
hessians
)
const
override
{
if
(
weights_
==
nullptr
)
{
#pragma omp parallel for schedule(static)
for
(
data_size_t
i
=
0
;
i
<
num_data_
;
++
i
)
{
gradients
[
i
]
=
static_cast
<
score_t
>
(
score
[
i
]
-
label_
[
i
]);
hessians
[
i
]
=
static_cast
<
score_t
>
(
score
[
i
]
+
max_delta_step_
);
}
}
else
{
#pragma omp parallel for schedule(static)
for
(
data_size_t
i
=
0
;
i
<
num_data_
;
++
i
)
{
gradients
[
i
]
=
static_cast
<
score_t
>
((
score
[
i
]
-
label_
[
i
])
*
weights_
[
i
]);
hessians
[
i
]
=
static_cast
<
score_t
>
((
score
[
i
]
+
max_delta_step_
)
*
weights_
[
i
]);
}
}
}
const
char
*
GetName
()
const
override
{
return
"poisson"
;
}
private:
/*! \brief Number of data */
data_size_t
num_data_
;
/*! \brief Pointer of label */
const
float
*
label_
;
/*! \brief Pointer of weights */
const
float
*
weights_
;
/*! \brief used to safeguard optimization */
double
max_delta_step_
;
};
}
// namespace LightGBM
#endif // LightGBM_OBJECTIVE_REGRESSION_OBJECTIVE_HPP_
src/treelearner/data_parallel_tree_learner.cpp
View file @
eade219e
...
...
@@ -22,10 +22,7 @@ void DataParallelTreeLearner::Init(const Dataset* train_data) {
rank_
=
Network
::
rank
();
num_machines_
=
Network
::
num_machines
();
// allocate buffer for communication
size_t
buffer_size
=
0
;
for
(
int
i
=
0
;
i
<
num_features_
;
++
i
)
{
buffer_size
+=
train_data_
->
FeatureAt
(
i
)
->
num_bin
()
*
sizeof
(
HistogramBinEntry
);
}
size_t
buffer_size
=
train_data_
->
NumTotalBin
()
*
sizeof
(
HistogramBinEntry
);
input_buffer_
.
resize
(
buffer_size
);
output_buffer_
.
resize
(
buffer_size
);
...
...
@@ -50,13 +47,19 @@ void DataParallelTreeLearner::BeforeTrain() {
// generate feature partition for current tree
std
::
vector
<
std
::
vector
<
int
>>
feature_distribution
(
num_machines_
,
std
::
vector
<
int
>
());
std
::
vector
<
int
>
num_bins_distributed
(
num_machines_
,
0
);
for
(
int
i
=
0
;
i
<
train_data_
->
num_features
();
++
i
)
{
if
(
is_feature_used_
[
i
])
{
for
(
int
i
=
0
;
i
<
train_data_
->
num_total_features
();
++
i
)
{
int
inner_feature_index
=
train_data_
->
InnerFeatureIndex
(
i
);
if
(
inner_feature_index
==
-
1
)
{
continue
;
}
if
(
is_feature_used_
[
inner_feature_index
])
{
int
cur_min_machine
=
static_cast
<
int
>
(
ArrayArgs
<
int
>::
ArgMin
(
num_bins_distributed
));
feature_distribution
[
cur_min_machine
].
push_back
(
i
);
num_bins_distributed
[
cur_min_machine
]
+=
train_data_
->
FeatureAt
(
i
)
->
num_bin
();
feature_distribution
[
cur_min_machine
].
push_back
(
inner_feature_index
);
auto
num_bin
=
train_data_
->
FeatureNumBin
(
inner_feature_index
);
if
(
train_data_
->
FeatureBinMapper
(
inner_feature_index
)
->
GetDefaultBin
()
==
0
)
{
num_bin
-=
1
;
}
num_bins_distributed
[
cur_min_machine
]
+=
num_bin
;
}
is_feature_aggregated_
[
i
]
=
false
;
is_feature_aggregated_
[
i
nner_feature_index
]
=
false
;
}
// get local used feature
for
(
auto
fid
:
feature_distribution
[
rank_
])
{
...
...
@@ -68,7 +71,11 @@ void DataParallelTreeLearner::BeforeTrain() {
for
(
int
i
=
0
;
i
<
num_machines_
;
++
i
)
{
block_len_
[
i
]
=
0
;
for
(
auto
fid
:
feature_distribution
[
i
])
{
block_len_
[
i
]
+=
train_data_
->
FeatureAt
(
fid
)
->
num_bin
()
*
sizeof
(
HistogramBinEntry
);
auto
num_bin
=
train_data_
->
FeatureNumBin
(
fid
);
if
(
train_data_
->
FeatureBinMapper
(
fid
)
->
GetDefaultBin
()
==
0
)
{
num_bin
-=
1
;
}
block_len_
[
i
]
+=
num_bin
*
sizeof
(
HistogramBinEntry
);
}
reduce_scatter_size_
+=
block_len_
[
i
];
}
...
...
@@ -83,7 +90,11 @@ void DataParallelTreeLearner::BeforeTrain() {
for
(
int
i
=
0
;
i
<
num_machines_
;
++
i
)
{
for
(
auto
fid
:
feature_distribution
[
i
])
{
buffer_write_start_pos_
[
fid
]
=
bin_size
;
bin_size
+=
train_data_
->
FeatureAt
(
fid
)
->
num_bin
()
*
sizeof
(
HistogramBinEntry
);
auto
num_bin
=
train_data_
->
FeatureNumBin
(
fid
);
if
(
train_data_
->
FeatureBinMapper
(
fid
)
->
GetDefaultBin
()
==
0
)
{
num_bin
-=
1
;
}
bin_size
+=
num_bin
*
sizeof
(
HistogramBinEntry
);
}
}
...
...
@@ -91,12 +102,16 @@ void DataParallelTreeLearner::BeforeTrain() {
bin_size
=
0
;
for
(
auto
fid
:
feature_distribution
[
rank_
])
{
buffer_read_start_pos_
[
fid
]
=
bin_size
;
bin_size
+=
train_data_
->
FeatureAt
(
fid
)
->
num_bin
()
*
sizeof
(
HistogramBinEntry
);
auto
num_bin
=
train_data_
->
FeatureNumBin
(
fid
);
if
(
train_data_
->
FeatureBinMapper
(
fid
)
->
GetDefaultBin
()
==
0
)
{
num_bin
-=
1
;
}
bin_size
+=
num_bin
*
sizeof
(
HistogramBinEntry
);
}
// sync global data sumup info
std
::
tuple
<
data_size_t
,
double
,
double
>
data
(
smaller_leaf_splits_
->
num_data_in_leaf
(),
smaller_leaf_splits_
->
sum_gradients
(),
smaller_leaf_splits_
->
sum_hessians
());
smaller_leaf_splits_
->
sum_gradients
(),
smaller_leaf_splits_
->
sum_hessians
());
int
size
=
sizeof
(
data
);
std
::
memcpy
(
input_buffer_
.
data
(),
&
data
,
size
);
// global sumup reduce
...
...
@@ -125,88 +140,88 @@ void DataParallelTreeLearner::BeforeTrain() {
}
void
DataParallelTreeLearner
::
FindBestThresholds
()
{
train_data_
->
ConstructHistograms
(
is_feature_used_
,
smaller_leaf_splits_
->
data_indices
(),
smaller_leaf_splits_
->
num_data_in_leaf
(),
smaller_leaf_splits_
->
LeafIndex
(),
ordered_bins_
,
gradients_
,
hessians_
,
ordered_gradients_
.
data
(),
ordered_hessians_
.
data
(),
smaller_leaf_histogram_array_
[
0
].
RawData
()
-
1
);
// construct local histograms
#pragma omp parallel for schedule(
guided
)
#pragma omp parallel for schedule(
static
)
for
(
int
feature_index
=
0
;
feature_index
<
num_features_
;
++
feature_index
)
{
if
((
!
is_feature_used_
.
empty
()
&&
is_feature_used_
[
feature_index
]
==
false
))
continue
;
// construct histograms for smaller leaf
if
(
ordered_bins_
[
feature_index
]
==
nullptr
)
{
smaller_leaf_histogram_array_
[
feature_index
].
Construct
(
smaller_leaf_splits_
->
data_indices
(),
smaller_leaf_splits_
->
num_data_in_leaf
(),
smaller_leaf_splits_
->
sum_gradients
(),
smaller_leaf_splits_
->
sum_hessians
(),
ptr_to_ordered_gradients_smaller_leaf_
,
ptr_to_ordered_hessians_smaller_leaf_
);
}
else
{
smaller_leaf_histogram_array_
[
feature_index
].
Construct
(
ordered_bins_
[
feature_index
].
get
(),
smaller_leaf_splits_
->
LeafIndex
(),
smaller_leaf_splits_
->
num_data_in_leaf
(),
smaller_leaf_splits_
->
sum_gradients
(),
smaller_leaf_splits_
->
sum_hessians
(),
gradients_
,
hessians_
);
}
// copy to buffer
std
::
memcpy
(
input_buffer_
.
data
()
+
buffer_write_start_pos_
[
feature_index
],
smaller_leaf_histogram_array_
[
feature_index
].
Histogram
Data
(),
smaller_leaf_histogram_array_
[
feature_index
].
SizeOfHistgram
());
smaller_leaf_histogram_array_
[
feature_index
].
Raw
Data
(),
smaller_leaf_histogram_array_
[
feature_index
].
SizeOfHistgram
());
}
// Reduce scatter for histogram
Network
::
ReduceScatter
(
input_buffer_
.
data
(),
reduce_scatter_size_
,
block_start_
.
data
(),
block_len_
.
data
(),
output_buffer_
.
data
(),
&
HistogramBinEntry
::
SumReducer
);
#pragma omp parallel for schedule(guided)
block_len_
.
data
(),
output_buffer_
.
data
(),
&
HistogramBinEntry
::
SumReducer
);
std
::
vector
<
SplitInfo
>
smaller_best
(
num_threads_
,
SplitInfo
());
std
::
vector
<
SplitInfo
>
larger_best
(
num_threads_
,
SplitInfo
());
#pragma omp parallel for schedule(static)
for
(
int
feature_index
=
0
;
feature_index
<
num_features_
;
++
feature_index
)
{
if
(
!
is_feature_aggregated_
[
feature_index
])
continue
;
// copy global sumup info
smaller_leaf_histogram_array_
[
feature_index
].
SetSumup
(
GetGlobalDataCountInLeaf
(
smaller_leaf_splits_
->
LeafIndex
()),
smaller_leaf_splits_
->
sum_gradients
(),
smaller_leaf_splits_
->
sum_hessians
());
const
int
tid
=
omp_get_thread_num
();
// restore global histograms from buffer
smaller_leaf_histogram_array_
[
feature_index
].
FromMemory
(
output_buffer_
.
data
()
+
buffer_read_start_pos_
[
feature_index
]);
output_buffer_
.
data
()
+
buffer_read_start_pos_
[
feature_index
]);
train_data_
->
FixHistogram
(
feature_index
,
smaller_leaf_splits_
->
sum_gradients
(),
smaller_leaf_splits_
->
sum_hessians
(),
GetGlobalDataCountInLeaf
(
smaller_leaf_splits_
->
LeafIndex
()),
smaller_leaf_histogram_array_
[
feature_index
].
RawData
());
SplitInfo
smaller_split
;
// find best threshold for smaller child
smaller_leaf_histogram_array_
[
feature_index
].
FindBestThreshold
(
&
smaller_leaf_splits_
->
BestSplitPerFeature
()[
feature_index
]);
smaller_leaf_splits_
->
sum_gradients
(),
smaller_leaf_splits_
->
sum_hessians
(),
GetGlobalDataCountInLeaf
(
smaller_leaf_splits_
->
LeafIndex
()),
&
smaller_split
);
if
(
smaller_split
.
gain
>
smaller_best
[
tid
].
gain
)
{
smaller_best
[
tid
]
=
smaller_split
;
smaller_best
[
tid
].
feature
=
train_data_
->
RealFeatureIndex
(
feature_index
);
}
// only root leaf
if
(
larger_leaf_splits_
==
nullptr
||
larger_leaf_splits_
->
LeafIndex
()
<
0
)
continue
;
// construct histgroms for large leaf, we init larger leaf as the parent, so we can just subtract the smaller leaf's histograms
larger_leaf_histogram_array_
[
feature_index
].
Subtract
(
smaller_leaf_histogram_array_
[
feature_index
]);
// set sumup info for histogram
larger_leaf_histogram_array_
[
feature_index
].
SetSumup
(
GetGlobalDataCountInLeaf
(
larger_leaf_splits_
->
LeafIndex
()),
larger_leaf_splits_
->
sum_gradients
(),
larger_leaf_splits_
->
sum_hessians
());
smaller_leaf_histogram_array_
[
feature_index
]);
SplitInfo
larger_split
;
// find best threshold for larger child
larger_leaf_histogram_array_
[
feature_index
].
FindBestThreshold
(
&
larger_leaf_splits_
->
BestSplitPerFeature
()[
feature_index
]);
larger_leaf_splits_
->
sum_gradients
(),
larger_leaf_splits_
->
sum_hessians
(),
GetGlobalDataCountInLeaf
(
larger_leaf_splits_
->
LeafIndex
()),
&
larger_split
);
if
(
larger_split
.
gain
>
larger_best
[
tid
].
gain
)
{
larger_best
[
tid
]
=
larger_split
;
larger_best
[
tid
].
feature
=
train_data_
->
RealFeatureIndex
(
feature_index
);
}
}
auto
smaller_best_idx
=
ArrayArgs
<
SplitInfo
>::
ArgMax
(
smaller_best
);
int
leaf
=
smaller_leaf_splits_
->
LeafIndex
();
best_split_per_leaf_
[
leaf
]
=
smaller_best
[
smaller_best_idx
];
if
(
larger_leaf_splits_
==
nullptr
||
larger_leaf_splits_
->
LeafIndex
()
<
0
)
{
return
;
}
leaf
=
larger_leaf_splits_
->
LeafIndex
();
auto
larger_best_idx
=
ArrayArgs
<
SplitInfo
>::
ArgMax
(
larger_best
);
best_split_per_leaf_
[
leaf
]
=
larger_best
[
larger_best_idx
];
}
void
DataParallelTreeLearner
::
FindBestSplitsForLeaves
()
{
int
smaller_best_feature
=
-
1
,
larger_best_feature
=
-
1
;
SplitInfo
smaller_best
,
larger_best
;
std
::
vector
<
double
>
gains
;
// find local best split for smaller leaf
for
(
size_t
i
=
0
;
i
<
smaller_leaf_splits_
->
BestSplitPerFeature
().
size
();
++
i
)
{
gains
.
push_back
(
smaller_leaf_splits_
->
BestSplitPerFeature
()[
i
].
gain
);
}
smaller_best_feature
=
static_cast
<
int
>
(
ArrayArgs
<
double
>::
ArgMax
(
gains
));
smaller_best
=
smaller_leaf_splits_
->
BestSplitPerFeature
()[
smaller_best_feature
];
smaller_best
=
best_split_per_leaf_
[
smaller_leaf_splits_
->
LeafIndex
()];
// find local best split for larger leaf
if
(
larger_leaf_splits_
->
LeafIndex
()
>=
0
)
{
gains
.
clear
();
for
(
size_t
i
=
0
;
i
<
larger_leaf_splits_
->
BestSplitPerFeature
().
size
();
++
i
)
{
gains
.
push_back
(
larger_leaf_splits_
->
BestSplitPerFeature
()[
i
].
gain
);
}
larger_best_feature
=
static_cast
<
int
>
(
ArrayArgs
<
double
>::
ArgMax
(
gains
));
larger_best
=
larger_leaf_splits_
->
BestSplitPerFeature
()[
larger_best_feature
];
larger_best
=
best_split_per_leaf_
[
larger_leaf_splits_
->
LeafIndex
()];
}
// sync global best info
...
...
@@ -214,7 +229,7 @@ void DataParallelTreeLearner::FindBestSplitsForLeaves() {
std
::
memcpy
(
input_buffer_
.
data
()
+
sizeof
(
SplitInfo
),
&
larger_best
,
sizeof
(
SplitInfo
));
Network
::
Allreduce
(
input_buffer_
.
data
(),
sizeof
(
SplitInfo
)
*
2
,
sizeof
(
SplitInfo
),
output_buffer_
.
data
(),
&
SplitInfo
::
MaxReducer
);
output_buffer_
.
data
(),
&
SplitInfo
::
MaxReducer
);
std
::
memcpy
(
&
smaller_best
,
output_buffer_
.
data
(),
sizeof
(
SplitInfo
));
std
::
memcpy
(
&
larger_best
,
output_buffer_
.
data
()
+
sizeof
(
SplitInfo
),
sizeof
(
SplitInfo
));
...
...
src/treelearner/data_partition.hpp
View file @
eade219e
...
...
@@ -2,9 +2,9 @@
#define LIGHTGBM_TREELEARNER_DATA_PARTITION_HPP_
#include <LightGBM/meta.h>
#include <LightGBM/
feature
.h>
#include <LightGBM/
dataset
.h>
#include <
omp
.h>
#include <
LightGBM/utils/openmp_wrapper
.h>
#include <cstring>
...
...
@@ -41,7 +41,12 @@ public:
leaf_begin_
.
resize
(
num_leaves_
);
leaf_count_
.
resize
(
num_leaves_
);
}
void
ResetNumData
(
int
num_data
)
{
num_data_
=
num_data
;
indices_
.
resize
(
num_data_
);
temp_left_indices_
.
resize
(
num_data_
);
temp_right_indices_
.
resize
(
num_data_
);
}
~
DataPartition
()
{
}
...
...
@@ -88,7 +93,7 @@ public:
* \param threshold threshold that want to split
* \param right_leaf index of right leaf
*/
void
Split
(
int
leaf
,
const
B
in
*
feature
_bins
,
unsigned
in
t
threshold
,
int
right_leaf
)
{
void
Split
(
int
leaf
,
const
Dataset
*
dataset
,
in
t
feature
,
uint32_
t
threshold
,
int
right_leaf
)
{
const
data_size_t
min_inner_size
=
1000
;
// get leaf boundary
const
data_size_t
begin
=
leaf_begin_
[
leaf
];
...
...
@@ -106,7 +111,7 @@ public:
data_size_t
cur_cnt
=
inner_size
;
if
(
cur_start
+
cur_cnt
>
cnt
)
{
cur_cnt
=
cnt
-
cur_start
;
}
// split data inner, reduce the times of function called
data_size_t
cur_left_count
=
feature_bins
->
Split
(
threshold
,
indices_
.
data
()
+
begin
+
cur_start
,
cur_cnt
,
data_size_t
cur_left_count
=
dataset
->
Split
(
feature
,
threshold
,
indices_
.
data
()
+
begin
+
cur_start
,
cur_cnt
,
temp_left_indices_
.
data
()
+
cur_start
,
temp_right_indices_
.
data
()
+
cur_start
);
offsets_buf_
[
i
]
=
cur_start
;
left_cnts_buf_
[
i
]
=
cur_left_count
;
...
...
src/treelearner/feature_histogram.hpp
View file @
eade219e
...
...
@@ -2,19 +2,31 @@
#define LIGHTGBM_TREELEARNER_FEATURE_HISTOGRAM_HPP_
#include "split_info.hpp"
#include <LightGBM/feature.h>
#include <LightGBM/utils/array_args.h>
#include <LightGBM/dataset.h>
#include <cstring>
namespace
LightGBM
{
namespace
LightGBM
{
class
FeatureMetainfo
{
public:
int
num_bin
;
int
bias
=
0
;
/*! \brief pointer of tree config */
const
TreeConfig
*
tree_config
;
};
/*!
* \brief FeatureHistogram is used to construct and store a histogram for a feature.
*/
class
FeatureHistogram
{
public:
FeatureHistogram
()
{
data_
=
nullptr
;
}
~
FeatureHistogram
()
{
}
...
...
@@ -28,125 +40,76 @@ public:
* \param feature the feature data for this histogram
* \param min_num_data_one_leaf minimal number of data in one leaf
*/
void
Init
(
const
Feature
*
feature
,
int
feature_idx
,
const
TreeConfig
*
tree_config
)
{
feature_idx_
=
feature_idx
;
tree_config_
=
tree_config
;
bin_data_
=
feature
->
bin_data
();
num_bins_
=
feature
->
num_bin
();
data_
.
resize
(
num_bins_
);
if
(
feature
->
bin_type
()
==
BinType
::
NumericalBin
)
{
find_best_threshold_fun_
=
std
::
bind
(
&
FeatureHistogram
::
FindBestThresholdForNumerical
,
this
,
std
::
placeholders
::
_1
);
void
Init
(
HistogramBinEntry
*
data
,
const
FeatureMetainfo
*
meta
,
BinType
bin_type
)
{
meta_
=
meta
;
data_
=
data
;
if
(
bin_type
==
BinType
::
NumericalBin
)
{
find_best_threshold_fun_
=
std
::
bind
(
&
FeatureHistogram
::
FindBestThresholdNumerical
,
this
,
std
::
placeholders
::
_1
,
std
::
placeholders
::
_2
,
std
::
placeholders
::
_3
,
std
::
placeholders
::
_4
);
}
else
{
find_best_threshold_fun_
=
std
::
bind
(
&
FeatureHistogram
::
FindBestThresholdForCategorical
,
this
,
std
::
placeholders
::
_1
);
find_best_threshold_fun_
=
std
::
bind
(
&
FeatureHistogram
::
FindBestThresholdCategorical
,
this
,
std
::
placeholders
::
_1
,
std
::
placeholders
::
_2
,
std
::
placeholders
::
_3
,
std
::
placeholders
::
_4
);
}
}
/*!
* \brief Construct a histogram
* \param num_data number of data in current leaf
* \param sum_gradients sum of gradients of current leaf
* \param sum_hessians sum of hessians of current leaf
* \param ordered_gradients Orederd gradients
* \param ordered_hessians Ordered hessians
* \param data_indices data indices of current leaf
*/
void
Construct
(
const
data_size_t
*
data_indices
,
data_size_t
num_data
,
double
sum_gradients
,
double
sum_hessians
,
const
score_t
*
ordered_gradients
,
const
score_t
*
ordered_hessians
)
{
std
::
memset
(
data_
.
data
(),
0
,
sizeof
(
HistogramBinEntry
)
*
num_bins_
);
num_data_
=
num_data
;
sum_gradients_
=
sum_gradients
;
sum_hessians_
=
sum_hessians
+
2
*
kEpsilon
;
bin_data_
->
ConstructHistogram
(
data_indices
,
num_data
,
ordered_gradients
,
ordered_hessians
,
data_
.
data
());
}
/*!
* \brief Construct a histogram by ordered bin
* \param leaf current leaf
* \param num_data number of data in current leaf
* \param sum_gradients sum of gradients of current leaf
* \param sum_hessians sum of hessians of current leaf
* \param gradients
* \param hessian
*/
void
Construct
(
const
OrderedBin
*
ordered_bin
,
int
leaf
,
data_size_t
num_data
,
double
sum_gradients
,
double
sum_hessians
,
const
score_t
*
gradients
,
const
score_t
*
hessians
)
{
std
::
memset
(
data_
.
data
(),
0
,
sizeof
(
HistogramBinEntry
)
*
num_bins_
);
num_data_
=
num_data
;
sum_gradients_
=
sum_gradients
;
sum_hessians_
=
sum_hessians
+
2
*
kEpsilon
;
ordered_bin
->
ConstructHistogram
(
leaf
,
gradients
,
hessians
,
data_
.
data
());
HistogramBinEntry
*
RawData
()
{
return
data_
;
}
/*!
* \brief Set sumup information for current histogram
* \param num_data number of data in current leaf
* \param sum_gradients sum of gradients of current leaf
* \param sum_hessians sum of hessians of current leaf
*/
void
SetSumup
(
data_size_t
num_data
,
double
sum_gradients
,
double
sum_hessians
)
{
num_data_
=
num_data
;
sum_gradients_
=
sum_gradients
;
sum_hessians_
=
sum_hessians
+
2
*
kEpsilon
;
}
/*!
* \brief Subtract current histograms with other
* \param other The histogram that want to subtract
*/
void
Subtract
(
const
FeatureHistogram
&
other
)
{
num_data_
-=
other
.
num_data_
;
sum_gradients_
-=
other
.
sum_gradients_
;
sum_hessians_
-=
other
.
sum_hessians_
;
for
(
unsigned
int
i
=
0
;
i
<
num_bins_
;
++
i
)
{
for
(
int
i
=
0
;
i
<
meta_
->
num_bin
-
meta_
->
bias
;
++
i
)
{
data_
[
i
].
cnt
-=
other
.
data_
[
i
].
cnt
;
data_
[
i
].
sum_gradients
-=
other
.
data_
[
i
].
sum_gradients
;
data_
[
i
].
sum_hessians
-=
other
.
data_
[
i
].
sum_hessians
;
}
}
/*!
* \brief Find best threshold for this histogram
* \param output The best split result
*/
void
FindBestThreshold
(
SplitInfo
*
output
)
{
find_best_threshold_fun_
(
output
);
void
FindBestThreshold
(
double
sum_gradient
,
double
sum_hessian
,
data_size_t
num_data
,
SplitInfo
*
output
)
{
find_best_threshold_fun_
(
sum_gradient
,
sum_hessian
+
2
*
kEpsilon
,
num_data
,
output
);
}
void
FindBestThresholdForNumerical
(
SplitInfo
*
output
)
{
void
FindBestThresholdNumerical
(
double
sum_gradient
,
double
sum_hessian
,
data_size_t
num_data
,
SplitInfo
*
output
)
{
double
best_sum_left_gradient
=
NAN
;
double
best_sum_left_hessian
=
NAN
;
double
best_gain
=
kMinScore
;
data_size_t
best_left_count
=
0
;
u
nsigned
in
t
best_threshold
=
static_cast
<
u
nsigned
int
>
(
num_bin
s_
);
u
int32_
t
best_threshold
=
static_cast
<
u
int32_t
>
(
meta_
->
num_bin
);
double
sum_right_gradient
=
0.0
f
;
double
sum_right_hessian
=
kEpsilon
;
data_size_t
right_count
=
0
;
double
gain_shift
=
GetLeafSplitGain
(
sum_gradient
s_
,
sum_hessian
s_
);
double
min_gain_shift
=
gain_shift
+
tree_config
_
->
min_gain_to_split
;
double
gain_shift
=
GetLeafSplitGain
(
sum_gradient
,
sum_hessian
);
double
min_gain_shift
=
gain_shift
+
meta_
->
tree_config
->
min_gain_to_split
;
is_splittable_
=
false
;
const
int
bias
=
meta_
->
bias
;
int
t
=
meta_
->
num_bin
-
1
-
bias
;
const
int
t_end
=
1
-
bias
;
// from right to left, and we don't need data in bin0
for
(
unsigned
int
t
=
num_bins_
-
1
;
t
>
0
;
--
t
)
{
for
(
;
t
>
=
t_end
;
--
t
)
{
sum_right_gradient
+=
data_
[
t
].
sum_gradients
;
sum_right_hessian
+=
data_
[
t
].
sum_hessians
;
right_count
+=
data_
[
t
].
cnt
;
// if data not enough, or sum hessian too small
if
(
right_count
<
tree_config
_
->
min_data_in_leaf
||
sum_right_hessian
<
tree_config
_
->
min_sum_hessian_in_leaf
)
continue
;
data_size_t
left_count
=
num_data
_
-
right_count
;
if
(
right_count
<
meta_
->
tree_config
->
min_data_in_leaf
||
sum_right_hessian
<
meta_
->
tree_config
->
min_sum_hessian_in_leaf
)
continue
;
data_size_t
left_count
=
num_data
-
right_count
;
// if data not enough
if
(
left_count
<
tree_config
_
->
min_data_in_leaf
)
break
;
if
(
left_count
<
meta_
->
tree_config
->
min_data_in_leaf
)
break
;
double
sum_left_hessian
=
sum_hessian
s_
-
sum_right_hessian
;
double
sum_left_hessian
=
sum_hessian
-
sum_right_hessian
;
// if sum hessian too small
if
(
sum_left_hessian
<
tree_config
_
->
min_sum_hessian_in_leaf
)
break
;
if
(
sum_left_hessian
<
meta_
->
tree_config
->
min_sum_hessian_in_leaf
)
break
;
double
sum_left_gradient
=
sum_gradient
s_
-
sum_right_gradient
;
double
sum_left_gradient
=
sum_gradient
-
sum_right_gradient
;
// current split gain
double
current_gain
=
GetLeafSplitGain
(
sum_left_gradient
,
sum_left_hessian
)
+
GetLeafSplitGain
(
sum_right_gradient
,
sum_right_hessian
);
// gain with split is worse than without split
if
(
current_gain
<
min_gain_shift
)
continue
;
if
(
current_gain
<
=
min_gain_shift
)
continue
;
// mark to is splittable
is_splittable_
=
true
;
...
...
@@ -156,91 +119,119 @@ public:
best_sum_left_gradient
=
sum_left_gradient
;
best_sum_left_hessian
=
sum_left_hessian
;
// left is <= threshold, right is > threshold. so this is t-1
best_threshold
=
t
-
1
;
best_threshold
=
static_cast
<
uint32_t
>
(
t
-
1
+
bias
)
;
best_gain
=
current_gain
;
}
}
if
(
is_splittable_
)
{
// update split information
output
->
feature
=
feature_idx_
;
output
->
threshold
=
best_threshold
;
output
->
left_output
=
CalculateSplittedLeafOutput
(
best_sum_left_gradient
,
best_sum_left_hessian
);
output
->
left_count
=
best_left_count
;
output
->
left_sum_gradient
=
best_sum_left_gradient
;
output
->
left_sum_hessian
=
best_sum_left_hessian
;
output
->
right_output
=
CalculateSplittedLeafOutput
(
sum_gradient
s_
-
best_sum_left_gradient
,
sum_hessian
s_
-
best_sum_left_hessian
);
output
->
right_count
=
num_data
_
-
best_left_count
;
output
->
right_sum_gradient
=
sum_gradient
s_
-
best_sum_left_gradient
;
output
->
right_sum_hessian
=
sum_hessian
s_
-
best_sum_left_hessian
;
output
->
left_sum_hessian
=
best_sum_left_hessian
-
kEpsilon
;
output
->
right_output
=
CalculateSplittedLeafOutput
(
sum_gradient
-
best_sum_left_gradient
,
sum_hessian
-
best_sum_left_hessian
);
output
->
right_count
=
num_data
-
best_left_count
;
output
->
right_sum_gradient
=
sum_gradient
-
best_sum_left_gradient
;
output
->
right_sum_hessian
=
sum_hessian
-
best_sum_left_hessian
-
kEpsilon
;
output
->
gain
=
best_gain
-
gain_shift
;
}
else
{
output
->
feature
=
feature_idx_
;
output
->
gain
=
kMinScore
;
}
}
/*!
* \brief Find best threshold for this histogram
* \param output The best split result
*/
void
FindBestThresholdForCategorical
(
SplitInfo
*
output
)
{
void
FindBestThresholdCategorical
(
double
sum_gradient
,
double
sum_hessian
,
data_size_t
num_data
,
SplitInfo
*
output
)
{
double
best_gain
=
kMinScore
;
unsigned
int
best_threshold
=
static_cast
<
unsigned
int
>
(
num_bins_
);
double
gain_shift
=
GetLeafSplitGain
(
sum_gradients_
,
sum_hessians_
);
double
min_gain_shift
=
gain_shift
+
tree_config_
->
min_gain_to_split
;
uint32_t
best_threshold
=
static_cast
<
uint32_t
>
(
meta_
->
num_bin
);
data_size_t
best_left_count
=
0
;
double
best_sum_left_gradient
=
0.0
f
;
double
best_sum_left_hessian
=
0.0
f
;
double
gain_shift
=
GetLeafSplitGain
(
sum_gradient
,
sum_hessian
);
double
min_gain_shift
=
gain_shift
+
meta_
->
tree_config
->
min_gain_to_split
;
is_splittable_
=
false
;
for
(
int
t
=
num_bin
s_
-
1
;
t
>=
0
;
--
t
)
{
double
sum_current_gradient
=
data_
[
t
].
sum_gradients
;
double
sum_current_hessian
=
data_
[
t
].
sum_hessians
;
data_size_t
current_count
=
data_
[
t
].
cnt
;
const
int
bias
=
meta_
->
bias
;
int
t
=
meta_
->
num_bin
-
1
-
bias
;
const
int
t_end
=
0
;
// from right to left, and we don't need data in bin0
for
(;
t
>=
t_end
;
--
t
)
{
// if data not enough, or sum hessian too small
if
(
current_count
<
tree_config
_
->
min_data_in_leaf
||
sum_current
_hessian
<
tree_config
_
->
min_sum_hessian_in_leaf
)
continue
;
data_size_t
other_count
=
num_data
_
-
current_cou
nt
;
if
(
data_
[
t
].
cnt
<
meta_
->
tree_config
->
min_data_in_leaf
||
data_
[
t
].
sum
_hessian
s
<
meta_
->
tree_config
->
min_sum_hessian_in_leaf
)
continue
;
data_size_t
other_count
=
num_data
-
data_
[
t
].
c
nt
;
// if data not enough
if
(
other_count
<
tree_config
_
->
min_data_in_leaf
)
continue
;
if
(
other_count
<
meta_
->
tree_config
->
min_data_in_leaf
)
continue
;
double
sum_other_hessian
=
sum_hessian
s_
-
sum_current_hessia
n
;
double
sum_other_hessian
=
sum_hessian
-
data_
[
t
].
sum_hessians
-
kEpsilo
n
;
// if sum hessian too small
if
(
sum_other_hessian
<
tree_config
_
->
min_sum_hessian_in_leaf
)
continue
;
if
(
sum_other_hessian
<
meta_
->
tree_config
->
min_sum_hessian_in_leaf
)
continue
;
double
sum_other_gradient
=
sum_gradient
s_
-
sum_current
_gradient
;
double
sum_other_gradient
=
sum_gradient
-
data_
[
t
].
sum
_gradient
s
;
// current split gain
double
current_gain
=
GetLeafSplitGain
(
sum_other_gradient
,
sum_other_hessian
)
+
GetLeafSplitGain
(
sum_current
_gradient
,
sum_current_hessia
n
);
double
current_gain
=
GetLeafSplitGain
(
sum_other_gradient
,
sum_other_hessian
)
+
GetLeafSplitGain
(
data_
[
t
].
sum
_gradient
s
,
data_
[
t
].
sum_hessians
+
kEpsilo
n
);
// gain with split is worse than without split
if
(
current_gain
<
min_gain_shift
)
continue
;
if
(
current_gain
<
=
min_gain_shift
)
continue
;
// mark to is splittable
is_splittable_
=
true
;
// better split point
if
(
current_gain
>
best_gain
)
{
best_threshold
=
static_cast
<
unsigned
int
>
(
t
);
best_threshold
=
static_cast
<
uint32_t
>
(
t
+
bias
);
best_sum_left_gradient
=
data_
[
t
].
sum_gradients
;
best_sum_left_hessian
=
data_
[
t
].
sum_hessians
+
kEpsilon
;
best_left_count
=
data_
[
t
].
cnt
;
best_gain
=
current_gain
;
}
}
// update split information
// need restore zero bin
if
(
bias
==
1
)
{
t
=
meta_
->
num_bin
-
1
-
bias
;
double
sum_bin0_gradient
=
sum_gradient
;
double
sum_bin0_hessian
=
sum_hessian
-
2
*
kEpsilon
;
data_size_t
cnt_bin0
=
num_data
;
for
(;
t
>=
0
;
--
t
)
{
sum_bin0_gradient
-=
data_
[
t
].
sum_gradients
;
sum_bin0_hessian
-=
data_
[
t
].
sum_hessians
;
cnt_bin0
-=
data_
[
t
].
cnt
;
}
data_size_t
other_count
=
num_data
-
cnt_bin0
;
double
sum_other_hessian
=
sum_hessian
-
sum_bin0_hessian
-
kEpsilon
;
if
(
cnt_bin0
>=
meta_
->
tree_config
->
min_data_in_leaf
&&
sum_bin0_hessian
>=
meta_
->
tree_config
->
min_sum_hessian_in_leaf
&&
other_count
>=
meta_
->
tree_config
->
min_data_in_leaf
&&
sum_other_hessian
>=
meta_
->
tree_config
->
min_sum_hessian_in_leaf
)
{
double
sum_other_gradient
=
sum_gradient
-
sum_bin0_gradient
;
double
current_gain
=
GetLeafSplitGain
(
sum_other_gradient
,
sum_other_hessian
)
+
GetLeafSplitGain
(
sum_bin0_gradient
,
sum_bin0_hessian
+
kEpsilon
);
if
(
current_gain
>
min_gain_shift
)
{
is_splittable_
=
true
;
// better split point
if
(
current_gain
>
best_gain
)
{
best_threshold
=
static_cast
<
uint32_t
>
(
0
);
best_sum_left_gradient
=
sum_bin0_gradient
;
best_sum_left_hessian
=
sum_bin0_hessian
+
kEpsilon
;
best_left_count
=
cnt_bin0
;
best_gain
=
current_gain
;
}
}
}
}
if
(
is_splittable_
)
{
output
->
feature
=
feature_idx_
;
// update split information
output
->
threshold
=
best_threshold
;
output
->
left_output
=
CalculateSplittedLeafOutput
(
data_
[
best_threshold
].
sum_gradients
,
data_
[
best_threshold
].
sum_hessians
);
output
->
left_count
=
data_
[
best_threshold
].
cnt
;
output
->
left_sum_gradient
=
data_
[
best_threshold
].
sum_gradients
;
output
->
left_sum_hessian
=
data_
[
best_threshold
].
sum_hessians
;
output
->
right_output
=
CalculateSplittedLeafOutput
(
sum_gradients_
-
data_
[
best_threshold
].
sum_gradients
,
sum_hessians_
-
data_
[
best_threshold
].
sum_hessians
);
output
->
right_count
=
num_data_
-
data_
[
best_threshold
].
cnt
;
output
->
right_sum_gradient
=
sum_gradients_
-
data_
[
best_threshold
].
sum_gradients
;
output
->
right_sum_hessian
=
sum_hessians_
-
data_
[
best_threshold
].
sum_hessians
;
output
->
left_output
=
CalculateSplittedLeafOutput
(
best_sum_left_gradient
,
best_sum_left_hessian
);
output
->
left_count
=
best_left_count
;
output
->
left_sum_gradient
=
best_sum_left_gradient
;
output
->
left_sum_hessian
=
best_sum_left_hessian
-
kEpsilon
;
output
->
right_output
=
CalculateSplittedLeafOutput
(
sum_gradient
-
best_sum_left_gradient
,
sum_hessian
-
best_sum_left_hessian
);
output
->
right_count
=
num_data
-
best_left_count
;
output
->
right_sum_gradient
=
sum_gradient
-
best_sum_left_gradient
;
output
->
right_sum_hessian
=
sum_hessian
-
best_sum_left_hessian
-
kEpsilon
;
output
->
gain
=
best_gain
-
gain_shift
;
}
else
{
output
->
feature
=
feature_idx_
;
output
->
gain
=
kMinScore
;
}
}
...
...
@@ -249,21 +240,14 @@ public:
* \brief Binary size of this histogram
*/
int
SizeOfHistgram
()
const
{
return
num_bins_
*
sizeof
(
HistogramBinEntry
);
}
/*!
* \brief Memory pointer to histogram data
*/
const
HistogramBinEntry
*
HistogramData
()
const
{
return
data_
.
data
();
return
(
meta_
->
num_bin
-
meta_
->
bias
)
*
sizeof
(
HistogramBinEntry
);
}
/*!
* \brief Restore histogram from memory
*/
void
FromMemory
(
char
*
memory_data
)
{
std
::
memcpy
(
data_
.
data
()
,
memory_data
,
num_bins_
*
sizeof
(
HistogramBinEntry
));
void
FromMemory
(
char
*
memory_data
)
{
std
::
memcpy
(
data_
,
memory_data
,
(
meta_
->
num_bin
-
meta_
->
bias
)
*
sizeof
(
HistogramBinEntry
));
}
/*!
...
...
@@ -276,10 +260,6 @@ public:
*/
void
set_is_splittable
(
bool
val
)
{
is_splittable_
=
val
;
}
void
ResetConfig
(
const
TreeConfig
*
tree_config
)
{
tree_config_
=
tree_config
;
}
private:
/*!
* \brief Calculate the split gain based on regularized sum_gradients and sum_hessians
...
...
@@ -289,12 +269,10 @@ private:
*/
double
GetLeafSplitGain
(
double
sum_gradients
,
double
sum_hessians
)
const
{
double
abs_sum_gradients
=
std
::
fabs
(
sum_gradients
);
if
(
abs_sum_gradients
>
tree_config_
->
lambda_l1
)
{
double
reg_abs_sum_gradients
=
abs_sum_gradients
-
tree_config_
->
lambda_l1
;
return
(
reg_abs_sum_gradients
*
reg_abs_sum_gradients
)
/
(
sum_hessians
+
tree_config_
->
lambda_l2
);
}
return
0.0
f
;
double
reg_abs_sum_gradients
=
std
::
max
(
0.0
,
abs_sum_gradients
-
meta_
->
tree_config
->
lambda_l1
);
return
(
reg_abs_sum_gradients
*
reg_abs_sum_gradients
)
/
(
sum_hessians
+
meta_
->
tree_config
->
lambda_l2
);
}
/*!
...
...
@@ -305,35 +283,19 @@ private:
*/
double
CalculateSplittedLeafOutput
(
double
sum_gradients
,
double
sum_hessians
)
const
{
double
abs_sum_gradients
=
std
::
fabs
(
sum_gradients
);
if
(
abs_sum_gradients
>
tree_config_
->
lambda_l1
)
{
return
-
std
::
copysign
(
abs_sum_gradients
-
tree_config_
->
lambda_l1
,
sum_gradients
)
/
(
sum_hessians
+
tree_config_
->
lambda_l2
);
}
return
0.0
f
;
double
reg_abs_sum_gradients
=
std
::
max
(
0.0
,
abs_sum_gradients
-
meta_
->
tree_config
->
lambda_l1
);
return
-
std
::
copysign
(
reg_abs_sum_gradients
,
sum_gradients
)
/
(
sum_hessians
+
meta_
->
tree_config
->
lambda_l2
);
}
int
feature_idx_
;
/*! \brief pointer of tree config */
const
TreeConfig
*
tree_config_
;
/*! \brief the bin data of current feature */
const
Bin
*
bin_data_
;
/*! \brief number of bin of histogram */
unsigned
int
num_bins_
;
const
FeatureMetainfo
*
meta_
;
/*! \brief sum of gradient of each bin */
std
::
vector
<
HistogramBinEntry
>
data_
;
/*! \brief number of all data */
data_size_t
num_data_
;
/*! \brief sum of gradient of current leaf */
double
sum_gradients_
;
/*! \brief sum of hessians of current leaf */
double
sum_hessians_
;
HistogramBinEntry
*
data_
;
//std::vector<HistogramBinEntry> data_;
/*! \brief False if this histogram cannot split */
bool
is_splittable_
=
true
;
/*! \brief function that used to find best threshold */
std
::
function
<
void
(
SplitInfo
*
)
>
find_best_threshold_fun_
;
};
std
::
function
<
void
(
double
,
double
,
data_size_t
,
SplitInfo
*
)
>
find_best_threshold_fun_
;
};
class
HistogramPool
{
public:
/*!
...
...
@@ -343,7 +305,6 @@ public:
cache_size_
=
0
;
total_size_
=
0
;
}
/*!
* \brief Destructor
*/
...
...
@@ -370,7 +331,6 @@ public:
ResetMap
();
}
}
/*!
* \brief Reset mapper
*/
...
...
@@ -383,34 +343,48 @@ public:
}
}
/*!
* \brief Fill the pool
* \param obj_create_fun that used to generate object
*/
void
Fill
(
std
::
function
<
FeatureHistogram
*
()
>
obj_create_fun
)
{
fill_func_
=
obj_create_fun
;
pool_
.
clear
();
pool_
.
resize
(
cache_size_
);
for
(
int
i
=
0
;
i
<
cache_size_
;
++
i
)
{
pool_
[
i
].
reset
(
obj_create_fun
());
void
DynamicChangeSize
(
const
Dataset
*
train_data
,
const
TreeConfig
*
tree_config
,
int
cache_size
,
int
total_size
)
{
if
(
feature_metas_
.
empty
())
{
feature_metas_
.
resize
(
train_data
->
num_features
());
#pragma omp parallel for schedule(static)
for
(
int
i
=
0
;
i
<
train_data
->
num_features
();
++
i
)
{
feature_metas_
[
i
].
num_bin
=
train_data
->
FeatureNumBin
(
i
);
if
(
train_data
->
FeatureBinMapper
(
i
)
->
GetDefaultBin
()
==
0
)
{
feature_metas_
[
i
].
bias
=
1
;
}
else
{
feature_metas_
[
i
].
bias
=
0
;
}
feature_metas_
[
i
].
tree_config
=
tree_config
;
}
}
}
void
DynamicChangeSize
(
int
cache_size
,
int
total_size
)
{
uint64_t
num_total_bin
=
train_data
->
NumTotalBin
();
Log
::
Info
(
"Total Bins %d"
,
num_total_bin
);
int
old_cache_size
=
cache_size_
;
Reset
(
cache_size
,
total_size
);
pool_
.
resize
(
cache_size_
);
pool_
.
resize
(
cache_size
);
data_
.
resize
(
cache_size
);
#pragma omp parallel for schedule(static)
for
(
int
i
=
old_cache_size
;
i
<
cache_size_
;
++
i
)
{
pool_
[
i
].
reset
(
fill_func_
());
pool_
[
i
].
reset
(
new
FeatureHistogram
[
train_data
->
num_features
()]);
data_
[
i
].
resize
(
num_total_bin
);
uint64_t
offset
=
0
;
for
(
int
j
=
0
;
j
<
train_data
->
num_features
();
++
j
)
{
offset
+=
static_cast
<
uint64_t
>
(
train_data
->
SubFeatureBinOffset
(
j
));
pool_
[
i
][
j
].
Init
(
data_
[
i
].
data
()
+
offset
,
&
feature_metas_
[
j
],
train_data
->
FeatureBinMapper
(
j
)
->
bin_type
());
auto
num_bin
=
train_data
->
FeatureNumBin
(
j
);
if
(
train_data
->
FeatureBinMapper
(
j
)
->
GetDefaultBin
()
==
0
)
{
num_bin
-=
1
;
}
offset
+=
static_cast
<
uint64_t
>
(
num_bin
);
}
CHECK
(
offset
==
num_total_bin
);
}
}
void
ResetConfig
(
const
TreeConfig
*
tree_config
,
int
array_size
)
{
for
(
int
i
=
0
;
i
<
cache_size_
;
++
i
)
{
auto
data_ptr
=
pool_
[
i
].
get
();
for
(
int
j
=
0
;
j
<
array_size
;
++
j
)
{
data_ptr
[
j
].
ResetConfig
(
tree_config
);
}
void
ResetConfig
(
const
TreeConfig
*
tree_config
)
{
#pragma omp parallel for schedule(static)
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
feature_metas_
.
size
());
++
i
)
{
feature_metas_
[
i
].
tree_config
=
tree_config
;
}
}
/*!
...
...
@@ -468,9 +442,9 @@ public:
inverse_mapper_
[
slot
]
=
dst_idx
;
}
private:
std
::
vector
<
std
::
unique_ptr
<
FeatureHistogram
[]
>>
pool_
;
std
::
function
<
FeatureHistogram
*
()
>
fill_func_
;
std
::
vector
<
std
::
vector
<
HistogramBinEntry
>>
data_
;
std
::
vector
<
FeatureMetainfo
>
feature_metas_
;
int
cache_size_
;
int
total_size_
;
bool
is_enough_
=
false
;
...
...
@@ -480,7 +454,5 @@ private:
int
cur_time_
=
0
;
};
}
// namespace LightGBM
#endif // LightGBM_TREELEARNER_FEATURE_HISTOGRAM_HPP_
src/treelearner/feature_parallel_tree_learner.cpp
View file @
eade219e
...
...
@@ -28,12 +28,14 @@ void FeatureParallelTreeLearner::BeforeTrain() {
// get feature partition
std
::
vector
<
std
::
vector
<
int
>>
feature_distribution
(
num_machines_
,
std
::
vector
<
int
>
());
std
::
vector
<
int
>
num_bins_distributed
(
num_machines_
,
0
);
for
(
int
i
=
0
;
i
<
train_data_
->
num_features
();
++
i
)
{
if
(
is_feature_used_
[
i
])
{
for
(
int
i
=
0
;
i
<
train_data_
->
num_total_features
();
++
i
)
{
int
inner_feature_index
=
train_data_
->
InnerFeatureIndex
(
i
);
if
(
inner_feature_index
==
-
1
)
{
continue
;
}
if
(
is_feature_used_
[
inner_feature_index
])
{
int
cur_min_machine
=
static_cast
<
int
>
(
ArrayArgs
<
int
>::
ArgMin
(
num_bins_distributed
));
feature_distribution
[
cur_min_machine
].
push_back
(
i
);
num_bins_distributed
[
cur_min_machine
]
+=
train_data_
->
Feature
At
(
i
)
->
num_bin
(
);
is_feature_used_
[
i
]
=
false
;
feature_distribution
[
cur_min_machine
].
push_back
(
i
nner_feature_index
);
num_bins_distributed
[
cur_min_machine
]
+=
train_data_
->
Feature
NumBin
(
inner_feature_index
);
is_feature_used_
[
i
nner_feature_index
]
=
false
;
}
}
// get local used features
...
...
@@ -43,23 +45,12 @@ void FeatureParallelTreeLearner::BeforeTrain() {
}
void
FeatureParallelTreeLearner
::
FindBestSplitsForLeaves
()
{
int
smaller_best_feature
=
-
1
,
larger_best_feature
=
-
1
;
SplitInfo
smaller_best
,
larger_best
;
// get best split at smaller leaf
std
::
vector
<
double
>
gains
;
for
(
size_t
i
=
0
;
i
<
smaller_leaf_splits_
->
BestSplitPerFeature
().
size
();
++
i
)
{
gains
.
push_back
(
smaller_leaf_splits_
->
BestSplitPerFeature
()[
i
].
gain
);
}
smaller_best_feature
=
static_cast
<
int
>
(
ArrayArgs
<
double
>::
ArgMax
(
gains
));
smaller_best
=
smaller_leaf_splits_
->
BestSplitPerFeature
()[
smaller_best_feature
];
// get best split at larger leaf
smaller_best
=
best_split_per_leaf_
[
smaller_leaf_splits_
->
LeafIndex
()];
// find local best split for larger leaf
if
(
larger_leaf_splits_
->
LeafIndex
()
>=
0
)
{
gains
.
clear
();
for
(
size_t
i
=
0
;
i
<
larger_leaf_splits_
->
BestSplitPerFeature
().
size
();
++
i
)
{
gains
.
push_back
(
larger_leaf_splits_
->
BestSplitPerFeature
()[
i
].
gain
);
}
larger_best_feature
=
static_cast
<
int
>
(
ArrayArgs
<
double
>::
ArgMax
(
gains
));
larger_best
=
larger_leaf_splits_
->
BestSplitPerFeature
()[
larger_best_feature
];
larger_best
=
best_split_per_leaf_
[
larger_leaf_splits_
->
LeafIndex
()];
}
// sync global best info
std
::
memcpy
(
input_buffer_
.
data
(),
&
smaller_best
,
sizeof
(
SplitInfo
));
...
...
src/treelearner/leaf_splits.hpp
View file @
eade219e
...
...
@@ -3,7 +3,6 @@
#include <LightGBM/meta.h>
#include "data_partition.hpp"
#include "split_info.hpp"
#include <vector>
...
...
@@ -14,13 +13,13 @@ namespace LightGBM {
*/
class
LeafSplits
{
public:
LeafSplits
(
int
num_feature
,
data_size_t
num_data
)
:
num_data_in_leaf_
(
num_data
),
num_data_
(
num_data
),
num_features_
(
num_feature
),
LeafSplits
(
data_size_t
num_data
)
:
num_data_in_leaf_
(
num_data
),
num_data_
(
num_data
),
data_indices_
(
nullptr
)
{
best_split_per_feature_
.
resize
(
num_features_
);
for
(
int
i
=
0
;
i
<
num_features_
;
++
i
)
{
best_split_per_feature_
[
i
].
feature
=
i
;
}
}
void
ResetNumData
(
data_size_t
num_data
)
{
num_data_
=
num_data
;
num_data_in_leaf_
=
num_data
;
}
~
LeafSplits
()
{
}
...
...
@@ -38,9 +37,6 @@ public:
data_indices_
=
data_partition
->
GetIndexOnLeaf
(
leaf
,
&
num_data_in_leaf_
);
sum_gradients_
=
sum_gradients
;
sum_hessians_
=
sum_hessians
;
for
(
SplitInfo
&
split_info
:
best_split_per_feature_
)
{
split_info
.
Reset
();
}
}
/*!
...
...
@@ -61,9 +57,6 @@ public:
}
sum_gradients_
=
tmp_sum_gradients
;
sum_hessians_
=
tmp_sum_hessians
;
for
(
SplitInfo
&
split_info
:
best_split_per_feature_
)
{
split_info
.
Reset
();
}
}
/*!
...
...
@@ -86,9 +79,6 @@ public:
}
sum_gradients_
=
tmp_sum_gradients
;
sum_hessians_
=
tmp_sum_hessians
;
for
(
SplitInfo
&
split_info
:
best_split_per_feature_
)
{
split_info
.
Reset
();
}
}
...
...
@@ -101,9 +91,6 @@ public:
leaf_index_
=
0
;
sum_gradients_
=
sum_gradients
;
sum_hessians_
=
sum_hessians
;
for
(
SplitInfo
&
split_info
:
best_split_per_feature_
)
{
split_info
.
Reset
();
}
}
/*!
...
...
@@ -111,13 +98,10 @@ public:
*/
void
Init
()
{
leaf_index_
=
-
1
;
for
(
SplitInfo
&
split_info
:
best_split_per_feature_
)
{
split_info
.
Reset
();
}
data_indices_
=
nullptr
;
num_data_in_leaf_
=
0
;
}
/*! \brief Get best splits on all features */
std
::
vector
<
SplitInfo
>&
BestSplitPerFeature
()
{
return
best_split_per_feature_
;}
/*! \brief Get current leaf index */
int
LeafIndex
()
const
{
return
leaf_index_
;
}
...
...
@@ -136,16 +120,12 @@ public:
private:
/*! \brief store best splits of all feature on current leaf */
std
::
vector
<
SplitInfo
>
best_split_per_feature_
;
/*! \brief current leaf index */
int
leaf_index_
;
/*! \brief number of data on current leaf */
data_size_t
num_data_in_leaf_
;
/*! \brief number of all training data */
data_size_t
num_data_
;
/*! \brief number of features */
int
num_features_
;
/*! \brief sum of gradients of current leaf */
double
sum_gradients_
;
/*! \brief sum of hessians of current leaf */
...
...
src/treelearner/parallel_tree_learner.h
View file @
eade219e
...
...
@@ -22,7 +22,7 @@ class FeatureParallelTreeLearner: public SerialTreeLearner {
public:
explicit
FeatureParallelTreeLearner
(
const
TreeConfig
*
tree_config
);
~
FeatureParallelTreeLearner
();
virtual
void
Init
(
const
Dataset
*
train_data
);
void
Init
(
const
Dataset
*
train_data
)
override
;
protected:
void
BeforeTrain
()
override
;
...
...
@@ -170,6 +170,10 @@ private:
std
::
unique_ptr
<
FeatureHistogram
[]
>
smaller_leaf_histogram_array_global_
;
/*! \brief Store global histogram for larger leaf */
std
::
unique_ptr
<
FeatureHistogram
[]
>
larger_leaf_histogram_array_global_
;
std
::
vector
<
HistogramBinEntry
>
smaller_leaf_histogram_data_
;
std
::
vector
<
HistogramBinEntry
>
larger_leaf_histogram_data_
;
std
::
vector
<
FeatureMetainfo
>
feature_metas_
;
};
}
// namespace LightGBM
...
...
src/treelearner/serial_tree_learner.cpp
View file @
eade219e
...
...
@@ -7,13 +7,34 @@
namespace
LightGBM
{
#ifdef TIMETAG
std
::
chrono
::
duration
<
double
,
std
::
milli
>
init_train_time
;
std
::
chrono
::
duration
<
double
,
std
::
milli
>
init_split_time
;
std
::
chrono
::
duration
<
double
,
std
::
milli
>
hist_time
;
std
::
chrono
::
duration
<
double
,
std
::
milli
>
find_split_time
;
std
::
chrono
::
duration
<
double
,
std
::
milli
>
split_time
;
std
::
chrono
::
duration
<
double
,
std
::
milli
>
ordered_bin_time
;
#endif // TIMETAG
SerialTreeLearner
::
SerialTreeLearner
(
const
TreeConfig
*
tree_config
)
:
tree_config_
(
tree_config
){
:
tree_config_
(
tree_config
)
{
random_
=
Random
(
tree_config_
->
feature_fraction_seed
);
#pragma omp parallel
#pragma omp master
{
num_threads_
=
omp_get_num_threads
();
}
}
SerialTreeLearner
::~
SerialTreeLearner
()
{
#ifdef TIMETAG
Log
::
Info
(
"SerialTreeLearner::init_train costs %f"
,
init_train_time
*
1e-3
);
Log
::
Info
(
"SerialTreeLearner::init_split costs %f"
,
init_split_time
*
1e-3
);
Log
::
Info
(
"SerialTreeLearner::hist_build costs %f"
,
hist_time
*
1e-3
);
Log
::
Info
(
"SerialTreeLearner::find_split costs %f"
,
find_split_time
*
1e-3
);
Log
::
Info
(
"SerialTreeLearner::split costs %f"
,
split_time
*
1e-3
);
Log
::
Info
(
"SerialTreeLearner::ordered_bin costs %f"
,
ordered_bin_time
*
1e-3
);
#endif
}
void
SerialTreeLearner
::
Init
(
const
Dataset
*
train_data
)
{
...
...
@@ -27,49 +48,74 @@ void SerialTreeLearner::Init(const Dataset* train_data) {
}
else
{
size_t
total_histogram_size
=
0
;
for
(
int
i
=
0
;
i
<
train_data_
->
num_features
();
++
i
)
{
total_histogram_size
+=
sizeof
(
HistogramBinEntry
)
*
train_data_
->
Feature
At
(
i
)
->
num_b
in
();
total_histogram_size
+=
sizeof
(
HistogramBinEntry
)
*
train_data_
->
Feature
NumB
in
(
i
);
}
max_cache_size
=
static_cast
<
int
>
(
tree_config_
->
histogram_pool_size
*
1024
*
1024
/
total_histogram_size
);
}
// at least need 2 leaves
max_cache_size
=
std
::
max
(
2
,
max_cache_size
);
max_cache_size
=
std
::
min
(
max_cache_size
,
tree_config_
->
num_leaves
);
histogram_pool_
.
Reset
(
max_cache_size
,
tree_config_
->
num_leaves
);
auto
histogram_create_function
=
[
this
]()
{
auto
tmp_histogram_array
=
std
::
unique_ptr
<
FeatureHistogram
[]
>
(
new
FeatureHistogram
[
train_data_
->
num_features
()]);
for
(
int
j
=
0
;
j
<
train_data_
->
num_features
();
++
j
)
{
tmp_histogram_array
[
j
].
Init
(
train_data_
->
FeatureAt
(
j
),
j
,
tree_config_
);
}
return
tmp_histogram_array
.
release
();
};
histogram_pool_
.
Fill
(
histogram_create_function
);
histogram_pool_
.
DynamicChangeSize
(
train_data_
,
tree_config_
,
max_cache_size
,
tree_config_
->
num_leaves
);
// push split information for all leaves
best_split_per_leaf_
.
resize
(
tree_config_
->
num_leaves
);
// initialize ordered_bins_ with nullptr
ordered_bins_
.
resize
(
num_features_
);
// get ordered bin
#pragma omp parallel for schedule(guided)
for
(
int
i
=
0
;
i
<
num_features_
;
++
i
)
{
ordered_bins_
[
i
].
reset
(
train_data_
->
FeatureAt
(
i
)
->
bin_data
()
->
CreateOrderedBin
());
}
train_data_
->
CreateOrderedBins
(
&
ordered_bins_
);
// check existing for ordered bin
for
(
int
i
=
0
;
i
<
num_features_
;
++
i
)
{
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
ordered_bins_
.
size
())
;
++
i
)
{
if
(
ordered_bins_
[
i
]
!=
nullptr
)
{
has_ordered_bin_
=
true
;
break
;
}
}
// initialize splits for leaf
smaller_leaf_splits_
.
reset
(
new
LeafSplits
(
train_data_
->
num_features
(),
train_data_
->
num_data
()));
larger_leaf_splits_
.
reset
(
new
LeafSplits
(
train_data_
->
num_features
(),
train_data_
->
num_data
()));
smaller_leaf_splits_
.
reset
(
new
LeafSplits
(
train_data_
->
num_data
()));
larger_leaf_splits_
.
reset
(
new
LeafSplits
(
train_data_
->
num_data
()));
// initialize data partition
data_partition_
.
reset
(
new
DataPartition
(
num_data_
,
tree_config_
->
num_leaves
));
is_feature_used_
.
resize
(
num_features_
);
// initialize ordered gradients and hessians
ordered_gradients_
.
resize
(
num_data_
);
ordered_hessians_
.
resize
(
num_data_
);
// if has ordered bin, need to allocate a buffer to fast split
if
(
has_ordered_bin_
)
{
is_data_in_leaf_
.
resize
(
num_data_
);
std
::
fill
(
is_data_in_leaf_
.
begin
(),
is_data_in_leaf_
.
end
(),
0
);
ordered_bin_indices_
.
clear
();
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
ordered_bins_
.
size
());
i
++
)
{
if
(
ordered_bins_
[
i
]
!=
nullptr
)
{
ordered_bin_indices_
.
push_back
(
i
);
}
}
}
Log
::
Info
(
"Number of data: %d, number of used features: %d"
,
num_data_
,
num_features_
);
}
void
SerialTreeLearner
::
ResetTrainingData
(
const
Dataset
*
train_data
)
{
train_data_
=
train_data
;
num_data_
=
train_data_
->
num_data
();
num_features_
=
train_data_
->
num_features
();
// get ordered bin
train_data_
->
CreateOrderedBins
(
&
ordered_bins_
);
has_ordered_bin_
=
false
;
// check existing for ordered bin
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
ordered_bins_
.
size
());
++
i
)
{
if
(
ordered_bins_
[
i
]
!=
nullptr
)
{
has_ordered_bin_
=
true
;
break
;
}
}
// initialize splits for leaf
smaller_leaf_splits_
->
ResetNumData
(
num_data_
);
larger_leaf_splits_
->
ResetNumData
(
num_data_
);
// initialize data partition
data_partition_
->
ResetNumData
(
num_data_
);
is_feature_used_
.
resize
(
num_features_
);
...
...
@@ -79,11 +125,16 @@ void SerialTreeLearner::Init(const Dataset* train_data) {
// if has ordered bin, need to allocate a buffer to fast split
if
(
has_ordered_bin_
)
{
is_data_in_leaf_
.
resize
(
num_data_
);
std
::
fill
(
is_data_in_leaf_
.
begin
(),
is_data_in_leaf_
.
end
(),
0
);
ordered_bin_indices_
.
clear
();
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
ordered_bins_
.
size
());
i
++
)
{
if
(
ordered_bins_
[
i
]
!=
nullptr
)
{
ordered_bin_indices_
.
push_back
(
i
);
}
}
}
Log
::
Info
(
"Number of data: %d, number of features: %d"
,
num_data_
,
num_features_
);
}
void
SerialTreeLearner
::
ResetConfig
(
const
TreeConfig
*
tree_config
)
{
if
(
tree_config_
->
num_leaves
!=
tree_config
->
num_leaves
)
{
tree_config_
=
tree_config
;
...
...
@@ -94,14 +145,14 @@ void SerialTreeLearner::ResetConfig(const TreeConfig* tree_config) {
}
else
{
size_t
total_histogram_size
=
0
;
for
(
int
i
=
0
;
i
<
train_data_
->
num_features
();
++
i
)
{
total_histogram_size
+=
sizeof
(
HistogramBinEntry
)
*
train_data_
->
Feature
At
(
i
)
->
num_b
in
();
total_histogram_size
+=
sizeof
(
HistogramBinEntry
)
*
train_data_
->
Feature
NumB
in
(
i
);
}
max_cache_size
=
static_cast
<
int
>
(
tree_config_
->
histogram_pool_size
*
1024
*
1024
/
total_histogram_size
);
}
// at least need 2 leaves
max_cache_size
=
std
::
max
(
2
,
max_cache_size
);
max_cache_size
=
std
::
min
(
max_cache_size
,
tree_config_
->
num_leaves
);
histogram_pool_
.
DynamicChangeSize
(
max_cache_size
,
tree_config_
->
num_leaves
);
histogram_pool_
.
DynamicChangeSize
(
train_data_
,
tree_config_
,
max_cache_size
,
tree_config_
->
num_leaves
);
// push split information for all leaves
best_split_per_leaf_
.
resize
(
tree_config_
->
num_leaves
);
...
...
@@ -110,24 +161,40 @@ void SerialTreeLearner::ResetConfig(const TreeConfig* tree_config) {
tree_config_
=
tree_config
;
}
histogram_pool_
.
ResetConfig
(
tree_config_
,
train_data_
->
num_features
()
);
histogram_pool_
.
ResetConfig
(
tree_config_
);
}
Tree
*
SerialTreeLearner
::
Train
(
const
score_t
*
gradients
,
const
score_t
*
hessians
)
{
gradients_
=
gradients
;
hessians_
=
hessians
;
#ifdef TIMETAG
auto
start_time
=
std
::
chrono
::
steady_clock
::
now
();
#endif
// some initial works before training
BeforeTrain
();
#ifdef TIMETAG
init_train_time
+=
std
::
chrono
::
steady_clock
::
now
()
-
start_time
;
#endif
auto
tree
=
std
::
unique_ptr
<
Tree
>
(
new
Tree
(
tree_config_
->
num_leaves
));
// save pointer to last trained tree
last_trained_tree_
=
tree
.
get
();
// root leaf
int
left_leaf
=
0
;
int
cur_depth
=
1
;
// only root leaf can be splitted on first time
int
right_leaf
=
-
1
;
for
(
int
split
=
0
;
split
<
tree_config_
->
num_leaves
-
1
;
split
++
)
{
for
(
int
split
=
0
;
split
<
tree_config_
->
num_leaves
-
1
;
++
split
)
{
#ifdef TIMETAG
start_time
=
std
::
chrono
::
steady_clock
::
now
();
#endif
// some initial works before finding best split
if
(
BeforeFindBestSplit
(
left_leaf
,
right_leaf
))
{
#ifdef TIMETAG
init_split_time
+=
std
::
chrono
::
steady_clock
::
now
()
-
start_time
;
#endif
// find best threshold for every feature
FindBestThresholds
();
// find best split from all features
...
...
@@ -139,13 +206,20 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians
const
SplitInfo
&
best_leaf_SplitInfo
=
best_split_per_leaf_
[
best_leaf
];
// cannot split, quit
if
(
best_leaf_SplitInfo
.
gain
<=
0.0
)
{
Log
::
Info
(
"No further splits with positive gain, best gain: %f, leaves: %d"
,
best_leaf_SplitInfo
.
gain
,
split
+
1
);
Log
::
Info
(
"No further splits with positive gain, best gain: %f"
,
best_leaf_SplitInfo
.
gain
);
break
;
}
#ifdef TIMETAG
start_time
=
std
::
chrono
::
steady_clock
::
now
();
#endif
// split tree with best leaf
Split
(
tree
.
get
(),
best_leaf
,
&
left_leaf
,
&
right_leaf
);
#ifdef TIMETAG
split_time
+=
std
::
chrono
::
steady_clock
::
now
()
-
start_time
;
#endif
cur_depth
=
std
::
max
(
cur_depth
,
tree
->
leaf_depth
(
left_leaf
));
}
Log
::
Info
(
"Trained a tree with leaves=%d and max_depth=%d"
,
tree
->
num_leaves
(),
cur_depth
);
return
tree
.
release
();
}
...
...
@@ -153,15 +227,24 @@ void SerialTreeLearner::BeforeTrain() {
// reset histogram pool
histogram_pool_
.
ResetMap
();
// initialize used features
for
(
int
i
=
0
;
i
<
num_features_
;
++
i
)
{
is_feature_used_
[
i
]
=
false
;
}
// Get used feature at current tree
int
used_feature_cnt
=
static_cast
<
int
>
(
num_features_
*
tree_config_
->
feature_fraction
);
auto
used_feature_indices
=
random_
.
Sample
(
num_features_
,
used_feature_cnt
);
for
(
auto
idx
:
used_feature_indices
)
{
is_feature_used_
[
idx
]
=
true
;
if
(
tree_config_
->
feature_fraction
<
1
)
{
int
used_feature_cnt
=
static_cast
<
int
>
(
train_data_
->
num_total_features
()
*
tree_config_
->
feature_fraction
);
// initialize used features
std
::
memset
(
is_feature_used_
.
data
(),
0
,
sizeof
(
int8_t
)
*
num_features_
);
// Get used feature at current tree
auto
used_feature_indices
=
random_
.
Sample
(
train_data_
->
num_total_features
(),
used_feature_cnt
);
#pragma omp parallel for schedule(static)
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
used_feature_indices
.
size
());
++
i
)
{
int
inner_feature_index
=
train_data_
->
InnerFeatureIndex
(
used_feature_indices
[
i
]);
if
(
inner_feature_index
<
0
)
{
continue
;
}
is_feature_used_
[
inner_feature_index
]
=
1
;
}
}
else
{
#pragma omp parallel for schedule(static)
for
(
int
i
=
0
;
i
<
num_features_
;
++
i
)
{
is_feature_used_
[
i
]
=
1
;
}
}
// initialize data partition
...
...
@@ -176,60 +259,49 @@ void SerialTreeLearner::BeforeTrain() {
if
(
data_partition_
->
leaf_count
(
0
)
==
num_data_
)
{
// use all data
smaller_leaf_splits_
->
Init
(
gradients_
,
hessians_
);
// point to gradients, avoid copy
ptr_to_ordered_gradients_smaller_leaf_
=
gradients_
;
ptr_to_ordered_hessians_smaller_leaf_
=
hessians_
;
}
else
{
// use bagging, only use part of data
smaller_leaf_splits_
->
Init
(
0
,
data_partition_
.
get
(),
gradients_
,
hessians_
);
// copy used gradients and hessians to ordered buffer
const
data_size_t
*
indices
=
data_partition_
->
indices
();
data_size_t
cnt
=
data_partition_
->
leaf_count
(
0
);
#pragma omp parallel for schedule(static)
for
(
data_size_t
i
=
0
;
i
<
cnt
;
++
i
)
{
ordered_gradients_
[
i
]
=
gradients_
[
indices
[
i
]];
ordered_hessians_
[
i
]
=
hessians_
[
indices
[
i
]];
}
// point to ordered_gradients_ and ordered_hessians_
ptr_to_ordered_gradients_smaller_leaf_
=
ordered_gradients_
.
data
();
ptr_to_ordered_hessians_smaller_leaf_
=
ordered_hessians_
.
data
();
}
ptr_to_ordered_gradients_larger_leaf_
=
nullptr
;
ptr_to_ordered_hessians_larger_leaf_
=
nullptr
;
larger_leaf_splits_
->
Init
();
// if has ordered bin, need to initialize the ordered bin
if
(
has_ordered_bin_
)
{
#ifdef TIMETAG
auto
start_time
=
std
::
chrono
::
steady_clock
::
now
();
#endif
if
(
data_partition_
->
leaf_count
(
0
)
==
num_data_
)
{
// use all data, pass nullptr
#pragma omp parallel for schedule(guided)
for
(
int
i
=
0
;
i
<
num_features_
;
++
i
)
{
if
(
ordered_bins_
[
i
]
!=
nullptr
)
{
ordered_bins_
[
i
]
->
Init
(
nullptr
,
tree_config_
->
num_leaves
);
}
#pragma omp parallel for schedule(static)
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
ordered_bin_indices_
.
size
());
++
i
)
{
ordered_bins_
[
ordered_bin_indices_
[
i
]]
->
Init
(
nullptr
,
tree_config_
->
num_leaves
);
}
}
else
{
// bagging, only use part of data
// mark used data
std
::
memset
(
is_data_in_leaf_
.
data
(),
0
,
sizeof
(
char
)
*
num_data_
);
const
data_size_t
*
indices
=
data_partition_
->
indices
();
data_size_t
begin
=
data_partition_
->
leaf_begin
(
0
);
data_size_t
end
=
begin
+
data_partition_
->
leaf_count
(
0
);
#pragma omp parallel for schedule(static)
#pragma omp parallel for schedule(static)
for
(
data_size_t
i
=
begin
;
i
<
end
;
++
i
)
{
is_data_in_leaf_
[
indices
[
i
]]
=
1
;
}
// initialize ordered bin
#pragma omp parallel for schedule(guided)
for
(
int
i
=
0
;
i
<
num_features_
;
++
i
)
{
if
(
ordered_bins_
[
i
]
!=
nullptr
)
{
ordered_bins_
[
i
]
->
Init
(
is_data_in_leaf_
.
data
(),
tree_config_
->
num_leaves
);
}
#pragma omp parallel for schedule(static)
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
ordered_bin_indices_
.
size
());
++
i
)
{
ordered_bins_
[
ordered_bin_indices_
[
i
]]
->
Init
(
is_data_in_leaf_
.
data
(),
tree_config_
->
num_leaves
);
}
#pragma omp parallel for schedule(static)
for
(
data_size_t
i
=
begin
;
i
<
end
;
++
i
)
{
is_data_in_leaf_
[
indices
[
i
]]
=
0
;
}
}
#ifdef TIMETAG
ordered_bin_time
+=
std
::
chrono
::
steady_clock
::
now
()
-
start_time
;
#endif
}
}
...
...
@@ -249,7 +321,7 @@ bool SerialTreeLearner::BeforeFindBestSplit(int left_leaf, int right_leaf) {
data_size_t
num_data_in_right_child
=
GetGlobalDataCountInLeaf
(
right_leaf
);
// no enough data to continue
if
(
num_data_in_right_child
<
static_cast
<
data_size_t
>
(
tree_config_
->
min_data_in_leaf
*
2
)
&&
num_data_in_left_child
<
static_cast
<
data_size_t
>
(
tree_config_
->
min_data_in_leaf
*
2
))
{
&&
num_data_in_left_child
<
static_cast
<
data_size_t
>
(
tree_config_
->
min_data_in_leaf
*
2
))
{
best_split_per_leaf_
[
left_leaf
].
gain
=
kMinScore
;
if
(
right_leaf
>=
0
)
{
best_split_per_leaf_
[
right_leaf
].
gain
=
kMinScore
;
...
...
@@ -257,172 +329,184 @@ bool SerialTreeLearner::BeforeFindBestSplit(int left_leaf, int right_leaf) {
return
false
;
}
parent_leaf_histogram_array_
=
nullptr
;
// -1 if only has one leaf. else equal the index of smaller leaf
int
smaller_leaf
=
-
1
;
int
larger_leaf
=
-
1
;
// only have root
if
(
right_leaf
<
0
)
{
histogram_pool_
.
Get
(
left_leaf
,
&
smaller_leaf_histogram_array_
);
larger_leaf_histogram_array_
=
nullptr
;
}
else
if
(
num_data_in_left_child
<
num_data_in_right_child
)
{
smaller_leaf
=
left_leaf
;
larger_leaf
=
right_leaf
;
// put parent(left) leaf's histograms into larger leaf's histograms
if
(
histogram_pool_
.
Get
(
left_leaf
,
&
larger_leaf_histogram_array_
))
{
parent_leaf_histogram_array_
=
larger_leaf_histogram_array_
;
}
histogram_pool_
.
Move
(
left_leaf
,
right_leaf
);
histogram_pool_
.
Get
(
left_leaf
,
&
smaller_leaf_histogram_array_
);
}
else
{
smaller_leaf
=
right_leaf
;
larger_leaf
=
left_leaf
;
// put parent(left) leaf's histograms to larger leaf's histograms
if
(
histogram_pool_
.
Get
(
left_leaf
,
&
larger_leaf_histogram_array_
))
{
parent_leaf_histogram_array_
=
larger_leaf_histogram_array_
;
}
histogram_pool_
.
Get
(
right_leaf
,
&
smaller_leaf_histogram_array_
);
}
// init for the ordered gradients, only initialize when have 2 leaves
if
(
smaller_leaf
>=
0
)
{
// only need to initialize for smaller leaf
// Get leaf boundary
const
data_size_t
*
indices
=
data_partition_
->
indices
();
data_size_t
begin
=
data_partition_
->
leaf_begin
(
smaller_leaf
);
data_size_t
end
=
begin
+
data_partition_
->
leaf_count
(
smaller_leaf
);
// copy
#pragma omp parallel for schedule(static)
for
(
data_size_t
i
=
begin
;
i
<
end
;
++
i
)
{
ordered_gradients_
[
i
-
begin
]
=
gradients_
[
indices
[
i
]];
ordered_hessians_
[
i
-
begin
]
=
hessians_
[
indices
[
i
]];
}
// assign pointer
ptr_to_ordered_gradients_smaller_leaf_
=
ordered_gradients_
.
data
();
ptr_to_ordered_hessians_smaller_leaf_
=
ordered_hessians_
.
data
();
if
(
parent_leaf_histogram_array_
==
nullptr
)
{
// need order gradient for larger leaf
data_size_t
smaller_size
=
end
-
begin
;
data_size_t
larger_begin
=
data_partition_
->
leaf_begin
(
larger_leaf
);
data_size_t
larger_end
=
larger_begin
+
data_partition_
->
leaf_count
(
larger_leaf
);
// copy
#pragma omp parallel for schedule(static)
for
(
data_size_t
i
=
larger_begin
;
i
<
larger_end
;
++
i
)
{
ordered_gradients_
[
smaller_size
+
i
-
larger_begin
]
=
gradients_
[
indices
[
i
]];
ordered_hessians_
[
smaller_size
+
i
-
larger_begin
]
=
hessians_
[
indices
[
i
]];
}
ptr_to_ordered_gradients_larger_leaf_
=
ordered_gradients_
.
data
()
+
smaller_size
;
ptr_to_ordered_hessians_larger_leaf_
=
ordered_hessians_
.
data
()
+
smaller_size
;
}
}
// split for the ordered bin
if
(
has_ordered_bin_
&&
right_leaf
>=
0
)
{
#ifdef TIMETAG
auto
start_time
=
std
::
chrono
::
steady_clock
::
now
();
#endif
// mark data that at left-leaf
std
::
memset
(
is_data_in_leaf_
.
data
(),
0
,
sizeof
(
char
)
*
num_data_
);
const
data_size_t
*
indices
=
data_partition_
->
indices
();
const
auto
left_cnt
=
data_partition_
->
leaf_count
(
left_leaf
);
const
auto
right_cnt
=
data_partition_
->
leaf_count
(
right_leaf
);
char
mark
=
1
;
data_size_t
begin
=
data_partition_
->
leaf_begin
(
left_leaf
);
data_size_t
end
=
begin
+
data_partition_
->
leaf_count
(
left_leaf
);
#pragma omp parallel for schedule(static)
data_size_t
end
=
begin
+
left_cnt
;
if
(
left_cnt
>
right_cnt
)
{
begin
=
data_partition_
->
leaf_begin
(
right_leaf
);
end
=
begin
+
right_cnt
;
mark
=
0
;
}
#pragma omp parallel for schedule(static)
for
(
data_size_t
i
=
begin
;
i
<
end
;
++
i
)
{
is_data_in_leaf_
[
indices
[
i
]]
=
1
;
}
// split the ordered bin
#pragma omp parallel for schedule(guided)
for
(
int
i
=
0
;
i
<
num_features_
;
++
i
)
{
if
(
ordered_bins_
[
i
]
!=
nullptr
)
{
ordered_bins_
[
i
]
->
Split
(
left_leaf
,
right_leaf
,
is_data_in_leaf_
.
data
());
}
#pragma omp parallel for schedule(static)
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
ordered_bin_indices_
.
size
());
++
i
)
{
ordered_bins_
[
ordered_bin_indices_
[
i
]]
->
Split
(
left_leaf
,
right_leaf
,
is_data_in_leaf_
.
data
(),
mark
);
}
#pragma omp parallel for schedule(static)
for
(
data_size_t
i
=
begin
;
i
<
end
;
++
i
)
{
is_data_in_leaf_
[
indices
[
i
]]
=
0
;
}
#ifdef TIMETAG
ordered_bin_time
+=
std
::
chrono
::
steady_clock
::
now
()
-
start_time
;
#endif
}
return
true
;
}
void
SerialTreeLearner
::
FindBestThresholds
()
{
#pragma omp parallel for schedule(guided)
for
(
int
feature_index
=
0
;
feature_index
<
num_features_
;
feature_index
++
)
{
// feature is not used
if
((
!
is_feature_used_
.
empty
()
&&
is_feature_used_
[
feature_index
]
==
false
))
continue
;
// if parent(larger) leaf cannot split at current feature
if
(
parent_leaf_histogram_array_
!=
nullptr
&&
!
parent_leaf_histogram_array_
[
feature_index
].
is_splittable
())
{
#ifdef TIMETAG
auto
start_time
=
std
::
chrono
::
steady_clock
::
now
();
#endif
std
::
vector
<
int8_t
>
is_feature_used
(
num_features_
,
0
);
#pragma omp parallel for schedule(static)
for
(
int
feature_index
=
0
;
feature_index
<
num_features_
;
++
feature_index
)
{
if
(
!
is_feature_used_
[
feature_index
])
continue
;
if
(
parent_leaf_histogram_array_
!=
nullptr
&&
!
parent_leaf_histogram_array_
[
feature_index
].
is_splittable
())
{
smaller_leaf_histogram_array_
[
feature_index
].
set_is_splittable
(
false
);
continue
;
}
// construct histograms for smaller leaf
if
(
ordered_bins_
[
feature_index
]
==
nullptr
)
{
// if not use ordered bin
smaller_leaf_histogram_array_
[
feature_index
].
Construct
(
smaller_leaf_splits_
->
data_indices
(),
smaller_leaf_splits_
->
num_data_in_leaf
(),
smaller_leaf_splits_
->
sum_gradients
(),
smaller_leaf_splits_
->
sum_hessians
(),
ptr_to_ordered_gradients_smaller_leaf_
,
ptr_to_ordered_hessians_smaller_leaf_
);
}
else
{
// used ordered bin
smaller_leaf_histogram_array_
[
feature_index
].
Construct
(
ordered_bins_
[
feature_index
].
get
(),
smaller_leaf_splits_
->
LeafIndex
(),
smaller_leaf_splits_
->
num_data_in_leaf
(),
smaller_leaf_splits_
->
sum_gradients
(),
smaller_leaf_splits_
->
sum_hessians
(),
gradients_
,
hessians_
);
is_feature_used
[
feature_index
]
=
1
;
}
bool
use_subtract
=
true
;
if
(
parent_leaf_histogram_array_
==
nullptr
)
{
use_subtract
=
false
;
}
// construct smaller leaf
HistogramBinEntry
*
ptr_smaller_leaf_hist_data
=
smaller_leaf_histogram_array_
[
0
].
RawData
()
-
1
;
train_data_
->
ConstructHistograms
(
is_feature_used
,
smaller_leaf_splits_
->
data_indices
(),
smaller_leaf_splits_
->
num_data_in_leaf
(),
smaller_leaf_splits_
->
LeafIndex
(),
ordered_bins_
,
gradients_
,
hessians_
,
ordered_gradients_
.
data
(),
ordered_hessians_
.
data
(),
ptr_smaller_leaf_hist_data
);
if
(
larger_leaf_histogram_array_
!=
nullptr
&&
!
use_subtract
)
{
// construct larger leaf
HistogramBinEntry
*
ptr_larger_leaf_hist_data
=
larger_leaf_histogram_array_
[
0
].
RawData
()
-
1
;
train_data_
->
ConstructHistograms
(
is_feature_used
,
larger_leaf_splits_
->
data_indices
(),
larger_leaf_splits_
->
num_data_in_leaf
(),
larger_leaf_splits_
->
LeafIndex
(),
ordered_bins_
,
gradients_
,
hessians_
,
ordered_gradients_
.
data
(),
ordered_hessians_
.
data
(),
ptr_larger_leaf_hist_data
);
}
#ifdef TIMETAG
hist_time
+=
std
::
chrono
::
steady_clock
::
now
()
-
start_time
;
#endif
#ifdef TIMETAG
start_time
=
std
::
chrono
::
steady_clock
::
now
();
#endif
std
::
vector
<
SplitInfo
>
smaller_best
(
num_threads_
);
std
::
vector
<
SplitInfo
>
larger_best
(
num_threads_
);
// find splits
#pragma omp parallel for schedule(static)
for
(
int
feature_index
=
0
;
feature_index
<
num_features_
;
++
feature_index
)
{
if
(
!
is_feature_used
[
feature_index
])
{
continue
;
}
const
int
tid
=
omp_get_thread_num
();
SplitInfo
smaller_split
;
train_data_
->
FixHistogram
(
feature_index
,
smaller_leaf_splits_
->
sum_gradients
(),
smaller_leaf_splits_
->
sum_hessians
(),
smaller_leaf_splits_
->
num_data_in_leaf
(),
smaller_leaf_histogram_array_
[
feature_index
].
RawData
());
smaller_leaf_histogram_array_
[
feature_index
].
FindBestThreshold
(
smaller_leaf_splits_
->
sum_gradients
(),
smaller_leaf_splits_
->
sum_hessians
(),
smaller_leaf_splits_
->
num_data_in_leaf
(),
&
smaller_split
);
if
(
smaller_split
.
gain
>
smaller_best
[
tid
].
gain
)
{
smaller_best
[
tid
]
=
smaller_split
;
smaller_best
[
tid
].
feature
=
train_data_
->
RealFeatureIndex
(
feature_index
);
}
// find best threshold for smaller child
smaller_leaf_histogram_array_
[
feature_index
].
FindBestThreshold
(
&
smaller_leaf_splits_
->
BestSplitPerFeature
()[
feature_index
]);
// only has root leaf
if
(
larger_leaf_splits_
==
nullptr
||
larger_leaf_splits_
->
LeafIndex
()
<
0
)
continue
;
if
(
larger_leaf_splits_
==
nullptr
||
larger_leaf_splits_
->
LeafIndex
()
<
0
)
{
continue
;
}
if
(
parent_leaf_histogram_array_
!=
nullptr
)
{
// construct histgroms for large leaf, we initialize larger leaf as the parent,
// so we can just subtract the smaller leaf's histograms
if
(
use_subtract
)
{
larger_leaf_histogram_array_
[
feature_index
].
Subtract
(
smaller_leaf_histogram_array_
[
feature_index
]);
}
else
{
if
(
ordered_bins_
[
feature_index
]
==
nullptr
)
{
// if not use ordered bin
larger_leaf_histogram_array_
[
feature_index
].
Construct
(
larger_leaf_splits_
->
data_indices
(),
larger_leaf_splits_
->
num_data_in_leaf
(),
larger_leaf_splits_
->
sum_gradients
(),
larger_leaf_splits_
->
sum_hessians
(),
ptr_to_ordered_gradients_larger_leaf_
,
ptr_to_ordered_hessians_larger_leaf_
);
}
else
{
// used ordered bin
larger_leaf_histogram_array_
[
feature_index
].
Construct
(
ordered_bins_
[
feature_index
].
get
(),
larger_leaf_splits_
->
LeafIndex
(),
larger_leaf_splits_
->
num_data_in_leaf
(),
larger_leaf_splits_
->
sum_gradients
(),
larger_leaf_splits_
->
sum_hessians
(),
gradients_
,
hessians_
);
}
train_data_
->
FixHistogram
(
feature_index
,
larger_leaf_splits_
->
sum_gradients
(),
larger_leaf_splits_
->
sum_hessians
(),
larger_leaf_splits_
->
num_data_in_leaf
(),
larger_leaf_histogram_array_
[
feature_index
].
RawData
());
}
SplitInfo
larger_split
;
// find best threshold for larger child
larger_leaf_histogram_array_
[
feature_index
].
FindBestThreshold
(
&
larger_leaf_splits_
->
BestSplitPerFeature
()[
feature_index
]);
larger_leaf_histogram_array_
[
feature_index
].
FindBestThreshold
(
larger_leaf_splits_
->
sum_gradients
(),
larger_leaf_splits_
->
sum_hessians
(),
larger_leaf_splits_
->
num_data_in_leaf
(),
&
larger_split
);
if
(
larger_split
.
gain
>
larger_best
[
tid
].
gain
)
{
larger_best
[
tid
]
=
larger_split
;
larger_best
[
tid
].
feature
=
train_data_
->
RealFeatureIndex
(
feature_index
);
}
}
auto
smaller_best_idx
=
ArrayArgs
<
SplitInfo
>::
ArgMax
(
smaller_best
);
int
leaf
=
smaller_leaf_splits_
->
LeafIndex
();
best_split_per_leaf_
[
leaf
]
=
smaller_best
[
smaller_best_idx
];
if
(
larger_leaf_splits_
!=
nullptr
&&
larger_leaf_splits_
->
LeafIndex
()
>=
0
)
{
leaf
=
larger_leaf_splits_
->
LeafIndex
();
auto
larger_best_idx
=
ArrayArgs
<
SplitInfo
>::
ArgMax
(
larger_best
);
best_split_per_leaf_
[
leaf
]
=
larger_best
[
larger_best_idx
];
}
#ifdef TIMETAG
find_split_time
+=
std
::
chrono
::
steady_clock
::
now
()
-
start_time
;
#endif
}
void
SerialTreeLearner
::
FindBestSplitsForLeaves
()
{
}
void
SerialTreeLearner
::
Split
(
Tree
*
tree
,
int
best_Leaf
,
int
*
left_leaf
,
int
*
right_leaf
)
{
const
SplitInfo
&
best_split_info
=
best_split_per_leaf_
[
best_Leaf
];
const
int
inner_feature_index
=
train_data_
->
InnerFeatureIndex
(
best_split_info
.
feature
);
// left = parent
*
left_leaf
=
best_Leaf
;
// split tree, will return right leaf
*
right_leaf
=
tree
->
Split
(
best_Leaf
,
best_split_info
.
feature
,
train_data_
->
FeatureAt
(
best_split_info
.
feature
)
->
bin_type
()
,
best_split_info
.
threshold
,
train_data_
->
FeatureAt
(
best_split_info
.
feature
)
->
feature_index
()
,
train_data_
->
FeatureAt
(
best_split_info
.
feature
)
->
BinToValue
(
best_split_info
.
threshold
)
,
static_cast
<
double
>
(
best_split_info
.
left_output
),
static_cast
<
double
>
(
best_split_info
.
righ
t_output
),
static_cast
<
d
ata_size_t
>
(
best_split_info
.
lef
t_
c
ou
n
t
),
static_cast
<
data_size_t
>
(
best_split_info
.
righ
t_count
),
static_cast
<
d
ouble
>
(
best_split_info
.
gain
));
*
right_leaf
=
tree
->
Split
(
best_Leaf
,
inner_feature_index
,
train_data_
->
FeatureBinMapper
(
inner_feature_index
)
->
bin_type
()
,
best_split_info
.
threshold
,
best_split_info
.
feature
,
train_data_
->
RealThreshold
(
inner_feature_index
,
best_split_info
.
threshold
),
static_cast
<
double
>
(
best_split_info
.
lef
t_output
),
static_cast
<
d
ouble
>
(
best_split_info
.
righ
t_ou
tpu
t
),
static_cast
<
data_size_t
>
(
best_split_info
.
lef
t_count
),
static_cast
<
d
ata_size_t
>
(
best_split_info
.
right_count
),
static_cast
<
double
>
(
best_split_info
.
gain
));
// split data partition
data_partition_
->
Split
(
best_Leaf
,
train_data_
->
FeatureAt
(
best_split_info
.
feature
)
->
bin_data
()
,
data_partition_
->
Split
(
best_Leaf
,
train_data_
,
inner_feature_index
,
best_split_info
.
threshold
,
*
right_leaf
);
// init the leaves that used on next iteration
...
...
@@ -431,8 +515,8 @@ void SerialTreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* ri
best_split_info
.
left_sum_gradient
,
best_split_info
.
left_sum_hessian
);
larger_leaf_splits_
->
Init
(
*
right_leaf
,
data_partition_
.
get
(),
best_split_info
.
right_sum_gradient
,
best_split_info
.
right_sum_hessian
);
best_split_info
.
right_sum_gradient
,
best_split_info
.
right_sum_hessian
);
}
else
{
smaller_leaf_splits_
->
Init
(
*
right_leaf
,
data_partition_
.
get
(),
best_split_info
.
right_sum_gradient
,
best_split_info
.
right_sum_hessian
);
larger_leaf_splits_
->
Init
(
*
left_leaf
,
data_partition_
.
get
(),
best_split_info
.
left_sum_gradient
,
best_split_info
.
left_sum_hessian
);
...
...
Prev
1
2
3
4
5
6
7
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment