Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tianlh
LightGBM-DCU
Commits
cc11525d
Commit
cc11525d
authored
Sep 28, 2017
by
ChenZhiyong
Committed by
Guolin Ke
Sep 28, 2017
Browse files
refine categorical split (#919)
* refine categorical split * add test
parent
b1b24ee2
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
141 additions
and
52 deletions
+141
-52
include/LightGBM/config.h
include/LightGBM/config.h
+7
-2
src/io/config.cpp
src/io/config.cpp
+11
-0
src/treelearner/feature_histogram.hpp
src/treelearner/feature_histogram.hpp
+87
-44
tests/python_package_test/test_engine.py
tests/python_package_test/test_engine.py
+36
-6
No files found.
include/LightGBM/config.h
View file @
cc11525d
...
@@ -224,7 +224,12 @@ public:
...
@@ -224,7 +224,12 @@ public:
int
gpu_device_id
=
-
1
;
int
gpu_device_id
=
-
1
;
/*! \brief Set to true to use double precision math on GPU (default using single precision) */
/*! \brief Set to true to use double precision math on GPU (default using single precision) */
bool
gpu_use_dp
=
false
;
bool
gpu_use_dp
=
false
;
int
max_cat_group
=
64
;
int
min_data_per_group
=
10
;
int
max_cat_threshold
=
256
;
int
max_cat_threshold
=
256
;
double
cat_smooth_ratio
=
0.01
;
double
min_cat_smooth
=
5
;
double
max_cat_smooth
=
100
;
LIGHTGBM_EXPORT
void
Set
(
const
std
::
unordered_map
<
std
::
string
,
std
::
string
>&
params
)
override
;
LIGHTGBM_EXPORT
void
Set
(
const
std
::
unordered_map
<
std
::
string
,
std
::
string
>&
params
)
override
;
};
};
...
@@ -463,8 +468,8 @@ struct ParameterAlias {
...
@@ -463,8 +468,8 @@ struct ParameterAlias {
"snapshot_freq"
,
"verbosity"
,
"sparse_threshold"
,
"enable_load_from_binary_file"
,
"snapshot_freq"
,
"verbosity"
,
"sparse_threshold"
,
"enable_load_from_binary_file"
,
"max_conflict_rate"
,
"poisson_max_delta_step"
,
"gaussian_eta"
,
"max_conflict_rate"
,
"poisson_max_delta_step"
,
"gaussian_eta"
,
"histogram_pool_size"
,
"output_freq"
,
"is_provide_training_metric"
,
"machine_list_filename"
,
"histogram_pool_size"
,
"output_freq"
,
"is_provide_training_metric"
,
"machine_list_filename"
,
"zero_as_missing"
,
"
max_cat_threshold
"
,
"zero_as_missing"
,
"
init_score_file"
,
"valid_init_score_file"
,
"is_predict_contrib
"
,
"
init_score_file"
,
"valid_init_score_file"
,
"is_predict_contrib
"
"
max_cat_threshold"
,
"max_cat_group"
,
"cat_smooth_ratio"
,
"min_cat_smooth"
,
"max_cat_smooth"
,
"min_data_per_group
"
});
});
std
::
unordered_map
<
std
::
string
,
std
::
string
>
tmp_map
;
std
::
unordered_map
<
std
::
string
,
std
::
string
>
tmp_map
;
for
(
const
auto
&
pair
:
*
params
)
{
for
(
const
auto
&
pair
:
*
params
)
{
...
...
src/io/config.cpp
View file @
cc11525d
...
@@ -370,7 +370,18 @@ void TreeConfig::Set(const std::unordered_map<std::string, std::string>& params)
...
@@ -370,7 +370,18 @@ void TreeConfig::Set(const std::unordered_map<std::string, std::string>& params)
GetInt
(
params
,
"gpu_platform_id"
,
&
gpu_platform_id
);
GetInt
(
params
,
"gpu_platform_id"
,
&
gpu_platform_id
);
GetInt
(
params
,
"gpu_device_id"
,
&
gpu_device_id
);
GetInt
(
params
,
"gpu_device_id"
,
&
gpu_device_id
);
GetBool
(
params
,
"gpu_use_dp"
,
&
gpu_use_dp
);
GetBool
(
params
,
"gpu_use_dp"
,
&
gpu_use_dp
);
GetInt
(
params
,
"max_cat_group"
,
&
max_cat_group
);
GetInt
(
params
,
"max_cat_threshold"
,
&
max_cat_threshold
);
GetInt
(
params
,
"max_cat_threshold"
,
&
max_cat_threshold
);
GetDouble
(
params
,
"cat_smooth_ratio"
,
&
cat_smooth_ratio
);
GetDouble
(
params
,
"min_cat_smooth"
,
&
min_cat_smooth
);
GetDouble
(
params
,
"max_cat_smooth"
,
&
max_cat_smooth
);
GetInt
(
params
,
"min_data_per_group"
,
&
min_data_per_group
);
CHECK
(
max_cat_group
>
1
);
CHECK
(
max_cat_threshold
>
0
);
CHECK
(
cat_smooth_ratio
>=
0
);
CHECK
(
min_cat_smooth
>=
1
);
CHECK
(
max_cat_smooth
>
min_cat_smooth
);
CHECK
(
min_data_per_group
>
0
);
}
}
void
BoostingConfig
::
Set
(
const
std
::
unordered_map
<
std
::
string
,
std
::
string
>&
params
)
{
void
BoostingConfig
::
Set
(
const
std
::
unordered_map
<
std
::
string
,
std
::
string
>&
params
)
{
...
...
src/treelearner/feature_histogram.hpp
View file @
cc11525d
...
@@ -106,58 +106,90 @@ public:
...
@@ -106,58 +106,90 @@ public:
output
->
default_left
=
false
;
output
->
default_left
=
false
;
double
best_gain
=
kMinScore
;
double
best_gain
=
kMinScore
;
data_size_t
best_left_count
=
0
;
data_size_t
best_left_count
=
0
;
double
best_sum_left_gradient
=
0.0
f
;
double
best_sum_left_gradient
=
0
;
double
best_sum_left_hessian
=
0.0
f
;
double
best_sum_left_hessian
=
0
;
double
gain_shift
=
GetLeafSplitGain
(
sum_gradient
,
sum_hessian
,
double
gain_shift
=
GetLeafSplitGain
(
sum_gradient
,
sum_hessian
,
meta_
->
tree_config
->
lambda_l1
,
meta_
->
tree_config
->
lambda_l2
);
meta_
->
tree_config
->
lambda_l1
,
meta_
->
tree_config
->
lambda_l2
);
double
min_gain_shift
=
gain_shift
+
meta_
->
tree_config
->
min_gain_to_split
;
double
min_gain_shift
=
gain_shift
+
meta_
->
tree_config
->
min_gain_to_split
;
bool
is_full_categorical
=
meta_
->
missing_type
==
MissingType
::
None
;
int
used_bin
=
meta_
->
num_bin
-
1
;
if
(
is_full_categorical
)
++
used_bin
;
std
::
vector
<
int
>
sorted_idx
(
used_bin
);
for
(
int
i
=
0
;
i
<
used_bin
;
++
i
)
sorted_idx
[
i
]
=
i
;
double
smooth_hess
=
meta_
->
tree_config
->
cat_smooth_ratio
*
num_data
;
smooth_hess
=
std
::
min
(
meta_
->
tree_config
->
max_cat_smooth
,
std
::
max
(
smooth_hess
,
meta_
->
tree_config
->
min_cat_smooth
));
const
double
smooth_grad
=
smooth_hess
*
sum_gradient
/
sum_hessian
;
auto
ctr_fun
=
[
&
smooth_hess
,
&
smooth_grad
](
double
sum_grad
,
double
sum_hess
)
{
return
(
sum_grad
+
smooth_grad
)
/
(
sum_hess
+
smooth_hess
);
};
std
::
sort
(
sorted_idx
.
begin
(),
sorted_idx
.
end
(),
[
this
,
&
ctr_fun
](
int
i
,
int
j
)
{
return
ctr_fun
(
data_
[
i
].
sum_gradients
,
data_
[
i
].
sum_hessians
)
<
ctr_fun
(
data_
[
j
].
sum_gradients
,
data_
[
j
].
sum_hessians
);
});
std
::
vector
<
int
>
find_direction
(
1
,
1
);
std
::
vector
<
int
>
start_position
(
1
,
0
);
if
(
!
is_full_categorical
||
meta_
->
tree_config
->
max_cat_threshold
*
2
<
meta_
->
num_bin
)
{
find_direction
.
push_back
(
-
1
);
start_position
.
push_back
(
used_bin
-
1
);
}
is_splittable_
=
false
;
is_splittable_
=
false
;
int
best_threshold
=
-
1
;
int
best_dir
=
1
;
for
(
size_t
out_i
=
0
;
out_i
<
find_direction
.
size
();
++
out_i
)
{
auto
dir
=
find_direction
[
out_i
];
auto
start_pos
=
start_position
[
out_i
];
data_size_t
rest_group
=
meta_
->
tree_config
->
max_cat_group
;
data_size_t
min_data_per_group
=
std
::
max
(
meta_
->
tree_config
->
min_data_per_group
,
num_data
/
rest_group
);
data_size_t
cnt_cur_group
=
0
;
double
sum_left_gradient
=
0.0
f
;
double
sum_left_hessian
=
kEpsilon
;
data_size_t
left_count
=
0
;
for
(
int
i
=
0
;
i
<
used_bin
&&
i
<
meta_
->
tree_config
->
max_cat_threshold
;
++
i
)
{
auto
t
=
sorted_idx
[
start_pos
];
start_pos
+=
dir
;
uint32_t
best_threshold
=
0
;
sum_left_gradient
+=
data_
[
t
].
sum_gradients
;
bool
is_full_categorical
=
meta_
->
missing_type
==
MissingType
::
None
;
sum_left_hessian
+=
data_
[
t
].
sum_hessians
;
left_count
+=
data_
[
t
].
cnt
;
cnt_cur_group
+=
data_
[
t
].
cnt
;
int
used_bin
=
meta_
->
num_bin
-
1
+
is_full_categorical
;
if
(
left_count
<
meta_
->
tree_config
->
min_data_in_leaf
||
sum_left_hessian
<
meta_
->
tree_config
->
min_sum_hessian_in_leaf
)
continue
;
data_size_t
right_count
=
num_data
-
left_count
;
if
(
right_count
<
meta_
->
tree_config
->
min_data_in_leaf
||
right_count
<
min_data_per_group
)
break
;
// from right to left, and we don't need data in bin0
double
sum_right_hessian
=
sum_hessian
-
sum_left_hessian
;
for
(
int
t
=
0
;
t
<
used_bin
;
++
t
)
{
if
(
sum_right_hessian
<
meta_
->
tree_config
->
min_sum_hessian_in_leaf
)
break
;
// if data not enough, or sum hessian too small
if
(
data_
[
t
].
cnt
<
meta_
->
tree_config
->
min_data_in_leaf
||
data_
[
t
].
sum_hessians
<
meta_
->
tree_config
->
min_sum_hessian_in_leaf
)
continue
;
data_size_t
other_count
=
num_data
-
data_
[
t
].
cnt
;
// if data not enough
if
(
other_count
<
meta_
->
tree_config
->
min_data_in_leaf
)
continue
;
double
sum_other_hessian
=
sum_hessian
-
data_
[
t
].
sum_hessians
-
kEpsilon
;
if
(
cnt_cur_group
<
min_data_per_group
)
continue
;
// if sum hessian too small
if
(
sum_other_hessian
<
meta_
->
tree_config
->
min_sum_hessian_in_leaf
)
continue
;
double
sum_other_gradient
=
sum_gradient
-
data_
[
t
].
sum_gradients
;
cnt_cur_group
=
0
;
// current split gain
if
(
--
rest_group
>
0
)
min_data_per_group
=
std
::
max
(
meta_
->
tree_config
->
min_data_per_group
,
right_count
/
rest_group
);
double
current_gain
=
GetLeafSplitGain
(
sum_other_gradient
,
sum_other_hessian
,
meta_
->
tree_config
->
lambda_l1
,
meta_
->
tree_config
->
lambda_l2
)
+
GetLeafSplitGain
(
data_
[
t
].
sum_gradients
,
data_
[
t
].
sum_hessians
+
kEpsilon
,
meta_
->
tree_config
->
lambda_l1
,
meta_
->
tree_config
->
lambda_l2
);
// gain with split is worse than without split
if
(
current_gain
<=
min_gain_shift
)
continue
;
// mark to is splittable
double
sum_right_gradient
=
sum_gradient
-
sum_left_gradient
;
double
current_gain
=
GetLeafSplitGain
(
sum_left_gradient
,
sum_left_hessian
,
meta_
->
tree_config
->
lambda_l1
,
meta_
->
tree_config
->
lambda_l2
)
+
GetLeafSplitGain
(
sum_right_gradient
,
sum_right_hessian
,
meta_
->
tree_config
->
lambda_l1
,
meta_
->
tree_config
->
lambda_l2
);
if
(
current_gain
<=
min_gain_shift
)
continue
;
is_splittable_
=
true
;
is_splittable_
=
true
;
// better split point
if
(
current_gain
>
best_gain
)
{
if
(
current_gain
>
best_gain
)
{
best_
threshold
=
static_cast
<
uint32_t
>
(
t
)
;
best_
left_count
=
left_count
;
best_sum_left_gradient
=
data_
[
t
].
sum
_gradient
s
;
best_sum_left_gradient
=
sum_left
_gradient
;
best_sum_left_hessian
=
data_
[
t
].
sum_hessians
+
kEpsilo
n
;
best_sum_left_hessian
=
sum_left_hessia
n
;
best_
left_count
=
data_
[
t
].
cnt
;
best_
threshold
=
i
;
best_gain
=
current_gain
;
best_gain
=
current_gain
;
best_dir
=
dir
;
}
}
}
}
}
if
(
is_splittable_
)
{
if
(
is_splittable_
)
{
// update split information
output
->
num_cat_threshold
=
1
;
output
->
cat_threshold
.
resize
(
output
->
num_cat_threshold
);
output
->
cat_threshold
[
0
]
=
best_threshold
;
output
->
left_output
=
CalculateSplittedLeafOutput
(
best_sum_left_gradient
,
best_sum_left_hessian
,
output
->
left_output
=
CalculateSplittedLeafOutput
(
best_sum_left_gradient
,
best_sum_left_hessian
,
meta_
->
tree_config
->
lambda_l1
,
meta_
->
tree_config
->
lambda_l2
);
meta_
->
tree_config
->
lambda_l1
,
meta_
->
tree_config
->
lambda_l2
);
output
->
left_count
=
best_left_count
;
output
->
left_count
=
best_left_count
;
...
@@ -170,6 +202,17 @@ public:
...
@@ -170,6 +202,17 @@ public:
output
->
right_sum_gradient
=
sum_gradient
-
best_sum_left_gradient
;
output
->
right_sum_gradient
=
sum_gradient
-
best_sum_left_gradient
;
output
->
right_sum_hessian
=
sum_hessian
-
best_sum_left_hessian
-
kEpsilon
;
output
->
right_sum_hessian
=
sum_hessian
-
best_sum_left_hessian
-
kEpsilon
;
output
->
gain
=
best_gain
-
min_gain_shift
;
output
->
gain
=
best_gain
-
min_gain_shift
;
output
->
num_cat_threshold
=
best_threshold
+
1
;
output
->
cat_threshold
=
std
::
vector
<
uint32_t
>
(
output
->
num_cat_threshold
);
if
(
best_dir
==
1
)
{
for
(
int
i
=
0
;
i
<
output
->
num_cat_threshold
;
++
i
)
{
output
->
cat_threshold
[
i
]
=
sorted_idx
[
i
];
}
}
else
{
for
(
int
i
=
0
;
i
<
output
->
num_cat_threshold
;
++
i
)
{
output
->
cat_threshold
[
i
]
=
sorted_idx
[
used_bin
-
1
-
i
];
}
}
}
}
}
}
...
@@ -287,7 +330,7 @@ private:
...
@@ -287,7 +330,7 @@ private:
best_gain
=
current_gain
;
best_gain
=
current_gain
;
}
}
}
}
}
else
{
}
else
{
double
sum_left_gradient
=
0.0
f
;
double
sum_left_gradient
=
0.0
f
;
double
sum_left_hessian
=
kEpsilon
;
double
sum_left_hessian
=
kEpsilon
;
data_size_t
left_count
=
0
;
data_size_t
left_count
=
0
;
...
...
tests/python_package_test/test_engine.py
View file @
cc11525d
...
@@ -132,7 +132,7 @@ class TestEngine(unittest.TestCase):
...
@@ -132,7 +132,7 @@ class TestEngine(unittest.TestCase):
lgb_eval
=
lgb
.
Dataset
(
X_train
,
y_train
)
lgb_eval
=
lgb
.
Dataset
(
X_train
,
y_train
)
params
=
{
params
=
{
'objective'
:
'
binary
'
,
'objective'
:
'
regression
'
,
'metric'
:
'auc'
,
'metric'
:
'auc'
,
'verbose'
:
-
1
,
'verbose'
:
-
1
,
'boost_from_average'
:
False
,
'boost_from_average'
:
False
,
...
@@ -149,7 +149,7 @@ class TestEngine(unittest.TestCase):
...
@@ -149,7 +149,7 @@ class TestEngine(unittest.TestCase):
verbose_eval
=
True
,
verbose_eval
=
True
,
evals_result
=
evals_result
)
evals_result
=
evals_result
)
pred
=
gbm
.
predict
(
X_train
)
pred
=
gbm
.
predict
(
X_train
)
self
.
assert
A
lmost
E
qual
(
pred
[
-
1
],
pred
[
0
],
places
=
5
)
np
.
testing
.
assert
_a
lmost
_e
qual
(
pred
,
y
)
def
test_missing_value_handle_zero
(
self
):
def
test_missing_value_handle_zero
(
self
):
x
=
[
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
np
.
nan
]
x
=
[
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
np
.
nan
]
...
@@ -161,7 +161,7 @@ class TestEngine(unittest.TestCase):
...
@@ -161,7 +161,7 @@ class TestEngine(unittest.TestCase):
lgb_eval
=
lgb
.
Dataset
(
X_train
,
y_train
)
lgb_eval
=
lgb
.
Dataset
(
X_train
,
y_train
)
params
=
{
params
=
{
'objective'
:
'
binary
'
,
'objective'
:
'
regression
'
,
'metric'
:
'auc'
,
'metric'
:
'auc'
,
'verbose'
:
-
1
,
'verbose'
:
-
1
,
'boost_from_average'
:
False
,
'boost_from_average'
:
False
,
...
@@ -178,8 +178,7 @@ class TestEngine(unittest.TestCase):
...
@@ -178,8 +178,7 @@ class TestEngine(unittest.TestCase):
verbose_eval
=
True
,
verbose_eval
=
True
,
evals_result
=
evals_result
)
evals_result
=
evals_result
)
pred
=
gbm
.
predict
(
X_train
)
pred
=
gbm
.
predict
(
X_train
)
self
.
assertAlmostEqual
(
pred
[
-
1
],
pred
[
-
2
],
places
=
5
)
np
.
testing
.
assert_almost_equal
(
pred
,
y
)
self
.
assertAlmostEqual
(
pred
[
-
1
],
pred
[
0
],
places
=
5
)
def
test_missing_value_handle_none
(
self
):
def
test_missing_value_handle_none
(
self
):
x
=
[
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
np
.
nan
]
x
=
[
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
np
.
nan
]
...
@@ -191,7 +190,7 @@ class TestEngine(unittest.TestCase):
...
@@ -191,7 +190,7 @@ class TestEngine(unittest.TestCase):
lgb_eval
=
lgb
.
Dataset
(
X_train
,
y_train
)
lgb_eval
=
lgb
.
Dataset
(
X_train
,
y_train
)
params
=
{
params
=
{
'objective'
:
'
binary
'
,
'objective'
:
'
regression
'
,
'metric'
:
'auc'
,
'metric'
:
'auc'
,
'verbose'
:
-
1
,
'verbose'
:
-
1
,
'boost_from_average'
:
False
,
'boost_from_average'
:
False
,
...
@@ -211,6 +210,37 @@ class TestEngine(unittest.TestCase):
...
@@ -211,6 +210,37 @@ class TestEngine(unittest.TestCase):
self
.
assertAlmostEqual
(
pred
[
0
],
pred
[
1
],
places
=
5
)
self
.
assertAlmostEqual
(
pred
[
0
],
pred
[
1
],
places
=
5
)
self
.
assertAlmostEqual
(
pred
[
-
1
],
pred
[
0
],
places
=
5
)
self
.
assertAlmostEqual
(
pred
[
-
1
],
pred
[
0
],
places
=
5
)
def
test_categorical_handle
(
self
):
x
=
[
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
]
y
=
[
0
,
1
,
0
,
1
,
0
,
1
,
0
,
1
]
X_train
=
np
.
array
(
x
).
reshape
(
len
(
x
),
1
)
y_train
=
np
.
array
(
y
)
lgb_train
=
lgb
.
Dataset
(
X_train
,
y_train
)
lgb_eval
=
lgb
.
Dataset
(
X_train
,
y_train
)
params
=
{
'objective'
:
'regression'
,
'metric'
:
'auc'
,
'verbose'
:
-
1
,
'boost_from_average'
:
False
,
'min_data'
:
1
,
'num_leaves'
:
2
,
'learning_rate'
:
1
,
'min_data_in_bin'
:
1
,
'min_data_per_group'
:
1
,
'zero_as_missing'
:
True
,
'categorical_column'
:
0
}
evals_result
=
{}
gbm
=
lgb
.
train
(
params
,
lgb_train
,
num_boost_round
=
1
,
valid_sets
=
lgb_eval
,
verbose_eval
=
True
,
evals_result
=
evals_result
)
pred
=
gbm
.
predict
(
X_train
)
np
.
testing
.
assert_almost_equal
(
pred
,
y
)
def
test_multiclass
(
self
):
def
test_multiclass
(
self
):
X
,
y
=
load_digits
(
10
,
True
)
X
,
y
=
load_digits
(
10
,
True
)
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
X
,
y
,
test_size
=
0.1
,
random_state
=
42
)
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
X
,
y
,
test_size
=
0.1
,
random_state
=
42
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment