Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tianlh
LightGBM-DCU
Commits
db9ec217
Commit
db9ec217
authored
Oct 17, 2017
by
Guolin Ke
Browse files
reduce parameters in categorical split
parent
8dc2bed8
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
22 additions
and
43 deletions
+22
-43
docs/Advanced-Topics.rst
docs/Advanced-Topics.rst
+1
-1
docs/Parameters.rst
docs/Parameters.rst
+4
-18
include/LightGBM/config.h
include/LightGBM/config.h
+5
-7
src/io/bin.cpp
src/io/bin.cpp
+3
-0
src/io/config.cpp
src/io/config.cpp
+2
-6
src/treelearner/feature_histogram.hpp
src/treelearner/feature_histogram.hpp
+4
-8
tests/python_package_test/test_engine.py
tests/python_package_test/test_engine.py
+3
-3
No files found.
docs/Advanced-Topics.rst
View file @
db9ec217
...
...
@@ -24,7 +24,7 @@ Categorical Feature Support
- Converting to ``int`` type is needed first, and there is support for non-negative numbers only.
It is better to convert into continues ranges.
- Use ``min_data_per_group``, ``cat_smooth
_ratio
`` to deal with over-fitting
- Use ``min_data_per_group``, ``cat_smooth`` to deal with over-fitting
(when ``#data`` is small or ``#category`` is large).
- For categorical features with high cardinality (``#category`` is large), it is better to convert it to numerical features.
...
...
docs/Parameters.rst
View file @
db9ec217
...
...
@@ -253,33 +253,19 @@ Learning Control Parameters
- min number of data per categorical group
- ``max_cat_threshold``, default=\ ``
128
``, type=int
- ``max_cat_threshold``, default=\ ``
32
``, type=int
- use for the categorical features
- limit the max threshold points in categorical features
- ``min_cat_smooth``, default=\ ``5``, type=double
- use for the categorical features
- refer to the descrption of the paramater ``cat_smooth_ratio``
- ``max_cat_smooth``, default=\ ``50``, type=double
- use for the categorical features
- refer to the descrption of the paramater ``cat_smooth_ratio``
- ``cat_smooth_ratio``, default=\ ``0.01``, type=double
- ``cat_smooth``, default=\ ``10``, type=double
- use for the categorical features
- this can reduce the effect of noises in categorical features, especially for categories with few data
- the smooth denominator is ``a = min(max_cat_smooth, max(min_cat_smooth, num_data / num_category * cat_smooth_ratio))``
- ``cat_l2``, default=\ ``1``, type=double
- ``cat_l2``, default=\ ``10``, type=double
- L2 regularization in categorcial split
...
...
@@ -294,7 +280,7 @@ IO Parameters
- LightGBM will auto compress memory according ``max_bin``.
For example, LightGBM will use ``uint8_t`` for feature value if ``max_bin=255``
- ``min_data_in_bin``, default=\ ``
5
``, type=int
- ``min_data_in_bin``, default=\ ``
3
``, type=int
- min number of data inside one bin, use this to avoid one-data-one-bin (may over-fitting)
...
...
include/LightGBM/config.h
View file @
db9ec217
...
...
@@ -122,7 +122,7 @@ public:
bool
is_predict_contrib
=
false
;
bool
is_predict_raw_score
=
false
;
int
min_data_in_leaf
=
20
;
int
min_data_in_bin
=
5
;
int
min_data_in_bin
=
3
;
double
max_conflict_rate
=
0.0
f
;
bool
enable_bundle
=
true
;
bool
has_header
=
false
;
...
...
@@ -226,11 +226,9 @@ public:
/*! \brief Set to true to use double precision math on GPU (default using single precision) */
bool
gpu_use_dp
=
false
;
int
min_data_per_group
=
100
;
int
max_cat_threshold
=
128
;
double
cat_smooth_ratio
=
0.01
;
double
cat_l2
=
1
;
double
min_cat_smooth
=
5
;
double
max_cat_smooth
=
50
;
int
max_cat_threshold
=
32
;
double
cat_l2
=
10
;
double
cat_smooth
=
10
;
LIGHTGBM_EXPORT
void
Set
(
const
std
::
unordered_map
<
std
::
string
,
std
::
string
>&
params
)
override
;
};
...
...
@@ -473,7 +471,7 @@ struct ParameterAlias {
"max_conflict_rate"
,
"poisson_max_delta_step"
,
"gaussian_eta"
,
"histogram_pool_size"
,
"output_freq"
,
"is_provide_training_metric"
,
"machine_list_filename"
,
"machines"
,
"zero_as_missing"
,
"init_score_file"
,
"valid_init_score_file"
,
"is_predict_contrib"
,
"max_cat_threshold"
,
"
cat_smooth_ratio"
,
"min_cat_smooth"
,
"max_
cat_smooth"
,
"min_data_per_group"
,
"cat_l2"
"max_cat_threshold"
,
"cat_smooth"
,
"min_data_per_group"
,
"cat_l2"
});
std
::
unordered_map
<
std
::
string
,
std
::
string
>
tmp_map
;
for
(
const
auto
&
pair
:
*
params
)
{
...
...
src/io/bin.cpp
View file @
db9ec217
...
...
@@ -329,6 +329,9 @@ namespace LightGBM {
cnt_in_bin
.
clear
();
while
(
cur_cat
<
distinct_values_int
.
size
()
&&
(
used_cnt
<
cut_cnt
||
num_bin_
<
max_bin
))
{
if
(
counts_int
[
cur_cat
]
<
min_data_in_bin
&&
cur_cat
>
1
)
{
break
;
}
bin_2_categorical_
.
push_back
(
distinct_values_int
[
cur_cat
]);
categorical_2_bin_
[
distinct_values_int
[
cur_cat
]]
=
static_cast
<
unsigned
int
>
(
num_bin_
);
used_cnt
+=
counts_int
[
cur_cat
];
...
...
src/io/config.cpp
View file @
db9ec217
...
...
@@ -381,16 +381,12 @@ void TreeConfig::Set(const std::unordered_map<std::string, std::string>& params)
GetInt
(
params
,
"gpu_device_id"
,
&
gpu_device_id
);
GetBool
(
params
,
"gpu_use_dp"
,
&
gpu_use_dp
);
GetInt
(
params
,
"max_cat_threshold"
,
&
max_cat_threshold
);
GetDouble
(
params
,
"cat_smooth_ratio"
,
&
cat_smooth_ratio
);
GetDouble
(
params
,
"cat_l2"
,
&
cat_l2
);
GetDouble
(
params
,
"min_cat_smooth"
,
&
min_cat_smooth
);
GetDouble
(
params
,
"max_cat_smooth"
,
&
max_cat_smooth
);
GetDouble
(
params
,
"cat_smooth"
,
&
cat_smooth
);
GetInt
(
params
,
"min_data_per_group"
,
&
min_data_per_group
);
CHECK
(
max_cat_threshold
>
0
);
CHECK
(
cat_smooth_ratio
>=
0
);
CHECK
(
cat_l2
>=
0.0
f
);
CHECK
(
min_cat_smooth
>=
1
);
CHECK
(
max_cat_smooth
>
min_cat_smooth
);
CHECK
(
cat_smooth
>=
1
);
CHECK
(
min_data_per_group
>
0
);
}
...
...
src/treelearner/feature_histogram.hpp
View file @
db9ec217
...
...
@@ -116,13 +116,9 @@ public:
if
(
is_full_categorical
)
++
used_bin
;
const
double
smooth_hess
=
std
::
max
(
meta_
->
tree_config
->
min_cat_smooth
,
std
::
min
(
meta_
->
tree_config
->
cat_smooth_ratio
*
num_data
,
meta_
->
tree_config
->
max_cat_smooth
));
const
int
min_data_per_cat
=
static_cast
<
int
>
(
smooth_hess
);
std
::
vector
<
int
>
sorted_idx
;
for
(
int
i
=
0
;
i
<
used_bin
;
++
i
)
{
if
(
data_
[
i
].
cnt
>=
m
in_data_per_cat
)
{
if
(
data_
[
i
].
cnt
>=
m
eta_
->
tree_config
->
cat_smooth
)
{
sorted_idx
.
push_back
(
i
);
}
}
...
...
@@ -130,12 +126,12 @@ public:
const
double
l2
=
meta_
->
tree_config
->
lambda_l2
+
meta_
->
tree_config
->
cat_l2
;
auto
ctr_fun
=
[
&
smooth_hes
s
](
double
sum_grad
,
double
sum_hess
)
{
return
(
sum_grad
)
/
(
sum_hess
+
smooth
_hess
);
auto
ctr_fun
=
[
thi
s
](
double
sum_grad
,
double
sum_hess
)
{
return
(
sum_grad
)
/
(
sum_hess
+
meta_
->
tree_config
->
cat_
smooth
);
};
std
::
sort
(
sorted_idx
.
begin
(),
sorted_idx
.
end
(),
[
this
,
&
ctr_fun
](
int
i
,
int
j
)
{
return
ctr_fun
(
data_
[
i
].
sum_gradients
,
data_
[
i
].
cnt
)
<
ctr_fun
(
data_
[
j
].
sum_gradients
,
data_
[
j
].
cnt
);
return
ctr_fun
(
data_
[
i
].
sum_gradients
,
data_
[
i
].
sum_hessians
)
<
ctr_fun
(
data_
[
j
].
sum_gradients
,
data_
[
j
].
sum_hessians
);
});
std
::
vector
<
int
>
find_direction
(
1
,
1
);
...
...
tests/python_package_test/test_engine.py
View file @
db9ec217
...
...
@@ -229,7 +229,7 @@ class TestEngine(unittest.TestCase):
'learning_rate'
:
1
,
'min_data_in_bin'
:
1
,
'min_data_per_group'
:
1
,
'
min_
cat_smooth'
:
1
,
'cat_smooth'
:
1
,
'cat_l2'
:
0
,
'zero_as_missing'
:
True
,
'categorical_column'
:
0
...
...
@@ -262,7 +262,7 @@ class TestEngine(unittest.TestCase):
'learning_rate'
:
1
,
'min_data_in_bin'
:
1
,
'min_data_per_group'
:
1
,
'
min_
cat_smooth'
:
1
,
'cat_smooth'
:
1
,
'cat_l2'
:
0
,
'zero_as_missing'
:
False
,
'categorical_column'
:
0
...
...
@@ -295,7 +295,7 @@ class TestEngine(unittest.TestCase):
'learning_rate'
:
1
,
'min_data_in_bin'
:
1
,
'min_data_per_group'
:
1
,
'
min_
cat_smooth'
:
1
,
'cat_smooth'
:
1
,
'cat_l2'
:
0
,
'zero_as_missing'
:
False
,
'categorical_column'
:
0
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment