Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tianlh
LightGBM-DCU
Commits
12ce2566
Commit
12ce2566
authored
Mar 13, 2017
by
Guolin Ke
Browse files
fix max_bin of categorical feature in parallel learning.
parent
ebc0de8b
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
32 additions
and
16 deletions
+32
-16
include/LightGBM/boosting.h
include/LightGBM/boosting.h
+3
-3
include/LightGBM/c_api.h
include/LightGBM/c_api.h
+1
-1
src/boosting/gbdt.h
src/boosting/gbdt.h
+1
-1
src/io/dataset_loader.cpp
src/io/dataset_loader.cpp
+27
-11
No files found.
include/LightGBM/boosting.h
View file @
12ce2566
...
...
@@ -99,14 +99,14 @@ public:
/*!
* \brief Get prediction result at data_idx data
* \param data_idx 0: training data, 1: 1st validation data
* \return out_len leng
h
t of returned score
* \return out_len lengt
h
of returned score
*/
virtual
int64_t
GetNumPredictAt
(
int
data_idx
)
const
=
0
;
/*!
* \brief Get prediction result at data_idx data
* \param data_idx 0: training data, 1: 1st validation data
* \param result used to store prediction result, should allocate memory before call this function
* \param out_len leng
h
t of returned score
* \param out_len lengt
h
of returned score
*/
virtual
void
GetPredictAt
(
int
data_idx
,
double
*
result
,
int64_t
*
out_len
)
=
0
;
...
...
@@ -125,7 +125,7 @@ public:
virtual
std
::
vector
<
double
>
Predict
(
const
double
*
feature_values
)
const
=
0
;
/*!
* \brief Predtion for one record with leaf index
* \brief Pred
ic
tion for one record with leaf index
* \param feature_values Feature value on this record
* \return Predicted leaf index for this record
*/
...
...
include/LightGBM/c_api.h
View file @
12ce2566
...
...
@@ -540,7 +540,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForFile(BoosterHandle handle,
* C_API_PREDICT_RAW_SCORE: raw score
* C_API_PREDICT_LEAF_INDEX: leaf index
* \param num_iteration number of iteration for prediction, <= 0 means no limit
* \param out_len leng
h
t of prediction
* \param out_len lengt
h
of prediction
* \return 0 when succeed, -1 when failure happens
*/
LIGHTGBM_C_EXPORT
int
LGBM_BoosterCalcNumPredict
(
BoosterHandle
handle
,
...
...
src/boosting/gbdt.h
View file @
12ce2566
...
...
@@ -119,7 +119,7 @@ public:
* \brief Get prediction result at data_idx data
* \param data_idx 0: training data, 1: 1st validation data
* \param result used to store prediction result, should allocate memory before call this function
* \param out_len leng
h
t of returned score
* \param out_len lengt
h
of returned score
*/
void
GetPredictAt
(
int
data_idx
,
double
*
out_result
,
int64_t
*
out_len
)
override
;
...
...
src/io/dataset_loader.cpp
View file @
12ce2566
...
...
@@ -721,11 +721,11 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
io_config_
.
max_bin
,
io_config_
.
min_data_in_bin
,
filter_cnt
,
bin_type
);
}
}
else
{
// if have multi-machines, need find bin distributed
// if have multi-machines, need
to
find bin distributed
// different machines will find bin for different features
// start and len will store the process feature indices for different machines
// machine i will find bins for features in [ st
r
at[i], start[i] + len[i] )
// machine i will find bins for features in [ sta
r
t[i], start[i] + len[i] )
std
::
vector
<
int
>
start
(
num_machines
);
std
::
vector
<
int
>
len
(
num_machines
);
int
total_num_feature
=
static_cast
<
int
>
(
sample_values
.
size
());
...
...
@@ -738,8 +738,29 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
start
[
i
+
1
]
=
start
[
i
]
+
len
[
i
];
}
len
[
num_machines
-
1
]
=
total_num_feature
-
start
[
num_machines
-
1
];
#pragma omp parallel for schedule(guided)
for
(
int
i
=
0
;
i
<
len
[
rank
];
++
i
)
{
if
(
ignore_features_
.
count
(
start
[
rank
]
+
i
)
>
0
)
{
continue
;
}
BinType
bin_type
=
BinType
::
NumericalBin
;
if
(
categorical_features_
.
count
(
start
[
rank
]
+
i
))
{
bin_type
=
BinType
::
CategoricalBin
;
}
bin_mappers
[
i
].
reset
(
new
BinMapper
());
bin_mappers
[
i
]
->
FindBin
(
sample_values
[
start
[
rank
]
+
i
],
sample_data
.
size
(),
io_config_
.
max_bin
,
io_config_
.
min_data_in_bin
,
filter_cnt
,
bin_type
);
}
// get max_bin
int
max_bin
=
0
;
for
(
int
i
=
0
;
i
<
len
[
rank
];
++
i
)
{
if
(
ignore_features_
.
count
(
start
[
rank
]
+
i
)
>
0
)
{
continue
;
}
max_bin
=
std
::
max
(
max_bin
,
bin_mappers
[
i
]
->
num_bin
());
}
// get size of bin mapper with max_bin_ size
int
type_size
=
BinMapper
::
SizeForSpecificBin
(
io_config_
.
max_bin
);
int
type_size
=
BinMapper
::
SizeForSpecificBin
(
max_bin
);
// since sizes of different feature may not be same, we expand all bin mapper to type_size
int
buffer_size
=
type_size
*
total_num_feature
;
auto
input_buffer
=
std
::
vector
<
char
>
(
buffer_size
);
...
...
@@ -751,14 +772,9 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
if
(
ignore_features_
.
count
(
start
[
rank
]
+
i
)
>
0
)
{
continue
;
}
BinType
bin_type
=
BinType
::
NumericalBin
;
if
(
categorical_features_
.
count
(
start
[
rank
]
+
i
))
{
bin_type
=
BinType
::
CategoricalBin
;
}
BinMapper
bin_mapper
;
bin_mapper
.
FindBin
(
sample_values
[
start
[
rank
]
+
i
],
sample_data
.
size
(),
io_config_
.
max_bin
,
io_config_
.
min_data_in_bin
,
filter_cnt
,
bin_type
);
bin_mapper
.
CopyTo
(
input_buffer
.
data
()
+
i
*
type_size
);
bin_mappers
[
i
]
->
CopyTo
(
input_buffer
.
data
()
+
i
*
type_size
);
// free
bin_mappers
[
i
].
reset
(
nullptr
);
}
// convert to binary size
for
(
int
i
=
0
;
i
<
num_machines
;
++
i
)
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment