Commit 12ce2566 authored by Guolin Ke's avatar Guolin Ke
Browse files

fix max_bin of categorical feature in parallel learning.

parent ebc0de8b
......@@ -99,14 +99,14 @@ public:
/*!
* \brief Get prediction result at data_idx data
* \param data_idx 0: training data, 1: 1st validation data
* \return out_len lenght of returned score
* \return out_len length of returned score
*/
virtual int64_t GetNumPredictAt(int data_idx) const = 0;
/*!
* \brief Get prediction result at data_idx data
* \param data_idx 0: training data, 1: 1st validation data
* \param result used to store prediction result, should allocate memory before call this function
* \param out_len lenght of returned score
* \param out_len length of returned score
*/
virtual void GetPredictAt(int data_idx, double* result, int64_t* out_len) = 0;
......@@ -125,7 +125,7 @@ public:
virtual std::vector<double> Predict(const double* feature_values) const = 0;
/*!
* \brief Predtion for one record with leaf index
* \brief Prediction for one record with leaf index
* \param feature_values Feature value on this record
* \return Predicted leaf index for this record
*/
......
......@@ -540,7 +540,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForFile(BoosterHandle handle,
* C_API_PREDICT_RAW_SCORE: raw score
* C_API_PREDICT_LEAF_INDEX: leaf index
* \param num_iteration number of iteration for prediction, <= 0 means no limit
* \param out_len lenght of prediction
* \param out_len length of prediction
* \return 0 when succeed, -1 when failure happens
*/
LIGHTGBM_C_EXPORT int LGBM_BoosterCalcNumPredict(BoosterHandle handle,
......
......@@ -119,7 +119,7 @@ public:
* \brief Get prediction result at data_idx data
* \param data_idx 0: training data, 1: 1st validation data
* \param result used to store prediction result, should allocate memory before call this function
* \param out_len lenght of returned score
* \param out_len length of returned score
*/
void GetPredictAt(int data_idx, double* out_result, int64_t* out_len) override;
......
......@@ -721,11 +721,11 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
io_config_.max_bin, io_config_.min_data_in_bin, filter_cnt, bin_type);
}
} else {
// if have multi-machines, need find bin distributed
// if have multi-machines, need to find bin distributed
// different machines will find bin for different features
// start and len will store the process feature indices for different machines
// machine i will find bins for features in [ strat[i], start[i] + len[i] )
// machine i will find bins for features in [ start[i], start[i] + len[i] )
std::vector<int> start(num_machines);
std::vector<int> len(num_machines);
int total_num_feature = static_cast<int>(sample_values.size());
......@@ -738,8 +738,29 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
start[i + 1] = start[i] + len[i];
}
len[num_machines - 1] = total_num_feature - start[num_machines - 1];
#pragma omp parallel for schedule(guided)
for (int i = 0; i < len[rank]; ++i) {
if (ignore_features_.count(start[rank] + i) > 0) {
continue;
}
BinType bin_type = BinType::NumericalBin;
if (categorical_features_.count(start[rank] + i)) {
bin_type = BinType::CategoricalBin;
}
bin_mappers[i].reset(new BinMapper());
bin_mappers[i]->FindBin(sample_values[start[rank] + i], sample_data.size(),
io_config_.max_bin, io_config_.min_data_in_bin, filter_cnt, bin_type);
}
// get max_bin
int max_bin = 0;
for (int i = 0; i < len[rank]; ++i) {
if (ignore_features_.count(start[rank] + i) > 0) {
continue;
}
max_bin = std::max(max_bin, bin_mappers[i]->num_bin());
}
// get size of bin mapper with max_bin_ size
int type_size = BinMapper::SizeForSpecificBin(io_config_.max_bin);
int type_size = BinMapper::SizeForSpecificBin(max_bin);
// since sizes of different feature may not be same, we expand all bin mapper to type_size
int buffer_size = type_size * total_num_feature;
auto input_buffer = std::vector<char>(buffer_size);
......@@ -751,14 +772,9 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
if (ignore_features_.count(start[rank] + i) > 0) {
continue;
}
BinType bin_type = BinType::NumericalBin;
if (categorical_features_.count(start[rank] + i)) {
bin_type = BinType::CategoricalBin;
}
BinMapper bin_mapper;
bin_mapper.FindBin(sample_values[start[rank] + i], sample_data.size(),
io_config_.max_bin, io_config_.min_data_in_bin, filter_cnt, bin_type);
bin_mapper.CopyTo(input_buffer.data() + i * type_size);
bin_mappers[i]->CopyTo(input_buffer.data() + i * type_size);
// free
bin_mappers[i].reset(nullptr);
}
// convert to binary size
for (int i = 0; i < num_machines; ++i) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment