Commit 12ce2566 authored by Guolin Ke's avatar Guolin Ke
Browse files

fix max_bin of categorical feature in parallel learning.

parent ebc0de8b
...@@ -99,14 +99,14 @@ public: ...@@ -99,14 +99,14 @@ public:
/*! /*!
* \brief Get prediction result at data_idx data * \brief Get prediction result at data_idx data
* \param data_idx 0: training data, 1: 1st validation data * \param data_idx 0: training data, 1: 1st validation data
* \return out_len lenght of returned score * \return out_len length of returned score
*/ */
virtual int64_t GetNumPredictAt(int data_idx) const = 0; virtual int64_t GetNumPredictAt(int data_idx) const = 0;
/*! /*!
* \brief Get prediction result at data_idx data * \brief Get prediction result at data_idx data
* \param data_idx 0: training data, 1: 1st validation data * \param data_idx 0: training data, 1: 1st validation data
* \param result used to store prediction result, should allocate memory before call this function * \param result used to store prediction result, should allocate memory before call this function
* \param out_len lenght of returned score * \param out_len length of returned score
*/ */
virtual void GetPredictAt(int data_idx, double* result, int64_t* out_len) = 0; virtual void GetPredictAt(int data_idx, double* result, int64_t* out_len) = 0;
...@@ -125,7 +125,7 @@ public: ...@@ -125,7 +125,7 @@ public:
virtual std::vector<double> Predict(const double* feature_values) const = 0; virtual std::vector<double> Predict(const double* feature_values) const = 0;
/*! /*!
* \brief Predtion for one record with leaf index * \brief Prediction for one record with leaf index
* \param feature_values Feature value on this record * \param feature_values Feature value on this record
* \return Predicted leaf index for this record * \return Predicted leaf index for this record
*/ */
......
...@@ -540,7 +540,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForFile(BoosterHandle handle, ...@@ -540,7 +540,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForFile(BoosterHandle handle,
* C_API_PREDICT_RAW_SCORE: raw score * C_API_PREDICT_RAW_SCORE: raw score
* C_API_PREDICT_LEAF_INDEX: leaf index * C_API_PREDICT_LEAF_INDEX: leaf index
* \param num_iteration number of iteration for prediction, <= 0 means no limit * \param num_iteration number of iteration for prediction, <= 0 means no limit
* \param out_len lenght of prediction * \param out_len length of prediction
* \return 0 when succeed, -1 when failure happens * \return 0 when succeed, -1 when failure happens
*/ */
LIGHTGBM_C_EXPORT int LGBM_BoosterCalcNumPredict(BoosterHandle handle, LIGHTGBM_C_EXPORT int LGBM_BoosterCalcNumPredict(BoosterHandle handle,
......
...@@ -119,7 +119,7 @@ public: ...@@ -119,7 +119,7 @@ public:
* \brief Get prediction result at data_idx data * \brief Get prediction result at data_idx data
* \param data_idx 0: training data, 1: 1st validation data * \param data_idx 0: training data, 1: 1st validation data
* \param result used to store prediction result, should allocate memory before call this function * \param result used to store prediction result, should allocate memory before call this function
* \param out_len lenght of returned score * \param out_len length of returned score
*/ */
void GetPredictAt(int data_idx, double* out_result, int64_t* out_len) override; void GetPredictAt(int data_idx, double* out_result, int64_t* out_len) override;
......
...@@ -721,11 +721,11 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines, ...@@ -721,11 +721,11 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
io_config_.max_bin, io_config_.min_data_in_bin, filter_cnt, bin_type); io_config_.max_bin, io_config_.min_data_in_bin, filter_cnt, bin_type);
} }
} else { } else {
// if have multi-machines, need find bin distributed // if have multi-machines, need to find bin distributed
// different machines will find bin for different features // different machines will find bin for different features
// start and len will store the process feature indices for different machines // start and len will store the process feature indices for different machines
// machine i will find bins for features in [ strat[i], start[i] + len[i] ) // machine i will find bins for features in [ start[i], start[i] + len[i] )
std::vector<int> start(num_machines); std::vector<int> start(num_machines);
std::vector<int> len(num_machines); std::vector<int> len(num_machines);
int total_num_feature = static_cast<int>(sample_values.size()); int total_num_feature = static_cast<int>(sample_values.size());
...@@ -738,8 +738,29 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines, ...@@ -738,8 +738,29 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
start[i + 1] = start[i] + len[i]; start[i + 1] = start[i] + len[i];
} }
len[num_machines - 1] = total_num_feature - start[num_machines - 1]; len[num_machines - 1] = total_num_feature - start[num_machines - 1];
#pragma omp parallel for schedule(guided)
for (int i = 0; i < len[rank]; ++i) {
if (ignore_features_.count(start[rank] + i) > 0) {
continue;
}
BinType bin_type = BinType::NumericalBin;
if (categorical_features_.count(start[rank] + i)) {
bin_type = BinType::CategoricalBin;
}
bin_mappers[i].reset(new BinMapper());
bin_mappers[i]->FindBin(sample_values[start[rank] + i], sample_data.size(),
io_config_.max_bin, io_config_.min_data_in_bin, filter_cnt, bin_type);
}
// get max_bin
int max_bin = 0;
for (int i = 0; i < len[rank]; ++i) {
if (ignore_features_.count(start[rank] + i) > 0) {
continue;
}
max_bin = std::max(max_bin, bin_mappers[i]->num_bin());
}
// get size of bin mapper with max_bin_ size // get size of bin mapper with max_bin_ size
int type_size = BinMapper::SizeForSpecificBin(io_config_.max_bin); int type_size = BinMapper::SizeForSpecificBin(max_bin);
// since sizes of different feature may not be same, we expand all bin mapper to type_size // since sizes of different feature may not be same, we expand all bin mapper to type_size
int buffer_size = type_size * total_num_feature; int buffer_size = type_size * total_num_feature;
auto input_buffer = std::vector<char>(buffer_size); auto input_buffer = std::vector<char>(buffer_size);
...@@ -751,14 +772,9 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines, ...@@ -751,14 +772,9 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
if (ignore_features_.count(start[rank] + i) > 0) { if (ignore_features_.count(start[rank] + i) > 0) {
continue; continue;
} }
BinType bin_type = BinType::NumericalBin; bin_mappers[i]->CopyTo(input_buffer.data() + i * type_size);
if (categorical_features_.count(start[rank] + i)) { // free
bin_type = BinType::CategoricalBin; bin_mappers[i].reset(nullptr);
}
BinMapper bin_mapper;
bin_mapper.FindBin(sample_values[start[rank] + i], sample_data.size(),
io_config_.max_bin, io_config_.min_data_in_bin, filter_cnt, bin_type);
bin_mapper.CopyTo(input_buffer.data() + i * type_size);
} }
// convert to binary size // convert to binary size
for (int i = 0; i < num_machines; ++i) { for (int i = 0; i < num_machines; ++i) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment