Commit eade219e authored by Qiwei Ye's avatar Qiwei Ye
Browse files

merge conflict

parents f23e6083 060bd316
......@@ -6,7 +6,7 @@
#include <LightGBM/meta.h>
#include <LightGBM/config.h>
#include <LightGBM/feature.h>
#include <LightGBM/feature_group.h>
#include <vector>
#include <utility>
......@@ -19,7 +19,6 @@ namespace LightGBM {
/*! \brief forward declaration */
class DatasetLoader;
/*!
* \brief This class is used to store some meta(non-feature) data for training data,
* e.g. labels, weights, initial scores, qurey level informations.
......@@ -88,8 +87,6 @@ public:
void SetQuery(const data_size_t* query, data_size_t len);
void SetQueryId(const data_size_t* query_id, data_size_t len);
/*!
* \brief Set initial scores
* \param init_score Initial scores, this class will manage memory for init_score.
......@@ -175,7 +172,7 @@ public:
* \brief Get Number of queries
* \return Number of queries
*/
inline const data_size_t num_queries() const { return num_queries_; }
inline data_size_t num_queries() const { return num_queries_; }
/*!
* \brief Get weights for queries, if not exists, will return nullptr
......@@ -244,6 +241,9 @@ private:
std::vector<data_size_t> queries_;
/*! \brief mutex for threading safe call */
std::mutex mutex_;
bool weight_load_from_file_;
bool query_load_from_file_;
bool init_score_load_from_file_;
};
......@@ -280,14 +280,20 @@ class Dataset {
public:
friend DatasetLoader;
Dataset();
LIGHTGBM_EXPORT Dataset();
Dataset(data_size_t num_data);
LIGHTGBM_EXPORT Dataset(data_size_t num_data);
void Construct(
std::vector<std::unique_ptr<BinMapper>>& bin_mappers,
const std::vector<std::vector<int>>& sample_non_zero_indices,
size_t total_sample_cnt,
const IOConfig& io_config);
/*! \brief Destructor */
~Dataset();
LIGHTGBM_EXPORT ~Dataset();
bool CheckAlign(const Dataset& other) const {
LIGHTGBM_EXPORT bool CheckAlign(const Dataset& other) const {
if (num_features_ != other.num_features_) {
return false;
}
......@@ -298,7 +304,7 @@ public:
return false;
}
for (int i = 0; i < num_features_; ++i) {
if (!features_[i]->CheckAlign(*(other.features_[i].get()))) {
if (!FeatureBinMapper(i)->CheckAlign(*(other.FeatureBinMapper(i)))) {
return false;
}
}
......@@ -306,57 +312,140 @@ public:
}
inline void PushOneRow(int tid, data_size_t row_idx, const std::vector<double>& feature_values) {
if (is_finish_load_) { return; }
for (size_t i = 0; i < feature_values.size() && i < static_cast<size_t>(num_total_features_); ++i) {
int feature_idx = used_feature_map_[i];
if (feature_idx >= 0) {
features_[feature_idx]->PushData(tid, row_idx, feature_values[i]);
const int group = feature2group_[feature_idx];
const int sub_feature = feature2subfeature_[feature_idx];
feature_groups_[group]->PushData(tid, sub_feature, row_idx, feature_values[i]);
}
}
}
inline void PushOneRow(int tid, data_size_t row_idx, const std::vector<std::pair<int, double>>& feature_values) {
if (is_finish_load_) { return; }
for (auto& inner_data : feature_values) {
if (inner_data.first >= num_total_features_) { continue; }
int feature_idx = used_feature_map_[inner_data.first];
if (feature_idx >= 0) {
features_[feature_idx]->PushData(tid, row_idx, inner_data.second);
const int group = feature2group_[feature_idx];
const int sub_feature = feature2subfeature_[feature_idx];
feature_groups_[group]->PushData(tid, sub_feature, row_idx, inner_data.second);
}
}
}
inline int GetInnerFeatureIndex(int col_idx) const {
inline void PushOneData(int tid, data_size_t row_idx, int group, int sub_feature, double value) {
feature_groups_[group]->PushData(tid, sub_feature, row_idx, value);
}
inline int RealFeatureIndex(int fidx) const {
return real_feature_idx_[fidx];
}
inline int InnerFeatureIndex(int col_idx) const {
return used_feature_map_[col_idx];
}
inline int Feature2Group(int feature_idx) const {
return feature2group_[feature_idx];
}
inline int Feture2SubFeature(int feature_idx) const {
return feature2subfeature_[feature_idx];
}
inline uint64_t NumTotalBin() const {
return group_bin_boundaries_.back();
}
Dataset* Subset(const data_size_t* used_indices, data_size_t num_used_indices, bool is_enable_sparse) const;
void ReSize(data_size_t num_data);
void FinishLoad();
void CopySubset(const Dataset* fullset, const data_size_t* used_indices, data_size_t num_used_indices, bool need_meta_data);
bool SetFloatField(const char* field_name, const float* field_data, data_size_t num_element);
LIGHTGBM_EXPORT void FinishLoad();
bool SetDoubleField(const char* field_name, const double* field_data, data_size_t num_element);
LIGHTGBM_EXPORT bool SetFloatField(const char* field_name, const float* field_data, data_size_t num_element);
bool SetIntField(const char* field_name, const int* field_data, data_size_t num_element);
LIGHTGBM_EXPORT bool SetDoubleField(const char* field_name, const double* field_data, data_size_t num_element);
bool GetFloatField(const char* field_name, data_size_t* out_len, const float** out_ptr);
LIGHTGBM_EXPORT bool SetIntField(const char* field_name, const int* field_data, data_size_t num_element);
bool GetDoubleField(const char* field_name, data_size_t* out_len, const double** out_ptr);
LIGHTGBM_EXPORT bool GetFloatField(const char* field_name, data_size_t* out_len, const float** out_ptr);
bool GetIntField(const char* field_name, data_size_t* out_len, const int** out_ptr);
LIGHTGBM_EXPORT bool GetDoubleField(const char* field_name, data_size_t* out_len, const double** out_ptr);
LIGHTGBM_EXPORT bool GetIntField(const char* field_name, data_size_t* out_len, const int** out_ptr);
/*!
* \brief Save current dataset into binary file, will save to "filename.bin"
*/
void SaveBinaryFile(const char* bin_filename);
LIGHTGBM_EXPORT void SaveBinaryFile(const char* bin_filename);
LIGHTGBM_EXPORT void CopyFeatureMapperFrom(const Dataset* dataset);
LIGHTGBM_EXPORT void CreateValid(const Dataset* dataset);
void ConstructHistograms(
const std::vector<int8_t>& is_feature_used,
const data_size_t* data_indices, data_size_t num_data,
int leaf_idx,
std::vector<std::unique_ptr<OrderedBin>>& ordered_bins,
const score_t* gradients, const score_t* hessians,
score_t* ordered_gradients, score_t* ordered_hessians,
HistogramBinEntry* histogram_data) const;
void FixHistogram(int feature_idx, double sum_gradient, double sum_hessian, data_size_t num_data,
HistogramBinEntry* data) const;
inline data_size_t Split(
int feature,
uint32_t threshold,
data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices) const {
const int group = feature2group_[feature];
const int sub_feature = feature2subfeature_[feature];
return feature_groups_[group]->Split(sub_feature, threshold, data_indices, num_data, lte_indices, gt_indices);
}
inline int SubFeatureBinOffset(int i) const {
const int sub_feature = feature2subfeature_[i];
if (sub_feature == 0) {
return 1;
} else {
return 0;
}
}
void CopyFeatureMapperFrom(const Dataset* dataset, bool is_enable_sparse);
inline int FeatureNumBin(int i) const {
const int group = feature2group_[i];
const int sub_feature = feature2subfeature_[i];
return feature_groups_[group]->bin_mappers_[sub_feature]->num_bin();
}
inline const BinMapper* FeatureBinMapper(int i) const {
const int group = feature2group_[i];
const int sub_feature = feature2subfeature_[i];
return feature_groups_[group]->bin_mappers_[sub_feature].get();
}
/*!
* \brief Get a feature pointer for specific index
* \param i Index for feature
* \return Pointer of feature
*/
inline Feature* FeatureAt(int i) const { return features_[i].get(); }
inline BinIterator* FeatureIterator(int i) const {
const int group = feature2group_[i];
const int sub_feature = feature2subfeature_[i];
return feature_groups_[group]->SubFeatureIterator(sub_feature);
}
inline double RealThreshold(int i, uint32_t threshold) const {
const int group = feature2group_[i];
const int sub_feature = feature2subfeature_[i];
return feature_groups_[group]->bin_mappers_[sub_feature]->BinToValue(threshold);
}
inline void CreateOrderedBins(std::vector<std::unique_ptr<OrderedBin>>* ordered_bins) const {
ordered_bins->resize(num_groups_);
#pragma omp parallel for schedule(guided)
for (int i = 0; i < num_groups_; ++i) {
ordered_bins->at(i).reset(feature_groups_[i]->bin_data_->CreateOrderedBin());
}
}
/*!
* \brief Get meta data pointer
......@@ -384,6 +473,20 @@ public:
feature_names_ = std::vector<std::string>(feature_names);
}
inline std::vector<std::string> feature_infos() const {
std::vector<std::string> bufs;
for (int i = 0; i < num_total_features_; i++) {
int fidx = used_feature_map_[i];
if (fidx == -1) {
bufs.push_back("none");
} else {
const auto bin_mapper = FeatureBinMapper(fidx);
bufs.push_back(bin_mapper->bin_info());
}
}
return bufs;
}
/*! \brief Get Number of data */
inline data_size_t num_data() const { return num_data_; }
......@@ -395,7 +498,7 @@ public:
private:
const char* data_filename_;
/*! \brief Store used features */
std::vector<std::unique_ptr<Feature>> features_;
std::vector<std::unique_ptr<FeatureGroup>> feature_groups_;
/*! \brief Mapper from real feature index to used index*/
std::vector<int> used_feature_map_;
/*! \brief Number of used features*/
......@@ -412,6 +515,14 @@ private:
std::vector<std::string> feature_names_;
/*! \brief store feature names */
static const char* binary_file_token;
int num_groups_;
std::vector<int> real_feature_idx_;
std::vector<int> feature2group_;
std::vector<int> feature2subfeature_;
std::vector<uint64_t> group_bin_boundaries_;
std::vector<int> group_feature_start_;
std::vector<int> group_feature_cnt_;
bool is_finish_load_;
};
} // namespace LightGBM
......
......@@ -8,21 +8,21 @@ namespace LightGBM {
class DatasetLoader {
public:
DatasetLoader(const IOConfig& io_config, const PredictFunction& predict_fun, int num_class, const char* filename);
LIGHTGBM_EXPORT DatasetLoader(const IOConfig& io_config, const PredictFunction& predict_fun, int num_class, const char* filename);
~DatasetLoader();
LIGHTGBM_EXPORT ~DatasetLoader();
Dataset* LoadFromFile(const char* filename, int rank, int num_machines);
LIGHTGBM_EXPORT Dataset* LoadFromFile(const char* filename, int rank, int num_machines);
Dataset* LoadFromFile(const char* filename) {
LIGHTGBM_EXPORT Dataset* LoadFromFile(const char* filename) {
return LoadFromFile(filename, 0, 1);
}
Dataset* LoadFromFileAlignWithOtherDataset(const char* filename, const Dataset* train_data);
LIGHTGBM_EXPORT Dataset* LoadFromFileAlignWithOtherDataset(const char* filename, const Dataset* train_data);
Dataset* LoadFromBinFile(const char* data_filename, const char* bin_filename, int rank, int num_machines);
Dataset* CostructFromSampleData(std::vector<std::vector<double>>& sample_values, size_t total_sample_size, data_size_t num_data);
LIGHTGBM_EXPORT Dataset* CostructFromSampleData(std::vector<std::vector<double>>& sample_values,
std::vector<std::vector<int>>& sample_indices,
size_t total_sample_size, data_size_t num_data);
/*! \brief Disable copy */
DatasetLoader& operator=(const DatasetLoader&) = delete;
......@@ -31,6 +31,8 @@ public:
private:
Dataset* LoadFromBinFile(const char* data_filename, const char* bin_filename, int rank, int num_machines, int* num_global_data, std::vector<data_size_t>* used_data_indices);
void SetHeader(const char* filename);
void CheckDataset(const Dataset* dataset);
......@@ -71,7 +73,6 @@ private:
std::vector<std::string> feature_names_;
/*! \brief Mapper from real feature index to used index*/
std::unordered_set<int> categorical_features_;
};
}
......
#ifndef LIGHTGBM_EXPORT_H_
#define LIGHTGBM_EXPORT_H_
/** Macros for exporting symbols in MSVC/GCC/CLANG **/
#ifdef __cplusplus
#define LIGHTGBM_EXTERN_C extern "C"
#else
#define LIGHTGBM_EXTERN_C
#endif
#ifdef _MSC_VER
#define LIGHTGBM_EXPORT __declspec(dllexport)
#define LIGHTGBM_C_EXPORT LIGHTGBM_EXTERN_C __declspec(dllexport)
#else
#define LIGHTGBM_EXPORT
#define LIGHTGBM_C_EXPORT LIGHTGBM_EXTERN_C
#endif
#endif /** LIGHTGBM_EXPORT_H_ **/
#ifndef LIGHTGBM_FEATURE_H_
#define LIGHTGBM_FEATURE_H_
#ifndef LIGHTGBM_FEATURE_GROUP_H_
#define LIGHTGBM_FEATURE_GROUP_H_
#include <LightGBM/utils/random.h>
......@@ -12,22 +12,41 @@
namespace LightGBM {
/*! \brief Using to store data and providing some operations on one feature*/
class Feature {
class Dataset;
class DatasetLoader;
/*! \brief Using to store data and providing some operations on one feature group*/
class FeatureGroup {
public:
friend Dataset;
friend DatasetLoader;
/*!
* \brief Constructor
* \param feature_idx Index of this feature
* \param bin_mapper Bin mapper for this feature
* \param num_feature number of features of this group
* \param bin_mappers Bin mapper for features
* \param num_data Total number of data
* \param is_enable_sparse True if enable sparse feature
*/
Feature(int feature_idx, BinMapper* bin_mapper,
data_size_t num_data, bool is_enable_sparse)
:bin_mapper_(bin_mapper) {
feature_index_ = feature_idx;
bin_data_.reset(Bin::CreateBin(num_data, bin_mapper_->num_bin(),
bin_mapper_->sparse_rate(), is_enable_sparse, &is_sparse_, bin_mapper_->GetDefaultBin(), bin_mapper_->bin_type()));
FeatureGroup(int num_feature,
std::vector<std::unique_ptr<BinMapper>>& bin_mappers,
data_size_t num_data, bool is_enable_sparse) : num_feature_(num_feature) {
CHECK(static_cast<int>(bin_mappers.size()) == num_feature);
// use bin at zero to store default_bin
num_total_bin_ = 1;
bin_offsets_.emplace_back(num_total_bin_);
int cnt_non_zero = 0;
for (int i = 0; i < num_feature_; ++i) {
bin_mappers_.emplace_back(bin_mappers[i].release());
auto num_bin = bin_mappers_[i]->num_bin();
if (bin_mappers_[i]->GetDefaultBin() == 0) {
num_bin -= 1;
}
num_total_bin_ += num_bin;
bin_offsets_.emplace_back(num_total_bin_);
cnt_non_zero += static_cast<int>(num_data * (1.0f - bin_mappers_[i]->sparse_rate()));
}
double sparse_rate = 1.0f - static_cast<double>(cnt_non_zero) / (num_data);
bin_data_.reset(Bin::CreateBin(num_data, num_total_bin_,
sparse_rate, is_enable_sparse, &is_sparse_));
}
/*!
* \brief Constructor from memory
......@@ -35,39 +54,44 @@ public:
* \param num_all_data Number of global data
* \param local_used_indices Local used indices, empty means using all data
*/
Feature(const void* memory, data_size_t num_all_data,
FeatureGroup(const void* memory, data_size_t num_all_data,
const std::vector<data_size_t>& local_used_indices) {
const char* memory_ptr = reinterpret_cast<const char*>(memory);
// get featuer index
feature_index_ = *(reinterpret_cast<const int*>(memory_ptr));
memory_ptr += sizeof(feature_index_);
// get is_sparse
is_sparse_ = *(reinterpret_cast<const bool*>(memory_ptr));
memory_ptr += sizeof(is_sparse_);
num_feature_ = *(reinterpret_cast<const int*>(memory_ptr));
memory_ptr += sizeof(num_feature_);
// get bin mapper
bin_mapper_.reset(new BinMapper(memory_ptr));
memory_ptr += bin_mapper_->SizesInByte();
bin_mappers_.clear();
bin_offsets_.clear();
// start from 1, due to need to store zero bin in this slot
num_total_bin_ = 1;
bin_offsets_.emplace_back(num_total_bin_);
for (int i = 0; i < num_feature_; ++i) {
bin_mappers_.emplace_back(new BinMapper(memory_ptr));
auto num_bin = bin_mappers_[i]->num_bin();
if (bin_mappers_[i]->GetDefaultBin() == 0) {
num_bin -= 1;
}
num_total_bin_ += num_bin;
bin_offsets_.emplace_back(num_total_bin_);
memory_ptr += bin_mappers_[i]->SizesInByte();
}
data_size_t num_data = num_all_data;
if (!local_used_indices.empty()) {
num_data = static_cast<data_size_t>(local_used_indices.size());
}
if (is_sparse_) {
bin_data_.reset(Bin::CreateSparseBin(num_data, bin_mapper_->num_bin(), bin_mapper_->GetDefaultBin(), bin_mapper_->bin_type()));
bin_data_.reset(Bin::CreateSparseBin(num_data, num_total_bin_));
} else {
bin_data_.reset(Bin::CreateDenseBin(num_data, bin_mapper_->num_bin(), bin_mapper_->GetDefaultBin(), bin_mapper_->bin_type()));
bin_data_.reset(Bin::CreateDenseBin(num_data, num_total_bin_));
}
// get bin data
bin_data_->LoadFromMemory(memory_ptr, local_used_indices);
}
/*! \brief Destructor */
~Feature() {
}
bool CheckAlign(const Feature& other) const {
if (feature_index_ != other.feature_index_) {
return false;
}
return bin_mapper_->CheckAlign(*(other.bin_mapper_.get()));
~FeatureGroup() {
}
/*!
......@@ -76,66 +100,91 @@ public:
* \param idx Index of record
* \param value feature value of record
*/
inline void PushData(int tid, data_size_t line_idx, double value) {
unsigned int bin = bin_mapper_->ValueToBin(value);
inline void PushData(int tid, int sub_feature_idx, data_size_t line_idx, double value) {
uint32_t bin = bin_mappers_[sub_feature_idx]->ValueToBin(value);
if (bin == bin_mappers_[sub_feature_idx]->GetDefaultBin()) { return; }
bin += bin_offsets_[sub_feature_idx];
if (bin_mappers_[sub_feature_idx]->GetDefaultBin() == 0) {
bin -= 1;
}
bin_data_->Push(tid, line_idx, bin);
}
inline void PushBin(int tid, data_size_t line_idx, unsigned int bin) {
bin_data_->Push(tid, line_idx, bin);
inline void CopySubset(const FeatureGroup* full_feature, const data_size_t* used_indices, data_size_t num_used_indices) {
bin_data_->CopySubset(full_feature->bin_data_.get(), used_indices, num_used_indices);
}
inline void FinishLoad() { bin_data_->FinishLoad(); }
/*! \brief Index of this feature */
inline int feature_index() const { return feature_index_; }
/*! \brief Bin mapper that this feature used */
inline const BinMapper* bin_mapper() const { return bin_mapper_.get(); }
/*! \brief Number of bin of this feature */
inline int num_bin() const { return bin_mapper_->num_bin(); }
inline BinType bin_type() const { return bin_mapper_->bin_type(); }
/*! \brief Get bin data of this feature */
inline const Bin* bin_data() const { return bin_data_.get(); }
inline BinIterator* SubFeatureIterator(int sub_feature) {
uint32_t min_bin = bin_offsets_[sub_feature];
uint32_t max_bin = bin_offsets_[sub_feature + 1] - 1;
uint32_t default_bin = bin_mappers_[sub_feature]->GetDefaultBin();
return bin_data_->GetIterator(min_bin, max_bin, default_bin);
}
inline data_size_t Split(
int sub_feature,
uint32_t threshold,
data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices) const {
uint32_t min_bin = bin_offsets_[sub_feature];
uint32_t max_bin = bin_offsets_[sub_feature + 1] - 1;
uint32_t default_bin = bin_mappers_[sub_feature]->GetDefaultBin();
return bin_data_->Split(min_bin, max_bin, default_bin,
threshold, data_indices, num_data, lte_indices, gt_indices, bin_mappers_[sub_feature]->bin_type());
}
/*!
* \brief From bin to feature value
* \param bin
* \return Feature value of this bin
* \return FeatureGroup value of this bin
*/
inline double BinToValue(unsigned int bin)
const { return bin_mapper_->BinToValue(bin); }
inline double BinToValue(int sub_feature_idx, uint32_t bin) const {
return bin_mappers_[sub_feature_idx]->BinToValue(bin);
}
/*!
* \brief Save binary data to file
* \param file File want to write
*/
void SaveBinaryToFile(FILE* file) const {
fwrite(&feature_index_, sizeof(feature_index_), 1, file);
fwrite(&is_sparse_, sizeof(is_sparse_), 1, file);
bin_mapper_->SaveBinaryToFile(file);
fwrite(&num_feature_, sizeof(num_feature_), 1, file);
for (int i = 0; i < num_feature_; ++i) {
bin_mappers_[i]->SaveBinaryToFile(file);
}
bin_data_->SaveBinaryToFile(file);
}
/*!
* \brief Get sizes in byte of this object
*/
size_t SizesInByte() const {
return sizeof(feature_index_) + sizeof(is_sparse_) +
bin_mapper_->SizesInByte() + bin_data_->SizesInByte();
size_t ret = sizeof(is_sparse_) + sizeof(num_feature_);
for (int i = 0; i < num_feature_; ++i) {
ret += bin_mappers_[i]->SizesInByte();
}
ret += bin_data_->SizesInByte();
return ret;
}
/*! \brief Disable copy */
Feature& operator=(const Feature&) = delete;
FeatureGroup& operator=(const FeatureGroup&) = delete;
/*! \brief Disable copy */
Feature(const Feature&) = delete;
FeatureGroup(const FeatureGroup&) = delete;
private:
/*! \brief Index of this feature */
int feature_index_;
/*! \brief Bin mapper that this feature used */
std::unique_ptr<BinMapper> bin_mapper_;
/*! \brief Number of features */
int num_feature_;
/*! \brief Bin mapper for sub features */
std::vector<std::unique_ptr<BinMapper>> bin_mappers_;
/*! \brief Bin offsets for sub features */
std::vector<uint32_t> bin_offsets_;
/*! \brief Bin data of this feature */
std::unique_ptr<Bin> bin_data_;
/*! \brief True if this feature is sparse */
bool is_sparse_;
int num_total_bin_;
};
} // namespace LightGBM
#endif // LightGBM_FEATURE_H_
#endif // LIGHTGBM_FEATURE_GROUP_H_
......@@ -46,7 +46,7 @@ public:
* \param type Specific type of metric
* \param config Config for metric
*/
static Metric* CreateMetric(const std::string& type, const MetricConfig& config);
LIGHTGBM_EXPORT static Metric* CreateMetric(const std::string& type, const MetricConfig& config);
};
......
......@@ -41,7 +41,7 @@ public:
* When number of machines is not power of 2, need group machines into power of 2 group.
* And we can let each group has at most 2 machines.
* if the group only has 1 machine. this machine is the normal node
* if the grou has 2 machines, this group will have two type of nodes, one is the leader.
* if the group has 2 machines, this group will have two type of nodes, one is the leader.
* leader will represent this group and communication with others.
*/
enum RecursiveHalvingNodeType {
......
......@@ -44,7 +44,7 @@ public:
* \param type Specific type of objective function
* \param config Config for objective function
*/
static ObjectiveFunction* CreateObjectiveFunction(const std::string& type,
LIGHTGBM_EXPORT static ObjectiveFunction* CreateObjectiveFunction(const std::string& type,
const ObjectiveConfig& config);
};
......
......@@ -2,7 +2,6 @@
#define LIGHTGBM_TREE_H_
#include <LightGBM/meta.h>
#include <LightGBM/feature.h>
#include <LightGBM/dataset.h>
#include <string>
......@@ -46,7 +45,7 @@ public:
* \param gain Split gain
* \return The index of new leaf.
*/
int Split(int leaf, int feature, BinType bin_type, unsigned int threshold, int real_feature,
int Split(int leaf, int feature, BinType bin_type, uint32_t threshold, int real_feature,
double threshold_double, double left_value,
double right_value, data_size_t left_cnt, data_size_t right_cnt, double gain);
......@@ -64,8 +63,9 @@ public:
* \param num_data Number of total data
* \param score Will add prediction to score
*/
void AddPredictionToScore(const Dataset* data, data_size_t num_data,
double* score) const;
void AddPredictionToScore(const Dataset* data,
data_size_t num_data,
double* score) const;
/*!
* \brief Adding prediction value of this tree model to scorese
......@@ -93,7 +93,7 @@ public:
inline int leaf_depth(int leaf_idx) const { return leaf_depth_[leaf_idx]; }
/*! \brief Get feature of specific split*/
inline int split_feature_real(int split_idx) const { return split_feature_real_[split_idx]; }
inline int split_feature(int split_idx) const { return split_feature_[split_idx]; }
/*!
* \brief Shrinkage for the tree's output
......@@ -101,9 +101,11 @@ public:
* \param rate The factor of shrinkage
*/
inline void Shrinkage(double rate) {
#pragma omp parallel for schedule(static)
for (int i = 0; i < num_leaves_; ++i) {
leaf_value_[i] = leaf_value_[i] * rate;
leaf_value_[i] *= rate;
}
shrinkage_ *= rate;
}
/*! \brief Serialize this object to string*/
......@@ -138,18 +140,10 @@ public:
}
}
static std::vector<std::function<bool(unsigned int, unsigned int)>> inner_decision_funs;
static std::vector<std::function<bool(double, double)>> decision_funs;
static std::vector<bool(*)(uint32_t, uint32_t)> inner_decision_funs;
static std::vector<bool(*)(double, double)> decision_funs;
private:
/*!
* \brief Find leaf index of which record belongs by data
* \param data The dataset
* \param data_idx Index of record
* \return Leaf index
*/
inline int GetLeaf(const std::vector<std::unique_ptr<BinIterator>>& iterators,
data_size_t data_idx) const;
/*!
* \brief Find leaf index of which record belongs by features
......@@ -171,11 +165,11 @@ private:
/*! \brief A non-leaf node's right child */
std::vector<int> right_child_;
/*! \brief A non-leaf node's split feature */
std::vector<int> split_feature_;
std::vector<int> split_feature_inner;
/*! \brief A non-leaf node's split feature, the original index */
std::vector<int> split_feature_real_;
std::vector<int> split_feature_;
/*! \brief A non-leaf node's split threshold in bin */
std::vector<unsigned int> threshold_in_bin_;
std::vector<uint32_t> threshold_in_bin_;
/*! \brief A non-leaf node's split threshold in feature value */
std::vector<double> threshold_;
/*! \brief Decision type, 0 for '<='(numerical feature), 1 for 'is'(categorical feature) */
......@@ -195,39 +189,34 @@ private:
std::vector<data_size_t> internal_count_;
/*! \brief Depth for leaves */
std::vector<int> leaf_depth_;
double shrinkage_;
bool has_categorical_;
};
inline double Tree::Predict(const double* feature_values) const {
int leaf = GetLeaf(feature_values);
return LeafOutput(leaf);
if (num_leaves_ > 1) {
int leaf = GetLeaf(feature_values);
return LeafOutput(leaf);
} else {
return 0.0f;
}
}
inline int Tree::PredictLeafIndex(const double* feature_values) const {
int leaf = GetLeaf(feature_values);
return leaf;
}
inline int Tree::GetLeaf(const std::vector<std::unique_ptr<BinIterator>>& iterators,
data_size_t data_idx) const {
int node = 0;
while (node >= 0) {
if (inner_decision_funs[decision_type_[node]](
iterators[split_feature_[node]]->Get(data_idx),
threshold_in_bin_[node])) {
node = left_child_[node];
} else {
node = right_child_[node];
}
if (num_leaves_ > 1) {
int leaf = GetLeaf(feature_values);
return leaf;
} else {
return 0;
}
return ~node;
}
inline int Tree::GetLeaf(const double* feature_values) const {
int node = 0;
while (node >= 0) {
if (decision_funs[decision_type_[node]](
feature_values[split_feature_real_[node]],
feature_values[split_feature_[node]],
threshold_[node])) {
node = left_child_[node];
} else {
......
......@@ -27,6 +27,8 @@ public:
*/
virtual void Init(const Dataset* train_data) = 0;
virtual void ResetTrainingData(const Dataset* train_data) = 0;
/*!
* \brief Reset tree configs
* \param tree_config config of tree
......
......@@ -3,6 +3,7 @@
#include <vector>
#include <algorithm>
#include <LightGBM/utils/openmp_wrapper.h>
namespace LightGBM {
......@@ -12,88 +13,136 @@ namespace LightGBM {
template<typename VAL_T>
class ArrayArgs {
public:
inline static size_t ArgMaxMT(const std::vector<VAL_T>& array) {
int num_threads = 1;
#pragma omp parallel
#pragma omp master
{
num_threads = omp_get_num_threads();
}
int step = std::max(1, (static_cast<int>(array.size()) + num_threads - 1) / num_threads);
std::vector<size_t> arg_maxs(num_threads, 0);
#pragma omp parallel for schedule(static,1)
for (int i = 0; i < num_threads; ++i) {
size_t start = step * i;
if (start >= array.size()) { continue; }
size_t end = std::min(array.size(), start + step);
size_t arg_max = start;
for (size_t j = start + 1; j < end; ++j) {
if (array[j] > array[arg_max]) {
arg_max = j;
}
}
arg_maxs[i] = arg_max;
}
size_t ret = arg_maxs[0];
for (int i = 1; i < num_threads; ++i) {
if (array[arg_maxs[i]] > array[ret]) {
ret = arg_maxs[i];
}
}
return ret;
}
inline static size_t ArgMax(const std::vector<VAL_T>& array) {
if (array.empty()) {
return 0;
}
size_t argMax = 0;
for (size_t i = 1; i < array.size(); ++i) {
if (array[i] > array[argMax]) {
argMax = i;
if (array.size() > 100) {
return ArgMaxMT(array);
} else {
size_t arg_max = 0;
for (size_t i = 1; i < array.size(); ++i) {
if (array[i] > array[arg_max]) {
arg_max = i;
}
}
return arg_max;
}
return argMax;
}
inline static size_t ArgMin(const std::vector<VAL_T>& array) {
if (array.empty()) {
return 0;
}
size_t argMin = 0;
size_t arg_min = 0;
for (size_t i = 1; i < array.size(); ++i) {
if (array[i] < array[argMin]) {
argMin = i;
if (array[i] < array[arg_min]) {
arg_min = i;
}
}
return argMin;
return arg_min;
}
inline static size_t ArgMax(const VAL_T* array, size_t n) {
if (n <= 0) {
return 0;
}
size_t argMax = 0;
size_t arg_max = 0;
for (size_t i = 1; i < n; ++i) {
if (array[i] > array[argMax]) {
argMax = i;
if (array[i] > array[arg_max]) {
arg_max = i;
}
}
return argMax;
return arg_max;
}
inline static size_t ArgMin(const VAL_T* array, size_t n) {
if (n <= 0) {
return 0;
}
size_t argMin = 0;
size_t arg_min = 0;
for (size_t i = 1; i < n; ++i) {
if (array[i] < array[argMin]) {
argMin = i;
if (array[i] < array[arg_min]) {
arg_min = i;
}
}
return argMin;
return arg_min;
}
inline static size_t Partition(std::vector<VAL_T>* array, size_t start, size_t end) {
VAL_T& pivot = (*array)[end - 1];
size_t p_idx = start;
for (size_t i = start; i < end - 1; ++i) {
if ((*array)[i] > pivot) {
std::swap((*array)[p_idx], (*array)[i]);
++p_idx;
}
inline static void Partition(std::vector<VAL_T>* arr, int start, int end, int* l, int* r) {
int i = start - 1;
int j = end - 1;
int p = i;
int q = j;
if (start >= end) {
return;
}
std::swap((*array)[p_idx], (*array)[end - 1]);
return p_idx;
std::vector<VAL_T>& ref = *arr;
VAL_T v = ref[end - 1];
for (;;) {
while (ref[++i] > v);
while (v > ref[--j]) { if (j == start) { break; } }
if (i >= j) { break; }
std::swap(ref[i], ref[j]);
if (ref[i] == v) { p++; std::swap(ref[p], ref[i]); }
if (v == ref[j]) { q--; std::swap(ref[j], ref[q]); }
}
std::swap(ref[i], ref[end - 1]);
j = i - 1;
i = i + 1;
for (int k = start; k <= p; k++, j--) { std::swap(ref[k], ref[j]); }
for (int k = end - 2; k >= q; k--, i++) { std::swap(ref[i], ref[k]); }
*l = j;
*r = i;
};
inline static size_t ArgMaxAtK(std::vector<VAL_T>* array, size_t start, size_t end, size_t k) {
if (start == end - 1) {
inline static int ArgMaxAtK(std::vector<VAL_T>* arr, int start, int end, int k) {
if (start >= end - 1) {
return start;
}
size_t p_idx = Partition(array, start, end);
if (p_idx == k) {
return p_idx;
}
else if (k < p_idx) {
return ArgMaxAtK(array, start, p_idx, k);
}
else {
return ArgMaxAtK(array, p_idx + 1, end, k);
int l = start;
int r = end - 1;
Partition(arr, start, end, &l, &r);
if ((k > l && k < r) || l == 0 || r == end - 1) {
return k;
} else if (k <= l) {
return ArgMaxAtK(arr, start, l, k);
} else {
return ArgMaxAtK(arr, r, end, k);
}
}
inline static void MaxK(const std::vector<VAL_T>& array, size_t k, std::vector<VAL_T>* out) {
inline static void MaxK(const std::vector<VAL_T>& array, int k, std::vector<VAL_T>* out) {
out->clear();
if (k <= 0) {
return;
......@@ -101,10 +150,10 @@ public:
for (auto val : array) {
out->push_back(val);
}
if (k >= array.size()) {
if (static_cast<size_t>(k) >= array.size()) {
return;
}
ArgMaxAtK(out, 0, out->size(), k - 1);
ArgMaxAtK(out, 0, static_cast<int>(out->size()), k - 1);
out->erase(out->begin() + k, out->end());
}
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
#ifndef LIGHTGBM_UTILS_THREADING_H_
#define LIGHTGBM_UTILS_THREADING_H_
#include <omp.h>
#include <LightGBM/utils/openmp_wrapper.h>
#include <vector>
#include <functional>
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment