Commit 4f77bd28 authored by Guolin Ke's avatar Guolin Ke
Browse files

update to v2.

parent 13d4581b
......@@ -20,7 +20,9 @@ public:
LIGHTGBM_EXPORT Dataset* LoadFromFileAlignWithOtherDataset(const char* filename, const Dataset* train_data);
LIGHTGBM_EXPORT Dataset* CostructFromSampleData(std::vector<std::vector<double>>& sample_values, size_t total_sample_size, data_size_t num_data);
LIGHTGBM_EXPORT Dataset* CostructFromSampleData(std::vector<std::vector<double>>& sample_values,
std::vector<std::vector<int>>& sample_indices,
size_t total_sample_size, data_size_t num_data);
/*! \brief Disable copy */
DatasetLoader& operator=(const DatasetLoader&) = delete;
......@@ -69,8 +71,6 @@ private:
std::unordered_set<int> ignore_features_;
/*! \brief store feature names */
std::vector<std::string> feature_names_;
/*! \brief Mapper from real feature index to used index*/
std::unordered_set<int> categorical_features_;
};
......
#ifndef LIGHTGBM_FEATURE_H_
#define LIGHTGBM_FEATURE_H_
#ifndef LIGHTGBM_FEATURE_GROUP_H_
#define LIGHTGBM_FEATURE_GROUP_H_
#include <LightGBM/utils/random.h>
......@@ -12,22 +12,41 @@
namespace LightGBM {
/*! \brief Using to store data and providing some operations on one feature*/
class Feature {
class Dataset;
class DatasetLoader;
/*! \brief Using to store data and providing some operations on one feature group*/
class FeatureGroup {
public:
friend Dataset;
friend DatasetLoader;
/*!
* \brief Constructor
* \param feature_idx Index of this feature
* \param bin_mapper Bin mapper for this feature
* \param num_feature number of features of this group
* \param bin_mappers Bin mapper for features
* \param num_data Total number of data
* \param is_enable_sparse True if enable sparse feature
*/
Feature(int feature_idx, BinMapper* bin_mapper,
data_size_t num_data, bool is_enable_sparse)
:bin_mapper_(bin_mapper) {
feature_index_ = feature_idx;
bin_data_.reset(Bin::CreateBin(num_data, bin_mapper_->num_bin(),
bin_mapper_->sparse_rate(), is_enable_sparse, &is_sparse_, bin_mapper_->GetDefaultBin(), bin_mapper_->bin_type()));
FeatureGroup(int num_feature,
std::vector<std::unique_ptr<BinMapper>>& bin_mappers,
data_size_t num_data, bool is_enable_sparse) : num_feature_(num_feature) {
CHECK(static_cast<int>(bin_mappers.size()) == num_feature);
// use bin at zero to store default_bin
num_total_bin_ = 1;
bin_offsets_.emplace_back(num_total_bin_);
int cnt_non_zero = 0;
for (int i = 0; i < num_feature_; ++i) {
bin_mappers_.emplace_back(bin_mappers[i].release());
auto num_bin = bin_mappers_[i]->num_bin();
if (bin_mappers_[i]->GetDefaultBin() == 0) {
num_bin -= 1;
}
num_total_bin_ += num_bin;
bin_offsets_.emplace_back(num_total_bin_);
cnt_non_zero += static_cast<int>(num_data * (1.0f - bin_mappers_[i]->sparse_rate()));
}
double sparse_rate = 1.0f - static_cast<double>(cnt_non_zero) / (num_data);
bin_data_.reset(Bin::CreateBin(num_data, num_total_bin_,
sparse_rate, is_enable_sparse, &is_sparse_));
}
/*!
* \brief Constructor from memory
......@@ -35,39 +54,44 @@ public:
* \param num_all_data Number of global data
* \param local_used_indices Local used indices, empty means using all data
*/
Feature(const void* memory, data_size_t num_all_data,
FeatureGroup(const void* memory, data_size_t num_all_data,
const std::vector<data_size_t>& local_used_indices) {
const char* memory_ptr = reinterpret_cast<const char*>(memory);
// get featuer index
feature_index_ = *(reinterpret_cast<const int*>(memory_ptr));
memory_ptr += sizeof(feature_index_);
// get is_sparse
is_sparse_ = *(reinterpret_cast<const bool*>(memory_ptr));
memory_ptr += sizeof(is_sparse_);
num_feature_ = *(reinterpret_cast<const int*>(memory_ptr));
memory_ptr += sizeof(num_feature_);
// get bin mapper
bin_mapper_.reset(new BinMapper(memory_ptr));
memory_ptr += bin_mapper_->SizesInByte();
bin_mappers_.clear();
bin_offsets_.clear();
// start from 1, due to need to store zero bin in this slot
num_total_bin_ = 1;
bin_offsets_.emplace_back(num_total_bin_);
for (int i = 0; i < num_feature_; ++i) {
bin_mappers_.emplace_back(new BinMapper(memory_ptr));
auto num_bin = bin_mappers_[i]->num_bin();
if (bin_mappers_[i]->GetDefaultBin() == 0) {
num_bin -= 1;
}
num_total_bin_ += num_bin;
bin_offsets_.emplace_back(num_total_bin_);
memory_ptr += bin_mappers_[i]->SizesInByte();
}
data_size_t num_data = num_all_data;
if (!local_used_indices.empty()) {
num_data = static_cast<data_size_t>(local_used_indices.size());
}
if (is_sparse_) {
bin_data_.reset(Bin::CreateSparseBin(num_data, bin_mapper_->num_bin(), bin_mapper_->GetDefaultBin(), bin_mapper_->bin_type()));
bin_data_.reset(Bin::CreateSparseBin(num_data, num_total_bin_));
} else {
bin_data_.reset(Bin::CreateDenseBin(num_data, bin_mapper_->num_bin(), bin_mapper_->GetDefaultBin(), bin_mapper_->bin_type()));
bin_data_.reset(Bin::CreateDenseBin(num_data, num_total_bin_));
}
// get bin data
bin_data_->LoadFromMemory(memory_ptr, local_used_indices);
}
/*! \brief Destructor */
~Feature() {
}
bool CheckAlign(const Feature& other) const {
if (feature_index_ != other.feature_index_) {
return false;
}
return bin_mapper_->CheckAlign(*(other.bin_mapper_.get()));
~FeatureGroup() {
}
/*!
......@@ -76,78 +100,91 @@ public:
* \param idx Index of record
* \param value feature value of record
*/
inline void PushData(int tid, data_size_t line_idx, double value) {
unsigned int bin = bin_mapper_->ValueToBin(value);
bin_data_->Push(tid, line_idx, bin);
}
inline void PushBin(int tid, data_size_t line_idx, unsigned int bin) {
inline void PushData(int tid, int sub_feature_idx, data_size_t line_idx, double value) {
uint32_t bin = bin_mappers_[sub_feature_idx]->ValueToBin(value);
if (bin == bin_mappers_[sub_feature_idx]->GetDefaultBin()) { return; }
bin += bin_offsets_[sub_feature_idx];
if (bin_mappers_[sub_feature_idx]->GetDefaultBin() == 0) {
bin -= 1;
}
bin_data_->Push(tid, line_idx, bin);
}
inline void CopySubset(const Feature* full_feature, const data_size_t* used_indices, data_size_t num_used_indices) {
inline void CopySubset(const FeatureGroup* full_feature, const data_size_t* used_indices, data_size_t num_used_indices) {
bin_data_->CopySubset(full_feature->bin_data_.get(), used_indices, num_used_indices);
}
inline void ReSize(data_size_t num_data) {
bin_data_->ReSize(num_data);
inline BinIterator* SubFetureIterator(int sub_feature) {
uint32_t min_bin = bin_offsets_[sub_feature];
uint32_t max_bin = bin_offsets_[sub_feature + 1] - 1;
uint32_t default_bin = bin_mappers_[sub_feature]->GetDefaultBin();
return bin_data_->GetIterator(min_bin, max_bin, default_bin);
}
inline bool is_sparse() const { return is_sparse_; }
inline void FinishLoad() { bin_data_->FinishLoad(); }
/*! \brief Index of this feature */
inline int feature_index() const { return feature_index_; }
/*! \brief Bin mapper that this feature used */
inline const BinMapper* bin_mapper() const { return bin_mapper_.get(); }
/*! \brief Number of bin of this feature */
inline int num_bin() const { return bin_mapper_->num_bin(); }
inline BinType bin_type() const { return bin_mapper_->bin_type(); }
/*! \brief Get bin data of this feature */
inline const Bin* bin_data() const { return bin_data_.get(); }
inline data_size_t Split(
int sub_feature,
uint32_t threshold,
data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices) const {
uint32_t min_bin = bin_offsets_[sub_feature];
uint32_t max_bin = bin_offsets_[sub_feature + 1] - 1;
uint32_t default_bin = bin_mappers_[sub_feature]->GetDefaultBin();
return bin_data_->Split(min_bin, max_bin, default_bin,
threshold, data_indices, num_data, lte_indices, gt_indices);
}
/*!
* \brief From bin to feature value
* \param bin
* \return Feature value of this bin
* \return FeatureGroup value of this bin
*/
inline double BinToValue(unsigned int bin)
const { return bin_mapper_->BinToValue(bin); }
inline double BinToValue(int sub_feature_idx, uint32_t bin) const {
return bin_mappers_[sub_feature_idx]->BinToValue(bin);
}
/*!
* \brief Save binary data to file
* \param file File want to write
*/
void SaveBinaryToFile(FILE* file) const {
fwrite(&feature_index_, sizeof(feature_index_), 1, file);
fwrite(&is_sparse_, sizeof(is_sparse_), 1, file);
bin_mapper_->SaveBinaryToFile(file);
fwrite(&num_feature_, sizeof(num_feature_), 1, file);
for (int i = 0; i < num_feature_; ++i) {
bin_mappers_[i]->SaveBinaryToFile(file);
}
bin_data_->SaveBinaryToFile(file);
}
/*!
* \brief Get sizes in byte of this object
*/
size_t SizesInByte() const {
return sizeof(feature_index_) + sizeof(is_sparse_) +
bin_mapper_->SizesInByte() + bin_data_->SizesInByte();
size_t ret = sizeof(is_sparse_) + sizeof(num_feature_);
for (int i = 0; i < num_feature_; ++i) {
ret += bin_mappers_[i]->SizesInByte();
}
ret += bin_data_->SizesInByte();
return ret;
}
/*! \brief Disable copy */
Feature& operator=(const Feature&) = delete;
FeatureGroup& operator=(const FeatureGroup&) = delete;
/*! \brief Disable copy */
Feature(const Feature&) = delete;
FeatureGroup(const FeatureGroup&) = delete;
private:
/*! \brief Index of this feature */
int feature_index_;
/*! \brief Bin mapper that this feature used */
std::unique_ptr<BinMapper> bin_mapper_;
/*! \brief Number of features */
int num_feature_;
/*! \brief Bin mapper for sub features */
std::vector<std::unique_ptr<BinMapper>> bin_mappers_;
/*! \brief Bin offsets for sub features */
std::vector<uint32_t> bin_offsets_;
/*! \brief Bin data of this feature */
std::unique_ptr<Bin> bin_data_;
/*! \brief True if this feature is sparse */
bool is_sparse_;
int num_total_bin_;
};
} // namespace LightGBM
#endif // LightGBM_FEATURE_H_
#endif // LIGHTGBM_FEATURE_GROUP_H_
......@@ -2,7 +2,6 @@
#define LIGHTGBM_TREE_H_
#include <LightGBM/meta.h>
#include <LightGBM/feature.h>
#include <LightGBM/dataset.h>
#include <string>
......@@ -35,7 +34,6 @@ public:
* \brief Performing a split on tree leaves.
* \param leaf Index of leaf to be split
* \param feature Index of feature; the converted index after removing useless features
* \param bin_type type of this feature, numerical or categorical
* \param threshold Threshold(bin) of split
* \param real_feature Index of feature, the original index on data
* \param threshold_double Threshold on feature value
......@@ -46,7 +44,7 @@ public:
* \param gain Split gain
* \return The index of new leaf.
*/
int Split(int leaf, int feature, BinType bin_type, unsigned int threshold, int real_feature,
int Split(int leaf, int feature, uint32_t threshold, int real_feature,
double threshold_double, double left_value,
double right_value, data_size_t left_cnt, data_size_t right_cnt, double gain);
......@@ -64,8 +62,9 @@ public:
* \param num_data Number of total data
* \param score Will add prediction to score
*/
void AddPredictionToScore(const Dataset* data, data_size_t num_data,
double* score) const;
void AddPredictionToScore(const Dataset* data,
data_size_t num_data,
double* score) const;
/*!
* \brief Adding prediction value of this tree model to scorese
......@@ -93,7 +92,7 @@ public:
inline int leaf_depth(int leaf_idx) const { return leaf_depth_[leaf_idx]; }
/*! \brief Get feature of specific split*/
inline int split_feature_real(int split_idx) const { return split_feature_real_[split_idx]; }
inline int split_feature(int split_idx) const { return split_feature_[split_idx]; }
/*!
* \brief Shrinkage for the tree's output
......@@ -101,8 +100,9 @@ public:
* \param rate The factor of shrinkage
*/
inline void Shrinkage(double rate) {
#pragma omp parallel for schedule(static)
for (int i = 0; i < num_leaves_; ++i) {
leaf_value_[i] = leaf_value_[i] * rate;
leaf_value_[i] *= rate;
}
}
......@@ -112,15 +112,6 @@ public:
/*! \brief Serialize this object to json*/
std::string ToJSON();
template<typename T>
static bool CategoricalDecision(T fval, T threshold) {
if (static_cast<int>(fval) == static_cast<int>(threshold)) {
return true;
} else {
return false;
}
}
template<typename T>
static bool NumericalDecision(T fval, T threshold) {
if (fval <= threshold) {
......@@ -130,26 +121,13 @@ public:
}
}
static const char* GetDecisionTypeName(int8_t type) {
if (type == 0) {
return "no_greater";
} else {
return "is";
}
}
private:
static std::vector<bool(*)(unsigned int, unsigned int)> inner_decision_funs;
static std::vector<bool(*)(double, double)> decision_funs;
inline int GetLeaf(std::vector<std::unique_ptr<BinIterator>>& iterators,
data_size_t data_idx) const;
private:
/*!
* \brief Find leaf index of which record belongs by data
* \param data The dataset
* \param data_idx Index of record
* \return Leaf index
*/
inline int GetLeaf(const std::vector<std::unique_ptr<BinIterator>>& iterators,
data_size_t data_idx) const;
inline int GetLeafRaw(std::vector<std::unique_ptr<BinIterator>>& iterators,
data_size_t data_idx) const;
/*!
* \brief Find leaf index of which record belongs by features
......@@ -171,15 +149,13 @@ private:
/*! \brief A non-leaf node's right child */
std::vector<int> right_child_;
/*! \brief A non-leaf node's split feature */
std::vector<int> split_feature_;
std::vector<int> split_feature_inner;
/*! \brief A non-leaf node's split feature, the original index */
std::vector<int> split_feature_real_;
std::vector<int> split_feature_;
/*! \brief A non-leaf node's split threshold in bin */
std::vector<unsigned int> threshold_in_bin_;
std::vector<uint32_t> threshold_in_bin_;
/*! \brief A non-leaf node's split threshold in feature value */
std::vector<double> threshold_;
/*! \brief Decision type, 0 for '<='(numerical feature), 1 for 'is'(categorical feature) */
std::vector<int8_t> decision_type_;
/*! \brief A non-leaf node's split gain */
std::vector<double> split_gain_;
// used for leaf node
......@@ -208,13 +184,28 @@ inline int Tree::PredictLeafIndex(const double* feature_values) const {
return leaf;
}
inline int Tree::GetLeaf(const std::vector<std::unique_ptr<BinIterator>>& iterators,
data_size_t data_idx) const {
inline int Tree::GetLeaf(std::vector<std::unique_ptr<BinIterator>>& iterators,
data_size_t data_idx) const {
int node = 0;
while (node >= 0) {
if (NumericalDecision<uint32_t>(
iterators[node]->Get(data_idx),
threshold_in_bin_[node])) {
node = left_child_[node];
} else {
node = right_child_[node];
}
}
return ~node;
}
inline int Tree::GetLeafRaw(std::vector<std::unique_ptr<BinIterator>>& iterators,
data_size_t data_idx) const {
int node = 0;
while (node >= 0) {
if (inner_decision_funs[decision_type_[node]](
iterators[split_feature_[node]]->Get(data_idx),
threshold_in_bin_[node])) {
if (NumericalDecision<uint32_t>(
iterators[split_feature_inner[node]]->Get(data_idx),
threshold_in_bin_[node])) {
node = left_child_[node];
} else {
node = right_child_[node];
......@@ -226,8 +217,8 @@ inline int Tree::GetLeaf(const std::vector<std::unique_ptr<BinIterator>>& iterat
inline int Tree::GetLeaf(const double* feature_values) const {
int node = 0;
while (node >= 0) {
if (decision_funs[decision_type_[node]](
feature_values[split_feature_real_[node]],
if (NumericalDecision<double>(
feature_values[split_feature_[node]],
threshold_[node])) {
node = left_child_[node];
} else {
......
......@@ -3,6 +3,7 @@
#include <vector>
#include <algorithm>
#include <LightGBM/utils/openmp_wrapper.h>
namespace LightGBM {
......@@ -12,88 +13,136 @@ namespace LightGBM {
template<typename VAL_T>
class ArrayArgs {
public:
inline static size_t ArgMaxMT(const std::vector<VAL_T>& array) {
int num_threads = 1;
#pragma omp parallel
#pragma omp master
{
num_threads = omp_get_num_threads();
}
int step = std::max(1, (static_cast<int>(array.size()) + num_threads - 1) / num_threads);
std::vector<size_t> arg_maxs(num_threads, 0);
#pragma omp parallel for schedule(static,1)
for (int i = 0; i < num_threads; ++i) {
size_t start = step * i;
if (start >= array.size()) { continue; }
size_t end = std::min(array.size(), start + step);
size_t arg_max = start;
for (size_t j = start + 1; j < end; ++j) {
if (array[j] > array[arg_max]) {
arg_max = j;
}
}
arg_maxs[i] = arg_max;
}
size_t ret = arg_maxs[0];
for (int i = 1; i < num_threads; ++i) {
if (array[arg_maxs[i]] > array[ret]) {
ret = arg_maxs[i];
}
}
return ret;
}
inline static size_t ArgMax(const std::vector<VAL_T>& array) {
if (array.empty()) {
return 0;
}
size_t argMax = 0;
for (size_t i = 1; i < array.size(); ++i) {
if (array[i] > array[argMax]) {
argMax = i;
if (array.size() > 100) {
return ArgMaxMT(array);
} else {
size_t arg_max = 0;
for (size_t i = 1; i < array.size(); ++i) {
if (array[i] > array[arg_max]) {
arg_max = i;
}
}
return arg_max;
}
return argMax;
}
inline static size_t ArgMin(const std::vector<VAL_T>& array) {
if (array.empty()) {
return 0;
}
size_t argMin = 0;
size_t arg_min = 0;
for (size_t i = 1; i < array.size(); ++i) {
if (array[i] < array[argMin]) {
argMin = i;
if (array[i] < array[arg_min]) {
arg_min = i;
}
}
return argMin;
return arg_min;
}
inline static size_t ArgMax(const VAL_T* array, size_t n) {
if (n <= 0) {
return 0;
}
size_t argMax = 0;
size_t arg_max = 0;
for (size_t i = 1; i < n; ++i) {
if (array[i] > array[argMax]) {
argMax = i;
if (array[i] > array[arg_max]) {
arg_max = i;
}
}
return argMax;
return arg_max;
}
inline static size_t ArgMin(const VAL_T* array, size_t n) {
if (n <= 0) {
return 0;
}
size_t argMin = 0;
size_t arg_min = 0;
for (size_t i = 1; i < n; ++i) {
if (array[i] < array[argMin]) {
argMin = i;
if (array[i] < array[arg_min]) {
arg_min = i;
}
}
return argMin;
return arg_min;
}
inline static size_t Partition(std::vector<VAL_T>* array, size_t start, size_t end) {
VAL_T& pivot = (*array)[end - 1];
size_t p_idx = start;
for (size_t i = start; i < end - 1; ++i) {
if ((*array)[i] > pivot) {
std::swap((*array)[p_idx], (*array)[i]);
++p_idx;
}
inline static void Partition(std::vector<VAL_T>* arr, int start, int end, int* l, int* r) {
int i = start - 1;
int j = end - 1;
int p = i;
int q = j;
if (start >= end) {
return;
}
std::swap((*array)[p_idx], (*array)[end - 1]);
return p_idx;
std::vector<VAL_T>& ref = *arr;
VAL_T v = ref[end - 1];
for (;;) {
while (ref[++i] > v);
while (v > ref[--j]) { if (j == start) { break; } }
if (i >= j) { break; }
std::swap(ref[i], ref[j]);
if (ref[i] == v) { p++; std::swap(ref[p], ref[i]); }
if (v == ref[j]) { q--; std::swap(ref[j], ref[q]); }
}
std::swap(ref[i], ref[end - 1]);
j = i - 1;
i = i + 1;
for (int k = start; k <= p; k++, j--) { std::swap(ref[k], ref[j]); }
for (int k = end - 2; k >= q; k--, i++) { std::swap(ref[i], ref[k]); }
*l = j;
*r = i;
};
inline static size_t ArgMaxAtK(std::vector<VAL_T>* array, size_t start, size_t end, size_t k) {
if (start == end - 1) {
inline static int ArgMaxAtK(std::vector<VAL_T>* arr, int start, int end, int k) {
if (start >= end - 1) {
return start;
}
size_t p_idx = Partition(array, start, end);
if (p_idx == k) {
return p_idx;
}
else if (k < p_idx) {
return ArgMaxAtK(array, start, p_idx, k);
}
else {
return ArgMaxAtK(array, p_idx + 1, end, k);
int l = start;
int r = end - 1;
Partition(arr, start, end, &l, &r);
if ((k > l && k < r) || l == 0 || r == end - 1) {
return k;
} else if (k <= l) {
return ArgMaxAtK(arr, start, l, k);
} else {
return ArgMaxAtK(arr, r, end, k);
}
}
inline static void MaxK(const std::vector<VAL_T>& array, size_t k, std::vector<VAL_T>* out) {
inline static void MaxK(const std::vector<VAL_T>& array, int k, std::vector<VAL_T>* out) {
out->clear();
if (k <= 0) {
return;
......@@ -104,7 +153,7 @@ public:
if (k >= array.size()) {
return;
}
ArgMaxAtK(out, 0, out->size(), k - 1);
ArgMaxAtK(out, 0, static_cast<int>(out->size()), k - 1);
out->erase(out->begin() + k, out->end());
}
......
......@@ -150,7 +150,7 @@ inline static const char* Atof(const char* p, double* out) {
frac = 0;
scale = 1.0;
if ((*p == 'e') || (*p == 'E')) {
unsigned int expon;
uint32_t expon;
// Get sign of exponent, if any.
++p;
if (*p == '-') {
......
......@@ -20,30 +20,41 @@ public:
std::random_device rd;
auto genrator = std::mt19937(rd());
std::uniform_int_distribution<int> distribution(0, x);
x = static_cast<unsigned int>(distribution(genrator));
x = distribution(genrator);
}
/*!
* \brief Constructor, with specific seed
*/
Random(int seed) {
x = static_cast<unsigned int>(seed);
x = seed;
}
/*!
* \brief Generate random integer
* \brief Generate random integer, int16 range. [0, 65536]
* \param lower_bound lower bound
* \param upper_bound upper bound
* \return The random integer between [lower_bound, upper_bound)
*/
inline int NextShort(int lower_bound, int upper_bound) {
return (RandInt16()) % (upper_bound - lower_bound) + lower_bound;
}
/*!
* \brief Generate random integer, int32 range
* \param lower_bound lower bound
* \param upper_bound upper bound
* \return The random integer between [lower_bound, upper_bound)
*/
inline int NextInt(int lower_bound, int upper_bound) {
return (fastrand()) % (upper_bound - lower_bound) + lower_bound;
return (RandInt32()) % (upper_bound - lower_bound) + lower_bound;
}
/*!
* \brief Generate random float data
* \return The random float between [0.0, 1.0)
*/
inline float NextFloat() {
// get random float in [0,1)
return static_cast<float>(fastrand()) / (32768.0f);
return static_cast<float>(RandInt16()) / (32768.0f);
}
/*!
* \brief Sample K data from {0,1,...,N-1}
......@@ -65,10 +76,16 @@ public:
return ret;
}
private:
inline int fastrand() {
inline int RandInt16() {
x = (214013 * x + 2531011);
return (x >> 16) & 0x7FFF;
}
inline int RandInt32() {
x = (214013 * x + 2531011);
return x & 0x7FFFFFF;
}
int x = 123456789;
};
......
......@@ -31,9 +31,9 @@ def get_threshold(node_id, prev_node_idx, is_child):
def print_simple_predicate(tab_len, node_id, is_left_child, prev_node_idx, is_leaf):
if is_left_child:
op = 'equal' if decision_type[prev_node_idx] == 1 else 'lessOrEqual'
op = 'lessOrEqual'
else:
op = 'notEqual' if decision_type[prev_node_idx] == 1 else 'greaterThan'
op = 'greaterThan'
out_('\t' * (tab_len + 1) + ("<SimplePredicate field=\"{0}\" " + " operator=\"{1}\" value=\"{2}\" />").format(
get_field_name(node_id, prev_node_idx, is_leaf), op, get_threshold(node_id, prev_node_idx, is_leaf)))
......@@ -128,7 +128,6 @@ with open('LightGBM_pmml.xml', 'w') as pmml_out:
split_feature = get_array_ints(next(model_content))
split_gain = next(model_content) # unused
threshold = get_array_strings(next(model_content))
decision_type = get_array_ints(next(model_content))
left_child = get_array_ints(next(model_content))
right_child = get_array_ints(next(model_content))
leaf_parent = get_array_ints(next(model_content))
......
......@@ -221,7 +221,7 @@ PANDAS_DTYPE_MAPPER = {'int8': 'int', 'int16': 'int', 'int32': 'int',
'float32': 'float', 'float64': 'float', 'bool': 'int'}
def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorical):
def _data_from_pandas(data, feature_name):
if isinstance(data, DataFrame):
if feature_name == 'auto' or feature_name is None:
if all([isinstance(name, integer_types + (np.integer, )) for name in data.columns]):
......@@ -229,25 +229,6 @@ def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorica
warnings.filterwarnings('once')
warnings.warn(msg, stacklevel=5)
data = data.rename(columns=str)
cat_cols = data.select_dtypes(include=['category']).columns
if pandas_categorical is None: # train dataset
pandas_categorical = [list(data[col].cat.categories) for col in cat_cols]
else:
if len(cat_cols) != len(pandas_categorical):
raise ValueError('train and valid dataset categorical_feature do not match.')
for col, category in zip(cat_cols, pandas_categorical):
if list(data[col].cat.categories) != list(category):
data[col] = data[col].cat.set_categories(category)
if len(cat_cols): # cat_cols is pandas Index object
data = data.copy() # not alter origin DataFrame
data[cat_cols] = data[cat_cols].apply(lambda x: x.cat.codes)
if categorical_feature is not None:
if feature_name is None:
feature_name = list(data.columns)
if categorical_feature == 'auto':
categorical_feature = list(cat_cols)
else:
categorical_feature = list(categorical_feature) + list(cat_cols)
if feature_name == 'auto':
feature_name = list(data.columns)
data_dtypes = data.dtypes
......@@ -261,9 +242,7 @@ def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorica
else:
if feature_name == 'auto':
feature_name = None
if categorical_feature == 'auto':
categorical_feature = None
return data, feature_name, categorical_feature, pandas_categorical
return data, feature_name
def _label_from_pandas(label):
......@@ -277,19 +256,6 @@ def _label_from_pandas(label):
return label
def _save_pandas_categorical(file_name, pandas_categorical):
with open(file_name, 'a') as f:
f.write('\npandas_categorical:' + json.dumps(pandas_categorical, default=json_default_with_numpy))
def _load_pandas_categorical(file_name):
with open(file_name, 'r') as f:
last_line = f.readlines()[-1]
if last_line.startswith('pandas_categorical:'):
return json.loads(last_line[len('pandas_categorical:'):])
return None
class _InnerPredictor(object):
"""
A _InnerPredictor of LightGBM.
......@@ -321,7 +287,6 @@ class _InnerPredictor(object):
ctypes.byref(out_num_class)))
self.num_class = out_num_class.value
self.num_total_iteration = out_num_iterations.value
self.pandas_categorical = _load_pandas_categorical(model_file)
elif booster_handle is not None:
self.__is_manage_handle = False
self.handle = booster_handle
......@@ -335,7 +300,6 @@ class _InnerPredictor(object):
self.handle,
ctypes.byref(out_num_iterations)))
self.num_total_iteration = out_num_iterations.value
self.pandas_categorical = None
else:
raise TypeError('Need Model file or Booster handle to create a predictor')
......@@ -371,7 +335,7 @@ class _InnerPredictor(object):
"""
if isinstance(data, Dataset):
raise TypeError("Cannot use Dataset instance for prediction, please use raw data instead")
data = _data_from_pandas(data, None, None, self.pandas_categorical)[0]
data = _data_from_pandas(data, None)[0]
predict_type = C_API_PREDICT_NORMAL
if raw_score:
predict_type = C_API_PREDICT_RAW_SCORE
......@@ -532,7 +496,7 @@ class Dataset(object):
"""Dataset in LightGBM."""
def __init__(self, data, label=None, max_bin=255, reference=None,
weight=None, group=None, silent=False,
feature_name='auto', categorical_feature='auto', params=None,
feature_name='auto', params=None,
free_raw_data=True):
"""
Parameters
......@@ -555,11 +519,6 @@ class Dataset(object):
feature_name : list of str, or 'auto'
Feature names
If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
params: dict, optional
Other parameters
free_raw_data: Bool
......@@ -574,12 +533,10 @@ class Dataset(object):
self.group = group
self.silent = silent
self.feature_name = feature_name
self.categorical_feature = categorical_feature
self.params = params
self.free_raw_data = free_raw_data
self.used_indices = None
self._predictor = None
self.pandas_categorical = None
def __del__(self):
self._free_handle()
......@@ -592,11 +549,11 @@ class Dataset(object):
def _lazy_init(self, data, label=None, max_bin=255, reference=None,
weight=None, group=None, predictor=None,
silent=False, feature_name='auto',
categorical_feature='auto', params=None):
params=None):
if data is None:
self.handle = None
return
data, feature_name, categorical_feature, self.pandas_categorical = _data_from_pandas(data, feature_name, categorical_feature, self.pandas_categorical)
data, feature_name, = _data_from_pandas(data, feature_name)
label = _label_from_pandas(label)
self.data_has_header = False
"""process for args"""
......@@ -608,23 +565,6 @@ class Dataset(object):
params["verbose"] = 0
elif "verbose" not in params:
params["verbose"] = 1
"""get categorical features"""
if categorical_feature is not None:
categorical_indices = set()
feature_dict = {}
if feature_name is not None:
feature_dict = {name: i for i, name in enumerate(feature_name)}
for name in categorical_feature:
if isinstance(name, string_type) and name in feature_dict:
categorical_indices.add(feature_dict[name])
elif isinstance(name, integer_types):
categorical_indices.add(name)
else:
raise TypeError("Wrong type({}) or unknown name({}) in categorical_feature"
.format(type(name).__name__, name))
params['categorical_column'] = sorted(categorical_indices)
params_str = param_dict_to_str(params)
"""process for reference dataset"""
ref_dataset = None
......@@ -784,7 +724,7 @@ class Dataset(object):
self._lazy_init(self.data, label=self.label, max_bin=self.max_bin,
weight=self.weight, group=self.group, predictor=self._predictor,
silent=self.silent, feature_name=self.feature_name,
categorical_feature=self.categorical_feature, params=self.params)
params=self.params)
if self.free_raw_data:
self.data = None
return self
......@@ -814,7 +754,6 @@ class Dataset(object):
weight=weight, group=group, silent=silent, params=params,
free_raw_data=self.free_raw_data)
ret._predictor = self._predictor
ret.pandas_categorical = self.pandas_categorical
return ret
def subset(self, used_indices, params=None):
......@@ -829,9 +768,8 @@ class Dataset(object):
Other parameters
"""
ret = Dataset(None, reference=self, feature_name=self.feature_name,
categorical_feature=self.categorical_feature, params=params)
params=params)
ret._predictor = self._predictor
ret.pandas_categorical = self.pandas_categorical
ret.used_indices = used_indices
return ret
......@@ -939,24 +877,6 @@ class Dataset(object):
else:
raise TypeError("Unknown type")
def set_categorical_feature(self, categorical_feature):
"""
Set categorical features
Parameters
----------
categorical_feature : list of int or str
Name/index of categorical features
"""
if self.categorical_feature == categorical_feature:
return
if self.data is not None:
self.categorical_feature = categorical_feature
self._free_handle()
else:
raise LightGBMError("Cannot set categorical feature after freed raw data, set free_raw_data=False when construct Dataset to avoid this.")
def _set_predictor(self, predictor):
"""
Set predictor for continued training, not recommand for user to call this function.
......@@ -979,7 +899,6 @@ class Dataset(object):
reference : Dataset
Will use reference as template to consturct current dataset
"""
self.set_categorical_feature(reference.categorical_feature)
self.set_feature_name(reference.feature_name)
self._set_predictor(reference._predictor)
if self.reference is reference:
......@@ -1208,7 +1127,6 @@ class Booster(object):
self.__inner_predict_buffer = [None]
self.__is_predicted_cur_iter = [False]
self.__get_eval_info()
self.pandas_categorical = train_set.pandas_categorical
elif model_file is not None:
"""Prediction task"""
out_num_iterations = ctypes.c_int(0)
......@@ -1221,7 +1139,6 @@ class Booster(object):
self.handle,
ctypes.byref(out_num_class)))
self.__num_class = out_num_class.value
self.pandas_categorical = _load_pandas_categorical(model_file)
elif 'model_str' in params:
self.__load_model_from_string(params['model_str'])
else:
......@@ -1237,7 +1154,6 @@ class Booster(object):
def __deepcopy__(self, _):
model_str = self.__save_model_to_string()
booster = Booster({'model_str': model_str})
booster.pandas_categorical = self.pandas_categorical
return booster
def __getstate__(self):
......@@ -1477,7 +1393,6 @@ class Booster(object):
self.handle,
ctypes.c_int(num_iteration),
c_str(filename)))
_save_pandas_categorical(filename, self.pandas_categorical)
def __load_model_from_string(self, model_str):
"""[Private] Load model from string"""
......@@ -1589,7 +1504,6 @@ class Booster(object):
def _to_predictor(self):
"""Convert to predictor"""
predictor = _InnerPredictor(booster_handle=self.handle)
predictor.pandas_categorical = self.pandas_categorical
return predictor
def feature_name(self):
......
......@@ -17,7 +17,7 @@ from .compat import (SKLEARN_INSTALLED, LGBMStratifiedKFold, integer_types,
def train(params, train_set, num_boost_round=100,
valid_sets=None, valid_names=None,
fobj=None, feval=None, init_model=None,
feature_name='auto', categorical_feature='auto',
feature_name='auto',
early_stopping_rounds=None, evals_result=None,
verbose_eval=True, learning_rates=None, callbacks=None):
"""
......@@ -45,11 +45,6 @@ def train(params, train_set, num_boost_round=100,
feature_name : list of str, or 'auto'
Feature names
If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
early_stopping_rounds: int
Activates early stopping.
Requires at least one validation data and one metric
......@@ -103,7 +98,6 @@ def train(params, train_set, num_boost_round=100,
train_set._update_params(params)
train_set._set_predictor(predictor)
train_set.set_feature_name(feature_name)
train_set.set_categorical_feature(categorical_feature)
is_valid_contain_train = False
train_data_name = "training"
......@@ -277,7 +271,7 @@ def _agg_cv_result(raw_results):
def cv(params, train_set, num_boost_round=10,
data_splitter=None, nfold=5, stratified=False, shuffle=True,
metrics=None, fobj=None, feval=None, init_model=None,
feature_name='auto', categorical_feature='auto',
feature_name='auto',
early_stopping_rounds=None, fpreproc=None,
verbose_eval=None, show_stdv=True, seed=0,
callbacks=None):
......@@ -311,11 +305,6 @@ def cv(params, train_set, num_boost_round=10,
feature_name : list of str, or 'auto'
Feature names
If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
early_stopping_rounds: int
Activates early stopping. CV error needs to decrease at least
every <early_stopping_rounds> round(s) to continue.
......@@ -354,7 +343,6 @@ def cv(params, train_set, num_boost_round=10,
train_set._update_params(params)
train_set._set_predictor(predictor)
train_set.set_feature_name(feature_name)
train_set.set_categorical_feature(categorical_feature)
if metrics:
params.setdefault('metric', [])
......
......@@ -257,12 +257,7 @@ def _to_graphviz(graph, tree_info, show_info, feature_names):
if info in {'split_gain', 'internal_value', 'internal_count'}:
label += '\n' + info + ':' + str(root[info])
graph.node(name, label=label)
if root['decision_type'] == 'no_greater':
l_dec, r_dec = '<=', '>'
elif root['decision_type'] == 'is':
l_dec, r_dec = 'is', "isn't"
else:
raise ValueError('Invalid decision type in tree model.')
l_dec, r_dec = '<=', '>'
add(root['left_child'], name, l_dec)
add(root['right_child'], name, r_dec)
else: # leaf
......
......@@ -284,7 +284,7 @@ class LGBMModel(LGBMModelBase):
eval_init_score=None, eval_group=None,
eval_metric=None,
early_stopping_rounds=None, verbose=True,
feature_name='auto', categorical_feature='auto',
feature_name='auto',
callbacks=None):
"""
Fit the gradient boosting model
......@@ -318,11 +318,6 @@ class LGBMModel(LGBMModelBase):
feature_name : list of str, or 'auto'
Feature names
If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
callbacks : list of callback functions
List of callback functions that are applied at each iteration.
See Callbacks in Python-API.md for more information.
......@@ -406,7 +401,6 @@ class LGBMModel(LGBMModelBase):
early_stopping_rounds=early_stopping_rounds,
evals_result=evals_result, fobj=self.fobj, feval=feval,
verbose_eval=verbose, feature_name=feature_name,
categorical_feature=categorical_feature,
callbacks=callbacks)
if evals_result:
......@@ -514,7 +508,7 @@ class LGBMRegressor(LGBMModel, LGBMRegressorBase):
eval_init_score=None,
eval_metric="l2",
early_stopping_rounds=None, verbose=True,
feature_name='auto', categorical_feature='auto', callbacks=None):
feature_name='auto', callbacks=None):
super(LGBMRegressor, self).fit(X, y, sample_weight=sample_weight,
init_score=init_score, eval_set=eval_set,
......@@ -523,7 +517,6 @@ class LGBMRegressor(LGBMModel, LGBMRegressorBase):
eval_metric=eval_metric,
early_stopping_rounds=early_stopping_rounds,
verbose=verbose, feature_name=feature_name,
categorical_feature=categorical_feature,
callbacks=callbacks)
return self
......@@ -560,7 +553,7 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
eval_init_score=None,
eval_metric="binary_logloss",
early_stopping_rounds=None, verbose=True,
feature_name='auto', categorical_feature='auto',
feature_name='auto',
callbacks=None):
self._le = LGBMLabelEncoder().fit(y)
y = self._le.transform(y)
......@@ -583,7 +576,6 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
eval_metric=eval_metric,
early_stopping_rounds=early_stopping_rounds,
verbose=verbose, feature_name=feature_name,
categorical_feature=categorical_feature,
callbacks=callbacks)
return self
......@@ -661,7 +653,7 @@ class LGBMRanker(LGBMModel):
eval_init_score=None, eval_group=None,
eval_metric='ndcg', eval_at=1,
early_stopping_rounds=None, verbose=True,
feature_name='auto', categorical_feature='auto',
feature_name='auto',
callbacks=None):
"""
Most arguments like common methods except following:
......@@ -692,6 +684,5 @@ class LGBMRanker(LGBMModel):
eval_metric=eval_metric,
early_stopping_rounds=early_stopping_rounds,
verbose=verbose, feature_name=feature_name,
categorical_feature=categorical_feature,
callbacks=callbacks)
return self
#include <LightGBM/boosting.h>
#include "gbdt.h"
#include "dart.hpp"
#include "goss.hpp"
namespace LightGBM {
......@@ -31,6 +32,8 @@ Boosting* Boosting::CreateBoosting(const std::string& type, const char* filename
return new GBDT();
} else if (type == std::string("dart")) {
return new DART();
} else if (type == std::string("goss")) {
return new GOSS();
} else {
return nullptr;
}
......@@ -42,6 +45,10 @@ Boosting* Boosting::CreateBoosting(const std::string& type, const char* filename
ret.reset(new GBDT());
} else if (type == std::string("dart")) {
ret.reset(new DART());
} else if (type == std::string("goss")) {
ret.reset(new GOSS());
} else {
Log::Fatal("unknow boosting type %s", type.c_str());
}
LoadFileToBoosting(ret.get(), filename);
} else {
......
......@@ -38,6 +38,11 @@ public:
random_for_drop_ = Random(gbdt_config_->drop_seed);
sum_weight_ = 0.0f;
}
void ResetTrainingData(const BoostingConfig* config, const Dataset* train_data, const ObjectiveFunction* object_function,
const std::vector<const Metric*>& training_metrics) override {
GBDT::ResetTrainingData(config, train_data, object_function, training_metrics);
}
/*!
* \brief one training iteration
*/
......
......@@ -4,7 +4,6 @@
#include <LightGBM/utils/common.h>
#include <LightGBM/feature.h>
#include <LightGBM/objective_function.h>
#include <LightGBM/metric.h>
......@@ -37,7 +36,6 @@ GBDT::GBDT()
}
GBDT::~GBDT() {
}
void GBDT::Init(const BoostingConfig* config, const Dataset* train_data, const ObjectiveFunction* object_function,
......@@ -106,16 +104,6 @@ void GBDT::ResetTrainingData(const BoostingConfig* config, const Dataset* train_
label_idx_ = train_data->label_idx();
// get feature names
feature_names_ = train_data->feature_names();
// get feature infos
feature_infos_.clear();
for (int i = 0; i < max_feature_idx_ + 1; ++i) {
int feature_idx = train_data->GetInnerFeatureIndex(i);
if (feature_idx < 0) {
feature_infos_.push_back("trival feature");
} else {
feature_infos_.push_back(train_data->FeatureAt(feature_idx)->bin_mapper()->bin_info());
}
}
}
if ((train_data_ != train_data && train_data != nullptr)
......@@ -587,11 +575,6 @@ std::string GBDT::SaveModelToString(int num_iterations) const {
ss << pairs[i].second << "=" << std::to_string(pairs[i].first) << std::endl;
}
ss << std::endl << "feature information:" << std::endl;
for (int i = 0; i < max_feature_idx_ + 1; ++i) {
ss << feature_names_[i] << "=" << feature_infos_[i] << std::endl;
}
return ss.str();
}
......@@ -651,51 +634,12 @@ bool GBDT::LoadModelFromString(const std::string& model_str) {
Log::Fatal("Wrong size of feature_names");
return false;
}
} else {
}
else {
Log::Fatal("Model file doesn't contain feature names");
return false;
}
// returns offset, or lines.size() if not found.
auto find_string_lineno = [&lines](const std::string &str, size_t start_line)
{
size_t i = start_line;
size_t featinfo_find_pos = std::string::npos;
while (i < lines.size()) {
featinfo_find_pos = lines[i].find(str);
if (featinfo_find_pos != std::string::npos)
break;
++i;
}
return i;
};
// load feature information
{
size_t finfo_line_idx = find_string_lineno("feature information:", 0);
if (finfo_line_idx >= lines.size()) {
Log::Fatal("Model file doesn't contain feature information");
return false;
}
feature_infos_.resize(max_feature_idx_ + 1);
// search for each feature name
for (int i=0; i < max_feature_idx_ + 1; i++) {
const auto feat_name = feature_names_[i];
size_t line_idx = find_string_lineno(feat_name + "=", finfo_line_idx + 1);
if (line_idx >= lines.size()) {
Log::Fatal(("Model file doesn't contain feature information for feature " + feat_name).c_str());
return false;
}
const auto this_line = lines[line_idx];
feature_infos_[i] = this_line.substr((feat_name + "=").size());
}
}
// get tree models
size_t i = 0;
while (i < lines.size()) {
......@@ -725,7 +669,7 @@ std::vector<std::pair<size_t, std::string>> GBDT::FeatureImportance() const {
std::vector<size_t> feature_importances(max_feature_idx_ + 1, 0);
for (size_t iter = 0; iter < models_.size(); ++iter) {
for (int split_idx = 0; split_idx < models_[iter]->num_leaves() - 1; ++split_idx) {
++feature_importances[models_[iter]->split_feature_real(split_idx)];
++feature_importances[models_[iter]->split_feature(split_idx)];
}
}
// store the importance first
......
......@@ -329,8 +329,6 @@ protected:
int num_init_iteration_;
/*! \brief Feature names */
std::vector<std::string> feature_names_;
/*! \brief Feature informations */
std::vector<std::string> feature_infos_;
/*! \brief number of threads */
int num_threads_;
/*! \brief Buffer for multi-threading bagging */
......
#ifndef LIGHTGBM_BOOSTING_GOSS_H_
#define LIGHTGBM_BOOSTING_GOSS_H_
#include <LightGBM/utils/array_args.h>
#include <LightGBM/utils/log.h>
#include <LightGBM/utils/openmp_wrapper.h>
#include <LightGBM/boosting.h>
#include "score_updater.hpp"
#include "gbdt.h"
#include <cstdio>
#include <vector>
#include <string>
#include <fstream>
#include <chrono>
namespace LightGBM {
class GOSS: public GBDT {
public:
/*!
* \brief Constructor
*/
GOSS() : GBDT() {
}
~GOSS() {
}
void Init(const BoostingConfig* config, const Dataset* train_data, const ObjectiveFunction* object_function,
const std::vector<const Metric*>& training_metrics) override {
GBDT::Init(config, train_data, object_function, training_metrics);
CHECK(gbdt_config_->top_rate + gbdt_config_->other_rate <= 1.0f);
CHECK(gbdt_config_->top_rate > 0.0f && gbdt_config_->other_rate > 0.0f);
if (gbdt_config_->bagging_freq > 0 && gbdt_config_->bagging_fraction != 1.0f) {
Log::Fatal("cannot used bagging in GOSS");
}
Log::Info("using GOSS");
}
void ResetTrainingData(const BoostingConfig* config, const Dataset* train_data, const ObjectiveFunction* object_function,
const std::vector<const Metric*>& training_metrics) override {
if (config->bagging_freq > 0 && config->bagging_fraction != 1.0f) {
Log::Fatal("cannot used bagging in GOSS");
}
GBDT::ResetTrainingData(config, train_data, object_function, training_metrics);
if (train_data_ == nullptr) { return; }
bag_data_indices_.resize(num_data_);
tmp_indices_.resize(num_data_);
tmp_indice_right_.resize(num_data_);
offsets_buf_.resize(num_threads_);
left_cnts_buf_.resize(num_threads_);
right_cnts_buf_.resize(num_threads_);
left_write_pos_buf_.resize(num_threads_);
right_write_pos_buf_.resize(num_threads_);
is_use_subset_ = false;
if (config->top_rate + config->other_rate <= 0.5) {
auto bag_data_cnt = static_cast<data_size_t>((config->top_rate + config->other_rate) * num_data_);
tmp_subset_.reset(new Dataset(bag_data_cnt));
tmp_subset_->CopyFeatureMapperFrom(train_data_);
is_use_subset_ = true;
}
// flag to not bagging first
bag_data_cnt_ = num_data_;
}
data_size_t BaggingHelper(Random& cur_rand, data_size_t start, data_size_t cnt, data_size_t* buffer, data_size_t* buffer_right) {
std::vector<score_t> tmp_gradients(cnt);
for (data_size_t i = 0; i < cnt; ++i) {
tmp_gradients[i] = std::fabs(gradients_[start + i] * hessians_[start + i]);
}
data_size_t top_k = static_cast<data_size_t>(cnt * gbdt_config_->top_rate);
data_size_t other_k = static_cast<data_size_t>(cnt * gbdt_config_->other_rate);
top_k = std::max(1, top_k);
ArrayArgs<score_t>::ArgMaxAtK(&tmp_gradients, 0, static_cast<int>(tmp_gradients.size()), top_k);
score_t threshold = tmp_gradients[top_k - 1];
score_t multiply = static_cast<score_t>(cnt - top_k) / other_k;
data_size_t cur_left_cnt = 0;
data_size_t cur_right_cnt = 0;
data_size_t big_weight_cnt = 0;
for (data_size_t i = 0; i < cnt; ++i) {
if (std::fabs(gradients_[start + i] * hessians_[start + i]) >= threshold) {
buffer[cur_left_cnt++] = start + i;
++big_weight_cnt;
} else {
data_size_t sampled = cur_left_cnt - big_weight_cnt;
data_size_t rest_need = other_k - sampled;
data_size_t rest_all = (cnt - i) - (top_k - big_weight_cnt);
double prob = (rest_need) / static_cast<double>(rest_all);
if (cur_rand.NextFloat() < prob) {
buffer[cur_left_cnt++] = start + i;
gradients_[start + i] *= multiply;
hessians_[start + i] *= multiply;
} else {
buffer_right[cur_right_cnt++] = start + i;
}
}
}
return cur_left_cnt;
}
void Bagging(int iter) override {
bag_data_cnt_ = num_data_;
// not subsample for first iterations
if (iter < static_cast<int>(1.0f / gbdt_config_->learning_rate)) { return; }
const data_size_t min_inner_size = 1000;
data_size_t inner_size = (num_data_ + num_threads_ - 1) / num_threads_;
if (inner_size < min_inner_size) { inner_size = min_inner_size; }
#pragma omp parallel for schedule(static, 1)
for (int i = 0; i < num_threads_; ++i) {
left_cnts_buf_[i] = 0;
right_cnts_buf_[i] = 0;
data_size_t cur_start = i * inner_size;
if (cur_start > num_data_) { continue; }
data_size_t cur_cnt = inner_size;
if (cur_start + cur_cnt > num_data_) { cur_cnt = num_data_ - cur_start; }
Random cur_rand(gbdt_config_->bagging_seed + iter * num_threads_ + i);
data_size_t cur_left_count = BaggingHelper(cur_rand, cur_start, cur_cnt,
tmp_indices_.data() + cur_start, tmp_indice_right_.data() + cur_start);
offsets_buf_[i] = cur_start;
left_cnts_buf_[i] = cur_left_count;
right_cnts_buf_[i] = cur_cnt - cur_left_count;
}
data_size_t left_cnt = 0;
left_write_pos_buf_[0] = 0;
right_write_pos_buf_[0] = 0;
for (int i = 1; i < num_threads_; ++i) {
left_write_pos_buf_[i] = left_write_pos_buf_[i - 1] + left_cnts_buf_[i - 1];
right_write_pos_buf_[i] = right_write_pos_buf_[i - 1] + right_cnts_buf_[i - 1];
}
left_cnt = left_write_pos_buf_[num_threads_ - 1] + left_cnts_buf_[num_threads_ - 1];
#pragma omp parallel for schedule(static, 1)
for (int i = 0; i < num_threads_; ++i) {
if (left_cnts_buf_[i] > 0) {
std::memcpy(bag_data_indices_.data() + left_write_pos_buf_[i],
tmp_indices_.data() + offsets_buf_[i], left_cnts_buf_[i] * sizeof(data_size_t));
}
if (right_cnts_buf_[i] > 0) {
std::memcpy(bag_data_indices_.data() + left_cnt + right_write_pos_buf_[i],
tmp_indice_right_.data() + offsets_buf_[i], right_cnts_buf_[i] * sizeof(data_size_t));
}
}
bag_data_cnt_ = left_cnt;
// set bagging data to tree learner
if (!is_use_subset_) {
tree_learner_->SetBaggingData(bag_data_indices_.data(), bag_data_cnt_);
} else {
// get subset
tmp_subset_->ReSize(bag_data_cnt_);
tmp_subset_->CopySubset(train_data_, bag_data_indices_.data(), bag_data_cnt_, false);
tree_learner_->ResetTrainingData(tmp_subset_.get());
}
}
/*!
* \brief Get Type name of this boosting object
*/
const char* SubModelName() const override { return "tree"; }
private:
std::vector<data_size_t> tmp_indice_right_;
};
} // namespace LightGBM
#endif // LIGHTGBM_BOOSTING_GOSS_H_
#ifndef LIGHTGBM_BOOSTING_SCORE_UPDATER_HPP_
#define LIGHTGBM_BOOSTING_SCORE_UPDATER_HPP_
#include <LightGBM/utils/openmp_wrapper.h>
#include <LightGBM/meta.h>
#include <LightGBM/dataset.h>
#include <LightGBM/tree.h>
......
......@@ -330,20 +330,22 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromMat(const void* data,
const int sample_cnt = static_cast<int>(nrow < io_config.bin_construct_sample_cnt ? nrow : io_config.bin_construct_sample_cnt);
auto sample_indices = rand.Sample(nrow, sample_cnt);
std::vector<std::vector<double>> sample_values(ncol);
std::vector<std::vector<int>> sample_idx(ncol);
for (size_t i = 0; i < sample_indices.size(); ++i) {
auto idx = sample_indices[i];
auto row = get_row_fun(static_cast<int>(idx));
for (size_t j = 0; j < row.size(); ++j) {
if (std::fabs(row[j]) > 1e-15) {
sample_values[j].push_back(row[j]);
if (std::fabs(row[j]) > kEpsilon) {
sample_values[j].emplace_back(row[j]);
sample_idx[j].emplace_back(static_cast<int>(i));
}
}
}
DatasetLoader loader(io_config, nullptr, 1, nullptr);
ret.reset(loader.CostructFromSampleData(sample_values, sample_cnt, nrow));
ret.reset(loader.CostructFromSampleData(sample_values, sample_idx, sample_cnt, nrow));
} else {
ret.reset(new Dataset(nrow));
ret->CopyFeatureMapperFrom(
ret->CreateValid(
reinterpret_cast<const Dataset*>(reference));
}
......@@ -382,29 +384,28 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromCSR(const void* indptr,
const int sample_cnt = static_cast<int>(nrow < io_config.bin_construct_sample_cnt ? nrow : io_config.bin_construct_sample_cnt);
auto sample_indices = rand.Sample(nrow, sample_cnt);
std::vector<std::vector<double>> sample_values;
std::vector<std::vector<int>> sample_idx;
for (size_t i = 0; i < sample_indices.size(); ++i) {
auto idx = sample_indices[i];
auto row = get_row_fun(static_cast<int>(idx));
for (std::pair<int, double>& inner_data : row) {
if (static_cast<size_t>(inner_data.first) >= sample_values.size()) {
// if need expand feature set
size_t need_size = inner_data.first - sample_values.size() + 1;
for (size_t j = 0; j < need_size; ++j) {
sample_values.emplace_back();
}
sample_values.resize(inner_data.first + 1);
sample_idx.resize(inner_data.first + 1);
}
if (std::fabs(inner_data.second) > 1e-15) {
if (std::fabs(inner_data.second) > kEpsilon) {
// edit the feature value
sample_values[inner_data.first].push_back(inner_data.second);
sample_values[inner_data.first].emplace_back(inner_data.second);
sample_idx[inner_data.first].emplace_back(static_cast<int>(i));
}
}
}
CHECK(num_col >= static_cast<int>(sample_values.size()));
DatasetLoader loader(io_config, nullptr, 1, nullptr);
ret.reset(loader.CostructFromSampleData(sample_values, sample_cnt, nrow));
ret.reset(loader.CostructFromSampleData(sample_values, sample_idx, sample_cnt, nrow));
} else {
ret.reset(new Dataset(nrow));
ret->CopyFeatureMapperFrom(
ret->CreateValid(
reinterpret_cast<const Dataset*>(reference));
}
......@@ -442,29 +443,33 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromCSC(const void* col_ptr,
const int sample_cnt = static_cast<int>(nrow < io_config.bin_construct_sample_cnt ? nrow : io_config.bin_construct_sample_cnt);
auto sample_indices = rand.Sample(nrow, sample_cnt);
std::vector<std::vector<double>> sample_values(ncol_ptr - 1);
std::vector<std::vector<int>> sample_idx(ncol_ptr - 1);
#pragma omp parallel for schedule(guided)
for (int i = 0; i < static_cast<int>(sample_values.size()); ++i) {
CSC_RowIterator col_it(col_ptr, col_ptr_type, indices, data, data_type, ncol_ptr, nelem, i);
for (int j = 0; j < sample_cnt; j++) {
auto val = col_it.Get(sample_indices[j]);
if (std::fabs(val) > kEpsilon) {
sample_values[i].push_back(val);
sample_values[i].emplace_back(val);
sample_idx[i].emplace_back(j);
}
}
}
DatasetLoader loader(io_config, nullptr, 1, nullptr);
ret.reset(loader.CostructFromSampleData(sample_values, sample_cnt, nrow));
ret.reset(loader.CostructFromSampleData(sample_values, sample_idx, sample_cnt, nrow));
} else {
ret.reset(new Dataset(nrow));
ret->CopyFeatureMapperFrom(
ret->CreateValid(
reinterpret_cast<const Dataset*>(reference));
}
#pragma omp parallel for schedule(guided)
for (int i = 0; i < ncol_ptr - 1; ++i) {
const int tid = omp_get_thread_num();
int feature_idx = ret->GetInnerFeatureIndex(i);
int feature_idx = ret->InnerFeatureIndex(i);
if (feature_idx < 0) { continue; }
int group = ret->Feature2Group(feature_idx);
int sub_feature = ret->Feture2SubFeature(feature_idx);
CSC_RowIterator col_it(col_ptr, col_ptr_type, indices, data, data_type, ncol_ptr, nelem, i);
int row_idx = 0;
while (row_idx < nrow) {
......@@ -472,7 +477,7 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromCSC(const void* col_ptr,
row_idx = pair.first;
// no more data
if (row_idx < 0) { break; }
ret->FeatureAt(feature_idx)->PushData(tid, row_idx, pair.second);
ret->PushOneData(tid, row_idx, group, sub_feature, pair.second);
}
}
ret->FinishLoad();
......
......@@ -23,16 +23,10 @@ BinMapper::BinMapper(const BinMapper& other) {
num_bin_ = other.num_bin_;
is_trival_ = other.is_trival_;
sparse_rate_ = other.sparse_rate_;
bin_type_ = other.bin_type_;
if (bin_type_ == BinType::NumericalBin) {
bin_upper_bound_ = other.bin_upper_bound_;
} else {
bin_2_categorical_ = other.bin_2_categorical_;
categorical_2_bin_ = other.categorical_2_bin_;
}
bin_upper_bound_ = other.bin_upper_bound_;
min_val_ = other.min_val_;
max_val_ = other.max_val_;
default_bin_ = other.default_bin_;
}
BinMapper::BinMapper(const void* memory) {
......@@ -43,37 +37,48 @@ BinMapper::~BinMapper() {
}
void BinMapper::FindBin(std::vector<double>* values, size_t total_sample_cnt, int max_bin, BinType bin_type) {
bin_type_ = bin_type;
std::vector<double>& ref_values = (*values);
size_t sample_size = total_sample_cnt;
int zero_cnt = static_cast<int>(total_sample_cnt - ref_values.size());
bool NeedFilter(std::vector<int>& cnt_in_bin, int total_cnt, int filter_cnt) {
int sum_left = 0;
for (size_t i = 0; i < cnt_in_bin.size() - 1; ++i) {
sum_left += cnt_in_bin[i];
if (sum_left >= filter_cnt) {
return false;
} else if (total_cnt - sum_left >= filter_cnt) {
return false;
}
}
return true;
}
void BinMapper::FindBin(std::vector<double>& values, size_t total_sample_cnt,
int max_bin, int min_data_in_bin, int min_split_data) {
// limit max_bin by min_data_in_bin
std::vector<double>& raw_values = values;
int zero_cnt = static_cast<int>(total_sample_cnt - raw_values.size());
// find distinct_values first
std::vector<double> distinct_values;
std::vector<int> counts;
std::sort(ref_values.begin(), ref_values.end());
std::sort(raw_values.begin(), raw_values.end());
// push zero in the front
if (ref_values.empty() || (ref_values[0] > 0.0f && zero_cnt > 0)) {
distinct_values.push_back(0);
if (raw_values.empty() || (raw_values[0] > 0.0f && zero_cnt > 0)) {
distinct_values.push_back(0.0f);
counts.push_back(zero_cnt);
}
if (!ref_values.empty()) {
distinct_values.push_back(ref_values[0]);
if (!raw_values.empty()) {
distinct_values.push_back(raw_values[0]);
counts.push_back(1);
}
for (size_t i = 1; i < ref_values.size(); ++i) {
if (ref_values[i] != ref_values[i - 1]) {
if (ref_values[i - 1] == 0.0f) {
counts.back() += zero_cnt;
} else if (ref_values[i - 1] < 0.0f && ref_values[i] > 0.0f) {
distinct_values.push_back(0);
for (size_t i = 1; i < raw_values.size(); ++i) {
if (raw_values[i] != raw_values[i - 1]) {
if (raw_values[i - 1] < 0.0f && raw_values[i] > 0.0f) {
distinct_values.push_back(0.0f);
counts.push_back(zero_cnt);
}
distinct_values.push_back(ref_values[i]);
distinct_values.push_back(raw_values[i]);
counts.push_back(1);
} else {
++counts.back();
......@@ -81,119 +86,106 @@ void BinMapper::FindBin(std::vector<double>* values, size_t total_sample_cnt, in
}
// push zero in the back
if (!ref_values.empty() && ref_values.back() < 0.0f && zero_cnt > 0) {
distinct_values.push_back(0);
if (!raw_values.empty() && raw_values.back() < 0.0f && zero_cnt > 0) {
distinct_values.push_back(0.0f);
counts.push_back(zero_cnt);
}
min_val_ = distinct_values.front();
max_val_ = distinct_values.back();
std::vector<int> cnt_in_bin;
int num_values = static_cast<int>(distinct_values.size());
if (bin_type_ == BinType::NumericalBin) {
if (num_values <= max_bin) {
std::sort(distinct_values.begin(), distinct_values.end());
// use distinct value is enough
num_bin_ = num_values;
bin_upper_bound_ = std::vector<double>(num_values);
for (int i = 0; i < num_values - 1; ++i) {
bin_upper_bound_[i] = (distinct_values[i] + distinct_values[i + 1]) / 2;
}
cnt_in_bin = counts;
bin_upper_bound_[num_values - 1] = std::numeric_limits<double>::infinity();
} else {
// mean size for one bin
double mean_bin_size = sample_size / static_cast<double>(max_bin);
int rest_bin_cnt = max_bin;
int rest_sample_cnt = static_cast<int>(sample_size);
std::vector<bool> is_big_count_value(num_values, false);
for (int i = 0; i < num_values; ++i) {
if (counts[i] >= mean_bin_size) {
is_big_count_value[i] = true;
--rest_bin_cnt;
rest_sample_cnt -= counts[i];
}
}
mean_bin_size = rest_sample_cnt / static_cast<double>(rest_bin_cnt);
std::vector<double> upper_bounds(max_bin, std::numeric_limits<double>::infinity());
std::vector<double> lower_bounds(max_bin, std::numeric_limits<double>::infinity());
int bin_cnt = 0;
lower_bounds[bin_cnt] = distinct_values[0];
int cur_cnt_inbin = 0;
for (int i = 0; i < num_values - 1; ++i) {
if (!is_big_count_value[i]) {
rest_sample_cnt -= counts[i];
}
cur_cnt_inbin += counts[i];
// need a new bin
if (is_big_count_value[i] || cur_cnt_inbin >= mean_bin_size ||
(is_big_count_value[i + 1] && cur_cnt_inbin >= std::max(1.0, mean_bin_size * 0.5f))) {
upper_bounds[bin_cnt] = distinct_values[i];
cnt_in_bin.push_back(cur_cnt_inbin);
++bin_cnt;
lower_bounds[bin_cnt] = distinct_values[i + 1];
if (bin_cnt >= max_bin - 1) { break; }
cur_cnt_inbin = 0;
if (!is_big_count_value[i]) {
--rest_bin_cnt;
mean_bin_size = rest_sample_cnt / static_cast<double>(rest_bin_cnt);
}
}
if (num_values <= max_bin) {
// use distinct value is enough
bin_upper_bound_.clear();
int cur_cnt_inbin = 0;
for (int i = 0; i < num_values - 1; ++i) {
cur_cnt_inbin += counts[i];
if (cur_cnt_inbin >= min_data_in_bin) {
bin_upper_bound_.push_back((distinct_values[i] + distinct_values[i + 1]) / 2);
cnt_in_bin.push_back(cur_cnt_inbin);
cur_cnt_inbin = 0;
}
++bin_cnt;
// update bin upper bound
bin_upper_bound_ = std::vector<double>(bin_cnt);
num_bin_ = bin_cnt;
for (int i = 0; i < bin_cnt - 1; ++i) {
bin_upper_bound_[i] = (upper_bounds[i] + lower_bounds[i + 1]) / 2.0f;
}
// last bin upper bound
bin_upper_bound_[bin_cnt - 1] = std::numeric_limits<double>::infinity();
}
cur_cnt_inbin += counts.back();
cnt_in_bin.push_back(cur_cnt_inbin);
bin_upper_bound_.push_back(std::numeric_limits<double>::infinity());
num_bin_ = static_cast<int>(bin_upper_bound_.size());
} else {
// convert to int type first
std::vector<int> distinct_values_int;
std::vector<int> counts_int;
distinct_values_int.push_back(static_cast<int>(distinct_values[0]));
counts_int.push_back(counts[0]);
for (size_t i = 1; i < distinct_values.size(); ++i) {
if (static_cast<int>(distinct_values[i]) != distinct_values_int.back()) {
distinct_values_int.push_back(static_cast<int>(distinct_values[i]));
counts_int.push_back(counts[i]);
} else {
counts_int.back() += counts[i];
if (min_data_in_bin > 0) {
max_bin = std::min(max_bin, static_cast<int>(total_sample_cnt / min_data_in_bin));
max_bin = std::max(max_bin, 1);
}
double mean_bin_size = static_cast<double>(total_sample_cnt) / max_bin;
if (zero_cnt > mean_bin_size) {
int non_zero_cnt = static_cast<int>(raw_values.size());
max_bin = std::min(max_bin, 1 + static_cast<int>(non_zero_cnt / min_data_in_bin));
}
// mean size for one bin
int rest_bin_cnt = max_bin;
int rest_sample_cnt = static_cast<int>(total_sample_cnt);
std::vector<bool> is_big_count_value(num_values, false);
for (int i = 0; i < num_values; ++i) {
if (counts[i] >= mean_bin_size) {
is_big_count_value[i] = true;
--rest_bin_cnt;
rest_sample_cnt -= counts[i];
}
}
mean_bin_size = static_cast<double>(rest_sample_cnt) / rest_bin_cnt;
std::vector<double> upper_bounds(max_bin, std::numeric_limits<double>::infinity());
std::vector<double> lower_bounds(max_bin, std::numeric_limits<double>::infinity());
int bin_cnt = 0;
lower_bounds[bin_cnt] = distinct_values[0];
int cur_cnt_inbin = 0;
for (int i = 0; i < num_values - 1; ++i) {
if (!is_big_count_value[i]) {
rest_sample_cnt -= counts[i];
}
cur_cnt_inbin += counts[i];
// need a new bin
if (is_big_count_value[i] || cur_cnt_inbin >= mean_bin_size ||
(is_big_count_value[i + 1] && cur_cnt_inbin >= std::max(1.0, mean_bin_size * 0.5f))) {
upper_bounds[bin_cnt] = distinct_values[i];
cnt_in_bin.push_back(cur_cnt_inbin);
++bin_cnt;
lower_bounds[bin_cnt] = distinct_values[i + 1];
if (bin_cnt >= max_bin - 1) { break; }
cur_cnt_inbin = 0;
if (!is_big_count_value[i]) {
--rest_bin_cnt;
mean_bin_size = rest_sample_cnt / static_cast<double>(rest_bin_cnt);
}
}
}
// sort by counts
Common::SortForPair<int, int>(counts_int, distinct_values_int, 0, true);
// will ingore the categorical of small counts
const int cut_cnt = static_cast<int>(sample_size * 0.95f);
categorical_2_bin_.clear();
bin_2_categorical_.clear();
num_bin_ = 0;
int used_cnt = 0;
max_bin = std::min(static_cast<int>(distinct_values_int.size()), max_bin);
while (used_cnt < cut_cnt || num_bin_ < max_bin ) {
bin_2_categorical_.push_back(distinct_values_int[num_bin_]);
categorical_2_bin_[distinct_values_int[num_bin_]] = static_cast<unsigned int>(num_bin_);
used_cnt += counts_int[num_bin_];
++num_bin_;
cur_cnt_inbin += counts.back();
cnt_in_bin.push_back(cur_cnt_inbin);
++bin_cnt;
// update bin upper bound
bin_upper_bound_ = std::vector<double>(bin_cnt);
num_bin_ = bin_cnt;
for (int i = 0; i < bin_cnt - 1; ++i) {
bin_upper_bound_[i] = (upper_bounds[i] + lower_bounds[i + 1]) / 2.0f;
}
cnt_in_bin = counts_int;
cnt_in_bin[0] += static_cast<int>(sample_size) - used_cnt;
// last bin upper bound
bin_upper_bound_[bin_cnt - 1] = std::numeric_limits<double>::infinity();
}
// check trival(num_bin_ == 1) feature
if (num_bin_ <= 1) {
is_trival_ = true;
default_bin_ = 0;
} else {
is_trival_ = false;
default_bin_ = ValueToBin(0);
}
if (NeedFilter(cnt_in_bin, static_cast<int>(total_sample_cnt), min_split_data)) {
is_trival_ = true;
}
// calculate sparse rate
CHECK(num_bin_ <= max_bin);
sparse_rate_ = static_cast<double>(cnt_in_bin[GetDefaultBin()]) / static_cast<double>(sample_size);
sparse_rate_ = static_cast<double>(cnt_in_bin[GetDefaultBin()]) / static_cast<double>(total_sample_cnt);
}
......@@ -202,8 +194,9 @@ int BinMapper::SizeForSpecificBin(int bin) {
size += sizeof(int);
size += sizeof(bool);
size += sizeof(double);
size += sizeof(BinType);
size += 2 * sizeof(double);
size += bin * sizeof(double);
size += sizeof(uint32_t);
return size;
}
......@@ -214,18 +207,13 @@ void BinMapper::CopyTo(char * buffer) {
buffer += sizeof(is_trival_);
std::memcpy(buffer, &sparse_rate_, sizeof(sparse_rate_));
buffer += sizeof(sparse_rate_);
std::memcpy(buffer, &bin_type_, sizeof(bin_type_));
buffer += sizeof(bin_type_);
std::memcpy(&min_val_, buffer, sizeof(min_val_));
buffer += sizeof(min_val_);
std::memcpy(&max_val_, buffer, sizeof(max_val_));
buffer += sizeof(max_val_);
if (bin_type_ == BinType::NumericalBin) {
std::memcpy(buffer, bin_upper_bound_.data(), num_bin_ * sizeof(double));
} else {
std::memcpy(buffer, bin_2_categorical_.data(), num_bin_ * sizeof(int));
}
std::memcpy(&default_bin_, buffer, sizeof(default_bin_));
buffer += sizeof(default_bin_);
std::memcpy(buffer, bin_upper_bound_.data(), num_bin_ * sizeof(double));
}
void BinMapper::CopyFrom(const char * buffer) {
......@@ -235,48 +223,30 @@ void BinMapper::CopyFrom(const char * buffer) {
buffer += sizeof(is_trival_);
std::memcpy(&sparse_rate_, buffer, sizeof(sparse_rate_));
buffer += sizeof(sparse_rate_);
std::memcpy(&bin_type_, buffer, sizeof(bin_type_));
buffer += sizeof(bin_type_);
std::memcpy(&min_val_, buffer, sizeof(min_val_));
buffer += sizeof(min_val_);
std::memcpy(&max_val_, buffer, sizeof(max_val_));
buffer += sizeof(max_val_);
if (bin_type_ == BinType::NumericalBin) {
bin_upper_bound_ = std::vector<double>(num_bin_);
std::memcpy(bin_upper_bound_.data(), buffer, num_bin_ * sizeof(double));
} else {
bin_2_categorical_ = std::vector<int>(num_bin_);
std::memcpy(bin_2_categorical_.data(), buffer, num_bin_ * sizeof(int));
categorical_2_bin_.clear();
for (int i = 0; i < num_bin_; ++i) {
categorical_2_bin_[bin_2_categorical_[i]] = static_cast<unsigned int>(i);
}
}
std::memcpy(&default_bin_, buffer, sizeof(default_bin_));
buffer += sizeof(default_bin_);
bin_upper_bound_ = std::vector<double>(num_bin_);
std::memcpy(bin_upper_bound_.data(), buffer, num_bin_ * sizeof(double));
}
void BinMapper::SaveBinaryToFile(FILE* file) const {
fwrite(&num_bin_, sizeof(num_bin_), 1, file);
fwrite(&is_trival_, sizeof(is_trival_), 1, file);
fwrite(&sparse_rate_, sizeof(sparse_rate_), 1, file);
fwrite(&bin_type_, sizeof(bin_type_), 1, file);
fwrite(&min_val_, sizeof(min_val_), 1, file);
fwrite(&max_val_, sizeof(max_val_), 1, file);
if (bin_type_ == BinType::NumericalBin) {
fwrite(bin_upper_bound_.data(), sizeof(double), num_bin_, file);
} else {
fwrite(bin_2_categorical_.data(), sizeof(int), num_bin_, file);
}
fwrite(&default_bin_, sizeof(default_bin_), 1, file);
fwrite(bin_upper_bound_.data(), sizeof(double), num_bin_, file);
}
size_t BinMapper::SizesInByte() const {
size_t ret = sizeof(num_bin_) + sizeof(is_trival_) + sizeof(sparse_rate_)
+ sizeof(bin_type_) + sizeof(min_val_) + sizeof(max_val_);
if (bin_type_ == BinType::NumericalBin) {
ret += sizeof(double) * num_bin_;
} else {
ret += sizeof(int) * num_bin_;
}
+ sizeof(min_val_) + sizeof(max_val_) + sizeof(default_bin_);
ret += sizeof(double) * num_bin_;
return ret;
}
......@@ -284,73 +254,46 @@ template class DenseBin<uint8_t>;
template class DenseBin<uint16_t>;
template class DenseBin<uint32_t>;
template class DenseCategoricalBin<uint8_t>;
template class DenseCategoricalBin<uint16_t>;
template class DenseCategoricalBin<uint32_t>;
template class SparseBin<uint8_t>;
template class SparseBin<uint16_t>;
template class SparseBin<uint32_t>;
template class SparseCategoricalBin<uint8_t>;
template class SparseCategoricalBin<uint16_t>;
template class SparseCategoricalBin<uint32_t>;
template class OrderedSparseBin<uint8_t>;
template class OrderedSparseBin<uint16_t>;
template class OrderedSparseBin<uint32_t>;
double BinMapper::kSparseThreshold = 0.8f;
Bin* Bin::CreateBin(data_size_t num_data, int num_bin, double sparse_rate,
bool is_enable_sparse, bool* is_sparse, uint32_t default_bin, BinType bin_type) {
bool is_enable_sparse, bool* is_sparse) {
// sparse threshold
const double kSparseThreshold = 0.8f;
if (sparse_rate >= kSparseThreshold && is_enable_sparse) {
if (sparse_rate >= BinMapper::kSparseThreshold && is_enable_sparse) {
*is_sparse = true;
return CreateSparseBin(num_data, num_bin, default_bin, bin_type);
return CreateSparseBin(num_data, num_bin);
} else {
*is_sparse = false;
return CreateDenseBin(num_data, num_bin, default_bin, bin_type);
return CreateDenseBin(num_data, num_bin);
}
}
Bin* Bin::CreateDenseBin(data_size_t num_data, int num_bin, uint32_t default_bin, BinType bin_type) {
if (bin_type == BinType::NumericalBin) {
if (num_bin <= 255) {
return new DenseBin<uint8_t>(num_data, default_bin);
} else if (num_bin <= 65535) {
return new DenseBin<uint16_t>(num_data, default_bin);
} else {
return new DenseBin<uint32_t>(num_data, default_bin);
}
Bin* Bin::CreateDenseBin(data_size_t num_data, int num_bin) {
if (num_bin <= 256) {
return new DenseBin<uint8_t>(num_data);
} else if (num_bin <= 65536) {
return new DenseBin<uint16_t>(num_data);
} else {
if (num_bin <= 255) {
return new DenseCategoricalBin<uint8_t>(num_data, default_bin);
} else if (num_bin <= 65535) {
return new DenseCategoricalBin<uint16_t>(num_data, default_bin);
} else {
return new DenseCategoricalBin<uint32_t>(num_data, default_bin);
}
return new DenseBin<uint32_t>(num_data);
}
}
Bin* Bin::CreateSparseBin(data_size_t num_data, int num_bin, uint32_t default_bin, BinType bin_type) {
if (bin_type == BinType::NumericalBin) {
if (num_bin <= 255) {
return new SparseBin<uint8_t>(num_data, default_bin);
} else if (num_bin <= 65535) {
return new SparseBin<uint16_t>(num_data, default_bin);
} else {
return new SparseBin<uint32_t>(num_data, default_bin);
}
Bin* Bin::CreateSparseBin(data_size_t num_data, int num_bin) {
if (num_bin <= 256) {
return new SparseBin<uint8_t>(num_data);
} else if (num_bin <= 65536) {
return new SparseBin<uint16_t>(num_data);
} else {
if (num_bin <= 255) {
return new SparseCategoricalBin<uint8_t>(num_data, default_bin);
} else if (num_bin <= 65535) {
return new SparseCategoricalBin<uint16_t>(num_data, default_bin);
} else {
return new SparseCategoricalBin<uint32_t>(num_data, default_bin);
}
return new SparseBin<uint32_t>(num_data);
}
}
......
......@@ -39,11 +39,11 @@ void OverallConfig::Set(const std::unordered_map<std::string, std::string>& para
// generate seeds by seed.
if (GetInt(params, "seed", &seed)) {
Random rand(seed);
int int_max = std::numeric_limits<int>::max();
io_config.data_random_seed = static_cast<int>(rand.NextInt(0, int_max));
boosting_config.bagging_seed = static_cast<int>(rand.NextInt(0, int_max));
boosting_config.drop_seed = static_cast<int>(rand.NextInt(0, int_max));
boosting_config.tree_config.feature_fraction_seed = static_cast<int>(rand.NextInt(0, int_max));
int int_max = std::numeric_limits<short>::max();
io_config.data_random_seed = static_cast<int>(rand.NextShort(0, int_max));
boosting_config.bagging_seed = static_cast<int>(rand.NextShort(0, int_max));
boosting_config.drop_seed = static_cast<int>(rand.NextShort(0, int_max));
boosting_config.tree_config.feature_fraction_seed = static_cast<int>(rand.NextShort(0, int_max));
}
GetTaskType(params);
GetBoostingType(params);
......@@ -79,6 +79,8 @@ void OverallConfig::GetBoostingType(const std::unordered_map<std::string, std::s
boosting_type = "gbdt";
} else if (value == std::string("dart")) {
boosting_type = "dart";
} else if (value == std::string("goss")) {
boosting_type = "goss";
} else {
Log::Fatal("Unknown boosting type %s", value.c_str());
}
......@@ -214,7 +216,11 @@ void IOConfig::Set(const std::unordered_map<std::string, std::string>& params) {
GetString(params, "weight_column", &weight_column);
GetString(params, "group_column", &group_column);
GetString(params, "ignore_column", &ignore_column);
GetString(params, "categorical_column", &categorical_column);
GetInt(params, "min_data_in_leaf", &min_data_in_leaf);
GetInt(params, "min_dato_in_bin", &min_data_in_bin);
GetDouble(params, "max_conflict_rate", &max_conflict_rate);
GetBool(params, "enable_bundle", &enable_bundle);
GetBool(params, "adjacent_bundle", &adjacent_bundle);
}
......@@ -323,6 +329,8 @@ void BoostingConfig::Set(const std::unordered_map<std::string, std::string>& par
GetInt(params, "max_drop", &max_drop);
GetBool(params, "xgboost_dart_mode", &xgboost_dart_mode);
GetBool(params, "uniform_drop", &uniform_drop);
GetDouble(params, "top_rate", &top_rate);
GetDouble(params, "other_rate", &other_rate);
CHECK(drop_rate <= 1.0 && drop_rate >= 0.0);
CHECK(skip_drop <= 1.0 && skip_drop >= 0.0);
GetTreeLearnerType(params);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment