Commit 9962e6d6 authored by Guolin Ke's avatar Guolin Ke
Browse files

support negative values for sparse features.

parent 1765b2e3
......@@ -21,9 +21,9 @@ enum BinType {
struct HistogramBinEntry {
public:
/*! \brief Sum of gradients on this bin */
double sum_gradients = 0.0;
double sum_gradients = 0.0f;
/*! \brief Sum of hessians on this bin */
double sum_hessians = 0.0;
double sum_hessians = 0.0f;
/*! \brief Number of data on this bin */
data_size_t cnt = 0;
......@@ -352,7 +352,7 @@ public:
*/
static Bin* CreateBin(data_size_t num_data, int num_bin,
double sparse_rate, bool is_enable_sparse,
bool* is_sparse, int default_bin, BinType bin_type);
bool* is_sparse, uint32_t default_bin, BinType bin_type);
/*!
* \brief Create object for bin data of one feature, used for dense feature
......@@ -363,7 +363,7 @@ public:
* \return The bin data object
*/
static Bin* CreateDenseBin(data_size_t num_data, int num_bin,
int default_bin, BinType bin_type);
uint32_t default_bin, BinType bin_type);
/*!
* \brief Create object for bin data of one feature, used for sparse feature
......@@ -374,7 +374,7 @@ public:
* \return The bin data object
*/
static Bin* CreateSparseBin(data_size_t num_data,
int num_bin, int default_bin, BinType bin_type);
int num_bin, uint32_t default_bin, BinType bin_type);
};
inline unsigned int BinMapper::ValueToBin(double value) const {
......
......@@ -85,7 +85,7 @@ enum TaskType {
/*! \brief Config for input and output files */
struct IOConfig: public ConfigBase {
public:
int max_bin = 256;
int max_bin = 255;
int num_class = 1;
int data_random_seed = 1;
std::string data_filename = "";
......
......@@ -30,6 +30,8 @@ BinMapper::BinMapper(const BinMapper& other) {
bin_2_categorical_ = other.bin_2_categorical_;
categorical_2_bin_ = other.categorical_2_bin_;
}
min_val_ = other.min_val_;
max_val_ = other.max_val_;
}
......@@ -85,6 +87,7 @@ void BinMapper::FindBin(const std::string& column_name, std::vector<double>* val
}
min_val_ = distinct_values.front();
max_val_ = distinct_values.back();
std::vector<int> cnt_in_bin;
int num_values = static_cast<int>(distinct_values.size());
int cnt_in_bin0 = 0;
if (bin_type_ == BinType::NumericalBin) {
......@@ -96,7 +99,7 @@ void BinMapper::FindBin(const std::string& column_name, std::vector<double>* val
for (int i = 0; i < num_values - 1; ++i) {
bin_upper_bound_[i] = (distinct_values[i] + distinct_values[i + 1]) / 2;
}
cnt_in_bin0 = counts[0];
cnt_in_bin = counts;
bin_upper_bound_[num_values - 1] = std::numeric_limits<double>::infinity();
} else {
// mean size for one bin
......@@ -128,9 +131,7 @@ void BinMapper::FindBin(const std::string& column_name, std::vector<double>* val
if (is_big_count_value[i] || cur_cnt_inbin >= mean_bin_size ||
(is_big_count_value[i + 1] && cur_cnt_inbin >= std::max(1.0, mean_bin_size * 0.5f))) {
upper_bounds[bin_cnt] = distinct_values[i];
if (bin_cnt == 0) {
cnt_in_bin0 = cur_cnt_inbin;
}
cnt_in_bin.push_back(cur_cnt_inbin);
++bin_cnt;
lower_bounds[bin_cnt] = distinct_values[i + 1];
if (bin_cnt >= max_bin - 1) { break; }
......@@ -183,7 +184,8 @@ void BinMapper::FindBin(const std::string& column_name, std::vector<double>* val
Log::Warning("Too many categoricals are ignored, \
please use bigger max_bin or partition column \"%s\" ", column_name.c_str());
}
cnt_in_bin0 = static_cast<int>(sample_size) - used_cnt + counts_int[0];
cnt_in_bin = counts_int;
cnt_in_bin[0] += static_cast<int>(sample_size) - used_cnt;
}
// check trival(num_bin_ == 1) feature
......@@ -193,7 +195,8 @@ void BinMapper::FindBin(const std::string& column_name, std::vector<double>* val
is_trival_ = false;
}
// calculate sparse rate
sparse_rate_ = static_cast<double>(cnt_in_bin0) / static_cast<double>(sample_size);
CHECK(num_bin_ <= max_bin);
sparse_rate_ = static_cast<double>(cnt_in_bin[GetDefaultBin()]) / static_cast<double>(sample_size);
}
......@@ -216,6 +219,11 @@ void BinMapper::CopyTo(char * buffer) {
buffer += sizeof(sparse_rate_);
std::memcpy(buffer, &bin_type_, sizeof(bin_type_));
buffer += sizeof(bin_type_);
std::memcpy(&min_val_, buffer, sizeof(min_val_));
buffer += sizeof(min_val_);
std::memcpy(&max_val_, buffer, sizeof(max_val_));
buffer += sizeof(max_val_);
if (bin_type_ == BinType::NumericalBin) {
std::memcpy(buffer, bin_upper_bound_.data(), num_bin_ * sizeof(double));
} else {
......@@ -232,6 +240,11 @@ void BinMapper::CopyFrom(const char * buffer) {
buffer += sizeof(sparse_rate_);
std::memcpy(&bin_type_, buffer, sizeof(bin_type_));
buffer += sizeof(bin_type_);
std::memcpy(&min_val_, buffer, sizeof(min_val_));
buffer += sizeof(min_val_);
std::memcpy(&max_val_, buffer, sizeof(max_val_));
buffer += sizeof(max_val_);
if (bin_type_ == BinType::NumericalBin) {
bin_upper_bound_ = std::vector<double>(num_bin_);
std::memcpy(bin_upper_bound_.data(), buffer, num_bin_ * sizeof(double));
......@@ -250,6 +263,8 @@ void BinMapper::SaveBinaryToFile(FILE* file) const {
fwrite(&is_trival_, sizeof(is_trival_), 1, file);
fwrite(&sparse_rate_, sizeof(sparse_rate_), 1, file);
fwrite(&bin_type_, sizeof(bin_type_), 1, file);
fwrite(&min_val_, sizeof(min_val_), 1, file);
fwrite(&max_val_, sizeof(max_val_), 1, file);
if (bin_type_ == BinType::NumericalBin) {
fwrite(bin_upper_bound_.data(), sizeof(double), num_bin_, file);
} else {
......@@ -259,7 +274,7 @@ void BinMapper::SaveBinaryToFile(FILE* file) const {
size_t BinMapper::SizesInByte() const {
size_t ret = sizeof(num_bin_) + sizeof(is_trival_) + sizeof(sparse_rate_)
+ sizeof(bin_type_);
+ sizeof(bin_type_) + sizeof(min_val_) + sizeof(max_val_);
if (bin_type_ == BinType::NumericalBin) {
ret += sizeof(double) * num_bin_;
} else {
......@@ -290,7 +305,7 @@ template class OrderedSparseBin<uint32_t>;
Bin* Bin::CreateBin(data_size_t num_data, int num_bin, double sparse_rate,
bool is_enable_sparse, bool* is_sparse, int default_bin, BinType bin_type) {
bool is_enable_sparse, bool* is_sparse, uint32_t default_bin, BinType bin_type) {
// sparse threshold
const double kSparseThreshold = 0.8f;
if (sparse_rate >= kSparseThreshold && is_enable_sparse) {
......@@ -302,19 +317,19 @@ Bin* Bin::CreateBin(data_size_t num_data, int num_bin, double sparse_rate,
}
}
Bin* Bin::CreateDenseBin(data_size_t num_data, int num_bin, int default_bin, BinType bin_type) {
Bin* Bin::CreateDenseBin(data_size_t num_data, int num_bin, uint32_t default_bin, BinType bin_type) {
if (bin_type == BinType::NumericalBin) {
if (num_bin <= 256) {
if (num_bin <= 255) {
return new DenseBin<uint8_t>(num_data, default_bin);
} else if (num_bin <= 65536) {
} else if (num_bin <= 65535) {
return new DenseBin<uint16_t>(num_data, default_bin);
} else {
return new DenseBin<uint32_t>(num_data, default_bin);
}
} else {
if (num_bin <= 256) {
if (num_bin <= 255) {
return new DenseCategoricalBin<uint8_t>(num_data, default_bin);
} else if (num_bin <= 65536) {
} else if (num_bin <= 65535) {
return new DenseCategoricalBin<uint16_t>(num_data, default_bin);
} else {
return new DenseCategoricalBin<uint32_t>(num_data, default_bin);
......@@ -322,19 +337,19 @@ Bin* Bin::CreateDenseBin(data_size_t num_data, int num_bin, int default_bin, Bin
}
}
Bin* Bin::CreateSparseBin(data_size_t num_data, int num_bin, int default_bin, BinType bin_type) {
Bin* Bin::CreateSparseBin(data_size_t num_data, int num_bin, uint32_t default_bin, BinType bin_type) {
if (bin_type == BinType::NumericalBin) {
if (num_bin <= 256) {
if (num_bin <= 255) {
return new SparseBin<uint8_t>(num_data, default_bin);
} else if (num_bin <= 65536) {
} else if (num_bin <= 65535) {
return new SparseBin<uint16_t>(num_data, default_bin);
} else {
return new SparseBin<uint32_t>(num_data, default_bin);
}
} else {
if (num_bin <= 256) {
if (num_bin <= 255) {
return new SparseCategoricalBin<uint8_t>(num_data, default_bin);
} else if (num_bin <= 65536) {
} else if (num_bin <= 65535) {
return new SparseCategoricalBin<uint16_t>(num_data, default_bin);
} else {
return new SparseCategoricalBin<uint32_t>(num_data, default_bin);
......
......@@ -16,14 +16,8 @@ namespace LightGBM {
template <typename VAL_T>
class DenseBin: public Bin {
public:
DenseBin(data_size_t num_data, int default_bin)
: num_data_(num_data) {
data_.resize(num_data_);
VAL_T default_bin_T = static_cast<VAL_T>(default_bin);
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) {
data_[i] = default_bin_T;
}
DenseBin(data_size_t num_data, uint32_t default_bin)
: num_data_(num_data), data_(num_data_, static_cast<VAL_T>(default_bin)) {
}
~DenseBin() {
......
......@@ -9,7 +9,7 @@
#include <cstring>
#include <cstdint>
#include <limits>
#include <vector>
namespace LightGBM {
......@@ -50,12 +50,9 @@ public:
friend class SparseBinIterator<VAL_T>;
friend class OrderedSparseBin<VAL_T>;
SparseBin(data_size_t num_data, int default_bin)
SparseBin(data_size_t num_data, uint32_t default_bin)
: num_data_(num_data) {
default_bin_ = static_cast<VAL_T>(default_bin);
if (default_bin_ != 0) {
Log::Info("Warning: sparse feature with negative values, treating negative values as zero");
}
#pragma omp parallel
#pragma omp master
{
......@@ -75,9 +72,10 @@ public:
}
void Push(int tid, data_size_t idx, uint32_t value) override {
// not store zero data
if (value <= default_bin_) { return; }
push_buffers_[tid].emplace_back(idx, static_cast<VAL_T>(value));
auto cur_bin = static_cast<VAL_T>(value);
if (cur_bin != default_bin_) {
push_buffers_[tid].emplace_back(idx, cur_bin);
}
}
BinIterator* GetIterator(data_size_t start_idx) const override;
......@@ -90,10 +88,11 @@ public:
inline bool NextNonzero(data_size_t* i_delta,
data_size_t* cur_pos) const {
const VAL_T non_data_flag = std::numeric_limits<VAL_T>::max();
++(*i_delta);
*cur_pos += deltas_[*i_delta];
data_size_t factor = 1;
while (*i_delta < num_vals_ && vals_[*i_delta] == 0) {
while (*i_delta < num_vals_ && vals_[*i_delta] == non_data_flag) {
++(*i_delta);
factor *= kMaxDelta;
*cur_pos += deltas_[*i_delta] * factor;
......@@ -130,41 +129,42 @@ public:
void FinishLoad() override {
// get total non zero size
size_t non_zero_size = 0;
size_t pair_cnt = 0;
for (size_t i = 0; i < push_buffers_.size(); ++i) {
non_zero_size += push_buffers_[i].size();
pair_cnt += push_buffers_[i].size();
}
std::vector<std::pair<data_size_t, VAL_T>> non_zero_pair;
std::vector<std::pair<data_size_t, VAL_T>> idx_val_pairs;
// merge
non_zero_pair.reserve(non_zero_size);
idx_val_pairs.reserve(pair_cnt);
for (size_t i = 0; i < push_buffers_.size(); ++i) {
non_zero_pair.insert(non_zero_pair.end(), push_buffers_[i].begin(), push_buffers_[i].end());
idx_val_pairs.insert(idx_val_pairs.end(), push_buffers_[i].begin(), push_buffers_[i].end());
push_buffers_[i].clear();
push_buffers_[i].shrink_to_fit();
}
push_buffers_.clear();
push_buffers_.shrink_to_fit();
// sort by data index
std::sort(non_zero_pair.begin(), non_zero_pair.end(),
std::sort(idx_val_pairs.begin(), idx_val_pairs.end(),
[](const std::pair<data_size_t, VAL_T>& a, const std::pair<data_size_t, VAL_T>& b) {
return a.first < b.first;
});
// load detla array
LoadFromPair(non_zero_pair);
LoadFromPair(idx_val_pairs);
}
void LoadFromPair(const std::vector<std::pair<data_size_t, VAL_T>>& non_zero_pair) {
void LoadFromPair(const std::vector<std::pair<data_size_t, VAL_T>>& idx_val_pairs) {
deltas_.clear();
vals_.clear();
const VAL_T non_data_flag = std::numeric_limits<VAL_T>::max();
// transform to delta array
data_size_t last_idx = 0;
for (size_t i = 0; i < non_zero_pair.size(); ++i) {
const data_size_t cur_idx = non_zero_pair[i].first;
const VAL_T bin = non_zero_pair[i].second;
for (size_t i = 0; i < idx_val_pairs.size(); ++i) {
const data_size_t cur_idx = idx_val_pairs[i].first;
const VAL_T bin = idx_val_pairs[i].second;
data_size_t cur_delta = cur_idx - last_idx;
while (cur_delta > kMaxDelta) {
deltas_.push_back(cur_delta % kMaxDelta);
vals_.push_back(0);
vals_.push_back(non_data_flag);
cur_delta /= kMaxDelta;
}
deltas_.push_back(static_cast<uint8_t>(cur_delta));
......@@ -270,7 +270,7 @@ public:
std::vector<std::pair<data_size_t, VAL_T>> tmp_pair;
for (data_size_t i = 0; i < num_used_indices; ++i) {
VAL_T bin = iterator.InnerGet(used_indices[i]);
if (bin > 0) {
if (bin != default_bin_) {
tmp_pair.emplace_back(i, bin);
}
}
......@@ -297,7 +297,7 @@ inline VAL_T SparseBinIterator<VAL_T>::InnerGet(data_size_t idx) {
if (cur_pos_ == idx && i_delta_ < bin_data_->num_vals_ && i_delta_ >= 0) {
return bin_data_->vals_[i_delta_];
} else {
return 0;
return bin_data_->default_bin_;
}
}
......@@ -317,7 +317,7 @@ BinIterator* SparseBin<VAL_T>::GetIterator(data_size_t start_idx) const {
template <typename VAL_T>
class SparseCategoricalBin: public SparseBin<VAL_T> {
public:
SparseCategoricalBin(data_size_t num_data, int default_bin)
SparseCategoricalBin(data_size_t num_data, uint32_t default_bin)
: SparseBin<VAL_T>(num_data, default_bin) {
}
......
......@@ -58,12 +58,34 @@ public:
data_[i].sum_hessians -= other.data_[i].sum_hessians;
}
}
void FixIgnoreBin(double sum_gradient, double sum_hessian, data_size_t num_data) {
if (feature_->is_sparse()) {
// not need to Fix if max heavy bin is 0
if (feature_->bin_type() == BinType::NumericalBin
&& feature_->bin_mapper()->GetDefaultBin() == 0) {
return;
}
int default_bin = static_cast<int>(feature_->bin_mapper()->GetDefaultBin());
data_[default_bin].sum_gradients = sum_gradient;
data_[default_bin].sum_hessians = sum_hessian;
data_[default_bin].cnt = num_data;
for (int t = feature_->num_bin() - 1; t >= 0; --t) {
if (t != default_bin) {
data_[default_bin].sum_gradients -= data_[t].sum_gradients;
data_[default_bin].sum_hessians -= data_[t].sum_hessians;
data_[default_bin].cnt -= data_[t].cnt;
}
}
}
}
/*!
* \brief Find best threshold for this histogram
* \param output The best split result
*/
void FindBestThreshold(double sum_gradient, double sum_hessian, data_size_t num_data,
SplitInfo* output) {
FixIgnoreBin(sum_gradient, sum_hessian, num_data);
find_best_threshold_fun_(sum_gradient, sum_hessian + 2 * kEpsilon, num_data, output);
if (output->gain > kMinScore) {
is_splittable_ = true;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment