Commit 90127b52 authored by Nikita Titov's avatar Nikita Titov Committed by Guolin Ke
Browse files

cpplint whitespaces and new lines (#1986)

parent 6f548ada
......@@ -51,8 +51,7 @@ const int INVALID_SOCKET = -1;
#ifdef _WIN32
#ifndef _MSC_VER
// not using visual studio in windows
inline int inet_pton(int af, const char *src, void *dst)
{
inline int inet_pton(int af, const char *src, void *dst) {
struct sockaddr_storage ss;
int size = sizeof(ss);
char src_copy[INET6_ADDRSTRLEN + 1];
......@@ -119,11 +118,11 @@ public:
if (sockfd_ == INVALID_SOCKET) {
return;
}
if (setsockopt(sockfd_, SOL_SOCKET, SO_RCVBUF, reinterpret_cast<const char*>(&SocketConfig::kSocketBufferSize), sizeof(SocketConfig::kSocketBufferSize)) != 0) {
Log::Warning("Set SO_RCVBUF failed, please increase your net.core.rmem_max to 100k at least");
}
if (setsockopt(sockfd_, SOL_SOCKET, SO_SNDBUF, reinterpret_cast<const char*>(&SocketConfig::kSocketBufferSize), sizeof(SocketConfig::kSocketBufferSize)) != 0) {
Log::Warning("Set SO_SNDBUF failed, please increase your net.core.wmem_max to 100k at least");
}
......
......@@ -19,7 +19,7 @@ public:
}
is_unbalance_ = config.is_unbalance;
scale_pos_weight_ = static_cast<double>(config.scale_pos_weight);
if(is_unbalance_ && std::fabs(scale_pos_weight_ - 1.0f) > 1e-6) {
if (is_unbalance_ && std::fabs(scale_pos_weight_ - 1.0f) > 1e-6) {
Log::Fatal("Cannot set is_unbalance and scale_pos_weight at the same time");
}
is_pos_ = is_pos;
......@@ -54,7 +54,7 @@ public:
// REMOVEME: remove the warning after 2.4 version release
Log::Warning("Starting from the 2.1.2 version, default value for "
"the \"boost_from_average\" parameter in \"binary\" objective is true.\n"
"This may cause significantly different results comparing to the previous versions of LightGBM.\n"
"This may cause significantly different results comparing to the previous versions of LightGBM.\n"
"Try to set boost_from_average=false, if your old models produce bad results");
// count for positive and negative samples
#pragma omp parallel for schedule(static) reduction(+:cnt_positive, cnt_negative)
......@@ -123,13 +123,13 @@ public:
}
}
}
// implement custom average to boost from (if enabled among options)
double BoostFromScore(int) const override {
double suml = 0.0f;
double sumw = 0.0f;
if (weights_ != nullptr) {
#pragma omp parallel for schedule(static) reduction(+:suml,sumw)
#pragma omp parallel for schedule(static) reduction(+:suml, sumw)
for (data_size_t i = 0; i < num_data_; ++i) {
suml += is_pos_(label_[i]) * weights_[i];
sumw += weights_[i];
......@@ -149,7 +149,7 @@ public:
return initscore;
}
bool ClassNeedTrain(int /*class_id*/) const override {
bool ClassNeedTrain(int /*class_id*/) const override {
return need_train_;
}
......
......@@ -35,7 +35,6 @@ public:
}
~MulticlassSoftmax() {
}
void Init(const Metadata& metadata, data_size_t num_data) override {
......@@ -138,8 +137,8 @@ public:
return std::log(std::max<double>(kEpsilon, class_init_probs_[class_id]));
}
bool ClassNeedTrain(int class_id) const override {
if (std::fabs(class_init_probs_[class_id]) <= kEpsilon
bool ClassNeedTrain(int class_id) const override {
if (std::fabs(class_init_probs_[class_id]) <= kEpsilon
|| std::fabs(class_init_probs_[class_id]) >= 1.0 - kEpsilon) {
return false;
} else {
......@@ -197,7 +196,6 @@ public:
}
~MulticlassOVA() {
}
void Init(const Metadata& metadata, data_size_t num_data) override {
......
......@@ -9,7 +9,7 @@ namespace LightGBM {
ObjectiveFunction* ObjectiveFunction::CreateObjectiveFunction(const std::string& type, const Config& config) {
if (type == std::string("regression") || type == std::string("regression_l2")
|| type == std::string("mean_squared_error") || type == std::string("mse")
|| type == std::string("mean_squared_error") || type == std::string("mse")
|| type == std::string("l2_root") || type == std::string("root_mean_squared_error") || type == std::string("rmse")) {
return new RegressionL2loss(config);
} else if (type == std::string("regression_l1") || type == std::string("mean_absolute_error") || type == std::string("mae")) {
......
......@@ -34,11 +34,9 @@ public:
}
explicit LambdarankNDCG(const std::vector<std::string>&) {
}
~LambdarankNDCG() {
}
void Init(const Metadata& metadata, data_size_t num_data) override {
num_data_ = num_data;
......
......@@ -78,7 +78,7 @@ public:
}
}
}
~RegressionL2loss() {
}
......@@ -146,7 +146,7 @@ public:
double suml = 0.0f;
double sumw = 0.0f;
if (weights_ != nullptr) {
#pragma omp parallel for schedule(static) reduction(+:suml,sumw)
#pragma omp parallel for schedule(static) reduction(+:suml, sumw)
for (data_size_t i = 0; i < num_data_; ++i) {
suml += label_[i] * weights_[i];
sumw += weights_[i];
......@@ -221,7 +221,7 @@ public:
bool IsRenewTreeOutput() const override { return true; }
double RenewTreeOutput(double, const double* pred,
double RenewTreeOutput(double, const double* pred,
const data_size_t* index_mapper,
const data_size_t* bagging_mapper,
data_size_t num_data_in_leaf) const override {
......@@ -253,7 +253,7 @@ public:
}
}
double RenewTreeOutput(double, double pred,
double RenewTreeOutput(double, double pred,
const data_size_t* index_mapper,
const data_size_t* bagging_mapper,
data_size_t num_data_in_leaf) const override {
......@@ -362,7 +362,6 @@ public:
}
explicit RegressionFairLoss(const std::vector<std::string>& strs): RegressionL2loss(strs) {
}
~RegressionFairLoss() {}
......@@ -414,7 +413,6 @@ public:
}
explicit RegressionPoissonLoss(const std::vector<std::string>& strs): RegressionL2loss(strs) {
}
~RegressionPoissonLoss() {}
......@@ -492,7 +490,6 @@ public:
}
explicit RegressionQuantileloss(const std::vector<std::string>& strs): RegressionL2loss(strs) {
}
~RegressionQuantileloss() {}
......@@ -620,7 +617,6 @@ public:
}
explicit RegressionMAPELOSS(const std::vector<std::string>& strs) : RegressionL1loss(strs) {
}
~RegressionMAPELOSS() {}
......@@ -727,7 +723,6 @@ public:
private:
std::vector<label_t> label_weight_;
};
......@@ -741,7 +736,6 @@ public:
}
explicit RegressionGammaLoss(const std::vector<std::string>& strs) : RegressionPoissonLoss(strs) {
}
~RegressionGammaLoss() {}
......@@ -766,7 +760,6 @@ public:
const char* GetName() const override {
return "gamma";
}
};
/*!
......@@ -779,7 +772,6 @@ public:
}
explicit RegressionTweedieLoss(const std::vector<std::string>& strs) : RegressionPoissonLoss(strs) {
}
~RegressionTweedieLoss() {}
......@@ -790,7 +782,7 @@ public:
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) {
gradients[i] = static_cast<score_t>(-label_[i] * std::exp((1 - rho_) * score[i]) + std::exp((2 - rho_) * score[i]));
hessians[i] = static_cast<score_t>(-label_[i] * (1 - rho_) * std::exp((1 - rho_) * score[i]) +
hessians[i] = static_cast<score_t>(-label_[i] * (1 - rho_) * std::exp((1 - rho_) * score[i]) +
(2 - rho_) * std::exp((2 - rho_) * score[i]));
}
} else {
......@@ -806,6 +798,7 @@ public:
const char* GetName() const override {
return "tweedie";
}
private:
double rho_;
};
......
......@@ -65,7 +65,6 @@ public:
Log::Fatal("[%s]: sum of weights is zero", GetName());
}
}
}
void GetGradients(const double* score, score_t* gradients, score_t* hessians) const override {
......@@ -108,7 +107,7 @@ public:
double suml = 0.0f;
double sumw = 0.0f;
if (weights_ != nullptr) {
#pragma omp parallel for schedule(static) reduction(+:suml,sumw)
#pragma omp parallel for schedule(static) reduction(+:suml, sumw)
for (data_size_t i = 0; i < num_data_; ++i) {
suml += label_[i] * weights_[i];
sumw += weights_[i];
......@@ -161,7 +160,6 @@ public:
Log::Info("[%s:%s]: (objective) labels passed interval [0, 1] check", GetName(), __func__);
if (weights_ != nullptr) {
Common::ObtainMinMaxSum(weights_, num_data_, &min_weight_, &max_weight_, (label_t*)nullptr);
if (min_weight_ <= 0.0f) {
Log::Fatal("[%s]: at least one weight is non-positive", GetName());
......@@ -196,7 +194,7 @@ public:
const double epf = std::exp(score[i]);
const double hhat = std::log(1.0f + epf);
const double z = 1.0f - std::exp(-w*hhat);
const double enf = 1.0f / epf; // = std::exp(-score[i]);
const double enf = 1.0f / epf; // = std::exp(-score[i]);
gradients[i] = static_cast<score_t>((1.0f - y / z) * w / (1.0f + enf));
const double c = 1.0f / (1.0f - z);
double d = 1.0f + epf;
......@@ -235,7 +233,7 @@ public:
double suml = 0.0f;
double sumw = 0.0f;
if (weights_ != nullptr) {
#pragma omp parallel for schedule(static) reduction(+:suml,sumw)
#pragma omp parallel for schedule(static) reduction(+:suml, sumw)
for (data_size_t i = 0; i < num_data_; ++i) {
suml += label_[i] * weights_[i];
sumw += weights_[i];
......
......@@ -14,7 +14,6 @@ DataParallelTreeLearner<TREELEARNER_T>::DataParallelTreeLearner(const Config* co
template <typename TREELEARNER_T>
DataParallelTreeLearner<TREELEARNER_T>::~DataParallelTreeLearner() {
}
template <typename TREELEARNER_T>
......
......@@ -48,7 +48,6 @@ public:
temp_right_indices_.resize(num_data_);
}
~DataPartition() {
}
/*!
......
......@@ -9,8 +9,7 @@
#include <cstring>
#include <cmath>
namespace LightGBM
{
namespace LightGBM {
class FeatureMetainfo {
public:
......@@ -83,7 +82,6 @@ public:
void FindBestThresholdNumerical(double sum_gradient, double sum_hessian, data_size_t num_data, double min_constraint, double max_constraint,
SplitInfo* output) {
is_splittable_ = false;
double gain_shift = GetLeafSplitGain(sum_gradient, sum_hessian,
meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step);
......@@ -118,7 +116,7 @@ public:
double best_sum_left_gradient = 0;
double best_sum_left_hessian = 0;
double gain_shift = GetLeafSplitGain(sum_gradient, sum_hessian, meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step);
double min_gain_shift = gain_shift + meta_->config->min_gain_to_split;
bool is_full_categorical = meta_->missing_type == MissingType::None;
int used_bin = meta_->num_bin - 1 + is_full_categorical;
......@@ -336,7 +334,7 @@ public:
output->gain = kMinScore;
Log::Warning("'Forced Split' will be ignored since the gain getting worse. ");
return;
};
}
// update split information
output->threshold = threshold;
......@@ -452,7 +450,6 @@ public:
}
private:
static double GetSplitGains(double sum_left_gradients, double sum_left_hessians,
double sum_right_gradients, double sum_right_hessians,
double l1, double l2, double max_delta_step,
......@@ -502,7 +499,6 @@ private:
void FindBestThresholdSequence(double sum_gradient, double sum_hessian, data_size_t num_data, double min_constraint, double max_constraint,
double min_gain_shift, SplitInfo* output, int dir, bool skip_default_bin, bool use_na_as_missing) {
const int8_t bias = meta_->bias;
double best_sum_left_gradient = NAN;
......@@ -512,7 +508,6 @@ private:
uint32_t best_threshold = static_cast<uint32_t>(meta_->num_bin);
if (dir == -1) {
double sum_right_gradient = 0.0f;
double sum_right_hessian = kEpsilon;
data_size_t right_count = 0;
......@@ -522,7 +517,6 @@ private:
// from right to left, and we don't need data in bin0
for (; t >= t_end; --t) {
// need to skip default bin
if (skip_default_bin && (t + bias) == static_cast<int>(meta_->default_bin)) { continue; }
......@@ -581,7 +575,6 @@ private:
}
for (; t <= t_end; ++t) {
// need to skip default bin
if (skip_default_bin && (t + bias) == static_cast<int>(meta_->default_bin)) { continue; }
if (t >= 0) {
......@@ -645,7 +638,7 @@ private:
const FeatureMetainfo* meta_;
/*! \brief sum of gradient of each bin */
HistogramBinEntry* data_;
//std::vector<HistogramBinEntry> data_;
// std::vector<HistogramBinEntry> data_;
bool is_splittable_ = true;
std::function<void(double, double, data_size_t, double, double, SplitInfo*)> find_best_threshold_fun_;
......@@ -701,7 +694,7 @@ public:
if (feature_metas_.empty()) {
int num_feature = train_data->num_features();
feature_metas_.resize(num_feature);
#pragma omp parallel for schedule(static, 512) if(num_feature >= 1024)
#pragma omp parallel for schedule(static, 512) if (num_feature >= 1024)
for (int i = 0; i < num_feature; ++i) {
feature_metas_[i].num_bin = train_data->FeatureNumBin(i);
feature_metas_[i].default_bin = train_data->FeatureBinMapper(i)->GetDefaultBin();
......@@ -751,7 +744,7 @@ public:
void ResetConfig(const Config* config) {
int size = static_cast<int>(feature_metas_.size());
#pragma omp parallel for schedule(static, 512) if(size >= 1024)
#pragma omp parallel for schedule(static, 512) if (size >= 1024)
for (int i = 0; i < size; ++i) {
feature_metas_[i].config = config;
}
......@@ -772,7 +765,7 @@ public:
last_used_time_[slot] = ++cur_time_;
return true;
} else {
// choose the least used slot
// choose the least used slot
int slot = static_cast<int>(ArrayArgs<int>::ArgMin(last_used_time_));
*out = pool_[slot].get();
last_used_time_[slot] = ++cur_time_;
......@@ -810,6 +803,7 @@ public:
last_used_time_[slot] = ++cur_time_;
inverse_mapper_[slot] = dst_idx;
}
private:
std::vector<std::unique_ptr<FeatureHistogram[]>> pool_;
std::vector<std::vector<HistogramBinEntry>> data_;
......
......@@ -14,7 +14,6 @@ FeatureParallelTreeLearner<TREELEARNER_T>::FeatureParallelTreeLearner(const Conf
template <typename TREELEARNER_T>
FeatureParallelTreeLearner<TREELEARNER_T>::~FeatureParallelTreeLearner() {
}
template <typename TREELEARNER_T>
......
......@@ -56,15 +56,14 @@ void PrintHistograms(HistogramBinEntry* h, size_t size) {
printf("\nTotal examples: %lu\n", total);
}
union Float_t
{
union Float_t {
int64_t i;
double f;
static int64_t ulp_diff(Float_t a, Float_t b) {
return abs(a.i - b.i);
}
};
void CompareHistograms(HistogramBinEntry* h1, HistogramBinEntry* h2, size_t size, int feature_id) {
size_t i;
......@@ -144,7 +143,7 @@ void GPUTreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featur
printf("Setting exp_workgroups_per_feature to %d, using %u work groups\n", exp_workgroups_per_feature, num_workgroups);
printf("Constructing histogram with %d examples\n", leaf_num_data);
#endif
// the GPU kernel will process all features in one call, and each
// 2^exp_workgroups_per_feature (compile time constant) workgroup will
// process one feature4 tuple
......@@ -184,7 +183,7 @@ void GPUTreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featur
// copy the results asynchronously. Size depends on if double precision is used
size_t output_size = num_dense_feature4_ * dword_features_ * device_bin_size_ * hist_bin_entry_sz_;
boost::compute::event histogram_wait_event;
host_histogram_outputs_ = (void*)queue_.enqueue_map_buffer_async(device_histogram_outputs_, boost::compute::command_queue::map_read,
host_histogram_outputs_ = (void*)queue_.enqueue_map_buffer_async(device_histogram_outputs_, boost::compute::command_queue::map_read,
0, output_size, histogram_wait_event, kernel_wait_obj_);
// we will wait for this object in WaitAndGetHistograms
histograms_wait_obj_ = boost::compute::wait_list(histogram_wait_event);
......@@ -196,13 +195,13 @@ void GPUTreeLearner::WaitAndGetHistograms(HistogramBinEntry* histograms) {
// when the output is ready, the computation is done
histograms_wait_obj_.wait();
#pragma omp parallel for schedule(static)
for(int i = 0; i < num_dense_feature_groups_; ++i) {
for (int i = 0; i < num_dense_feature_groups_; ++i) {
if (!feature_masks_[i]) {
continue;
}
int dense_group_index = dense_feature_group_map_[i];
auto old_histogram_array = histograms + train_data_->GroupBinBoundary(dense_group_index);
int bin_size = train_data_->FeatureGroupNumBin(dense_group_index);
int bin_size = train_data_->FeatureGroupNumBin(dense_group_index);
if (device_bin_mults_[i] == 1) {
for (int j = 0; j < bin_size; ++j) {
old_histogram_array[j].sum_gradients = hist_outputs[i * device_bin_size_+ j].sum_gradients;
......@@ -265,36 +264,36 @@ void GPUTreeLearner::AllocateGPUMemory() {
if (ptr_pinned_feature_masks_) {
queue_.enqueue_unmap_buffer(pinned_feature_masks_, ptr_pinned_feature_masks_);
}
// make ordered_gradients and hessians larger (including extra room for prefetching), and pin them
// make ordered_gradients and hessians larger (including extra room for prefetching), and pin them
ordered_gradients_.reserve(allocated_num_data_);
ordered_hessians_.reserve(allocated_num_data_);
pinned_gradients_ = boost::compute::buffer(); // deallocate
pinned_gradients_ = boost::compute::buffer(ctx_, allocated_num_data_ * sizeof(score_t),
boost::compute::memory_object::read_write | boost::compute::memory_object::use_host_ptr,
pinned_gradients_ = boost::compute::buffer(); // deallocate
pinned_gradients_ = boost::compute::buffer(ctx_, allocated_num_data_ * sizeof(score_t),
boost::compute::memory_object::read_write | boost::compute::memory_object::use_host_ptr,
ordered_gradients_.data());
ptr_pinned_gradients_ = queue_.enqueue_map_buffer(pinned_gradients_, boost::compute::command_queue::map_write_invalidate_region,
ptr_pinned_gradients_ = queue_.enqueue_map_buffer(pinned_gradients_, boost::compute::command_queue::map_write_invalidate_region,
0, allocated_num_data_ * sizeof(score_t));
pinned_hessians_ = boost::compute::buffer(); // deallocate
pinned_hessians_ = boost::compute::buffer(ctx_, allocated_num_data_ * sizeof(score_t),
boost::compute::memory_object::read_write | boost::compute::memory_object::use_host_ptr,
pinned_hessians_ = boost::compute::buffer(); // deallocate
pinned_hessians_ = boost::compute::buffer(ctx_, allocated_num_data_ * sizeof(score_t),
boost::compute::memory_object::read_write | boost::compute::memory_object::use_host_ptr,
ordered_hessians_.data());
ptr_pinned_hessians_ = queue_.enqueue_map_buffer(pinned_hessians_, boost::compute::command_queue::map_write_invalidate_region,
ptr_pinned_hessians_ = queue_.enqueue_map_buffer(pinned_hessians_, boost::compute::command_queue::map_write_invalidate_region,
0, allocated_num_data_ * sizeof(score_t));
// allocate space for gradients and hessians on device
// we will copy gradients and hessians in after ordered_gradients_ and ordered_hessians_ are constructed
device_gradients_ = boost::compute::buffer(); // deallocate
device_gradients_ = boost::compute::buffer(ctx_, allocated_num_data_ * sizeof(score_t),
device_gradients_ = boost::compute::buffer(); // deallocate
device_gradients_ = boost::compute::buffer(ctx_, allocated_num_data_ * sizeof(score_t),
boost::compute::memory_object::read_only, nullptr);
device_hessians_ = boost::compute::buffer(); // deallocate
device_hessians_ = boost::compute::buffer(ctx_, allocated_num_data_ * sizeof(score_t),
device_hessians_ = boost::compute::buffer(); // deallocate
device_hessians_ = boost::compute::buffer(ctx_, allocated_num_data_ * sizeof(score_t),
boost::compute::memory_object::read_only, nullptr);
// allocate feature mask, for disabling some feature-groups' histogram calculation
feature_masks_.resize(num_dense_feature4_ * dword_features_);
device_feature_masks_ = boost::compute::buffer(); // deallocate
device_feature_masks_ = boost::compute::buffer(ctx_, num_dense_feature4_ * dword_features_,
device_feature_masks_ = boost::compute::buffer(); // deallocate
device_feature_masks_ = boost::compute::buffer(ctx_, num_dense_feature4_ * dword_features_,
boost::compute::memory_object::read_only, nullptr);
pinned_feature_masks_ = boost::compute::buffer(ctx_, num_dense_feature4_ * dword_features_,
boost::compute::memory_object::read_write | boost::compute::memory_object::use_host_ptr,
pinned_feature_masks_ = boost::compute::buffer(ctx_, num_dense_feature4_ * dword_features_,
boost::compute::memory_object::read_write | boost::compute::memory_object::use_host_ptr,
feature_masks_.data());
ptr_pinned_feature_masks_ = queue_.enqueue_map_buffer(pinned_feature_masks_, boost::compute::command_queue::map_write_invalidate_region,
0, num_dense_feature4_ * dword_features_);
......@@ -320,7 +319,7 @@ void GPUTreeLearner::AllocateGPUMemory() {
boost::compute::fill(sync_counters_->begin(), sync_counters_->end(), 0, queue_);
// The output buffer is allocated to host directly, to overlap compute and data transfer
device_histogram_outputs_ = boost::compute::buffer(); // deallocate
device_histogram_outputs_ = boost::compute::buffer(ctx_, num_dense_feature4_ * dword_features_ * device_bin_size_ * hist_bin_entry_sz_,
device_histogram_outputs_ = boost::compute::buffer(ctx_, num_dense_feature4_ * dword_features_ * device_bin_size_ * hist_bin_entry_sz_,
boost::compute::memory_object::write_only | boost::compute::memory_object::alloc_host_ptr, nullptr);
// find the dense feature-groups and group then into Feature4 data structure (several feature-groups packed into 4 bytes)
int k = 0, copied_feature4 = 0;
......@@ -342,7 +341,7 @@ void GPUTreeLearner::AllocateGPUMemory() {
else {
sparse_feature_group_map_.push_back(i);
}
// found
// found
if (k == dword_features_) {
k = 0;
for (int j = 0; j < dword_features_; ++j) {
......@@ -362,8 +361,8 @@ void GPUTreeLearner::AllocateGPUMemory() {
// preallocate arrays for all threads, and pin them
for (int i = 0; i < nthreads; ++i) {
host4_vecs[i] = (Feature4*)boost::alignment::aligned_alloc(4096, num_data_ * sizeof(Feature4));
host4_bufs[i] = boost::compute::buffer(ctx_, num_data_ * sizeof(Feature4),
boost::compute::memory_object::read_write | boost::compute::memory_object::use_host_ptr,
host4_bufs[i] = boost::compute::buffer(ctx_, num_data_ * sizeof(Feature4),
boost::compute::memory_object::read_write | boost::compute::memory_object::use_host_ptr,
host4_vecs[i]);
host4_ptrs[i] = (Feature4*)queue_.enqueue_map_buffer(host4_bufs[i], boost::compute::command_queue::map_write_invalidate_region,
0, num_data_ * sizeof(Feature4));
......@@ -402,13 +401,13 @@ void GPUTreeLearner::AllocateGPUMemory() {
*static_cast<Dense4bitsBinIterator*>(bin_iters[6]),
*static_cast<Dense4bitsBinIterator*>(bin_iters[7])};
for (int j = 0; j < num_data_; ++j) {
host4[j].s[0] = (uint8_t)((iters[0].RawGet(j) * dev_bin_mult[0] + ((j+0) & (dev_bin_mult[0] - 1)))
host4[j].s[0] = (uint8_t)((iters[0].RawGet(j) * dev_bin_mult[0] + ((j+0) & (dev_bin_mult[0] - 1)))
|((iters[1].RawGet(j) * dev_bin_mult[1] + ((j+1) & (dev_bin_mult[1] - 1))) << 4));
host4[j].s[1] = (uint8_t)((iters[2].RawGet(j) * dev_bin_mult[2] + ((j+2) & (dev_bin_mult[2] - 1)))
host4[j].s[1] = (uint8_t)((iters[2].RawGet(j) * dev_bin_mult[2] + ((j+2) & (dev_bin_mult[2] - 1)))
|((iters[3].RawGet(j) * dev_bin_mult[3] + ((j+3) & (dev_bin_mult[3] - 1))) << 4));
host4[j].s[2] = (uint8_t)((iters[4].RawGet(j) * dev_bin_mult[4] + ((j+4) & (dev_bin_mult[4] - 1)))
host4[j].s[2] = (uint8_t)((iters[4].RawGet(j) * dev_bin_mult[4] + ((j+4) & (dev_bin_mult[4] - 1)))
|((iters[5].RawGet(j) * dev_bin_mult[5] + ((j+5) & (dev_bin_mult[5] - 1))) << 4));
host4[j].s[3] = (uint8_t)((iters[6].RawGet(j) * dev_bin_mult[6] + ((j+6) & (dev_bin_mult[6] - 1)))
host4[j].s[3] = (uint8_t)((iters[6].RawGet(j) * dev_bin_mult[6] + ((j+6) & (dev_bin_mult[6] - 1)))
|((iters[7].RawGet(j) * dev_bin_mult[7] + ((j+7) & (dev_bin_mult[7] - 1))) << 4));
}
}
......@@ -432,7 +431,7 @@ void GPUTreeLearner::AllocateGPUMemory() {
}
}
else {
Log::Fatal("Bug in GPU tree builder: only DenseBin and Dense4bitsBin are supported");
Log::Fatal("Bug in GPU tree builder: only DenseBin and Dense4bitsBin are supported");
}
}
}
......@@ -481,7 +480,7 @@ void GPUTreeLearner::AllocateGPUMemory() {
DenseBinIterator<uint8_t> iter = *static_cast<DenseBinIterator<uint8_t>*>(bin_iter);
#pragma omp parallel for schedule(static)
for (int j = 0; j < num_data_; ++j) {
host4[j].s[i] = (uint8_t)(iter.RawGet(j) * device_bin_mults_[copied_feature4 * dword_features_ + i]
host4[j].s[i] = (uint8_t)(iter.RawGet(j) * device_bin_mults_[copied_feature4 * dword_features_ + i]
+ ((j+i) & (device_bin_mults_[copied_feature4 * dword_features_ + i] - 1)));
}
}
......@@ -489,12 +488,12 @@ void GPUTreeLearner::AllocateGPUMemory() {
Dense4bitsBinIterator iter = *static_cast<Dense4bitsBinIterator*>(bin_iter);
#pragma omp parallel for schedule(static)
for (int j = 0; j < num_data_; ++j) {
host4[j].s[i] = (uint8_t)(iter.RawGet(j) * device_bin_mults_[copied_feature4 * dword_features_ + i]
host4[j].s[i] = (uint8_t)(iter.RawGet(j) * device_bin_mults_[copied_feature4 * dword_features_ + i]
+ ((j+i) & (device_bin_mults_[copied_feature4 * dword_features_ + i] - 1)));
}
}
else {
Log::Fatal("BUG in GPU tree builder: only DenseBin and Dense4bitsBin are supported");
Log::Fatal("BUG in GPU tree builder: only DenseBin and Dense4bitsBin are supported");
}
}
else {
......@@ -538,8 +537,8 @@ void GPUTreeLearner::AllocateGPUMemory() {
}
// data transfer time
std::chrono::duration<double, std::milli> end_time = std::chrono::steady_clock::now() - start_time;
Log::Info("%d dense feature groups (%.2f MB) transferred to GPU in %f secs. %d sparse feature groups",
dense_feature_group_map_.size(), ((dense_feature_group_map_.size() + (dword_features_ - 1)) / dword_features_) * num_data_ * sizeof(Feature4) / (1024.0 * 1024.0),
Log::Info("%d dense feature groups (%.2f MB) transferred to GPU in %f secs. %d sparse feature groups",
dense_feature_group_map_.size(), ((dense_feature_group_map_.size() + (dword_features_ - 1)) / dword_features_) * num_data_ * sizeof(Feature4) / (1024.0 * 1024.0),
end_time * 1e-3, sparse_feature_group_map_.size());
#if GPU_DEBUG >= 1
printf("Dense feature group list (size %lu): ", dense_feature_group_map_.size());
......@@ -596,7 +595,7 @@ void GPUTreeLearner::BuildGPUKernels() {
OMP_LOOP_EX_BEGIN();
boost::compute::program program;
std::ostringstream opts;
// compile the GPU kernel depending if double precision is used, constant hessian is used, etc
// compile the GPU kernel depending if double precision is used, constant hessian is used, etc.
opts << " -D POWER_FEATURE_WORKGROUPS=" << i
<< " -D USE_CONSTANT_BUF=" << use_constants << " -D USE_DP_FLOAT=" << int(config_->gpu_use_dp)
<< " -D CONST_HESSIAN=" << int(is_constant_hessian_)
......@@ -617,7 +616,7 @@ void GPUTreeLearner::BuildGPUKernels() {
}
}
histogram_kernels_[i] = program.create_kernel(kernel_name_);
// kernel with all features enabled, with elimited branches
opts << " -D ENABLE_ALL_FEATURES=1";
try {
......@@ -661,7 +660,7 @@ void GPUTreeLearner::SetupKernelArguments() {
for (int i = 0; i <= kMaxLogWorkgroupsPerFeature; ++i) {
// The only argument that needs to be changed later is num_data_
if (is_constant_hessian_) {
// hessian is passed as a parameter, but it is not available now.
// hessian is passed as a parameter, but it is not available now.
// hessian will be set in BeforeTrain()
histogram_kernels_[i].set_args(*device_features_, device_feature_masks_, num_data_,
*device_data_indices_, num_data_, device_gradients_, 0.0f,
......@@ -711,9 +710,9 @@ void GPUTreeLearner::InitGPU(int platform_id, int device_id) {
if ((int)platform_devices.size() > device_id) {
Log::Info("Using requested OpenCL platform %d device %d", platform_id, device_id);
dev_ = platform_devices[device_id];
}
}
}
}
}
}
// determine which kernel to use based on the max number of bins
if (max_num_bin_ <= 16) {
kernel_source_ = kernel16_src_;
......@@ -727,7 +726,7 @@ void GPUTreeLearner::InitGPU(int platform_id, int device_id) {
device_bin_size_ = 64;
dword_features_ = 4;
}
else if ( max_num_bin_ <= 256) {
else if (max_num_bin_ <= 256) {
kernel_source_ = kernel256_src_;
kernel_name_ = "histogram256";
device_bin_size_ = 256;
......@@ -736,10 +735,10 @@ void GPUTreeLearner::InitGPU(int platform_id, int device_id) {
else {
Log::Fatal("bin size %d cannot run on GPU", max_num_bin_);
}
if(max_num_bin_ == 65) {
if (max_num_bin_ == 65) {
Log::Warning("Setting max_bin to 63 is sugguested for best performance");
}
if(max_num_bin_ == 17) {
if (max_num_bin_ == 17) {
Log::Warning("Setting max_bin to 15 is sugguested for best performance");
}
ctx_ = boost::compute::context(dev_);
......@@ -774,7 +773,6 @@ void GPUTreeLearner::ResetTrainingData(const Dataset* train_data) {
}
void GPUTreeLearner::BeforeTrain() {
#if GPU_DEBUG >= 2
printf("Copying intial full gradients and hessians to device\n");
#endif
......@@ -861,7 +859,7 @@ bool GPUTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int ri
// copy indices to the GPU:
#if GPU_DEBUG >= 2
Log::Info("Copying indices, gradients and hessians to GPU...");
printf("Indices size %d being copied (left = %d, right = %d)\n", end - begin,num_data_in_left_child,num_data_in_right_child);
printf("Indices size %d being copied (left = %d, right = %d)\n", end - begin, num_data_in_left_child, num_data_in_right_child);
#endif
indices_future_ = boost::compute::copy_async(indices + begin, indices + end, device_data_indices_->begin(), queue_);
......@@ -893,7 +891,6 @@ bool GPUTreeLearner::ConstructGPUHistogramsAsync(
const data_size_t* data_indices, data_size_t num_data,
const score_t* gradients, const score_t* hessians,
score_t* ordered_gradients, score_t* ordered_hessians) {
if (num_data <= 0) {
return false;
}
......@@ -901,7 +898,7 @@ bool GPUTreeLearner::ConstructGPUHistogramsAsync(
if (!num_dense_feature_groups_) {
return false;
}
// copy data indices if it is not null
if (data_indices != nullptr && num_data != num_data_) {
indices_future_ = boost::compute::copy_async(data_indices, data_indices + num_data, device_data_indices_->begin(), queue_);
......@@ -934,15 +931,15 @@ bool GPUTreeLearner::ConstructGPUHistogramsAsync(
}
// converted indices in is_feature_used to feature-group indices
std::vector<int8_t> is_feature_group_used(num_feature_groups_, 0);
#pragma omp parallel for schedule(static,1024) if (num_features_ >= 2048)
#pragma omp parallel for schedule(static, 1024) if (num_features_ >= 2048)
for (int i = 0; i < num_features_; ++i) {
if(is_feature_used[i]) {
if (is_feature_used[i]) {
is_feature_group_used[train_data_->Feature2Group(i)] = 1;
}
}
// construct the feature masks for dense feature-groups
int used_dense_feature_groups = 0;
#pragma omp parallel for schedule(static,1024) reduction(+:used_dense_feature_groups) if (num_dense_feature_groups_ >= 2048)
#pragma omp parallel for schedule(static, 1024) reduction(+:used_dense_feature_groups) if (num_dense_feature_groups_ >= 2048)
for (int i = 0; i < num_dense_feature_groups_; ++i) {
if (is_feature_group_used[dense_feature_group_map_[i]]) {
feature_masks_[i] = 1;
......@@ -1036,7 +1033,7 @@ void GPUTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_u
num_data,
num_data != num_data_ ? ordered_gradients_.data() : gradients_,
num_data != num_data_ ? ordered_hessians_.data() : hessians_,
current_histogram);
current_histogram);
CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index);
std::copy(gpu_histogram, gpu_histogram + size, current_histogram);
delete [] gpu_histogram;
......@@ -1083,7 +1080,7 @@ void GPUTreeLearner::FindBestSplits() {
smaller_leaf_histogram_array_[feature_index].set_is_splittable(false);
continue;
}
size_t bin_size = train_data_->FeatureNumBin(feature_index) + 1;
size_t bin_size = train_data_->FeatureNumBin(feature_index) + 1;
printf("Feature %d smaller leaf:\n", feature_index);
PrintHistograms(smaller_leaf_histogram_array_[feature_index].RawData() - 1, bin_size);
if (larger_leaf_splits_ == nullptr || larger_leaf_splits_->LeafIndex() < 0) { continue; }
......@@ -1124,4 +1121,4 @@ void GPUTreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* right
}
} // namespace LightGBM
#endif // USE_GPU
#endif // USE_GPU
......@@ -63,12 +63,13 @@ protected:
void FindBestSplits() override;
void Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) override;
void ConstructHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract) override;
private:
/*! \brief 4-byte feature tuple used by GPU kernels */
struct Feature4 {
uint8_t s[4];
};
/*! \brief Single precision histogram entiry for GPU */
struct GPUHistogramBinEntry {
score_t sum_gradients;
......@@ -82,7 +83,7 @@ private:
* \return Log2 of the best number for workgroups per feature, in range 0...kMaxLogWorkgroupsPerFeature
*/
int GetNumWorkgroupsPerFeature(data_size_t leaf_num_data);
/*!
* \brief Initialize GPU device, context and command queues
* Also compiles the OpenCL kernel
......@@ -100,7 +101,7 @@ private:
* \brief Compile OpenCL GPU source code to kernel binaries
*/
void BuildGPUKernels();
/*!
* \brief Returns OpenCL kernel build log when compiled with option opts
* \param opts OpenCL build options
......@@ -120,7 +121,7 @@ private:
* \param use_all_features Set to true to not use feature masks, with a faster kernel
*/
void GPUHistogram(data_size_t leaf_num_data, bool use_all_features);
/*!
* \brief Wait for GPU kernel execution and read histogram
* \param histograms Destination of histogram results from GPU.
......@@ -151,7 +152,7 @@ private:
/*! brief Log2 of max number of workgroups per feature*/
const int kMaxLogWorkgroupsPerFeature = 10; // 2^10
const int kMaxLogWorkgroupsPerFeature = 10; // 2^10
/*! brief Max total number of workgroups with preallocated workspace.
* If we use more than this number of workgroups, we have to reallocate subhistograms */
int preallocd_max_num_wg_ = 1024;
......@@ -166,15 +167,15 @@ private:
/*! \brief GPU command queue object */
boost::compute::command_queue queue_;
/*! \brief GPU kernel for 256 bins */
const char *kernel256_src_ =
const char *kernel256_src_ =
#include "ocl/histogram256.cl"
;
/*! \brief GPU kernel for 64 bins */
const char *kernel64_src_ =
const char *kernel64_src_ =
#include "ocl/histogram64.cl"
;
/*! \brief GPU kernel for 16 bins */
const char *kernel16_src_ =
const char *kernel16_src_ =
#include "ocl/histogram16.cl"
;
/*! \brief Currently used kernel source */
......@@ -266,7 +267,7 @@ private:
// When GPU support is not compiled in, quit with an error message
namespace LightGBM {
class GPUTreeLearner: public SerialTreeLearner {
public:
#pragma warning(disable : 4702)
......@@ -276,7 +277,7 @@ public:
}
};
}
} // namespace LightGBM
#endif // USE_GPU
......
......@@ -129,7 +129,7 @@ public:
/*! \brief Get sum of gradients of current leaf */
double sum_gradients() const { return sum_gradients_; }
/*! \brief Get sum of hessians of current leaf */
double sum_hessians() const { return sum_hessians_; }
......
......@@ -51,6 +51,7 @@ public:
~DataParallelTreeLearner();
void Init(const Dataset* train_data, bool is_constant_hessian) override;
void ResetConfig(const Config* config) override;
protected:
void BeforeTrain() override;
void FindBestSplits() override;
......@@ -104,6 +105,7 @@ public:
~VotingParallelTreeLearner() { }
void Init(const Dataset* train_data, bool is_constant_hessian) override;
void ResetConfig(const Config* config) override;
protected:
void BeforeTrain() override;
bool BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) override;
......@@ -185,7 +187,7 @@ inline void SyncUpGlobalBestSplit(char* input_buffer_, char* output_buffer_, Spl
int size = SplitInfo::Size(max_cat_threshold);
smaller_best_split->CopyTo(input_buffer_);
larger_best_split->CopyTo(input_buffer_ + size);
Network::Allreduce(input_buffer_, size * 2, size, output_buffer_,
Network::Allreduce(input_buffer_, size * 2, size, output_buffer_,
[] (const char* src, char* dst, int size, comm_size_t len) {
comm_size_t used_size = 0;
LightSplitInfo p1, p2;
......
......@@ -18,7 +18,7 @@ std::chrono::duration<double, std::milli> hist_time;
std::chrono::duration<double, std::milli> find_split_time;
std::chrono::duration<double, std::milli> split_time;
std::chrono::duration<double, std::milli> ordered_bin_time;
#endif // TIMETAG
#endif // TIMETAG
SerialTreeLearner::SerialTreeLearner(const Config* config)
:config_(config) {
......@@ -253,7 +253,6 @@ Tree* SerialTreeLearner::FitByExistingTree(const Tree* old_tree, const std::vect
}
void SerialTreeLearner::BeforeTrain() {
// reset histogram pool
histogram_pool_.ResetMap();
......@@ -322,7 +321,7 @@ void SerialTreeLearner::BeforeTrain() {
const data_size_t* indices = data_partition_->indices();
data_size_t begin = data_partition_->leaf_begin(0);
data_size_t end = begin + data_partition_->leaf_count(0);
#pragma omp parallel for schedule(static, 512) if(end - begin >= 1024)
#pragma omp parallel for schedule(static, 512) if (end - begin >= 1024)
for (data_size_t i = begin; i < end; ++i) {
is_data_in_leaf_[indices[i]] = 1;
}
......@@ -335,7 +334,7 @@ void SerialTreeLearner::BeforeTrain() {
OMP_LOOP_EX_END();
}
OMP_THROW_EX();
#pragma omp parallel for schedule(static, 512) if(end - begin >= 1024)
#pragma omp parallel for schedule(static, 512) if (end - begin >= 1024)
for (data_size_t i = begin; i < end; ++i) {
is_data_in_leaf_[indices[i]] = 0;
}
......@@ -401,7 +400,7 @@ bool SerialTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int
end = begin + right_cnt;
mark = 0;
}
#pragma omp parallel for schedule(static, 512) if(end - begin >= 1024)
#pragma omp parallel for schedule(static, 512) if (end - begin >= 1024)
for (data_size_t i = begin; i < end; ++i) {
is_data_in_leaf_[indices[i]] = 1;
}
......@@ -414,7 +413,7 @@ bool SerialTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int
OMP_LOOP_EX_END();
}
OMP_THROW_EX();
#pragma omp parallel for schedule(static, 512) if(end - begin >= 1024)
#pragma omp parallel for schedule(static, 512) if (end - begin >= 1024)
for (data_size_t i = begin; i < end; ++i) {
is_data_in_leaf_[indices[i]] = 0;
}
......@@ -427,7 +426,7 @@ bool SerialTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int
void SerialTreeLearner::FindBestSplits() {
std::vector<int8_t> is_feature_used(num_features_, 0);
#pragma omp parallel for schedule(static,1024) if (num_features_ >= 2048)
#pragma omp parallel for schedule(static, 1024) if (num_features_ >= 2048)
for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
if (!is_feature_used_[feature_index]) continue;
if (parent_leaf_histogram_array_ != nullptr
......@@ -542,7 +541,7 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(const std::vector<int8_t>&
}
int32_t SerialTreeLearner::ForceSplits(Tree* tree, Json& forced_split_json, int* left_leaf,
int* right_leaf, int *cur_depth,
int* right_leaf, int *cur_depth,
bool *aborted_last_force_split) {
int32_t result_count = 0;
// start at root leaf
......@@ -553,8 +552,7 @@ int32_t SerialTreeLearner::ForceSplits(Tree* tree, Json& forced_split_json, int*
bool left_smaller = true;
std::unordered_map<int, SplitInfo> forceSplitMap;
q.push(std::make_pair(forced_split_json, *left_leaf));
while(!q.empty()) {
while (!q.empty()) {
// before processing next node from queue, store info for current left/right leaf
// store "best split" for left and right, even if they might be overwritten by forced split
if (BeforeFindBestSplit(tree, *left_leaf, *right_leaf)) {
......@@ -815,7 +813,7 @@ void SerialTreeLearner::RenewTreeOutput(Tree* tree, const ObjectiveFunction* obj
for (int i = 0; i < tree->num_leaves(); ++i) {
tree->SetLeafOutput(i, outputs[i] / n_nozeroworker_perleaf[i]);
}
}
}
}
}
......
......@@ -103,10 +103,9 @@ protected:
/* Force splits with forced_split_json dict and then return num splits forced.*/
virtual int32_t ForceSplits(Tree* tree, Json& forced_split_json, int* left_leaf,
int* right_leaf, int* cur_depth,
int* right_leaf, int* cur_depth,
bool *aborted_last_force_split);
/*!
* \brief Get the number of data in a leaf
* \param leaf_idx The index of leaf
......
......@@ -185,7 +185,6 @@ public:
return local_feature == other_feature;
}
}
};
struct LightSplitInfo {
......@@ -280,7 +279,6 @@ public:
return local_feature == other_feature;
}
}
};
} // namespace LightGBM
......
......@@ -370,7 +370,6 @@ void VotingParallelTreeLearner<TREELEARNER_T>::FindBestSplits() {
template <typename TREELEARNER_T>
void VotingParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(const std::vector<int8_t>&, bool) {
std::vector<SplitInfo> smaller_bests_per_thread(this->num_threads_);
std::vector<SplitInfo> larger_best_per_thread(this->num_threads_);
// find best split from local aggregated histograms
......@@ -506,4 +505,4 @@ void VotingParallelTreeLearner<TREELEARNER_T>::Split(Tree* tree, int best_Leaf,
// instantiate template classes, otherwise linker cannot find the code
template class VotingParallelTreeLearner<GPUTreeLearner>;
template class VotingParallelTreeLearner<SerialTreeLearner>;
} // namespace FTLBoost
} // namespace LightGBM
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment