Unverified Commit bc7d2f0c authored by Guolin Ke's avatar Guolin Ke Committed by GitHub
Browse files

speed up for const hessian (#2857)

* speed up for const hessian

* rename template

* fix clang build

* template init

* add comment
parent f7037fd5
...@@ -30,6 +30,9 @@ enum MissingType { ...@@ -30,6 +30,9 @@ enum MissingType {
}; };
typedef double hist_t; typedef double hist_t;
typedef uint64_t hist_cnt_t;
// check at compile time
static_assert(sizeof(hist_t) == sizeof(hist_cnt_t), "Histogram entry size is not correct");
const size_t kHistEntrySize = 2 * sizeof(hist_t); const size_t kHistEntrySize = 2 * sizeof(hist_t);
const int kHistOffset = 2; const int kHistOffset = 2;
...@@ -482,20 +485,24 @@ class MultiValBin { ...@@ -482,20 +485,24 @@ class MultiValBin {
const std::vector<uint32_t>& lower, const std::vector<uint32_t>& upper, const std::vector<uint32_t>& lower, const std::vector<uint32_t>& upper,
const std::vector<uint32_t>& delta) = 0; const std::vector<uint32_t>& delta) = 0;
virtual void ConstructHistogram( virtual void ConstructHistogram(const data_size_t* data_indices,
const data_size_t* data_indices, data_size_t start, data_size_t end, data_size_t start, data_size_t end,
const score_t* gradients, const score_t* hessians, const score_t* gradients,
const score_t* hessians,
hist_t* out) const = 0; hist_t* out) const = 0;
virtual void ConstructHistogram(data_size_t start, data_size_t end, virtual void ConstructHistogram(data_size_t start, data_size_t end,
const score_t* gradients, const score_t* hessians, const score_t* gradients,
const score_t* hessians,
hist_t* out) const = 0; hist_t* out) const = 0;
virtual void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* ordered_gradients, hist_t* out) const = 0;
virtual void ConstructHistogram(data_size_t start, data_size_t end, virtual void ConstructHistogramOrdered(const data_size_t* data_indices,
const score_t* ordered_gradients, hist_t* out) const = 0; data_size_t start, data_size_t end,
const score_t* ordered_gradients,
const score_t* ordered_hessians,
hist_t* out) const = 0;
virtual void FinishLoad() = 0; virtual void FinishLoad() = 0;
......
...@@ -482,20 +482,56 @@ class Dataset { ...@@ -482,20 +482,56 @@ class Dataset {
void InitTrain(const std::vector<int8_t>& is_feature_used, void InitTrain(const std::vector<int8_t>& is_feature_used,
TrainingShareStates* share_state) const; TrainingShareStates* share_state) const;
void ConstructHistograms(const std::vector<int8_t>& is_feature_used, template <bool USE_INDICES, bool USE_HESSIAN>
void ConstructHistogramsInner(const std::vector<int8_t>& is_feature_used,
const data_size_t* data_indices, const data_size_t* data_indices,
data_size_t num_data, const score_t* gradients, data_size_t num_data, const score_t* gradients,
const score_t* hessians, score_t* ordered_gradients, const score_t* hessians,
score_t* ordered_gradients,
score_t* ordered_hessians, score_t* ordered_hessians,
TrainingShareStates* share_state, TrainingShareStates* share_state,
hist_t* histogram_data) const; hist_t* hist_data) const;
template <bool USE_INDICES, bool ORDERED>
void ConstructHistogramsMultiVal(const data_size_t* data_indices, void ConstructHistogramsMultiVal(const data_size_t* data_indices,
data_size_t num_data, data_size_t num_data,
const score_t* gradients, const score_t* gradients,
const score_t* hessians, const score_t* hessians,
TrainingShareStates* share_state, TrainingShareStates* share_state,
hist_t* histogram_data) const; hist_t* hist_data) const;
inline void ConstructHistograms(
const std::vector<int8_t>& is_feature_used,
const data_size_t* data_indices, data_size_t num_data,
const score_t* gradients, const score_t* hessians,
score_t* ordered_gradients, score_t* ordered_hessians,
TrainingShareStates* share_state, hist_t* hist_data) const {
if (num_data <= 0) {
return;
}
bool use_indices = data_indices != nullptr && (num_data < num_data_);
if (share_state->is_constant_hessian) {
if (use_indices) {
ConstructHistogramsInner<true, false>(
is_feature_used, data_indices, num_data, gradients, hessians,
ordered_gradients, ordered_hessians, share_state, hist_data);
} else {
ConstructHistogramsInner<false, false>(
is_feature_used, data_indices, num_data, gradients, hessians,
ordered_gradients, ordered_hessians, share_state, hist_data);
}
} else {
if (use_indices) {
ConstructHistogramsInner<true, true>(
is_feature_used, data_indices, num_data, gradients, hessians,
ordered_gradients, ordered_hessians, share_state, hist_data);
} else {
ConstructHistogramsInner<false, true>(
is_feature_used, data_indices, num_data, gradients, hessians,
ordered_gradients, ordered_hessians, share_state, hist_data);
}
}
}
void FixHistogram(int feature_idx, double sum_gradient, double sum_hessian, hist_t* data) const; void FixHistogram(int feature_idx, double sum_gradient, double sum_hessian, hist_t* data) const;
......
...@@ -651,8 +651,9 @@ TrainingShareStates* Dataset::GetShareStates( ...@@ -651,8 +651,9 @@ TrainingShareStates* Dataset::GetShareStates(
hist_data.data()); hist_data.data());
col_wise_time = std::chrono::steady_clock::now() - start_time; col_wise_time = std::chrono::steady_clock::now() - start_time;
start_time = std::chrono::steady_clock::now(); start_time = std::chrono::steady_clock::now();
ConstructHistogramsMultiVal(nullptr, num_data_, gradients, hessians, ConstructHistograms(is_feature_used, nullptr, num_data_, gradients,
rowwise_state.get(), hist_data.data()); hessians, gradients, hessians, rowwise_state.get(),
hist_data.data());
row_wise_time = std::chrono::steady_clock::now() - start_time; row_wise_time = std::chrono::steady_clock::now() - start_time;
Log::Debug("col-wise cost %f seconds, row-wise cost %f seconds", Log::Debug("col-wise cost %f seconds, row-wise cost %f seconds",
col_wise_time * 1e-3, row_wise_time * 1e-3); col_wise_time * 1e-3, row_wise_time * 1e-3);
...@@ -1193,6 +1194,7 @@ void Dataset::InitTrain(const std::vector<int8_t>& is_feature_used, ...@@ -1193,6 +1194,7 @@ void Dataset::InitTrain(const std::vector<int8_t>& is_feature_used,
} }
} }
template <bool USE_INDICES, bool ORDERED>
void Dataset::ConstructHistogramsMultiVal(const data_size_t* data_indices, void Dataset::ConstructHistogramsMultiVal(const data_size_t* data_indices,
data_size_t num_data, data_size_t num_data,
const score_t* gradients, const score_t* gradients,
...@@ -1237,21 +1239,17 @@ void Dataset::ConstructHistogramsMultiVal(const data_size_t* data_indices, ...@@ -1237,21 +1239,17 @@ void Dataset::ConstructHistogramsMultiVal(const data_size_t* data_indices,
static_cast<size_t>(num_bin_aligned) * 2 * (tid - 1); static_cast<size_t>(num_bin_aligned) * 2 * (tid - 1);
} }
std::memset(reinterpret_cast<void*>(data_ptr), 0, num_bin * kHistEntrySize); std::memset(reinterpret_cast<void*>(data_ptr), 0, num_bin * kHistEntrySize);
if (data_indices != nullptr && num_data < num_data_) { if (USE_INDICES) {
if (!share_state->is_constant_hessian) { if (ORDERED) {
multi_val_bin->ConstructHistogram(data_indices, start, end, gradients, multi_val_bin->ConstructHistogramOrdered(data_indices, start, end,
hessians, data_ptr); gradients, hessians, data_ptr);
} else { } else {
multi_val_bin->ConstructHistogram(data_indices, start, end, gradients, multi_val_bin->ConstructHistogram(data_indices, start, end, gradients,
data_ptr); hessians, data_ptr);
} }
} else { } else {
if (!share_state->is_constant_hessian) {
multi_val_bin->ConstructHistogram(start, end, gradients, hessians, multi_val_bin->ConstructHistogram(start, end, gradients, hessians,
data_ptr); data_ptr);
} else {
multi_val_bin->ConstructHistogram(start, end, gradients, data_ptr);
}
} }
OMP_LOOP_EX_END(); OMP_LOOP_EX_END();
} }
...@@ -1263,20 +1261,6 @@ void Dataset::ConstructHistogramsMultiVal(const data_size_t* data_indices, ...@@ -1263,20 +1261,6 @@ void Dataset::ConstructHistogramsMultiVal(const data_size_t* data_indices,
int bin_block_size = num_bin; int bin_block_size = num_bin;
Threading::BlockInfo<data_size_t>(share_state->num_threads, num_bin, 512, &n_bin_block, Threading::BlockInfo<data_size_t>(share_state->num_threads, num_bin, 512, &n_bin_block,
&bin_block_size); &bin_block_size);
if (!share_state->is_constant_hessian) {
#pragma omp parallel for schedule(static)
for (int t = 0; t < n_bin_block; ++t) {
const int start = t * bin_block_size;
const int end = std::min(start + bin_block_size, num_bin);
for (int tid = 1; tid < n_data_block; ++tid) {
auto src_ptr = share_state->hist_buf.data() +
static_cast<size_t>(num_bin_aligned) * 2 * (tid - 1);
for (int i = start * 2; i < end * 2; ++i) {
hist_data[i] += src_ptr[i];
}
}
}
} else {
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (int t = 0; t < n_bin_block; ++t) { for (int t = 0; t < n_bin_block; ++t) {
const int start = t * bin_block_size; const int start = t * bin_block_size;
...@@ -1288,10 +1272,6 @@ void Dataset::ConstructHistogramsMultiVal(const data_size_t* data_indices, ...@@ -1288,10 +1272,6 @@ void Dataset::ConstructHistogramsMultiVal(const data_size_t* data_indices,
hist_data[i] += src_ptr[i]; hist_data[i] += src_ptr[i];
} }
} }
for (int i = start; i < end; ++i) {
GET_HESS(hist_data, i) = GET_HESS(hist_data, i) * hessians[0];
}
}
} }
global_timer.Stop("Dataset::sparse_bin_histogram_merge"); global_timer.Stop("Dataset::sparse_bin_histogram_merge");
global_timer.Start("Dataset::sparse_bin_histogram_move"); global_timer.Start("Dataset::sparse_bin_histogram_move");
...@@ -1299,20 +1279,16 @@ void Dataset::ConstructHistogramsMultiVal(const data_size_t* data_indices, ...@@ -1299,20 +1279,16 @@ void Dataset::ConstructHistogramsMultiVal(const data_size_t* data_indices,
global_timer.Stop("Dataset::sparse_bin_histogram_move"); global_timer.Stop("Dataset::sparse_bin_histogram_move");
} }
void Dataset::ConstructHistograms( template <bool USE_INDICES, bool USE_HESSIAN>
void Dataset::ConstructHistogramsInner(
const std::vector<int8_t>& is_feature_used, const data_size_t* data_indices, const std::vector<int8_t>& is_feature_used, const data_size_t* data_indices,
data_size_t num_data, const score_t* gradients, const score_t* hessians, data_size_t num_data, const score_t* gradients, const score_t* hessians,
score_t* ordered_gradients, score_t* ordered_hessians, score_t* ordered_gradients, score_t* ordered_hessians,
TrainingShareStates* share_state, hist_t* hist_data) const { TrainingShareStates* share_state, hist_t* hist_data) const {
Common::FunctionTimer fun_timer("Dataset::ConstructHistograms", global_timer);
if (num_data < 0 || hist_data == nullptr) {
return;
}
if (!share_state->is_colwise) { if (!share_state->is_colwise) {
return ConstructHistogramsMultiVal(data_indices, num_data, gradients, return ConstructHistogramsMultiVal<USE_INDICES, false>(
hessians, share_state, hist_data); data_indices, num_data, gradients, hessians, share_state, hist_data);
} }
global_timer.Start("Dataset::Get used group");
std::vector<int> used_dense_group; std::vector<int> used_dense_group;
int multi_val_groud_id = -1; int multi_val_groud_id = -1;
used_dense_group.reserve(num_groups_); used_dense_group.reserve(num_groups_);
...@@ -1335,117 +1311,102 @@ void Dataset::ConstructHistograms( ...@@ -1335,117 +1311,102 @@ void Dataset::ConstructHistograms(
} }
} }
int num_used_dense_group = static_cast<int>(used_dense_group.size()); int num_used_dense_group = static_cast<int>(used_dense_group.size());
global_timer.Stop("Dataset::Get used group");
global_timer.Start("Dataset::dense_bin_histogram"); global_timer.Start("Dataset::dense_bin_histogram");
if (num_used_dense_group > 0) {
auto ptr_ordered_grad = gradients; auto ptr_ordered_grad = gradients;
auto ptr_ordered_hess = hessians; auto ptr_ordered_hess = hessians;
if (data_indices != nullptr && num_data < num_data_) { if (num_used_dense_group > 0) {
if (!share_state->is_constant_hessian) { if (USE_INDICES) {
if (USE_HESSIAN) {
#pragma omp parallel for schedule(static, 512) if (num_data >= 1024) #pragma omp parallel for schedule(static, 512) if (num_data >= 1024)
for (data_size_t i = 0; i < num_data; ++i) { for (data_size_t i = 0; i < num_data; ++i) {
ordered_gradients[i] = gradients[data_indices[i]]; ordered_gradients[i] = gradients[data_indices[i]];
ordered_hessians[i] = hessians[data_indices[i]]; ordered_hessians[i] = hessians[data_indices[i]];
} }
ptr_ordered_grad = ordered_gradients;
ptr_ordered_hess = ordered_hessians;
} else { } else {
#pragma omp parallel for schedule(static, 512) if (num_data >= 1024) #pragma omp parallel for schedule(static, 512) if (num_data >= 1024)
for (data_size_t i = 0; i < num_data; ++i) { for (data_size_t i = 0; i < num_data; ++i) {
ordered_gradients[i] = gradients[data_indices[i]]; ordered_gradients[i] = gradients[data_indices[i]];
} }
}
ptr_ordered_grad = ordered_gradients; ptr_ordered_grad = ordered_gradients;
ptr_ordered_hess = ordered_hessians; }
if (!share_state->is_constant_hessian) { }
OMP_INIT_EX(); OMP_INIT_EX();
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (int gi = 0; gi < num_used_dense_group; ++gi) { for (int gi = 0; gi < num_used_dense_group; ++gi) {
OMP_LOOP_EX_BEGIN(); OMP_LOOP_EX_BEGIN();
int group = used_dense_group[gi]; int group = used_dense_group[gi];
// feature is not used
auto data_ptr = hist_data + group_bin_boundaries_[group] * 2; auto data_ptr = hist_data + group_bin_boundaries_[group] * 2;
const int num_bin = feature_groups_[group]->num_total_bin_; const int num_bin = feature_groups_[group]->num_total_bin_;
std::memset(reinterpret_cast<void*>(data_ptr), 0, std::memset(reinterpret_cast<void*>(data_ptr), 0,
num_bin * kHistEntrySize); num_bin * kHistEntrySize);
// construct histograms for smaller leaf if (USE_HESSIAN) {
if (USE_INDICES) {
feature_groups_[group]->bin_data_->ConstructHistogram( feature_groups_[group]->bin_data_->ConstructHistogram(
data_indices, 0, num_data, ptr_ordered_grad, ptr_ordered_hess, data_indices, 0, num_data, ptr_ordered_grad, ptr_ordered_hess,
data_ptr); data_ptr);
OMP_LOOP_EX_END();
}
OMP_THROW_EX();
} else { } else {
OMP_INIT_EX();
#pragma omp parallel for schedule(static)
for (int gi = 0; gi < num_used_dense_group; ++gi) {
OMP_LOOP_EX_BEGIN();
int group = used_dense_group[gi];
// feature is not used
auto data_ptr = hist_data + group_bin_boundaries_[group] * 2;
const int num_bin = feature_groups_[group]->num_total_bin_;
std::memset(reinterpret_cast<void*>(data_ptr), 0,
num_bin * kHistEntrySize);
// construct histograms for smaller leaf
feature_groups_[group]->bin_data_->ConstructHistogram( feature_groups_[group]->bin_data_->ConstructHistogram(
data_indices, 0, num_data, ptr_ordered_grad, data_ptr); 0, num_data, ptr_ordered_grad, ptr_ordered_hess, data_ptr);
// fixed hessian.
for (int i = 0; i < num_bin; ++i) {
GET_HESS(data_ptr, i) = GET_HESS(data_ptr, i) * hessians[0];
}
OMP_LOOP_EX_END();
}
OMP_THROW_EX();
} }
} else { } else {
if (!share_state->is_constant_hessian) { if (USE_INDICES) {
OMP_INIT_EX();
#pragma omp parallel for schedule(static)
for (int gi = 0; gi < num_used_dense_group; ++gi) {
OMP_LOOP_EX_BEGIN();
int group = used_dense_group[gi];
// feature is not used
auto data_ptr = hist_data + group_bin_boundaries_[group] * 2;
const int num_bin = feature_groups_[group]->num_total_bin_;
std::memset(reinterpret_cast<void*>(data_ptr), 0,
num_bin * kHistEntrySize);
// construct histograms for smaller leaf
feature_groups_[group]->bin_data_->ConstructHistogram( feature_groups_[group]->bin_data_->ConstructHistogram(
0, num_data, ptr_ordered_grad, ptr_ordered_hess, data_ptr); data_indices, 0, num_data, ptr_ordered_grad, data_ptr);
OMP_LOOP_EX_END();
}
OMP_THROW_EX();
} else { } else {
OMP_INIT_EX();
#pragma omp parallel for schedule(static)
for (int gi = 0; gi < num_used_dense_group; ++gi) {
OMP_LOOP_EX_BEGIN();
int group = used_dense_group[gi];
// feature is not used
auto data_ptr = hist_data + group_bin_boundaries_[group] * 2;
const int num_bin = feature_groups_[group]->num_total_bin_;
std::memset(reinterpret_cast<void*>(data_ptr), 0,
num_bin * kHistEntrySize);
// construct histograms for smaller leaf
feature_groups_[group]->bin_data_->ConstructHistogram( feature_groups_[group]->bin_data_->ConstructHistogram(
0, num_data, ptr_ordered_grad, data_ptr); 0, num_data, ptr_ordered_grad, data_ptr);
// fixed hessian.
for (int i = 0; i < num_bin; ++i) {
GET_HESS(data_ptr, i) = GET_HESS(data_ptr, i) * hessians[0];
} }
OMP_LOOP_EX_END(); auto cnt_dst = reinterpret_cast<hist_cnt_t*>(data_ptr + 1);
for (int i = 0; i < num_bin * 2; i += 2) {
data_ptr[i + 1] = static_cast<double>(cnt_dst[i]) * hessians[0];
} }
OMP_THROW_EX();
} }
OMP_LOOP_EX_END();
} }
OMP_THROW_EX();
} }
global_timer.Stop("Dataset::dense_bin_histogram"); global_timer.Stop("Dataset::dense_bin_histogram");
if (multi_val_groud_id >= 0) { if (multi_val_groud_id >= 0) {
ConstructHistogramsMultiVal( if (num_used_dense_group > 0) {
ConstructHistogramsMultiVal<USE_INDICES, true>(
data_indices, num_data, ptr_ordered_grad, ptr_ordered_hess,
share_state,
hist_data + group_bin_boundaries_[multi_val_groud_id] * 2);
} else {
ConstructHistogramsMultiVal<USE_INDICES, false>(
data_indices, num_data, gradients, hessians, share_state, data_indices, num_data, gradients, hessians, share_state,
hist_data + group_bin_boundaries_[multi_val_groud_id] * 2); hist_data + group_bin_boundaries_[multi_val_groud_id] * 2);
} }
}
} }
// explicitly initilize template methods, for cross module call
template void Dataset::ConstructHistogramsInner<true, true>(
const std::vector<int8_t>& is_feature_used, const data_size_t* data_indices,
data_size_t num_data, const score_t* gradients, const score_t* hessians,
score_t* ordered_gradients, score_t* ordered_hessians,
TrainingShareStates* share_state, hist_t* hist_data) const;
template void Dataset::ConstructHistogramsInner<true, false>(
const std::vector<int8_t>& is_feature_used, const data_size_t* data_indices,
data_size_t num_data, const score_t* gradients, const score_t* hessians,
score_t* ordered_gradients, score_t* ordered_hessians,
TrainingShareStates* share_state, hist_t* hist_data) const;
template void Dataset::ConstructHistogramsInner<false, true>(
const std::vector<int8_t>& is_feature_used, const data_size_t* data_indices,
data_size_t num_data, const score_t* gradients, const score_t* hessians,
score_t* ordered_gradients, score_t* ordered_hessians,
TrainingShareStates* share_state, hist_t* hist_data) const;
template void Dataset::ConstructHistogramsInner<false, false>(
const std::vector<int8_t>& is_feature_used, const data_size_t* data_indices,
data_size_t num_data, const score_t* gradients, const score_t* hessians,
score_t* ordered_gradients, score_t* ordered_hessians,
TrainingShareStates* share_state, hist_t* hist_data) const;
void Dataset::FixHistogram(int feature_idx, double sum_gradient, void Dataset::FixHistogram(int feature_idx, double sum_gradient,
double sum_hessian, hist_t* data) const { double sum_hessian, hist_t* data) const {
const int group = feature2group_[feature_idx]; const int group = feature2group_[feature_idx];
......
...@@ -68,42 +68,42 @@ class DenseBin: public Bin { ...@@ -68,42 +68,42 @@ class DenseBin: public Bin {
BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const override; BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const override;
#define ACC_GH(hist, i, g, h) \ template<bool USE_INDICES, bool USE_PREFETCH, bool USE_HESSIAN>
const auto ti = static_cast<int>(i) << 1; \
hist[ti] += g; \
hist[ti + 1] += h; \
template<bool use_indices, bool use_prefetch, bool use_hessians>
void ConstructHistogramInner(const data_size_t* data_indices, data_size_t start, data_size_t end, void ConstructHistogramInner(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* ordered_gradients, const score_t* ordered_hessians, hist_t* out) const { const score_t* ordered_gradients, const score_t* ordered_hessians, hist_t* out) const {
data_size_t i = start; data_size_t i = start;
hist_t* grad = out;
if (use_prefetch) { hist_t* hess = out + 1;
hist_cnt_t* cnt = reinterpret_cast<hist_cnt_t*>(hess);
if (USE_PREFETCH) {
const data_size_t pf_offset = 64 / sizeof(VAL_T); const data_size_t pf_offset = 64 / sizeof(VAL_T);
const data_size_t pf_end = end - pf_offset; const data_size_t pf_end = end - pf_offset;
for (; i < pf_end; ++i) { for (; i < pf_end; ++i) {
const auto idx = use_indices ? data_indices[i] : i; const auto idx = USE_INDICES ? data_indices[i] : i;
const auto pf_idx = use_indices ? data_indices[i + pf_offset] : i + pf_offset; const auto pf_idx = USE_INDICES ? data_indices[i + pf_offset] : i + pf_offset;
PREFETCH_T0(data_.data() + pf_idx); PREFETCH_T0(data_.data() + pf_idx);
const VAL_T bin = data_[idx]; const auto ti = static_cast<uint32_t>(data_[idx]) << 1;
if (use_hessians) { if (USE_HESSIAN) {
ACC_GH(out, bin, ordered_gradients[i], ordered_hessians[i]); grad[ti] += ordered_gradients[i];
hess[ti] += ordered_hessians[i];
} else { } else {
ACC_GH(out, bin, ordered_gradients[i], 1.0f); grad[ti] += ordered_gradients[i];
++cnt[ti];
} }
} }
} }
for (; i < end; ++i) { for (; i < end; ++i) {
const auto idx = use_indices ? data_indices[i] : i; const auto idx = USE_INDICES ? data_indices[i] : i;
const VAL_T bin = data_[idx]; const auto ti = static_cast<uint32_t>(data_[idx]) << 1;
if (use_hessians) { if (USE_HESSIAN) {
ACC_GH(out, bin, ordered_gradients[i], ordered_hessians[i]); grad[ti] += ordered_gradients[i];
hess[ti] += ordered_hessians[i];
} else { } else {
ACC_GH(out, bin, ordered_gradients[i], 1.0f); grad[ti] += ordered_gradients[i];
++cnt[ti];
} }
} }
} }
#undef ACC_GH
void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end, void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* ordered_gradients, const score_t* ordered_hessians, const score_t* ordered_gradients, const score_t* ordered_hessians,
......
...@@ -73,42 +73,44 @@ class Dense4bitsBin : public Bin { ...@@ -73,42 +73,44 @@ class Dense4bitsBin : public Bin {
inline BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const override; inline BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const override;
#define ACC_GH(hist, i, g, h) \ template<bool USE_INDICES, bool USE_PREFETCH, bool USE_HESSIAN>
const auto ti = (i) << 1; \
hist[ti] += g; \
hist[ti + 1] += h; \
template<bool use_indices, bool use_prefetch, bool use_hessians>
void ConstructHistogramInner(const data_size_t* data_indices, data_size_t start, data_size_t end, void ConstructHistogramInner(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* ordered_gradients, const score_t* ordered_hessians, hist_t* out) const { const score_t* ordered_gradients, const score_t* ordered_hessians, hist_t* out) const {
data_size_t i = start; data_size_t i = start;
hist_t* grad = out;
if (use_prefetch) { hist_t* hess = out + 1;
hist_cnt_t* cnt = reinterpret_cast<hist_cnt_t*>(hess);
if (USE_PREFETCH) {
const data_size_t pf_offset = 64; const data_size_t pf_offset = 64;
const data_size_t pf_end = end - pf_offset; const data_size_t pf_end = end - pf_offset;
for (; i < pf_end; ++i) { for (; i < pf_end; ++i) {
const auto idx = use_indices ? data_indices[i] : i; const auto idx = USE_INDICES ? data_indices[i] : i;
const auto pf_idx = use_indices ? data_indices[i + pf_offset] : i + pf_offset; const auto pf_idx = USE_INDICES ? data_indices[i + pf_offset] : i + pf_offset;
PREFETCH_T0(data_.data() + (pf_idx >> 1)); PREFETCH_T0(data_.data() + (pf_idx >> 1));
const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf; const uint8_t bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
if (use_hessians) { const uint8_t ti = static_cast<uint8_t>(bin) << 1;
ACC_GH(out, bin, ordered_gradients[i], ordered_hessians[i]); if (USE_HESSIAN) {
grad[ti] += ordered_gradients[i];
hess[ti] += ordered_hessians[i];
} else { } else {
ACC_GH(out, bin, ordered_gradients[i], 1.0f); grad[ti] += ordered_gradients[i];
++cnt[ti];
} }
} }
} }
for (; i < end; ++i) { for (; i < end; ++i) {
const auto idx = use_indices ? data_indices[i] : i; const auto idx = USE_INDICES ? data_indices[i] : i;
const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf; const uint8_t bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
if (use_hessians) { const uint8_t ti = static_cast<uint8_t>(bin) << 1;
ACC_GH(out, bin, ordered_gradients[i], ordered_hessians[i]); if (USE_HESSIAN) {
grad[ti] += ordered_gradients[i];
hess[ti] += ordered_hessians[i];
} else { } else {
ACC_GH(out, bin, ordered_gradients[i], 1.0f); grad[ti] += ordered_gradients[i];
++cnt[ti];
} }
} }
} }
#undef ACC_GH
void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end, void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* ordered_gradients, const score_t* ordered_hessians, const score_t* ordered_gradients, const score_t* ordered_hessians,
......
...@@ -50,75 +50,75 @@ class MultiValDenseBin : public MultiValBin { ...@@ -50,75 +50,75 @@ class MultiValDenseBin : public MultiValBin {
return false; return false;
} }
#define ACC_GH(hist, i, g, h) \
const auto ti = static_cast<int>(i) << 1; \
hist[ti] += g; \
hist[ti + 1] += h; \
template<bool use_indices, bool use_prefetch, bool use_hessians> template<bool USE_INDICES, bool USE_PREFETCH, bool ORDERED>
void ConstructHistogramInner(const data_size_t* data_indices, data_size_t start, data_size_t end, void ConstructHistogramInner(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* gradients, const score_t* hessians, hist_t* out) const { const score_t* gradients, const score_t* hessians, hist_t* out) const {
data_size_t i = start; data_size_t i = start;
if (use_prefetch) { hist_t* grad = out;
hist_t* hess = out + 1;
if (USE_PREFETCH) {
const data_size_t pf_offset = 32 / sizeof(VAL_T); const data_size_t pf_offset = 32 / sizeof(VAL_T);
const data_size_t pf_end = end - pf_offset; const data_size_t pf_end = end - pf_offset;
for (; i < pf_end; ++i) { for (; i < pf_end; ++i) {
const auto idx = use_indices ? data_indices[i] : i; const auto idx = USE_INDICES ? data_indices[i] : i;
const auto pf_idx = use_indices ? data_indices[i + pf_offset] : i + pf_offset; const auto pf_idx = USE_INDICES ? data_indices[i + pf_offset] : i + pf_offset;
if (!ORDERED) {
PREFETCH_T0(gradients + pf_idx); PREFETCH_T0(gradients + pf_idx);
if (use_hessians) {
PREFETCH_T0(hessians + pf_idx); PREFETCH_T0(hessians + pf_idx);
} }
PREFETCH_T0(data_.data() + RowPtr(pf_idx)); PREFETCH_T0(data_.data() + RowPtr(pf_idx));
const auto j_start = RowPtr(idx); const auto j_start = RowPtr(idx);
for (auto j = j_start; j < j_start + num_feature_; ++j) { for (auto j = j_start; j < j_start + num_feature_; ++j) {
const VAL_T bin = data_[j]; const auto ti = static_cast<uint32_t>(data_[j]) << 1;
if (use_hessians) { if (ORDERED) {
ACC_GH(out, bin, gradients[idx], hessians[idx]); grad[ti] += gradients[i];
hess[ti] += hessians[i];
} else { } else {
ACC_GH(out, bin, gradients[idx], 1.0f); grad[ti] += gradients[idx];
hess[ti] += hessians[idx];
} }
} }
} }
} }
for (; i < end; ++i) { for (; i < end; ++i) {
const auto idx = use_indices ? data_indices[i] : i; const auto idx = USE_INDICES ? data_indices[i] : i;
const auto j_start = RowPtr(idx); const auto j_start = RowPtr(idx);
for (auto j = j_start; j < j_start + num_feature_; ++j) { for (auto j = j_start; j < j_start + num_feature_; ++j) {
const VAL_T bin = data_[j]; const auto ti = static_cast<uint32_t>(data_[j]) << 1;
if (use_hessians) { if (ORDERED) {
ACC_GH(out, bin, gradients[idx], hessians[idx]); grad[ti] += gradients[i];
hess[ti] += hessians[i];
} else { } else {
ACC_GH(out, bin, gradients[idx], 1.0f); grad[ti] += gradients[idx];
hess[ti] += hessians[idx];
} }
} }
} }
} }
#undef ACC_GH
void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end, void ConstructHistogram(const data_size_t* data_indices, data_size_t start,
const score_t* gradients, const score_t* hessians, data_size_t end, const score_t* gradients,
hist_t* out) const override { const score_t* hessians, hist_t* out) const override {
ConstructHistogramInner<true, true, true>(data_indices, start, end, gradients, hessians, out); ConstructHistogramInner<true, true, false>(data_indices, start, end,
gradients, hessians, out);
} }
void ConstructHistogram(data_size_t start, data_size_t end, void ConstructHistogram(data_size_t start, data_size_t end,
const score_t* gradients, const score_t* hessians, const score_t* gradients, const score_t* hessians,
hist_t* out) const override { hist_t* out) const override {
ConstructHistogramInner<false, false, true>(nullptr, start, end, gradients, hessians, out); ConstructHistogramInner<false, false, false>(
nullptr, start, end, gradients, hessians, out);
} }
void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end, void ConstructHistogramOrdered(const data_size_t* data_indices,
const score_t* gradients, data_size_t start, data_size_t end,
hist_t* out) const override {
ConstructHistogramInner<true, true, false>(data_indices, start, end, gradients, nullptr, out);
}
void ConstructHistogram(data_size_t start, data_size_t end,
const score_t* gradients, const score_t* gradients,
const score_t* hessians,
hist_t* out) const override { hist_t* out) const override {
ConstructHistogramInner<false, false, false>(nullptr, start, end, gradients, nullptr, out); ConstructHistogramInner<true, true, true>(data_indices, start, end,
gradients, hessians, out);
} }
MultiValBin* CreateLike(data_size_t num_data, int num_bin, int num_feature, double) const override { MultiValBin* CreateLike(data_size_t num_data, int num_bin, int num_feature, double) const override {
......
...@@ -106,27 +106,24 @@ class MultiValSparseBin : public MultiValBin { ...@@ -106,27 +106,24 @@ class MultiValSparseBin : public MultiValBin {
bool IsSparse() override { return true; } bool IsSparse() override { return true; }
#define ACC_GH(hist, i, g, h) \ template <bool USE_INDICES, bool USE_PREFETCH, bool ORDERED>
const auto ti = static_cast<int>(i) << 1; \
hist[ti] += g; \
hist[ti + 1] += h;
template <bool use_indices, bool use_prefetch, bool use_hessians>
void ConstructHistogramInner(const data_size_t* data_indices, void ConstructHistogramInner(const data_size_t* data_indices,
data_size_t start, data_size_t end, data_size_t start, data_size_t end,
const score_t* gradients, const score_t* gradients,
const score_t* hessians, hist_t* out) const { const score_t* hessians, hist_t* out) const {
data_size_t i = start; data_size_t i = start;
if (use_prefetch) { hist_t* grad = out;
hist_t* hess = out + 1;
if (USE_PREFETCH) {
const data_size_t pf_offset = 32 / sizeof(VAL_T); const data_size_t pf_offset = 32 / sizeof(VAL_T);
const data_size_t pf_end = end - pf_offset; const data_size_t pf_end = end - pf_offset;
for (; i < pf_end; ++i) { for (; i < pf_end; ++i) {
const auto idx = use_indices ? data_indices[i] : i; const auto idx = USE_INDICES ? data_indices[i] : i;
const auto pf_idx = const auto pf_idx =
use_indices ? data_indices[i + pf_offset] : i + pf_offset; USE_INDICES ? data_indices[i + pf_offset] : i + pf_offset;
if (!ORDERED) {
PREFETCH_T0(gradients + pf_idx); PREFETCH_T0(gradients + pf_idx);
if (use_hessians) {
PREFETCH_T0(hessians + pf_idx); PREFETCH_T0(hessians + pf_idx);
} }
PREFETCH_T0(row_ptr_.data() + pf_idx); PREFETCH_T0(row_ptr_.data() + pf_idx);
...@@ -134,57 +131,55 @@ class MultiValSparseBin : public MultiValBin { ...@@ -134,57 +131,55 @@ class MultiValSparseBin : public MultiValBin {
const auto j_start = RowPtr(idx); const auto j_start = RowPtr(idx);
const auto j_end = RowPtr(idx + 1); const auto j_end = RowPtr(idx + 1);
for (auto j = j_start; j < j_end; ++j) { for (auto j = j_start; j < j_end; ++j) {
const VAL_T bin = data_[j]; const auto ti = static_cast<uint32_t>(data_[j]) << 1;
if (use_hessians) { if (ORDERED) {
ACC_GH(out, bin, gradients[idx], hessians[idx]); grad[ti] += gradients[i];
hess[ti] += hessians[i];
} else { } else {
ACC_GH(out, bin, gradients[idx], 1.0f); grad[ti] += gradients[idx];
hess[ti] += hessians[idx];
} }
} }
} }
} }
for (; i < end; ++i) { for (; i < end; ++i) {
const auto idx = use_indices ? data_indices[i] : i; const auto idx = USE_INDICES ? data_indices[i] : i;
const auto j_start = RowPtr(idx); const auto j_start = RowPtr(idx);
const auto j_end = RowPtr(idx + 1); const auto j_end = RowPtr(idx + 1);
for (auto j = j_start; j < j_end; ++j) { for (auto j = j_start; j < j_end; ++j) {
const VAL_T bin = data_[j]; const auto ti = static_cast<uint32_t>(data_[j]) << 1;
if (use_hessians) { if (ORDERED) {
ACC_GH(out, bin, gradients[idx], hessians[idx]); grad[ti] += gradients[i];
hess[ti] += hessians[i];
} else { } else {
ACC_GH(out, bin, gradients[idx], 1.0f); grad[ti] += gradients[idx];
hess[ti] += hessians[idx];
} }
} }
} }
} }
#undef ACC_GH
void ConstructHistogram(const data_size_t* data_indices, data_size_t start, void ConstructHistogram(const data_size_t* data_indices, data_size_t start,
data_size_t end, const score_t* gradients, data_size_t end, const score_t* gradients,
const score_t* hessians, hist_t* out) const override { const score_t* hessians, hist_t* out) const override {
ConstructHistogramInner<true, true, true>(data_indices, start, end, ConstructHistogramInner<true, true, false>(data_indices, start, end,
gradients, hessians, out); gradients, hessians, out);
} }
void ConstructHistogram(data_size_t start, data_size_t end, void ConstructHistogram(data_size_t start, data_size_t end,
const score_t* gradients, const score_t* hessians, const score_t* gradients, const score_t* hessians,
hist_t* out) const override { hist_t* out) const override {
ConstructHistogramInner<false, false, true>(nullptr, start, end, gradients, ConstructHistogramInner<false, false, false>(
hessians, out); nullptr, start, end, gradients, hessians, out);
} }
void ConstructHistogram(const data_size_t* data_indices, data_size_t start, void ConstructHistogramOrdered(const data_size_t* data_indices,
data_size_t end, const score_t* gradients, data_size_t start, data_size_t end,
hist_t* out) const override {
ConstructHistogramInner<true, true, false>(data_indices, start, end,
gradients, nullptr, out);
}
void ConstructHistogram(data_size_t start, data_size_t end,
const score_t* gradients, const score_t* gradients,
const score_t* hessians,
hist_t* out) const override { hist_t* out) const override {
ConstructHistogramInner<false, false, false>(nullptr, start, end, gradients, ConstructHistogramInner<true, true, true>(data_indices, start, end,
nullptr, out); gradients, hessians, out);
} }
MultiValBin* CreateLike(data_size_t num_data, int num_bin, int, MultiValBin* CreateLike(data_size_t num_data, int num_bin, int,
......
...@@ -138,6 +138,8 @@ class SparseBin: public Bin { ...@@ -138,6 +138,8 @@ class SparseBin: public Bin {
data_size_t i_delta, cur_pos; data_size_t i_delta, cur_pos;
InitIndex(data_indices[start], &i_delta, &cur_pos); InitIndex(data_indices[start], &i_delta, &cur_pos);
data_size_t i = start; data_size_t i = start;
hist_t* grad = out;
hist_cnt_t* cnt = reinterpret_cast<hist_cnt_t*>(out + 1);
for (;;) { for (;;) {
if (cur_pos < data_indices[i]) { if (cur_pos < data_indices[i]) {
cur_pos += deltas_[++i_delta]; cur_pos += deltas_[++i_delta];
...@@ -145,8 +147,9 @@ class SparseBin: public Bin { ...@@ -145,8 +147,9 @@ class SparseBin: public Bin {
} else if (cur_pos > data_indices[i]) { } else if (cur_pos > data_indices[i]) {
if (++i >= end) { break; } if (++i >= end) { break; }
} else { } else {
const VAL_T bin = vals_[i_delta]; const uint32_t ti = static_cast<uint32_t>(vals_[i_delta]) << 1;
ACC_GH(out, bin, ordered_gradients[i], 1.0f); grad[ti] += ordered_gradients[i];
++cnt[ti];
if (++i >= end) { break; } if (++i >= end) { break; }
cur_pos += deltas_[++i_delta]; cur_pos += deltas_[++i_delta];
if (i_delta >= num_vals_) { break; } if (i_delta >= num_vals_) { break; }
...@@ -158,12 +161,15 @@ class SparseBin: public Bin { ...@@ -158,12 +161,15 @@ class SparseBin: public Bin {
const score_t* ordered_gradients, hist_t* out) const override { const score_t* ordered_gradients, hist_t* out) const override {
data_size_t i_delta, cur_pos; data_size_t i_delta, cur_pos;
InitIndex(start, &i_delta, &cur_pos); InitIndex(start, &i_delta, &cur_pos);
hist_t* grad = out;
hist_cnt_t* cnt = reinterpret_cast<hist_cnt_t*>(out + 1);
while (cur_pos < start && i_delta < num_vals_) { while (cur_pos < start && i_delta < num_vals_) {
cur_pos += deltas_[++i_delta]; cur_pos += deltas_[++i_delta];
} }
while (cur_pos < end && i_delta < num_vals_) { while (cur_pos < end && i_delta < num_vals_) {
const VAL_T bin = vals_[i_delta]; const uint32_t ti = static_cast<uint32_t>(vals_[i_delta]) << 1;
ACC_GH(out, bin, ordered_gradients[cur_pos], 1.0f); grad[ti] += ordered_gradients[cur_pos];
++cnt[ti];
cur_pos += deltas_[++i_delta]; cur_pos += deltas_[++i_delta];
} }
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment