Unverified Commit bc7d2f0c authored by Guolin Ke's avatar Guolin Ke Committed by GitHub
Browse files

speed up for const hessian (#2857)

* speed up for const hessian

* rename template

* fix clang build

* template init

* add comment
parent f7037fd5
......@@ -30,6 +30,9 @@ enum MissingType {
};
typedef double hist_t;
typedef uint64_t hist_cnt_t;
// check at compile time
static_assert(sizeof(hist_t) == sizeof(hist_cnt_t), "Histogram entry size is not correct");
const size_t kHistEntrySize = 2 * sizeof(hist_t);
const int kHistOffset = 2;
......@@ -482,20 +485,24 @@ class MultiValBin {
const std::vector<uint32_t>& lower, const std::vector<uint32_t>& upper,
const std::vector<uint32_t>& delta) = 0;
virtual void ConstructHistogram(
const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* gradients, const score_t* hessians,
hist_t* out) const = 0;
virtual void ConstructHistogram(const data_size_t* data_indices,
data_size_t start, data_size_t end,
const score_t* gradients,
const score_t* hessians,
hist_t* out) const = 0;
virtual void ConstructHistogram(data_size_t start, data_size_t end,
const score_t* gradients, const score_t* hessians,
hist_t* out) const = 0;
const score_t* gradients,
const score_t* hessians,
hist_t* out) const = 0;
virtual void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* ordered_gradients, hist_t* out) const = 0;
virtual void ConstructHistogram(data_size_t start, data_size_t end,
const score_t* ordered_gradients, hist_t* out) const = 0;
virtual void ConstructHistogramOrdered(const data_size_t* data_indices,
data_size_t start, data_size_t end,
const score_t* ordered_gradients,
const score_t* ordered_hessians,
hist_t* out) const = 0;
virtual void FinishLoad() = 0;
......
......@@ -482,20 +482,56 @@ class Dataset {
void InitTrain(const std::vector<int8_t>& is_feature_used,
TrainingShareStates* share_state) const;
void ConstructHistograms(const std::vector<int8_t>& is_feature_used,
const data_size_t* data_indices,
data_size_t num_data, const score_t* gradients,
const score_t* hessians, score_t* ordered_gradients,
score_t* ordered_hessians,
TrainingShareStates* share_state,
hist_t* histogram_data) const;
template <bool USE_INDICES, bool USE_HESSIAN>
void ConstructHistogramsInner(const std::vector<int8_t>& is_feature_used,
const data_size_t* data_indices,
data_size_t num_data, const score_t* gradients,
const score_t* hessians,
score_t* ordered_gradients,
score_t* ordered_hessians,
TrainingShareStates* share_state,
hist_t* hist_data) const;
template <bool USE_INDICES, bool ORDERED>
void ConstructHistogramsMultiVal(const data_size_t* data_indices,
data_size_t num_data,
const score_t* gradients,
const score_t* hessians,
TrainingShareStates* share_state,
hist_t* histogram_data) const;
hist_t* hist_data) const;
inline void ConstructHistograms(
const std::vector<int8_t>& is_feature_used,
const data_size_t* data_indices, data_size_t num_data,
const score_t* gradients, const score_t* hessians,
score_t* ordered_gradients, score_t* ordered_hessians,
TrainingShareStates* share_state, hist_t* hist_data) const {
if (num_data <= 0) {
return;
}
bool use_indices = data_indices != nullptr && (num_data < num_data_);
if (share_state->is_constant_hessian) {
if (use_indices) {
ConstructHistogramsInner<true, false>(
is_feature_used, data_indices, num_data, gradients, hessians,
ordered_gradients, ordered_hessians, share_state, hist_data);
} else {
ConstructHistogramsInner<false, false>(
is_feature_used, data_indices, num_data, gradients, hessians,
ordered_gradients, ordered_hessians, share_state, hist_data);
}
} else {
if (use_indices) {
ConstructHistogramsInner<true, true>(
is_feature_used, data_indices, num_data, gradients, hessians,
ordered_gradients, ordered_hessians, share_state, hist_data);
} else {
ConstructHistogramsInner<false, true>(
is_feature_used, data_indices, num_data, gradients, hessians,
ordered_gradients, ordered_hessians, share_state, hist_data);
}
}
}
void FixHistogram(int feature_idx, double sum_gradient, double sum_hessian, hist_t* data) const;
......
......@@ -651,8 +651,9 @@ TrainingShareStates* Dataset::GetShareStates(
hist_data.data());
col_wise_time = std::chrono::steady_clock::now() - start_time;
start_time = std::chrono::steady_clock::now();
ConstructHistogramsMultiVal(nullptr, num_data_, gradients, hessians,
rowwise_state.get(), hist_data.data());
ConstructHistograms(is_feature_used, nullptr, num_data_, gradients,
hessians, gradients, hessians, rowwise_state.get(),
hist_data.data());
row_wise_time = std::chrono::steady_clock::now() - start_time;
Log::Debug("col-wise cost %f seconds, row-wise cost %f seconds",
col_wise_time * 1e-3, row_wise_time * 1e-3);
......@@ -1193,6 +1194,7 @@ void Dataset::InitTrain(const std::vector<int8_t>& is_feature_used,
}
}
template <bool USE_INDICES, bool ORDERED>
void Dataset::ConstructHistogramsMultiVal(const data_size_t* data_indices,
data_size_t num_data,
const score_t* gradients,
......@@ -1237,21 +1239,17 @@ void Dataset::ConstructHistogramsMultiVal(const data_size_t* data_indices,
static_cast<size_t>(num_bin_aligned) * 2 * (tid - 1);
}
std::memset(reinterpret_cast<void*>(data_ptr), 0, num_bin * kHistEntrySize);
if (data_indices != nullptr && num_data < num_data_) {
if (!share_state->is_constant_hessian) {
multi_val_bin->ConstructHistogram(data_indices, start, end, gradients,
hessians, data_ptr);
if (USE_INDICES) {
if (ORDERED) {
multi_val_bin->ConstructHistogramOrdered(data_indices, start, end,
gradients, hessians, data_ptr);
} else {
multi_val_bin->ConstructHistogram(data_indices, start, end, gradients,
data_ptr);
hessians, data_ptr);
}
} else {
if (!share_state->is_constant_hessian) {
multi_val_bin->ConstructHistogram(start, end, gradients, hessians,
data_ptr);
} else {
multi_val_bin->ConstructHistogram(start, end, gradients, data_ptr);
}
multi_val_bin->ConstructHistogram(start, end, gradients, hessians,
data_ptr);
}
OMP_LOOP_EX_END();
}
......@@ -1263,33 +1261,15 @@ void Dataset::ConstructHistogramsMultiVal(const data_size_t* data_indices,
int bin_block_size = num_bin;
Threading::BlockInfo<data_size_t>(share_state->num_threads, num_bin, 512, &n_bin_block,
&bin_block_size);
if (!share_state->is_constant_hessian) {
#pragma omp parallel for schedule(static)
for (int t = 0; t < n_bin_block; ++t) {
const int start = t * bin_block_size;
const int end = std::min(start + bin_block_size, num_bin);
for (int tid = 1; tid < n_data_block; ++tid) {
auto src_ptr = share_state->hist_buf.data() +
static_cast<size_t>(num_bin_aligned) * 2 * (tid - 1);
for (int i = start * 2; i < end * 2; ++i) {
hist_data[i] += src_ptr[i];
}
}
}
} else {
#pragma omp parallel for schedule(static)
for (int t = 0; t < n_bin_block; ++t) {
const int start = t * bin_block_size;
const int end = std::min(start + bin_block_size, num_bin);
for (int tid = 1; tid < n_data_block; ++tid) {
auto src_ptr = share_state->hist_buf.data() +
static_cast<size_t>(num_bin_aligned) * 2 * (tid - 1);
for (int i = start * 2; i < end * 2; ++i) {
hist_data[i] += src_ptr[i];
}
}
for (int i = start; i < end; ++i) {
GET_HESS(hist_data, i) = GET_HESS(hist_data, i) * hessians[0];
for (int t = 0; t < n_bin_block; ++t) {
const int start = t * bin_block_size;
const int end = std::min(start + bin_block_size, num_bin);
for (int tid = 1; tid < n_data_block; ++tid) {
auto src_ptr = share_state->hist_buf.data() +
static_cast<size_t>(num_bin_aligned) * 2 * (tid - 1);
for (int i = start * 2; i < end * 2; ++i) {
hist_data[i] += src_ptr[i];
}
}
}
......@@ -1299,20 +1279,16 @@ void Dataset::ConstructHistogramsMultiVal(const data_size_t* data_indices,
global_timer.Stop("Dataset::sparse_bin_histogram_move");
}
void Dataset::ConstructHistograms(
template <bool USE_INDICES, bool USE_HESSIAN>
void Dataset::ConstructHistogramsInner(
const std::vector<int8_t>& is_feature_used, const data_size_t* data_indices,
data_size_t num_data, const score_t* gradients, const score_t* hessians,
score_t* ordered_gradients, score_t* ordered_hessians,
TrainingShareStates* share_state, hist_t* hist_data) const {
Common::FunctionTimer fun_timer("Dataset::ConstructHistograms", global_timer);
if (num_data < 0 || hist_data == nullptr) {
return;
}
if (!share_state->is_colwise) {
return ConstructHistogramsMultiVal(data_indices, num_data, gradients,
hessians, share_state, hist_data);
return ConstructHistogramsMultiVal<USE_INDICES, false>(
data_indices, num_data, gradients, hessians, share_state, hist_data);
}
global_timer.Start("Dataset::Get used group");
std::vector<int> used_dense_group;
int multi_val_groud_id = -1;
used_dense_group.reserve(num_groups_);
......@@ -1335,117 +1311,102 @@ void Dataset::ConstructHistograms(
}
}
int num_used_dense_group = static_cast<int>(used_dense_group.size());
global_timer.Stop("Dataset::Get used group");
global_timer.Start("Dataset::dense_bin_histogram");
auto ptr_ordered_grad = gradients;
auto ptr_ordered_hess = hessians;
if (num_used_dense_group > 0) {
auto ptr_ordered_grad = gradients;
auto ptr_ordered_hess = hessians;
if (data_indices != nullptr && num_data < num_data_) {
if (!share_state->is_constant_hessian) {
if (USE_INDICES) {
if (USE_HESSIAN) {
#pragma omp parallel for schedule(static, 512) if (num_data >= 1024)
for (data_size_t i = 0; i < num_data; ++i) {
ordered_gradients[i] = gradients[data_indices[i]];
ordered_hessians[i] = hessians[data_indices[i]];
}
ptr_ordered_grad = ordered_gradients;
ptr_ordered_hess = ordered_hessians;
} else {
#pragma omp parallel for schedule(static, 512) if (num_data >= 1024)
for (data_size_t i = 0; i < num_data; ++i) {
ordered_gradients[i] = gradients[data_indices[i]];
}
ptr_ordered_grad = ordered_gradients;
}
ptr_ordered_grad = ordered_gradients;
ptr_ordered_hess = ordered_hessians;
if (!share_state->is_constant_hessian) {
OMP_INIT_EX();
}
OMP_INIT_EX();
#pragma omp parallel for schedule(static)
for (int gi = 0; gi < num_used_dense_group; ++gi) {
OMP_LOOP_EX_BEGIN();
int group = used_dense_group[gi];
// feature is not used
auto data_ptr = hist_data + group_bin_boundaries_[group] * 2;
const int num_bin = feature_groups_[group]->num_total_bin_;
std::memset(reinterpret_cast<void*>(data_ptr), 0,
num_bin * kHistEntrySize);
// construct histograms for smaller leaf
for (int gi = 0; gi < num_used_dense_group; ++gi) {
OMP_LOOP_EX_BEGIN();
int group = used_dense_group[gi];
auto data_ptr = hist_data + group_bin_boundaries_[group] * 2;
const int num_bin = feature_groups_[group]->num_total_bin_;
std::memset(reinterpret_cast<void*>(data_ptr), 0,
num_bin * kHistEntrySize);
if (USE_HESSIAN) {
if (USE_INDICES) {
feature_groups_[group]->bin_data_->ConstructHistogram(
data_indices, 0, num_data, ptr_ordered_grad, ptr_ordered_hess,
data_ptr);
OMP_LOOP_EX_END();
}
OMP_THROW_EX();
} else {
OMP_INIT_EX();
#pragma omp parallel for schedule(static)
for (int gi = 0; gi < num_used_dense_group; ++gi) {
OMP_LOOP_EX_BEGIN();
int group = used_dense_group[gi];
// feature is not used
auto data_ptr = hist_data + group_bin_boundaries_[group] * 2;
const int num_bin = feature_groups_[group]->num_total_bin_;
std::memset(reinterpret_cast<void*>(data_ptr), 0,
num_bin * kHistEntrySize);
// construct histograms for smaller leaf
feature_groups_[group]->bin_data_->ConstructHistogram(
data_indices, 0, num_data, ptr_ordered_grad, data_ptr);
// fixed hessian.
for (int i = 0; i < num_bin; ++i) {
GET_HESS(data_ptr, i) = GET_HESS(data_ptr, i) * hessians[0];
}
OMP_LOOP_EX_END();
}
OMP_THROW_EX();
}
} else {
if (!share_state->is_constant_hessian) {
OMP_INIT_EX();
#pragma omp parallel for schedule(static)
for (int gi = 0; gi < num_used_dense_group; ++gi) {
OMP_LOOP_EX_BEGIN();
int group = used_dense_group[gi];
// feature is not used
auto data_ptr = hist_data + group_bin_boundaries_[group] * 2;
const int num_bin = feature_groups_[group]->num_total_bin_;
std::memset(reinterpret_cast<void*>(data_ptr), 0,
num_bin * kHistEntrySize);
// construct histograms for smaller leaf
} else {
feature_groups_[group]->bin_data_->ConstructHistogram(
0, num_data, ptr_ordered_grad, ptr_ordered_hess, data_ptr);
OMP_LOOP_EX_END();
}
OMP_THROW_EX();
} else {
OMP_INIT_EX();
#pragma omp parallel for schedule(static)
for (int gi = 0; gi < num_used_dense_group; ++gi) {
OMP_LOOP_EX_BEGIN();
int group = used_dense_group[gi];
// feature is not used
auto data_ptr = hist_data + group_bin_boundaries_[group] * 2;
const int num_bin = feature_groups_[group]->num_total_bin_;
std::memset(reinterpret_cast<void*>(data_ptr), 0,
num_bin * kHistEntrySize);
// construct histograms for smaller leaf
if (USE_INDICES) {
feature_groups_[group]->bin_data_->ConstructHistogram(
data_indices, 0, num_data, ptr_ordered_grad, data_ptr);
} else {
feature_groups_[group]->bin_data_->ConstructHistogram(
0, num_data, ptr_ordered_grad, data_ptr);
// fixed hessian.
for (int i = 0; i < num_bin; ++i) {
GET_HESS(data_ptr, i) = GET_HESS(data_ptr, i) * hessians[0];
}
OMP_LOOP_EX_END();
}
OMP_THROW_EX();
auto cnt_dst = reinterpret_cast<hist_cnt_t*>(data_ptr + 1);
for (int i = 0; i < num_bin * 2; i += 2) {
data_ptr[i + 1] = static_cast<double>(cnt_dst[i]) * hessians[0];
}
}
OMP_LOOP_EX_END();
}
OMP_THROW_EX();
}
global_timer.Stop("Dataset::dense_bin_histogram");
if (multi_val_groud_id >= 0) {
ConstructHistogramsMultiVal(
data_indices, num_data, gradients, hessians, share_state,
hist_data + group_bin_boundaries_[multi_val_groud_id] * 2);
if (num_used_dense_group > 0) {
ConstructHistogramsMultiVal<USE_INDICES, true>(
data_indices, num_data, ptr_ordered_grad, ptr_ordered_hess,
share_state,
hist_data + group_bin_boundaries_[multi_val_groud_id] * 2);
} else {
ConstructHistogramsMultiVal<USE_INDICES, false>(
data_indices, num_data, gradients, hessians, share_state,
hist_data + group_bin_boundaries_[multi_val_groud_id] * 2);
}
}
}
// explicitly initilize template methods, for cross module call
template void Dataset::ConstructHistogramsInner<true, true>(
const std::vector<int8_t>& is_feature_used, const data_size_t* data_indices,
data_size_t num_data, const score_t* gradients, const score_t* hessians,
score_t* ordered_gradients, score_t* ordered_hessians,
TrainingShareStates* share_state, hist_t* hist_data) const;
template void Dataset::ConstructHistogramsInner<true, false>(
const std::vector<int8_t>& is_feature_used, const data_size_t* data_indices,
data_size_t num_data, const score_t* gradients, const score_t* hessians,
score_t* ordered_gradients, score_t* ordered_hessians,
TrainingShareStates* share_state, hist_t* hist_data) const;
template void Dataset::ConstructHistogramsInner<false, true>(
const std::vector<int8_t>& is_feature_used, const data_size_t* data_indices,
data_size_t num_data, const score_t* gradients, const score_t* hessians,
score_t* ordered_gradients, score_t* ordered_hessians,
TrainingShareStates* share_state, hist_t* hist_data) const;
template void Dataset::ConstructHistogramsInner<false, false>(
const std::vector<int8_t>& is_feature_used, const data_size_t* data_indices,
data_size_t num_data, const score_t* gradients, const score_t* hessians,
score_t* ordered_gradients, score_t* ordered_hessians,
TrainingShareStates* share_state, hist_t* hist_data) const;
void Dataset::FixHistogram(int feature_idx, double sum_gradient,
double sum_hessian, hist_t* data) const {
const int group = feature2group_[feature_idx];
......
......@@ -68,42 +68,42 @@ class DenseBin: public Bin {
BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const override;
#define ACC_GH(hist, i, g, h) \
const auto ti = static_cast<int>(i) << 1; \
hist[ti] += g; \
hist[ti + 1] += h; \
template<bool use_indices, bool use_prefetch, bool use_hessians>
template<bool USE_INDICES, bool USE_PREFETCH, bool USE_HESSIAN>
void ConstructHistogramInner(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* ordered_gradients, const score_t* ordered_hessians, hist_t* out) const {
data_size_t i = start;
if (use_prefetch) {
hist_t* grad = out;
hist_t* hess = out + 1;
hist_cnt_t* cnt = reinterpret_cast<hist_cnt_t*>(hess);
if (USE_PREFETCH) {
const data_size_t pf_offset = 64 / sizeof(VAL_T);
const data_size_t pf_end = end - pf_offset;
for (; i < pf_end; ++i) {
const auto idx = use_indices ? data_indices[i] : i;
const auto pf_idx = use_indices ? data_indices[i + pf_offset] : i + pf_offset;
const auto idx = USE_INDICES ? data_indices[i] : i;
const auto pf_idx = USE_INDICES ? data_indices[i + pf_offset] : i + pf_offset;
PREFETCH_T0(data_.data() + pf_idx);
const VAL_T bin = data_[idx];
if (use_hessians) {
ACC_GH(out, bin, ordered_gradients[i], ordered_hessians[i]);
const auto ti = static_cast<uint32_t>(data_[idx]) << 1;
if (USE_HESSIAN) {
grad[ti] += ordered_gradients[i];
hess[ti] += ordered_hessians[i];
} else {
ACC_GH(out, bin, ordered_gradients[i], 1.0f);
grad[ti] += ordered_gradients[i];
++cnt[ti];
}
}
}
for (; i < end; ++i) {
const auto idx = use_indices ? data_indices[i] : i;
const VAL_T bin = data_[idx];
if (use_hessians) {
ACC_GH(out, bin, ordered_gradients[i], ordered_hessians[i]);
const auto idx = USE_INDICES ? data_indices[i] : i;
const auto ti = static_cast<uint32_t>(data_[idx]) << 1;
if (USE_HESSIAN) {
grad[ti] += ordered_gradients[i];
hess[ti] += ordered_hessians[i];
} else {
ACC_GH(out, bin, ordered_gradients[i], 1.0f);
grad[ti] += ordered_gradients[i];
++cnt[ti];
}
}
}
#undef ACC_GH
void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* ordered_gradients, const score_t* ordered_hessians,
......
......@@ -73,42 +73,44 @@ class Dense4bitsBin : public Bin {
inline BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const override;
#define ACC_GH(hist, i, g, h) \
const auto ti = (i) << 1; \
hist[ti] += g; \
hist[ti + 1] += h; \
template<bool use_indices, bool use_prefetch, bool use_hessians>
template<bool USE_INDICES, bool USE_PREFETCH, bool USE_HESSIAN>
void ConstructHistogramInner(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* ordered_gradients, const score_t* ordered_hessians, hist_t* out) const {
data_size_t i = start;
if (use_prefetch) {
hist_t* grad = out;
hist_t* hess = out + 1;
hist_cnt_t* cnt = reinterpret_cast<hist_cnt_t*>(hess);
if (USE_PREFETCH) {
const data_size_t pf_offset = 64;
const data_size_t pf_end = end - pf_offset;
for (; i < pf_end; ++i) {
const auto idx = use_indices ? data_indices[i] : i;
const auto pf_idx = use_indices ? data_indices[i + pf_offset] : i + pf_offset;
const auto idx = USE_INDICES ? data_indices[i] : i;
const auto pf_idx = USE_INDICES ? data_indices[i + pf_offset] : i + pf_offset;
PREFETCH_T0(data_.data() + (pf_idx >> 1));
const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
if (use_hessians) {
ACC_GH(out, bin, ordered_gradients[i], ordered_hessians[i]);
const uint8_t bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
const uint8_t ti = static_cast<uint8_t>(bin) << 1;
if (USE_HESSIAN) {
grad[ti] += ordered_gradients[i];
hess[ti] += ordered_hessians[i];
} else {
ACC_GH(out, bin, ordered_gradients[i], 1.0f);
grad[ti] += ordered_gradients[i];
++cnt[ti];
}
}
}
for (; i < end; ++i) {
const auto idx = use_indices ? data_indices[i] : i;
const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
if (use_hessians) {
ACC_GH(out, bin, ordered_gradients[i], ordered_hessians[i]);
const auto idx = USE_INDICES ? data_indices[i] : i;
const uint8_t bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
const uint8_t ti = static_cast<uint8_t>(bin) << 1;
if (USE_HESSIAN) {
grad[ti] += ordered_gradients[i];
hess[ti] += ordered_hessians[i];
} else {
ACC_GH(out, bin, ordered_gradients[i], 1.0f);
grad[ti] += ordered_gradients[i];
++cnt[ti];
}
}
}
#undef ACC_GH
void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* ordered_gradients, const score_t* ordered_hessians,
......
......@@ -50,75 +50,75 @@ class MultiValDenseBin : public MultiValBin {
return false;
}
#define ACC_GH(hist, i, g, h) \
const auto ti = static_cast<int>(i) << 1; \
hist[ti] += g; \
hist[ti + 1] += h; \
template<bool use_indices, bool use_prefetch, bool use_hessians>
template<bool USE_INDICES, bool USE_PREFETCH, bool ORDERED>
void ConstructHistogramInner(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* gradients, const score_t* hessians, hist_t* out) const {
data_size_t i = start;
if (use_prefetch) {
hist_t* grad = out;
hist_t* hess = out + 1;
if (USE_PREFETCH) {
const data_size_t pf_offset = 32 / sizeof(VAL_T);
const data_size_t pf_end = end - pf_offset;
for (; i < pf_end; ++i) {
const auto idx = use_indices ? data_indices[i] : i;
const auto pf_idx = use_indices ? data_indices[i + pf_offset] : i + pf_offset;
PREFETCH_T0(gradients + pf_idx);
if (use_hessians) {
const auto idx = USE_INDICES ? data_indices[i] : i;
const auto pf_idx = USE_INDICES ? data_indices[i + pf_offset] : i + pf_offset;
if (!ORDERED) {
PREFETCH_T0(gradients + pf_idx);
PREFETCH_T0(hessians + pf_idx);
}
PREFETCH_T0(data_.data() + RowPtr(pf_idx));
const auto j_start = RowPtr(idx);
for (auto j = j_start; j < j_start + num_feature_; ++j) {
const VAL_T bin = data_[j];
if (use_hessians) {
ACC_GH(out, bin, gradients[idx], hessians[idx]);
const auto ti = static_cast<uint32_t>(data_[j]) << 1;
if (ORDERED) {
grad[ti] += gradients[i];
hess[ti] += hessians[i];
} else {
ACC_GH(out, bin, gradients[idx], 1.0f);
grad[ti] += gradients[idx];
hess[ti] += hessians[idx];
}
}
}
}
for (; i < end; ++i) {
const auto idx = use_indices ? data_indices[i] : i;
const auto idx = USE_INDICES ? data_indices[i] : i;
const auto j_start = RowPtr(idx);
for (auto j = j_start; j < j_start + num_feature_; ++j) {
const VAL_T bin = data_[j];
if (use_hessians) {
ACC_GH(out, bin, gradients[idx], hessians[idx]);
const auto ti = static_cast<uint32_t>(data_[j]) << 1;
if (ORDERED) {
grad[ti] += gradients[i];
hess[ti] += hessians[i];
} else {
ACC_GH(out, bin, gradients[idx], 1.0f);
grad[ti] += gradients[idx];
hess[ti] += hessians[idx];
}
}
}
}
#undef ACC_GH
void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* gradients, const score_t* hessians,
hist_t* out) const override {
ConstructHistogramInner<true, true, true>(data_indices, start, end, gradients, hessians, out);
void ConstructHistogram(const data_size_t* data_indices, data_size_t start,
data_size_t end, const score_t* gradients,
const score_t* hessians, hist_t* out) const override {
ConstructHistogramInner<true, true, false>(data_indices, start, end,
gradients, hessians, out);
}
void ConstructHistogram(data_size_t start, data_size_t end,
const score_t* gradients, const score_t* hessians,
hist_t* out) const override {
ConstructHistogramInner<false, false, true>(nullptr, start, end, gradients, hessians, out);
}
void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* gradients,
hist_t* out) const override {
ConstructHistogramInner<true, true, false>(data_indices, start, end, gradients, nullptr, out);
}
void ConstructHistogram(data_size_t start, data_size_t end,
const score_t* gradients,
hist_t* out) const override {
ConstructHistogramInner<false, false, false>(nullptr, start, end, gradients, nullptr, out);
const score_t* gradients, const score_t* hessians,
hist_t* out) const override {
ConstructHistogramInner<false, false, false>(
nullptr, start, end, gradients, hessians, out);
}
void ConstructHistogramOrdered(const data_size_t* data_indices,
data_size_t start, data_size_t end,
const score_t* gradients,
const score_t* hessians,
hist_t* out) const override {
ConstructHistogramInner<true, true, true>(data_indices, start, end,
gradients, hessians, out);
}
MultiValBin* CreateLike(data_size_t num_data, int num_bin, int num_feature, double) const override {
......
......@@ -106,27 +106,24 @@ class MultiValSparseBin : public MultiValBin {
bool IsSparse() override { return true; }
#define ACC_GH(hist, i, g, h) \
const auto ti = static_cast<int>(i) << 1; \
hist[ti] += g; \
hist[ti + 1] += h;
template <bool use_indices, bool use_prefetch, bool use_hessians>
template <bool USE_INDICES, bool USE_PREFETCH, bool ORDERED>
void ConstructHistogramInner(const data_size_t* data_indices,
data_size_t start, data_size_t end,
const score_t* gradients,
const score_t* hessians, hist_t* out) const {
data_size_t i = start;
if (use_prefetch) {
hist_t* grad = out;
hist_t* hess = out + 1;
if (USE_PREFETCH) {
const data_size_t pf_offset = 32 / sizeof(VAL_T);
const data_size_t pf_end = end - pf_offset;
for (; i < pf_end; ++i) {
const auto idx = use_indices ? data_indices[i] : i;
const auto idx = USE_INDICES ? data_indices[i] : i;
const auto pf_idx =
use_indices ? data_indices[i + pf_offset] : i + pf_offset;
PREFETCH_T0(gradients + pf_idx);
if (use_hessians) {
USE_INDICES ? data_indices[i + pf_offset] : i + pf_offset;
if (!ORDERED) {
PREFETCH_T0(gradients + pf_idx);
PREFETCH_T0(hessians + pf_idx);
}
PREFETCH_T0(row_ptr_.data() + pf_idx);
......@@ -134,57 +131,55 @@ class MultiValSparseBin : public MultiValBin {
const auto j_start = RowPtr(idx);
const auto j_end = RowPtr(idx + 1);
for (auto j = j_start; j < j_end; ++j) {
const VAL_T bin = data_[j];
if (use_hessians) {
ACC_GH(out, bin, gradients[idx], hessians[idx]);
const auto ti = static_cast<uint32_t>(data_[j]) << 1;
if (ORDERED) {
grad[ti] += gradients[i];
hess[ti] += hessians[i];
} else {
ACC_GH(out, bin, gradients[idx], 1.0f);
grad[ti] += gradients[idx];
hess[ti] += hessians[idx];
}
}
}
}
for (; i < end; ++i) {
const auto idx = use_indices ? data_indices[i] : i;
const auto idx = USE_INDICES ? data_indices[i] : i;
const auto j_start = RowPtr(idx);
const auto j_end = RowPtr(idx + 1);
for (auto j = j_start; j < j_end; ++j) {
const VAL_T bin = data_[j];
if (use_hessians) {
ACC_GH(out, bin, gradients[idx], hessians[idx]);
const auto ti = static_cast<uint32_t>(data_[j]) << 1;
if (ORDERED) {
grad[ti] += gradients[i];
hess[ti] += hessians[i];
} else {
ACC_GH(out, bin, gradients[idx], 1.0f);
grad[ti] += gradients[idx];
hess[ti] += hessians[idx];
}
}
}
}
#undef ACC_GH
void ConstructHistogram(const data_size_t* data_indices, data_size_t start,
data_size_t end, const score_t* gradients,
const score_t* hessians, hist_t* out) const override {
ConstructHistogramInner<true, true, true>(data_indices, start, end,
gradients, hessians, out);
ConstructHistogramInner<true, true, false>(data_indices, start, end,
gradients, hessians, out);
}
void ConstructHistogram(data_size_t start, data_size_t end,
const score_t* gradients, const score_t* hessians,
hist_t* out) const override {
ConstructHistogramInner<false, false, true>(nullptr, start, end, gradients,
hessians, out);
}
void ConstructHistogram(const data_size_t* data_indices, data_size_t start,
data_size_t end, const score_t* gradients,
hist_t* out) const override {
ConstructHistogramInner<true, true, false>(data_indices, start, end,
gradients, nullptr, out);
ConstructHistogramInner<false, false, false>(
nullptr, start, end, gradients, hessians, out);
}
void ConstructHistogram(data_size_t start, data_size_t end,
const score_t* gradients,
hist_t* out) const override {
ConstructHistogramInner<false, false, false>(nullptr, start, end, gradients,
nullptr, out);
void ConstructHistogramOrdered(const data_size_t* data_indices,
data_size_t start, data_size_t end,
const score_t* gradients,
const score_t* hessians,
hist_t* out) const override {
ConstructHistogramInner<true, true, true>(data_indices, start, end,
gradients, hessians, out);
}
MultiValBin* CreateLike(data_size_t num_data, int num_bin, int,
......
......@@ -138,6 +138,8 @@ class SparseBin: public Bin {
data_size_t i_delta, cur_pos;
InitIndex(data_indices[start], &i_delta, &cur_pos);
data_size_t i = start;
hist_t* grad = out;
hist_cnt_t* cnt = reinterpret_cast<hist_cnt_t*>(out + 1);
for (;;) {
if (cur_pos < data_indices[i]) {
cur_pos += deltas_[++i_delta];
......@@ -145,8 +147,9 @@ class SparseBin: public Bin {
} else if (cur_pos > data_indices[i]) {
if (++i >= end) { break; }
} else {
const VAL_T bin = vals_[i_delta];
ACC_GH(out, bin, ordered_gradients[i], 1.0f);
const uint32_t ti = static_cast<uint32_t>(vals_[i_delta]) << 1;
grad[ti] += ordered_gradients[i];
++cnt[ti];
if (++i >= end) { break; }
cur_pos += deltas_[++i_delta];
if (i_delta >= num_vals_) { break; }
......@@ -158,12 +161,15 @@ class SparseBin: public Bin {
const score_t* ordered_gradients, hist_t* out) const override {
data_size_t i_delta, cur_pos;
InitIndex(start, &i_delta, &cur_pos);
hist_t* grad = out;
hist_cnt_t* cnt = reinterpret_cast<hist_cnt_t*>(out + 1);
while (cur_pos < start && i_delta < num_vals_) {
cur_pos += deltas_[++i_delta];
}
while (cur_pos < end && i_delta < num_vals_) {
const VAL_T bin = vals_[i_delta];
ACC_GH(out, bin, ordered_gradients[cur_pos], 1.0f);
const uint32_t ti = static_cast<uint32_t>(vals_[i_delta]) << 1;
grad[ti] += ordered_gradients[cur_pos];
++cnt[ti];
cur_pos += deltas_[++i_delta];
}
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment