Unverified Commit e79716e0 authored by Andrew Ziem's avatar Andrew Ziem Committed by GitHub
Browse files

Correct spelling (#4250)



* Correct spelling

Most changes were in comments, and there were a few changes to literals for log output.

There were no changes to variable names, function names, IDs, or functionality.

* Clarify a phrase in a comment
Co-authored-by: default avatarJames Lamb <jaylamb20@gmail.com>

* Clarify a phrase in a comment
Co-authored-by: default avatarJames Lamb <jaylamb20@gmail.com>

* Clarify a phrase in a comment
Co-authored-by: default avatarJames Lamb <jaylamb20@gmail.com>

* Correct spelling

Most are code comments, but one case is a literal in a logging message.

There are a few grammar fixes too.
Co-authored-by: default avatarJames Lamb <jaylamb20@gmail.com>
parent bb88d92e
...@@ -400,7 +400,7 @@ def cv(params, train_set, num_boost_round=100, ...@@ -400,7 +400,7 @@ def cv(params, train_set, num_boost_round=100,
verbose_eval=None, show_stdv=True, seed=0, verbose_eval=None, show_stdv=True, seed=0,
callbacks=None, eval_train_metric=False, callbacks=None, eval_train_metric=False,
return_cvbooster=False): return_cvbooster=False):
"""Perform the cross-validation with given paramaters. """Perform the cross-validation with given parameters.
Parameters Parameters
---------- ----------
...@@ -459,7 +459,7 @@ def cv(params, train_set, num_boost_round=100, ...@@ -459,7 +459,7 @@ def cv(params, train_set, num_boost_round=100,
train_data : Dataset train_data : Dataset
The training dataset. The training dataset.
eval_name : string eval_name : string
The name of evaluation function (without whitespaces). The name of evaluation function (without whitespace).
eval_result : float eval_result : float
The eval result. The eval result.
is_higher_better : bool is_higher_better : bool
......
...@@ -136,7 +136,7 @@ class _EvalFunctionWrapper: ...@@ -136,7 +136,7 @@ class _EvalFunctionWrapper:
For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups,
where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc. where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc.
eval_name : string eval_name : string
The name of evaluation function (without whitespaces). The name of evaluation function (without whitespace).
eval_result : float eval_result : float
The eval result. The eval result.
is_higher_better : bool is_higher_better : bool
...@@ -162,7 +162,7 @@ class _EvalFunctionWrapper: ...@@ -162,7 +162,7 @@ class _EvalFunctionWrapper:
Returns Returns
------- -------
eval_name : string eval_name : string
The name of evaluation function (without whitespaces). The name of evaluation function (without whitespace).
eval_result : float eval_result : float
The eval result. The eval result.
is_higher_better : bool is_higher_better : bool
...@@ -289,7 +289,7 @@ _lgbmmodel_doc_custom_eval_note = """ ...@@ -289,7 +289,7 @@ _lgbmmodel_doc_custom_eval_note = """
For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups,
where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc. where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc.
eval_name : string eval_name : string
The name of evaluation function (without whitespaces). The name of evaluation function (without whitespace).
eval_result : float eval_result : float
The eval result. The eval result.
is_higher_better : bool is_higher_better : bool
...@@ -402,7 +402,7 @@ class LGBMModel(_LGBMModelBase): ...@@ -402,7 +402,7 @@ class LGBMModel(_LGBMModelBase):
subsample : float, optional (default=1.) subsample : float, optional (default=1.)
Subsample ratio of the training instance. Subsample ratio of the training instance.
subsample_freq : int, optional (default=0) subsample_freq : int, optional (default=0)
Frequence of subsample, <=0 means no enable. Frequency of subsample, <=0 means no enable.
colsample_bytree : float, optional (default=1.) colsample_bytree : float, optional (default=1.)
Subsample ratio of columns when constructing each tree. Subsample ratio of columns when constructing each tree.
reg_alpha : float, optional (default=0.) reg_alpha : float, optional (default=0.)
......
...@@ -106,7 +106,7 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective ...@@ -106,7 +106,7 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective
train_score_updater_.reset(new ScoreUpdater(train_data_, num_tree_per_iteration_)); train_score_updater_.reset(new ScoreUpdater(train_data_, num_tree_per_iteration_));
num_data_ = train_data_->num_data(); num_data_ = train_data_->num_data();
// create buffer for gradients and hessians // create buffer for gradients and Hessians
if (objective_function_ != nullptr) { if (objective_function_ != nullptr) {
size_t total_size = static_cast<size_t>(num_data_) * num_tree_per_iteration_; size_t total_size = static_cast<size_t>(num_data_) * num_tree_per_iteration_;
gradients_.resize(total_size); gradients_.resize(total_size);
...@@ -320,7 +320,7 @@ void GBDT::RefitTree(const std::vector<std::vector<int>>& tree_leaf_prediction) ...@@ -320,7 +320,7 @@ void GBDT::RefitTree(const std::vector<std::vector<int>>& tree_leaf_prediction)
} }
} }
/* If the custom "average" is implemented it will be used inplace of the label average (if enabled) /* If the custom "average" is implemented it will be used in place of the label average (if enabled)
* *
* An improvement to this is to have options to explicitly choose * An improvement to this is to have options to explicitly choose
* (i) standard average * (i) standard average
......
...@@ -131,7 +131,7 @@ class GBDT : public GBDTBase { ...@@ -131,7 +131,7 @@ class GBDT : public GBDTBase {
/*! /*!
* \brief Perform a full training procedure * \brief Perform a full training procedure
* \param snapshot_freq frequence of snapshot * \param snapshot_freq frequency of snapshot
* \param model_output_path path of model file * \param model_output_path path of model file
*/ */
void Train(int snapshot_freq, const std::string& model_output_path) override; void Train(int snapshot_freq, const std::string& model_output_path) override;
...@@ -141,7 +141,7 @@ class GBDT : public GBDTBase { ...@@ -141,7 +141,7 @@ class GBDT : public GBDTBase {
/*! /*!
* \brief Training logic * \brief Training logic
* \param gradients nullptr for using default objective, otherwise use self-defined boosting * \param gradients nullptr for using default objective, otherwise use self-defined boosting
* \param hessians nullptr for using default objective, otherwise use self-defined boosting * \param Hessians nullptr for using default objective, otherwise use self-defined boosting
* \return True if cannot train any more * \return True if cannot train any more
*/ */
bool TrainOneIter(const score_t* gradients, const score_t* hessians) override; bool TrainOneIter(const score_t* gradients, const score_t* hessians) override;
...@@ -444,7 +444,7 @@ class GBDT : public GBDTBase { ...@@ -444,7 +444,7 @@ class GBDT : public GBDTBase {
/*! /*!
* \brief Print metric result of current iteration * \brief Print metric result of current iteration
* \param iter Current interation * \param iter Current iteration
* \return best_msg if met early_stopping * \return best_msg if met early_stopping
*/ */
std::string OutputMetric(int iter); std::string OutputMetric(int iter);
......
...@@ -347,7 +347,7 @@ void Config::CheckParamConflict() { ...@@ -347,7 +347,7 @@ void Config::CheckParamConflict() {
Log::Warning("CUDA currently requires double precision calculations."); Log::Warning("CUDA currently requires double precision calculations.");
gpu_use_dp = true; gpu_use_dp = true;
} }
// linear tree learner must be serial type and run on cpu device // linear tree learner must be serial type and run on CPU device
if (linear_tree) { if (linear_tree) {
if (device_type != std::string("cpu")) { if (device_type != std::string("cpu")) {
device_type = "cpu"; device_type = "cpu";
......
...@@ -1212,7 +1212,7 @@ void DatasetLoader::ExtractFeaturesFromMemory(std::vector<std::string>* text_dat ...@@ -1212,7 +1212,7 @@ void DatasetLoader::ExtractFeaturesFromMemory(std::vector<std::string>* text_dat
dataset->metadata_.SetLabelAt(i, static_cast<label_t>(tmp_label)); dataset->metadata_.SetLabelAt(i, static_cast<label_t>(tmp_label));
// free processed line: // free processed line:
ref_text_data[i].clear(); ref_text_data[i].clear();
// shrink_to_fit will be very slow in linux, and seems not free memory, disable for now // shrink_to_fit will be very slow in Linux, and seems not free memory, disable for now
// text_reader_->Lines()[i].shrink_to_fit(); // text_reader_->Lines()[i].shrink_to_fit();
// push data // push data
std::vector<bool> is_feature_added(dataset->num_features_, false); std::vector<bool> is_feature_added(dataset->num_features_, false);
......
...@@ -198,11 +198,11 @@ class AUCMetric: public Metric { ...@@ -198,11 +198,11 @@ class AUCMetric: public Metric {
sorted_idx.emplace_back(i); sorted_idx.emplace_back(i);
} }
Common::ParallelSort(sorted_idx.begin(), sorted_idx.end(), [score](data_size_t a, data_size_t b) {return score[a] > score[b]; }); Common::ParallelSort(sorted_idx.begin(), sorted_idx.end(), [score](data_size_t a, data_size_t b) {return score[a] > score[b]; });
// temp sum of postive label // temp sum of positive label
double cur_pos = 0.0f; double cur_pos = 0.0f;
// total sum of postive label // total sum of positive label
double sum_pos = 0.0f; double sum_pos = 0.0f;
// accumlate of auc // accumulate of AUC
double accum = 0.0f; double accum = 0.0f;
// temp sum of negative label // temp sum of negative label
double cur_neg = 0.0f; double cur_neg = 0.0f;
...@@ -214,7 +214,7 @@ class AUCMetric: public Metric { ...@@ -214,7 +214,7 @@ class AUCMetric: public Metric {
// new threshold // new threshold
if (cur_score != threshold) { if (cur_score != threshold) {
threshold = cur_score; threshold = cur_score;
// accmulate // accumulate
accum += cur_neg*(cur_pos * 0.5f + sum_pos); accum += cur_neg*(cur_pos * 0.5f + sum_pos);
sum_pos += cur_pos; sum_pos += cur_pos;
// reset // reset
...@@ -231,7 +231,7 @@ class AUCMetric: public Metric { ...@@ -231,7 +231,7 @@ class AUCMetric: public Metric {
// new threshold // new threshold
if (cur_score != threshold) { if (cur_score != threshold) {
threshold = cur_score; threshold = cur_score;
// accmulate // accumulate
accum += cur_neg*(cur_pos * 0.5f + sum_pos); accum += cur_neg*(cur_pos * 0.5f + sum_pos);
sum_pos += cur_pos; sum_pos += cur_pos;
// reset // reset
...@@ -309,15 +309,15 @@ class AveragePrecisionMetric: public Metric { ...@@ -309,15 +309,15 @@ class AveragePrecisionMetric: public Metric {
sorted_idx.emplace_back(i); sorted_idx.emplace_back(i);
} }
Common::ParallelSort(sorted_idx.begin(), sorted_idx.end(), [score](data_size_t a, data_size_t b) {return score[a] > score[b]; }); Common::ParallelSort(sorted_idx.begin(), sorted_idx.end(), [score](data_size_t a, data_size_t b) {return score[a] > score[b]; });
// temp sum of postive label // temp sum of positive label
double cur_actual_pos = 0.0f; double cur_actual_pos = 0.0f;
// total sum of postive label // total sum of positive label
double sum_actual_pos = 0.0f; double sum_actual_pos = 0.0f;
// total sum of predicted positive // total sum of predicted positive
double sum_pred_pos = 0.0f; double sum_pred_pos = 0.0f;
// accumulated precision // accumulated precision
double accum_prec = 1.0f; double accum_prec = 1.0f;
// accumlated pr-auc // accumulated pr-auc
double accum = 0.0f; double accum = 0.0f;
// temp sum of negative label // temp sum of negative label
double cur_neg = 0.0f; double cur_neg = 0.0f;
...@@ -348,7 +348,7 @@ class AveragePrecisionMetric: public Metric { ...@@ -348,7 +348,7 @@ class AveragePrecisionMetric: public Metric {
// new threshold // new threshold
if (cur_score != threshold) { if (cur_score != threshold) {
threshold = cur_score; threshold = cur_score;
// accmulate // accumulate
sum_actual_pos += cur_actual_pos; sum_actual_pos += cur_actual_pos;
sum_pred_pos += cur_actual_pos + cur_neg; sum_pred_pos += cur_actual_pos + cur_neg;
accum_prec = sum_actual_pos / sum_pred_pos; accum_prec = sum_actual_pos / sum_pred_pos;
......
...@@ -179,7 +179,7 @@ class MultiSoftmaxLoglossMetric: public MulticlassMetric<MultiSoftmaxLoglossMetr ...@@ -179,7 +179,7 @@ class MultiSoftmaxLoglossMetric: public MulticlassMetric<MultiSoftmaxLoglossMetr
} }
}; };
/*! \brief Auc-mu for multiclass task*/ /*! \brief AUC mu for multiclass task*/
class AucMuMetric : public Metric { class AucMuMetric : public Metric {
public: public:
explicit AucMuMetric(const Config& config) : config_(config) { explicit AucMuMetric(const Config& config) : config_(config) {
...@@ -275,7 +275,7 @@ class AucMuMetric : public Metric { ...@@ -275,7 +275,7 @@ class AucMuMetric : public Metric {
return false; return false;
} }
}); });
// calculate auc // calculate AUC
double num_j = 0; double num_j = 0;
double last_j_dist = 0; double last_j_dist = 0;
double num_current_j = 0; double num_current_j = 0;
......
...@@ -56,7 +56,7 @@ class NDCGMetric:public Metric { ...@@ -56,7 +56,7 @@ class NDCGMetric:public Metric {
} }
} }
inverse_max_dcgs_.resize(num_queries_); inverse_max_dcgs_.resize(num_queries_);
// cache the inverse max DCG for all querys, used to calculate NDCG // cache the inverse max DCG for all queries, used to calculate NDCG
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_queries_; ++i) { for (data_size_t i = 0; i < num_queries_; ++i) {
inverse_max_dcgs_[i].resize(eval_at_.size(), 0.0f); inverse_max_dcgs_[i].resize(eval_at_.size(), 0.0f);
...@@ -67,7 +67,7 @@ class NDCGMetric:public Metric { ...@@ -67,7 +67,7 @@ class NDCGMetric:public Metric {
if (inverse_max_dcgs_[i][j] > 0.0f) { if (inverse_max_dcgs_[i][j] > 0.0f) {
inverse_max_dcgs_[i][j] = 1.0f / inverse_max_dcgs_[i][j]; inverse_max_dcgs_[i][j] = 1.0f / inverse_max_dcgs_[i][j];
} else { } else {
// marking negative for all negative querys. // marking negative for all negative queries.
// if one meet this query, it's ndcg will be set as -1. // if one meet this query, it's ndcg will be set as -1.
inverse_max_dcgs_[i][j] = -1.0f; inverse_max_dcgs_[i][j] = -1.0f;
} }
......
...@@ -239,7 +239,7 @@ class PoissonMetric: public RegressionMetric<PoissonMetric> { ...@@ -239,7 +239,7 @@ class PoissonMetric: public RegressionMetric<PoissonMetric> {
}; };
/*! \brief Mape regression loss for regression task */ /*! \brief MAPE regression loss for regression task */
class MAPEMetric : public RegressionMetric<MAPEMetric> { class MAPEMetric : public RegressionMetric<MAPEMetric> {
public: public:
explicit MAPEMetric(const Config& config) :RegressionMetric<MAPEMetric>(config) { explicit MAPEMetric(const Config& config) :RegressionMetric<MAPEMetric>(config) {
......
...@@ -55,7 +55,7 @@ RecursiveHalvingMap::RecursiveHalvingMap(int in_k, RecursiveHalvingNodeType _typ ...@@ -55,7 +55,7 @@ RecursiveHalvingMap::RecursiveHalvingMap(int in_k, RecursiveHalvingNodeType _typ
is_power_of_2 = _is_power_of_2; is_power_of_2 = _is_power_of_2;
if (type != RecursiveHalvingNodeType::Other) { if (type != RecursiveHalvingNodeType::Other) {
for (int i = 0; i < k; ++i) { for (int i = 0; i < k; ++i) {
// defalut set as -1 // default set as -1
ranks.push_back(-1); ranks.push_back(-1);
send_block_start.push_back(-1); send_block_start.push_back(-1);
send_block_len.push_back(-1); send_block_len.push_back(-1);
...@@ -153,7 +153,7 @@ RecursiveHalvingMap RecursiveHalvingMap::Construct(int rank, int num_machines) { ...@@ -153,7 +153,7 @@ RecursiveHalvingMap RecursiveHalvingMap::Construct(int rank, int num_machines) {
const int dir = ((cur_group_idx / distance[i]) % 2 == 0) ? 1 : -1; const int dir = ((cur_group_idx / distance[i]) % 2 == 0) ? 1 : -1;
const int next_node_idx = group_to_node[(cur_group_idx + dir * distance[i])]; const int next_node_idx = group_to_node[(cur_group_idx + dir * distance[i])];
rec_map.ranks[i] = next_node_idx; rec_map.ranks[i] = next_node_idx;
// get receive block informations // get receive block information
const int recv_block_start = cur_group_idx / distance[i]; const int recv_block_start = cur_group_idx / distance[i];
rec_map.recv_block_start[i] = group_block_start[recv_block_start * distance[i]]; rec_map.recv_block_start[i] = group_block_start[recv_block_start * distance[i]];
int recv_block_len = 0; int recv_block_len = 0;
...@@ -162,7 +162,7 @@ RecursiveHalvingMap RecursiveHalvingMap::Construct(int rank, int num_machines) { ...@@ -162,7 +162,7 @@ RecursiveHalvingMap RecursiveHalvingMap::Construct(int rank, int num_machines) {
recv_block_len += group_block_len[recv_block_start * distance[i] + j]; recv_block_len += group_block_len[recv_block_start * distance[i] + j];
} }
rec_map.recv_block_len[i] = recv_block_len; rec_map.recv_block_len[i] = recv_block_len;
// get send block informations // get send block information
const int send_block_start = (cur_group_idx + dir * distance[i]) / distance[i]; const int send_block_start = (cur_group_idx + dir * distance[i]) / distance[i];
rec_map.send_block_start[i] = group_block_start[send_block_start * distance[i]]; rec_map.send_block_start[i] = group_block_start[send_block_start * distance[i]];
int send_block_len = 0; int send_block_len = 0;
......
...@@ -132,7 +132,7 @@ class Linkers { ...@@ -132,7 +132,7 @@ class Linkers {
*/ */
bool CheckLinker(int rank); bool CheckLinker(int rank);
/*! /*!
* \brief Print connented linkers * \brief Print connected linkers
*/ */
void PrintLinkers(); void PrintLinkers();
......
...@@ -88,12 +88,12 @@ class RankingObjective : public ObjectiveFunction { ...@@ -88,12 +88,12 @@ class RankingObjective : public ObjectiveFunction {
const label_t* label_; const label_t* label_;
/*! \brief Pointer of weights */ /*! \brief Pointer of weights */
const label_t* weights_; const label_t* weights_;
/*! \brief Query boundries */ /*! \brief Query boundaries */
const data_size_t* query_boundaries_; const data_size_t* query_boundaries_;
}; };
/*! /*!
* \brief Objective function for Lambdrank with NDCG * \brief Objective function for LambdaRank with NDCG
*/ */
class LambdarankNDCG : public RankingObjective { class LambdarankNDCG : public RankingObjective {
public: public:
...@@ -133,7 +133,7 @@ class LambdarankNDCG : public RankingObjective { ...@@ -133,7 +133,7 @@ class LambdarankNDCG : public RankingObjective {
inverse_max_dcgs_[i] = 1.0f / inverse_max_dcgs_[i]; inverse_max_dcgs_[i] = 1.0f / inverse_max_dcgs_[i];
} }
} }
// construct sigmoid table to speed up sigmoid transform // construct Sigmoid table to speed up Sigmoid transform
ConstructSigmoidTable(); ConstructSigmoidTable();
} }
...@@ -256,7 +256,7 @@ class LambdarankNDCG : public RankingObjective { ...@@ -256,7 +256,7 @@ class LambdarankNDCG : public RankingObjective {
const char* GetName() const override { return "lambdarank"; } const char* GetName() const override { return "lambdarank"; }
private: private:
/*! \brief Simgoid param */ /*! \brief Sigmoid param */
double sigmoid_; double sigmoid_;
/*! \brief Normalize the lambdas or not */ /*! \brief Normalize the lambdas or not */
bool norm_; bool norm_;
...@@ -272,9 +272,9 @@ class LambdarankNDCG : public RankingObjective { ...@@ -272,9 +272,9 @@ class LambdarankNDCG : public RankingObjective {
size_t _sigmoid_bins = 1024 * 1024; size_t _sigmoid_bins = 1024 * 1024;
/*! \brief Minimal input of sigmoid table */ /*! \brief Minimal input of sigmoid table */
double min_sigmoid_input_ = -50; double min_sigmoid_input_ = -50;
/*! \brief Maximal input of sigmoid table */ /*! \brief Maximal input of Sigmoid table */
double max_sigmoid_input_ = 50; double max_sigmoid_input_ = 50;
/*! \brief Factor that covert score to bin in sigmoid table */ /*! \brief Factor that covert score to bin in Sigmoid table */
double sigmoid_table_idx_factor_; double sigmoid_table_idx_factor_;
}; };
......
...@@ -571,7 +571,7 @@ class RegressionQuantileloss : public RegressionL2loss { ...@@ -571,7 +571,7 @@ class RegressionQuantileloss : public RegressionL2loss {
/*! /*!
* \brief Mape Regression Loss * \brief MAPE Regression Loss
*/ */
class RegressionMAPELOSS : public RegressionL1loss { class RegressionMAPELOSS : public RegressionL1loss {
public: public:
......
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
#include <vector> #include <vector>
/* /*
* Implements gradients and hessians for the following point losses. * Implements gradients and Hessians for the following point losses.
* Target y is anything in interval [0, 1]. * Target y is anything in interval [0, 1].
* *
* (1) CrossEntropy; "xentropy"; * (1) CrossEntropy; "xentropy";
...@@ -76,7 +76,7 @@ class CrossEntropy: public ObjectiveFunction { ...@@ -76,7 +76,7 @@ class CrossEntropy: public ObjectiveFunction {
void GetGradients(const double* score, score_t* gradients, score_t* hessians) const override { void GetGradients(const double* score, score_t* gradients, score_t* hessians) const override {
if (weights_ == nullptr) { if (weights_ == nullptr) {
// compute pointwise gradients and hessians with implied unit weights // compute pointwise gradients and Hessians with implied unit weights
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
const double z = 1.0f / (1.0f + std::exp(-score[i])); const double z = 1.0f / (1.0f + std::exp(-score[i]));
...@@ -84,7 +84,7 @@ class CrossEntropy: public ObjectiveFunction { ...@@ -84,7 +84,7 @@ class CrossEntropy: public ObjectiveFunction {
hessians[i] = static_cast<score_t>(z * (1.0f - z)); hessians[i] = static_cast<score_t>(z * (1.0f - z));
} }
} else { } else {
// compute pointwise gradients and hessians with given weights // compute pointwise gradients and Hessians with given weights
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
const double z = 1.0f / (1.0f + std::exp(-score[i])); const double z = 1.0f / (1.0f + std::exp(-score[i]));
...@@ -189,7 +189,7 @@ class CrossEntropyLambda: public ObjectiveFunction { ...@@ -189,7 +189,7 @@ class CrossEntropyLambda: public ObjectiveFunction {
void GetGradients(const double* score, score_t* gradients, score_t* hessians) const override { void GetGradients(const double* score, score_t* gradients, score_t* hessians) const override {
if (weights_ == nullptr) { if (weights_ == nullptr) {
// compute pointwise gradients and hessians with implied unit weights; exactly equivalent to CrossEntropy with unit weights // compute pointwise gradients and Hessians with implied unit weights; exactly equivalent to CrossEntropy with unit weights
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
const double z = 1.0f / (1.0f + std::exp(-score[i])); const double z = 1.0f / (1.0f + std::exp(-score[i]));
...@@ -197,7 +197,7 @@ class CrossEntropyLambda: public ObjectiveFunction { ...@@ -197,7 +197,7 @@ class CrossEntropyLambda: public ObjectiveFunction {
hessians[i] = static_cast<score_t>(z * (1.0f - z)); hessians[i] = static_cast<score_t>(z * (1.0f - z));
} }
} else { } else {
// compute pointwise gradients and hessians with given weights // compute pointwise gradients and Hessians with given weights
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
const double w = weights_[i]; const double w = weights_[i];
......
...@@ -99,7 +99,7 @@ class CUDATreeLearner: public SerialTreeLearner { ...@@ -99,7 +99,7 @@ class CUDATreeLearner: public SerialTreeLearner {
/*! /*!
* \brief Compute GPU feature histogram for the current leaf. * \brief Compute GPU feature histogram for the current leaf.
* Indices, gradients and hessians have been copied to the device. * Indices, gradients and Hessians have been copied to the device.
* \param leaf_num_data Number of data on current leaf * \param leaf_num_data Number of data on current leaf
* \param use_all_features Set to true to not use feature masks, with a faster kernel * \param use_all_features Set to true to not use feature masks, with a faster kernel
*/ */
...@@ -224,7 +224,7 @@ class CUDATreeLearner: public SerialTreeLearner { ...@@ -224,7 +224,7 @@ class CUDATreeLearner: public SerialTreeLearner {
std::vector<cudaEvent_t> indices_future_; std::vector<cudaEvent_t> indices_future_;
/*! Asynchronous waiting object for copying gradients */ /*! Asynchronous waiting object for copying gradients */
std::vector<cudaEvent_t> gradients_future_; std::vector<cudaEvent_t> gradients_future_;
/*! Asynchronous waiting object for copying hessians */ /*! Asynchronous waiting object for copying Hessians */
std::vector<cudaEvent_t> hessians_future_; std::vector<cudaEvent_t> hessians_future_;
/*! Asynchronous waiting object for copying dense features */ /*! Asynchronous waiting object for copying dense features */
std::vector<cudaEvent_t> features_future_; std::vector<cudaEvent_t> features_future_;
......
...@@ -359,7 +359,7 @@ class FeatureHistogram { ...@@ -359,7 +359,7 @@ class FeatureHistogram {
continue; continue;
} }
// mark to is splittable // mark as able to be split
is_splittable_ = true; is_splittable_ = true;
// better split point // better split point
if (current_gain > best_gain) { if (current_gain > best_gain) {
...@@ -940,7 +940,7 @@ class FeatureHistogram { ...@@ -940,7 +940,7 @@ class FeatureHistogram {
continue; continue;
} }
// mark to is splittable // mark as able to be split
is_splittable_ = true; is_splittable_ = true;
// better split point // better split point
if (current_gain > best_gain) { if (current_gain > best_gain) {
...@@ -1010,7 +1010,7 @@ class FeatureHistogram { ...@@ -1010,7 +1010,7 @@ class FeatureHistogram {
} }
double sum_right_hessian = sum_hessian - sum_left_hessian; double sum_right_hessian = sum_hessian - sum_left_hessian;
// if sum hessian too small // if sum Hessian too small
if (sum_right_hessian < meta_->config->min_sum_hessian_in_leaf) { if (sum_right_hessian < meta_->config->min_sum_hessian_in_leaf) {
break; break;
} }
...@@ -1033,7 +1033,7 @@ class FeatureHistogram { ...@@ -1033,7 +1033,7 @@ class FeatureHistogram {
continue; continue;
} }
// mark to is splittable // mark as able to be split
is_splittable_ = true; is_splittable_ = true;
// better split point // better split point
if (current_gain > best_gain) { if (current_gain > best_gain) {
......
...@@ -119,7 +119,7 @@ int GPUTreeLearner::GetNumWorkgroupsPerFeature(data_size_t leaf_num_data) { ...@@ -119,7 +119,7 @@ int GPUTreeLearner::GetNumWorkgroupsPerFeature(data_size_t leaf_num_data) {
} }
void GPUTreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_features) { void GPUTreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_features) {
// we have already copied ordered gradients, ordered hessians and indices to GPU // we have already copied ordered gradients, ordered Hessians and indices to GPU
// decide the best number of workgroups working on one feature4 tuple // decide the best number of workgroups working on one feature4 tuple
// set work group size based on feature size // set work group size based on feature size
// each 2^exp_workgroups_per_feature workgroups work on a feature4 tuple // each 2^exp_workgroups_per_feature workgroups work on a feature4 tuple
...@@ -164,7 +164,7 @@ void GPUTreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featur ...@@ -164,7 +164,7 @@ void GPUTreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featur
// there will be 2^exp_workgroups_per_feature = num_workgroups / num_dense_feature4 sub-histogram per feature4 // there will be 2^exp_workgroups_per_feature = num_workgroups / num_dense_feature4 sub-histogram per feature4
// and we will launch num_feature workgroups for this kernel // and we will launch num_feature workgroups for this kernel
// will launch threads for all features // will launch threads for all features
// the queue should be asynchrounous, and we will can WaitAndGetHistograms() before we start processing dense feature groups // the queue should be asynchronous, and we will can WaitAndGetHistograms() before we start processing dense feature groups
if (leaf_num_data == num_data_) { if (leaf_num_data == num_data_) {
kernel_wait_obj_ = boost::compute::wait_list( kernel_wait_obj_ = boost::compute::wait_list(
queue_.enqueue_1d_range_kernel(histogram_fulldata_kernels_[exp_workgroups_per_feature], 0, num_workgroups * 256, 256)); queue_.enqueue_1d_range_kernel(histogram_fulldata_kernels_[exp_workgroups_per_feature], 0, num_workgroups * 256, 256));
...@@ -256,7 +256,7 @@ void GPUTreeLearner::AllocateGPUMemory() { ...@@ -256,7 +256,7 @@ void GPUTreeLearner::AllocateGPUMemory() {
if (ptr_pinned_feature_masks_) { if (ptr_pinned_feature_masks_) {
queue_.enqueue_unmap_buffer(pinned_feature_masks_, ptr_pinned_feature_masks_); queue_.enqueue_unmap_buffer(pinned_feature_masks_, ptr_pinned_feature_masks_);
} }
// make ordered_gradients and hessians larger (including extra room for prefetching), and pin them // make ordered_gradients and Hessians larger (including extra room for prefetching), and pin them
ordered_gradients_.reserve(allocated_num_data_); ordered_gradients_.reserve(allocated_num_data_);
ordered_hessians_.reserve(allocated_num_data_); ordered_hessians_.reserve(allocated_num_data_);
pinned_gradients_ = boost::compute::buffer(); // deallocate pinned_gradients_ = boost::compute::buffer(); // deallocate
...@@ -271,8 +271,8 @@ void GPUTreeLearner::AllocateGPUMemory() { ...@@ -271,8 +271,8 @@ void GPUTreeLearner::AllocateGPUMemory() {
ordered_hessians_.data()); ordered_hessians_.data());
ptr_pinned_hessians_ = queue_.enqueue_map_buffer(pinned_hessians_, boost::compute::command_queue::map_write_invalidate_region, ptr_pinned_hessians_ = queue_.enqueue_map_buffer(pinned_hessians_, boost::compute::command_queue::map_write_invalidate_region,
0, allocated_num_data_ * sizeof(score_t)); 0, allocated_num_data_ * sizeof(score_t));
// allocate space for gradients and hessians on device // allocate space for gradients and Hessians on device
// we will copy gradients and hessians in after ordered_gradients_ and ordered_hessians_ are constructed // we will copy gradients and Hessians in after ordered_gradients_ and ordered_hessians_ are constructed
device_gradients_ = boost::compute::buffer(); // deallocate device_gradients_ = boost::compute::buffer(); // deallocate
device_gradients_ = boost::compute::buffer(ctx_, allocated_num_data_ * sizeof(score_t), device_gradients_ = boost::compute::buffer(ctx_, allocated_num_data_ * sizeof(score_t),
boost::compute::memory_object::read_only, nullptr); boost::compute::memory_object::read_only, nullptr);
...@@ -599,7 +599,7 @@ void GPUTreeLearner::BuildGPUKernels() { ...@@ -599,7 +599,7 @@ void GPUTreeLearner::BuildGPUKernels() {
} }
histogram_kernels_[i] = program.create_kernel(kernel_name_); histogram_kernels_[i] = program.create_kernel(kernel_name_);
// kernel with all features enabled, with elimited branches // kernel with all features enabled, with eliminated branches
opts << " -D ENABLE_ALL_FEATURES=1"; opts << " -D ENABLE_ALL_FEATURES=1";
try { try {
program = boost::compute::program::build_with_source(kernel_source_, ctx_, opts.str()); program = boost::compute::program::build_with_source(kernel_source_, ctx_, opts.str());
...@@ -781,8 +781,8 @@ void GPUTreeLearner::BeforeTrain() { ...@@ -781,8 +781,8 @@ void GPUTreeLearner::BeforeTrain() {
// use bagging // use bagging
if (data_partition_->leaf_count(0) != num_data_ && num_dense_feature_groups_) { if (data_partition_->leaf_count(0) != num_data_ && num_dense_feature_groups_) {
// On GPU, we start copying indices, gradients and hessians now, instead at ConstructHistogram() // On GPU, we start copying indices, gradients and Hessians now, instead at ConstructHistogram()
// copy used gradients and hessians to ordered buffer // copy used gradients and Hessians to ordered buffer
const data_size_t* indices = data_partition_->indices(); const data_size_t* indices = data_partition_->indices();
data_size_t cnt = data_partition_->leaf_count(0); data_size_t cnt = data_partition_->leaf_count(0);
#if GPU_DEBUG > 0 #if GPU_DEBUG > 0
...@@ -829,7 +829,7 @@ bool GPUTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int ri ...@@ -829,7 +829,7 @@ bool GPUTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int ri
smaller_leaf = right_leaf; smaller_leaf = right_leaf;
} }
// Copy indices, gradients and hessians as early as possible // Copy indices, gradients and Hessians as early as possible
if (smaller_leaf >= 0 && num_dense_feature_groups_) { if (smaller_leaf >= 0 && num_dense_feature_groups_) {
// only need to initialize for smaller leaf // only need to initialize for smaller leaf
// Get leaf boundary // Get leaf boundary
...@@ -839,7 +839,7 @@ bool GPUTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int ri ...@@ -839,7 +839,7 @@ bool GPUTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int ri
// copy indices to the GPU: // copy indices to the GPU:
#if GPU_DEBUG >= 2 #if GPU_DEBUG >= 2
Log::Info("Copying indices, gradients and hessians to GPU..."); Log::Info("Copying indices, gradients and Hessians to GPU...");
printf("Indices size %d being copied (left = %d, right = %d)\n", end - begin, num_data_in_left_child, num_data_in_right_child); printf("Indices size %d being copied (left = %d, right = %d)\n", end - begin, num_data_in_left_child, num_data_in_right_child);
#endif #endif
indices_future_ = boost::compute::copy_async(indices + begin, indices + end, device_data_indices_->begin(), queue_); indices_future_ = boost::compute::copy_async(indices + begin, indices + end, device_data_indices_->begin(), queue_);
...@@ -849,7 +849,7 @@ bool GPUTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int ri ...@@ -849,7 +849,7 @@ bool GPUTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int ri
for (data_size_t i = begin; i < end; ++i) { for (data_size_t i = begin; i < end; ++i) {
ordered_hessians_[i - begin] = hessians_[indices[i]]; ordered_hessians_[i - begin] = hessians_[indices[i]];
} }
// copy ordered hessians to the GPU: // copy ordered Hessians to the GPU:
hessians_future_ = queue_.enqueue_write_buffer_async(device_hessians_, 0, (end - begin) * sizeof(score_t), ptr_pinned_hessians_); hessians_future_ = queue_.enqueue_write_buffer_async(device_hessians_, 0, (end - begin) * sizeof(score_t), ptr_pinned_hessians_);
} }
...@@ -861,7 +861,7 @@ bool GPUTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int ri ...@@ -861,7 +861,7 @@ bool GPUTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int ri
gradients_future_ = queue_.enqueue_write_buffer_async(device_gradients_, 0, (end - begin) * sizeof(score_t), ptr_pinned_gradients_); gradients_future_ = queue_.enqueue_write_buffer_async(device_gradients_, 0, (end - begin) * sizeof(score_t), ptr_pinned_gradients_);
#if GPU_DEBUG >= 2 #if GPU_DEBUG >= 2
Log::Info("Gradients/hessians/indices copied to device with size %d", end - begin); Log::Info("Gradients/Hessians/indices copied to device with size %d", end - begin);
#endif #endif
} }
return SerialTreeLearner::BeforeFindBestSplit(tree, left_leaf, right_leaf); return SerialTreeLearner::BeforeFindBestSplit(tree, left_leaf, right_leaf);
...@@ -896,7 +896,7 @@ bool GPUTreeLearner::ConstructGPUHistogramsAsync( ...@@ -896,7 +896,7 @@ bool GPUTreeLearner::ConstructGPUHistogramsAsync(
gradients_future_ = queue_.enqueue_write_buffer_async(device_gradients_, 0, num_data * sizeof(score_t), gradients); gradients_future_ = queue_.enqueue_write_buffer_async(device_gradients_, 0, num_data * sizeof(score_t), gradients);
} }
} }
// generate and copy ordered_hessians if hessians is not null // generate and copy ordered_hessians if Hessians is not null
if (hessians != nullptr && !share_state_->is_constant_hessian) { if (hessians != nullptr && !share_state_->is_constant_hessian) {
if (num_data != num_data_) { if (num_data != num_data_) {
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
...@@ -965,7 +965,7 @@ void GPUTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_u ...@@ -965,7 +965,7 @@ void GPUTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_u
} }
// construct smaller leaf // construct smaller leaf
hist_t* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - kHistOffset; hist_t* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - kHistOffset;
// ConstructGPUHistogramsAsync will return true if there are availabe feature gourps dispatched to GPU // ConstructGPUHistogramsAsync will return true if there are available feature groups dispatched to GPU
bool is_gpu_used = ConstructGPUHistogramsAsync(is_feature_used, bool is_gpu_used = ConstructGPUHistogramsAsync(is_feature_used,
nullptr, smaller_leaf_splits_->num_data_in_leaf(), nullptr, smaller_leaf_splits_->num_data_in_leaf(),
nullptr, nullptr, nullptr, nullptr,
...@@ -988,7 +988,7 @@ void GPUTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_u ...@@ -988,7 +988,7 @@ void GPUTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_u
} }
} }
// Compare GPU histogram with CPU histogram, useful for debuggin GPU code problem // Compare GPU histogram with CPU histogram, useful for debugging GPU code problem
// #define GPU_DEBUG_COMPARE // #define GPU_DEBUG_COMPARE
#ifdef GPU_DEBUG_COMPARE #ifdef GPU_DEBUG_COMPARE
for (int i = 0; i < num_dense_feature_groups_; ++i) { for (int i = 0; i < num_dense_feature_groups_; ++i) {
......
...@@ -117,7 +117,7 @@ class GPUTreeLearner: public SerialTreeLearner { ...@@ -117,7 +117,7 @@ class GPUTreeLearner: public SerialTreeLearner {
/*! /*!
* \brief Compute GPU feature histogram for the current leaf. * \brief Compute GPU feature histogram for the current leaf.
* Indices, gradients and hessians have been copied to the device. * Indices, gradients and Hessians have been copied to the device.
* \param leaf_num_data Number of data on current leaf * \param leaf_num_data Number of data on current leaf
* \param use_all_features Set to true to not use feature masks, with a faster kernel * \param use_all_features Set to true to not use feature masks, with a faster kernel
*/ */
...@@ -138,11 +138,11 @@ class GPUTreeLearner: public SerialTreeLearner { ...@@ -138,11 +138,11 @@ class GPUTreeLearner: public SerialTreeLearner {
* Set to nullptr to skip copy to GPU. * Set to nullptr to skip copy to GPU.
* \param num_data Number of data examples to be included in histogram * \param num_data Number of data examples to be included in histogram
* \param gradients Array of gradients for all examples. * \param gradients Array of gradients for all examples.
* \param hessians Array of hessians for all examples. * \param Hessians Array of Hessians for all examples.
* \param ordered_gradients Ordered gradients will be generated and copied to GPU when gradients is not nullptr, * \param ordered_gradients Ordered gradients will be generated and copied to GPU when gradients is not nullptr,
* Set gradients to nullptr to skip copy to GPU. * Set gradients to nullptr to skip copy to GPU.
* \param ordered_hessians Ordered hessians will be generated and copied to GPU when hessians is not nullptr, * \param ordered_hessians Ordered Hessians will be generated and copied to GPU when Hessians is not nullptr,
* Set hessians to nullptr to skip copy to GPU. * Set Hessians to nullptr to skip copy to GPU.
* \return true if GPU kernel is launched, false if GPU is not used * \return true if GPU kernel is launched, false if GPU is not used
*/ */
bool ConstructGPUHistogramsAsync( bool ConstructGPUHistogramsAsync(
...@@ -258,7 +258,7 @@ class GPUTreeLearner: public SerialTreeLearner { ...@@ -258,7 +258,7 @@ class GPUTreeLearner: public SerialTreeLearner {
boost::compute::future<void> indices_future_; boost::compute::future<void> indices_future_;
/*! \brief Asynchronous waiting object for copying gradients */ /*! \brief Asynchronous waiting object for copying gradients */
boost::compute::event gradients_future_; boost::compute::event gradients_future_;
/*! \brief Asynchronous waiting object for copying hessians */ /*! \brief Asynchronous waiting object for copying Hessians */
boost::compute::event hessians_future_; boost::compute::event hessians_future_;
}; };
......
...@@ -129,7 +129,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, ...@@ -129,7 +129,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
// assume this starts at 32 * 4 = 128-byte boundary // What does it mean? boundary?? // assume this starts at 32 * 4 = 128-byte boundary // What does it mean? boundary??
// total size: 2 * 256 * size_of(float) = 2 KB // total size: 2 * 256 * size_of(float) = 2 KB
// organization: each feature/grad/hessian is at a different bank, // organization: each feature/grad/hessian is at a different bank,
// as indepedent of the feature value as possible // as independent of the feature value as possible
acc_type *gh_hist = reinterpret_cast<acc_type *>(shared_array); acc_type *gh_hist = reinterpret_cast<acc_type *>(shared_array);
// counter histogram // counter histogram
...@@ -197,7 +197,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, ...@@ -197,7 +197,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
// there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4 // there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4
for (unsigned int i = subglobal_tid; i < num_data; i += subglobal_size) { for (unsigned int i = subglobal_tid; i < num_data; i += subglobal_size) {
// prefetch the next iteration variables // prefetch the next iteration variables
// we don't need bondary check because we have made the buffer large // we don't need boundary check because we have made the buffer large
int i_next = i + subglobal_size; int i_next = i + subglobal_size;
#ifdef IGNORE_INDICES #ifdef IGNORE_INDICES
// we need to check to bounds here // we need to check to bounds here
...@@ -274,10 +274,10 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, ...@@ -274,10 +274,10 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
#if POWER_FEATURE_WORKGROUPS != 0 #if POWER_FEATURE_WORKGROUPS != 0
acc_type *__restrict__ output = reinterpret_cast<acc_type *>(output_buf) + group_id * 3 * NUM_BINS; acc_type *__restrict__ output = reinterpret_cast<acc_type *>(output_buf) + group_id * 3 * NUM_BINS;
// write gradients and hessians // write gradients and Hessians
acc_type *__restrict__ ptr_f = output; acc_type *__restrict__ ptr_f = output;
for (uint16_t i = ltid; i < 2 * NUM_BINS; i += lsize) { for (uint16_t i = ltid; i < 2 * NUM_BINS; i += lsize) {
// even threads read gradients, odd threads read hessians // even threads read gradients, odd threads read Hessians
acc_type value = gh_hist[i]; acc_type value = gh_hist[i];
ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value; ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value;
} }
...@@ -441,14 +441,14 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, ...@@ -441,14 +441,14 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
// assume this starts at 32 * 4 = 128-byte boundary // What does it mean? boundary?? // assume this starts at 32 * 4 = 128-byte boundary // What does it mean? boundary??
// total size: 2 * 256 * size_of(float) = 2 KB // total size: 2 * 256 * size_of(float) = 2 KB
// organization: each feature/grad/hessian is at a different bank, // organization: each feature/grad/hessian is at a different bank,
// as indepedent of the feature value as possible // as independent of the feature value as possible
acc_type *gh_hist = reinterpret_cast<acc_type *>(shared_array); acc_type *gh_hist = reinterpret_cast<acc_type *>(shared_array);
// counter histogram // counter histogram
// total size: 256 * size_of(unsigned int) = 1 KB // total size: 256 * size_of(unsigned int) = 1 KB
unsigned int *cnt_hist = reinterpret_cast<unsigned int *>(gh_hist + 2 * NUM_BINS); unsigned int *cnt_hist = reinterpret_cast<unsigned int *>(gh_hist + 2 * NUM_BINS);
// odd threads (1, 3, ...) compute histograms for hessians first // odd threads (1, 3, ...) compute histograms for Hessians first
// even thread (0, 2, ...) compute histograms for gradients first // even thread (0, 2, ...) compute histograms for gradients first
// etc. // etc.
uchar is_hessian_first = ltid & 1; uchar is_hessian_first = ltid & 1;
...@@ -462,7 +462,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, ...@@ -462,7 +462,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
// size of threads that process this feature4 // size of threads that process this feature4
const unsigned int subglobal_size = lsize * (1 << power_feature_workgroups); const unsigned int subglobal_size = lsize * (1 << power_feature_workgroups);
// equavalent thread ID in this subgroup for this feature4 // equivalent thread ID in this subgroup for this feature4
const unsigned int subglobal_tid = gtid - feature_id * subglobal_size; const unsigned int subglobal_tid = gtid - feature_id * subglobal_size;
data_size_t ind; data_size_t ind;
...@@ -584,10 +584,10 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, ...@@ -584,10 +584,10 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
#if POWER_FEATURE_WORKGROUPS != 0 #if POWER_FEATURE_WORKGROUPS != 0
acc_type *__restrict__ output = reinterpret_cast<acc_type *>(output_buf) + group_id * 3 * NUM_BINS; acc_type *__restrict__ output = reinterpret_cast<acc_type *>(output_buf) + group_id * 3 * NUM_BINS;
// write gradients and hessians // write gradients and Hessians
acc_type *__restrict__ ptr_f = output; acc_type *__restrict__ ptr_f = output;
for (uint16_t i = ltid; i < 2 * NUM_BINS; i += lsize) { for (uint16_t i = ltid; i < 2 * NUM_BINS; i += lsize) {
// even threads read gradients, odd threads read hessians // even threads read gradients, odd threads read Hessians
acc_type value = gh_hist[i]; acc_type value = gh_hist[i];
ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value; ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value;
} }
...@@ -773,7 +773,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, ...@@ -773,7 +773,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
// size of threads that process this feature4 // size of threads that process this feature4
const unsigned int subglobal_size = lsize * (1 << power_feature_workgroups); const unsigned int subglobal_size = lsize * (1 << power_feature_workgroups);
// equavalent thread ID in this subgroup for this feature4 // equivalent thread ID in this subgroup for this feature4
const unsigned int subglobal_tid = gtid - feature_id * subglobal_size; const unsigned int subglobal_tid = gtid - feature_id * subglobal_size;
data_size_t ind; data_size_t ind;
...@@ -819,7 +819,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, ...@@ -819,7 +819,7 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
// there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4 // there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4
for (unsigned int i = subglobal_tid; i < num_data; i += subglobal_size) { for (unsigned int i = subglobal_tid; i < num_data; i += subglobal_size) {
// prefetch the next iteration variables // prefetch the next iteration variables
// we don't need bondary check because we have made the buffer large // we don't need boundary check because we have made the buffer large
int i_next = i + subglobal_size; int i_next = i + subglobal_size;
#ifdef IGNORE_INDICES #ifdef IGNORE_INDICES
// we need to check to bounds here // we need to check to bounds here
...@@ -895,10 +895,10 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base, ...@@ -895,10 +895,10 @@ __global__ void KERNEL_NAME(const uchar* feature_data_base,
#if POWER_FEATURE_WORKGROUPS != 0 #if POWER_FEATURE_WORKGROUPS != 0
acc_type *__restrict__ output = reinterpret_cast<acc_type *>(output_buf) + group_id * 3 * NUM_BINS; acc_type *__restrict__ output = reinterpret_cast<acc_type *>(output_buf) + group_id * 3 * NUM_BINS;
// write gradients and hessians // write gradients and Hessians
acc_type *__restrict__ ptr_f = output; acc_type *__restrict__ ptr_f = output;
for (uint16_t i = ltid; i < 2 * NUM_BINS; i += lsize) { for (uint16_t i = ltid; i < 2 * NUM_BINS; i += lsize) {
// even threads read gradients, odd threads read hessians // even threads read gradients, odd threads read Hessians
acc_type value = gh_hist[i]; acc_type value = gh_hist[i];
ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value; ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value;
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment