Unverified Commit f1a14869 authored by Guolin Ke's avatar Guolin Ke Committed by GitHub
Browse files

fix many cpp lint errors (#2426)

* fix many cpp lint errors

* indent

* fix bug

* fix more

* fix gpu

* more fixes
parent 4f89cc10
......@@ -59,9 +59,9 @@ int GetConfilctCount(const std::vector<bool>& mark, const int* indices, int num_
}
return ret;
}
void MarkUsed(std::vector<bool>& mark, const int* indices, int num_indices) {
void MarkUsed(std::vector<bool>* mark, const int* indices, int num_indices) {
for (int i = 0; i < num_indices; ++i) {
mark[indices[i]] = true;
mark->at(indices[i]) = true;
}
}
......@@ -115,7 +115,7 @@ std::vector<std::vector<int>> FindGroups(const std::vector<std::unique_ptr<BinMa
features_in_group[gid].push_back(fidx);
group_conflict_cnt[gid] += cnt;
group_non_zero_cnt[gid] += cur_non_zero_cnt - cnt;
MarkUsed(conflict_marks[gid], sample_indices[fidx], num_per_col[fidx]);
MarkUsed(&conflict_marks[gid], sample_indices[fidx], num_per_col[fidx]);
if (is_use_gpu) {
group_num_bin[gid] += bin_mappers[fidx]->num_bin() + (bin_mappers[fidx]->GetDefaultBin() == 0 ? -1 : 0);
}
......@@ -127,7 +127,7 @@ std::vector<std::vector<int>> FindGroups(const std::vector<std::unique_ptr<BinMa
features_in_group.back().push_back(fidx);
group_conflict_cnt.push_back(0);
conflict_marks.emplace_back(total_sample_cnt, false);
MarkUsed(conflict_marks.back(), sample_indices[fidx], num_per_col[fidx]);
MarkUsed(&(conflict_marks.back()), sample_indices[fidx], num_per_col[fidx]);
group_non_zero_cnt.emplace_back(cur_non_zero_cnt);
if (is_use_gpu) {
group_num_bin.push_back(1 + bin_mappers[fidx]->num_bin() + (bin_mappers[fidx]->GetDefaultBin() == 0 ? -1 : 0));
......@@ -137,7 +137,7 @@ std::vector<std::vector<int>> FindGroups(const std::vector<std::unique_ptr<BinMa
return features_in_group;
}
std::vector<std::vector<int>> FastFeatureBundling(std::vector<std::unique_ptr<BinMapper>>& bin_mappers,
std::vector<std::vector<int>> FastFeatureBundling(const std::vector<std::unique_ptr<BinMapper>>& bin_mappers,
int** sample_indices,
const int* num_per_col,
size_t total_sample_cnt,
......@@ -213,17 +213,17 @@ std::vector<std::vector<int>> FastFeatureBundling(std::vector<std::unique_ptr<Bi
}
void Dataset::Construct(
std::vector<std::unique_ptr<BinMapper>>& bin_mappers,
std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
int** sample_non_zero_indices,
const int* num_per_col,
size_t total_sample_cnt,
const Config& io_config) {
num_total_features_ = static_cast<int>(bin_mappers.size());
num_total_features_ = static_cast<int>(bin_mappers->size());
sparse_threshold_ = io_config.sparse_threshold;
// get num_features
std::vector<int> used_features;
for (int i = 0; i < static_cast<int>(bin_mappers.size()); ++i) {
if (bin_mappers[i] != nullptr && !bin_mappers[i]->is_trivial()) {
for (int i = 0; i < static_cast<int>(bin_mappers->size()); ++i) {
if (bin_mappers->at(i) != nullptr && !bin_mappers->at(i)->is_trivial()) {
used_features.emplace_back(i);
}
}
......@@ -233,7 +233,7 @@ void Dataset::Construct(
auto features_in_group = NoGroup(used_features);
if (io_config.enable_bundle && !used_features.empty()) {
features_in_group = FastFeatureBundling(bin_mappers,
features_in_group = FastFeatureBundling(*bin_mappers,
sample_non_zero_indices, num_per_col, total_sample_cnt,
used_features, io_config.max_conflict_rate,
num_data_, io_config.min_data_in_leaf,
......@@ -261,11 +261,11 @@ void Dataset::Construct(
real_feature_idx_[cur_fidx] = real_fidx;
feature2group_[cur_fidx] = i;
feature2subfeature_[cur_fidx] = j;
cur_bin_mappers.emplace_back(bin_mappers[real_fidx].release());
cur_bin_mappers.emplace_back(bin_mappers->at(real_fidx).release());
++cur_fidx;
}
feature_groups_.emplace_back(std::unique_ptr<FeatureGroup>(
new FeatureGroup(cur_cnt_features, cur_bin_mappers, num_data_, sparse_threshold_,
new FeatureGroup(cur_cnt_features, &cur_bin_mappers, num_data_, sparse_threshold_,
io_config.is_enable_sparse)));
}
feature_groups_.shrink_to_fit();
......@@ -413,7 +413,7 @@ void Dataset::CopyFeatureMapperFrom(const Dataset* dataset) {
}
feature_groups_.emplace_back(new FeatureGroup(
dataset->feature_groups_[i]->num_feature_,
bin_mappers,
&bin_mappers,
num_data_,
dataset->feature_groups_[i]->is_sparse_));
}
......@@ -446,7 +446,7 @@ void Dataset::CreateValid(const Dataset* dataset) {
bin_mappers.emplace_back(new BinMapper(*(dataset->FeatureBinMapper(i))));
feature_groups_.emplace_back(new FeatureGroup(
1,
bin_mappers,
&bin_mappers,
num_data_,
sparse_threshold_,
is_enable_sparse));
......@@ -778,7 +778,7 @@ void Dataset::DumpTextFile(const char* text_filename) {
void Dataset::ConstructHistograms(const std::vector<int8_t>& is_feature_used,
const data_size_t* data_indices, data_size_t num_data,
int leaf_idx,
std::vector<std::unique_ptr<OrderedBin>>& ordered_bins,
std::vector<std::unique_ptr<OrderedBin>>* ordered_bins,
const score_t* gradients, const score_t* hessians,
score_t* ordered_gradients, score_t* ordered_hessians,
bool is_constant_hessian,
......@@ -830,9 +830,9 @@ void Dataset::ConstructHistograms(const std::vector<int8_t>& is_feature_used,
// feature is not used
auto data_ptr = hist_data + group_bin_boundaries_[group];
const int num_bin = feature_groups_[group]->num_total_bin_;
std::memset((void*)(data_ptr + 1), 0, (num_bin - 1) * sizeof(HistogramBinEntry));
std::memset(reinterpret_cast<void*>(data_ptr + 1), 0, (num_bin - 1) * sizeof(HistogramBinEntry));
// construct histograms for smaller leaf
if (ordered_bins[group] == nullptr) {
if (ordered_bins->at(group) == nullptr) {
// if not use ordered bin
feature_groups_[group]->bin_data_->ConstructHistogram(
data_indices,
......@@ -842,7 +842,7 @@ void Dataset::ConstructHistograms(const std::vector<int8_t>& is_feature_used,
data_ptr);
} else {
// used ordered bin
ordered_bins[group]->ConstructHistogram(leaf_idx,
ordered_bins->at(group)->ConstructHistogram(leaf_idx,
gradients,
hessians,
data_ptr);
......@@ -859,9 +859,9 @@ void Dataset::ConstructHistograms(const std::vector<int8_t>& is_feature_used,
// feature is not used
auto data_ptr = hist_data + group_bin_boundaries_[group];
const int num_bin = feature_groups_[group]->num_total_bin_;
std::memset((void*)(data_ptr + 1), 0, (num_bin - 1) * sizeof(HistogramBinEntry));
std::memset(reinterpret_cast<void*>(data_ptr + 1), 0, (num_bin - 1) * sizeof(HistogramBinEntry));
// construct histograms for smaller leaf
if (ordered_bins[group] == nullptr) {
if (ordered_bins->at(group) == nullptr) {
// if not use ordered bin
feature_groups_[group]->bin_data_->ConstructHistogram(
data_indices,
......@@ -870,7 +870,7 @@ void Dataset::ConstructHistograms(const std::vector<int8_t>& is_feature_used,
data_ptr);
} else {
// used ordered bin
ordered_bins[group]->ConstructHistogram(leaf_idx,
ordered_bins->at(group)->ConstructHistogram(leaf_idx,
gradients,
data_ptr);
}
......@@ -892,9 +892,9 @@ void Dataset::ConstructHistograms(const std::vector<int8_t>& is_feature_used,
// feature is not used
auto data_ptr = hist_data + group_bin_boundaries_[group];
const int num_bin = feature_groups_[group]->num_total_bin_;
std::memset((void*)(data_ptr + 1), 0, (num_bin - 1) * sizeof(HistogramBinEntry));
std::memset(reinterpret_cast<void*>(data_ptr + 1), 0, (num_bin - 1) * sizeof(HistogramBinEntry));
// construct histograms for smaller leaf
if (ordered_bins[group] == nullptr) {
if (ordered_bins->at(group) == nullptr) {
// if not use ordered bin
feature_groups_[group]->bin_data_->ConstructHistogram(
num_data,
......@@ -903,7 +903,7 @@ void Dataset::ConstructHistograms(const std::vector<int8_t>& is_feature_used,
data_ptr);
} else {
// used ordered bin
ordered_bins[group]->ConstructHistogram(leaf_idx,
ordered_bins->at(group)->ConstructHistogram(leaf_idx,
gradients,
hessians,
data_ptr);
......@@ -920,9 +920,9 @@ void Dataset::ConstructHistograms(const std::vector<int8_t>& is_feature_used,
// feature is not used
auto data_ptr = hist_data + group_bin_boundaries_[group];
const int num_bin = feature_groups_[group]->num_total_bin_;
std::memset((void*)(data_ptr + 1), 0, (num_bin - 1) * sizeof(HistogramBinEntry));
std::memset(reinterpret_cast<void*>(data_ptr + 1), 0, (num_bin - 1) * sizeof(HistogramBinEntry));
// construct histograms for smaller leaf
if (ordered_bins[group] == nullptr) {
if (ordered_bins->at(group) == nullptr) {
// if not use ordered bin
feature_groups_[group]->bin_data_->ConstructHistogram(
num_data,
......@@ -930,7 +930,7 @@ void Dataset::ConstructHistograms(const std::vector<int8_t>& is_feature_used,
data_ptr);
} else {
// used ordered bin
ordered_bins[group]->ConstructHistogram(leaf_idx,
ordered_bins->at(group)->ConstructHistogram(leaf_idx,
gradients,
data_ptr);
}
......@@ -967,32 +967,32 @@ void Dataset::FixHistogram(int feature_idx, double sum_gradient, double sum_hess
}
template<typename T>
void PushVector(std::vector<T>& dest, const std::vector<T>& src) {
dest.reserve(dest.size() + src.size());
void PushVector(std::vector<T>* dest, const std::vector<T>& src) {
dest->reserve(dest->size() + src.size());
for (auto i : src) {
dest.push_back(i);
dest->push_back(i);
}
}
template<typename T>
void PushOffset(std::vector<T>& dest, const std::vector<T>& src, const T& offset) {
dest.reserve(dest.size() + src.size());
void PushOffset(std::vector<T>* dest, const std::vector<T>& src, const T& offset) {
dest->reserve(dest->size() + src.size());
for (auto i : src) {
dest.push_back(i + offset);
dest->push_back(i + offset);
}
}
template<typename T>
void PushClearIfEmpty(std::vector<T>& dest, const size_t dest_len, const std::vector<T>& src, const size_t src_len, const T& deflt) {
if (!dest.empty() && !src.empty()) {
void PushClearIfEmpty(std::vector<T>* dest, const size_t dest_len, const std::vector<T>& src, const size_t src_len, const T& deflt) {
if (!dest->empty() && !src.empty()) {
PushVector(dest, src);
} else if (!dest.empty() && src.empty()) {
} else if (!dest->empty() && src.empty()) {
for (size_t i = 0; i < src_len; ++i) {
dest.push_back(deflt);
dest->push_back(deflt);
}
} else if (dest.empty() && !src.empty()) {
} else if (dest->empty() && !src.empty()) {
for (size_t i = 0; i < dest_len; ++i) {
dest.push_back(deflt);
dest->push_back(deflt);
}
PushVector(dest, src);
}
......@@ -1002,9 +1002,9 @@ void Dataset::addFeaturesFrom(Dataset* other) {
if (other->num_data_ != num_data_) {
throw std::runtime_error("Cannot add features from other Dataset with a different number of rows");
}
PushVector(feature_names_, other->feature_names_);
PushVector(feature2subfeature_, other->feature2subfeature_);
PushVector(group_feature_cnt_, other->group_feature_cnt_);
PushVector(&feature_names_, other->feature_names_);
PushVector(&feature2subfeature_, other->feature2subfeature_);
PushVector(&group_feature_cnt_, other->group_feature_cnt_);
feature_groups_.reserve(other->feature_groups_.size());
for (auto& fg : other->feature_groups_) {
feature_groups_.emplace_back(new FeatureGroup(*fg));
......@@ -1016,17 +1016,17 @@ void Dataset::addFeaturesFrom(Dataset* other) {
used_feature_map_.push_back(-1); // Unused feature.
}
}
PushOffset(real_feature_idx_, other->real_feature_idx_, num_total_features_);
PushOffset(feature2group_, other->feature2group_, num_groups_);
PushOffset(&real_feature_idx_, other->real_feature_idx_, num_total_features_);
PushOffset(&feature2group_, other->feature2group_, num_groups_);
auto bin_offset = group_bin_boundaries_.back();
// Skip the leading 0 when copying group_bin_boundaries.
for (auto i = other->group_bin_boundaries_.begin()+1; i < other->group_bin_boundaries_.end(); ++i) {
group_bin_boundaries_.push_back(*i + bin_offset);
}
PushOffset(group_feature_start_, other->group_feature_start_, num_features_);
PushOffset(&group_feature_start_, other->group_feature_start_, num_features_);
PushClearIfEmpty(monotone_types_, num_total_features_, other->monotone_types_, other->num_total_features_, (int8_t)0);
PushClearIfEmpty(feature_penalty_, num_total_features_, other->feature_penalty_, other->num_total_features_, 1.0);
PushClearIfEmpty(&monotone_types_, num_total_features_, other->monotone_types_, other->num_total_features_, (int8_t)0);
PushClearIfEmpty(&feature_penalty_, num_total_features_, other->feature_penalty_, other->num_total_features_, 1.0);
num_features_ += other->num_features_;
num_total_features_ += other->num_total_features_;
......
......@@ -190,7 +190,7 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, const char* initscore
// initialize label
dataset->metadata_.Init(dataset->num_data_, weight_idx_, group_idx_);
// extract features
ExtractFeaturesFromMemory(text_data, parser.get(), dataset.get());
ExtractFeaturesFromMemory(&text_data, parser.get(), dataset.get());
text_data.clear();
} else {
// sample data from file
......@@ -242,7 +242,7 @@ Dataset* DatasetLoader::LoadFromFileAlignWithOtherDataset(const char* filename,
dataset->metadata_.Init(dataset->num_data_, weight_idx_, group_idx_);
dataset->CreateValid(train_data);
// extract features
ExtractFeaturesFromMemory(text_data, parser.get(), dataset.get());
ExtractFeaturesFromMemory(&text_data, parser.get(), dataset.get());
text_data.clear();
} else {
TextReader<data_size_t> text_reader(filename, config_.header);
......@@ -692,7 +692,7 @@ Dataset* DatasetLoader::CostructFromSampleData(double** sample_values,
}
}
auto dataset = std::unique_ptr<Dataset>(new Dataset(num_data));
dataset->Construct(bin_mappers, sample_indices, num_per_col, total_sample_size, config_);
dataset->Construct(&bin_mappers, sample_indices, num_per_col, total_sample_size, config_);
dataset->set_feature_names(feature_names_);
return dataset.release();
}
......@@ -798,7 +798,7 @@ std::vector<std::string> DatasetLoader::SampleTextDataFromFile(const char* filen
TextReader<data_size_t> text_reader(filename, config_.header);
std::vector<std::string> out_data;
if (num_machines == 1 || config_.pre_partition) {
*num_global_data = static_cast<data_size_t>(text_reader.SampleFromFile(random_, sample_cnt, &out_data));
*num_global_data = static_cast<data_size_t>(text_reader.SampleFromFile(&random_, sample_cnt, &out_data));
} else { // need partition data
// get query data
const data_size_t* query_boundaries = metadata.query_boundaries();
......@@ -811,7 +811,7 @@ std::vector<std::string> DatasetLoader::SampleTextDataFromFile(const char* filen
} else {
return false;
}
}, used_data_indices, random_, sample_cnt, &out_data);
}, used_data_indices, &random_, sample_cnt, &out_data);
} else {
// if contain query file, minimal sample unit is one query
data_size_t num_queries = metadata.num_queries();
......@@ -833,7 +833,7 @@ std::vector<std::string> DatasetLoader::SampleTextDataFromFile(const char* filen
++qid;
}
return is_query_used;
}, used_data_indices, random_, sample_cnt, &out_data);
}, used_data_indices, &random_, sample_cnt, &out_data);
}
}
return out_data;
......@@ -1018,12 +1018,12 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
}
}
sample_values.clear();
dataset->Construct(bin_mappers, Common::Vector2Ptr<int>(sample_indices).data(),
dataset->Construct(&bin_mappers, Common::Vector2Ptr<int>(&sample_indices).data(),
Common::VectorSize<int>(sample_indices).data(), sample_data.size(), config_);
}
/*! \brief Extract local features from memory */
void DatasetLoader::ExtractFeaturesFromMemory(std::vector<std::string>& text_data, const Parser* parser, Dataset* dataset) {
void DatasetLoader::ExtractFeaturesFromMemory(std::vector<std::string>* text_data, const Parser* parser, Dataset* dataset) {
std::vector<std::pair<int, double>> oneline_features;
double tmp_label = 0.0f;
if (predict_fun_ == nullptr) {
......@@ -1035,11 +1035,11 @@ void DatasetLoader::ExtractFeaturesFromMemory(std::vector<std::string>& text_dat
const int tid = omp_get_thread_num();
oneline_features.clear();
// parser
parser->ParseOneLine(text_data[i].c_str(), &oneline_features, &tmp_label);
parser->ParseOneLine(text_data->at(i).c_str(), &oneline_features, &tmp_label);
// set label
dataset->metadata_.SetLabelAt(i, static_cast<label_t>(tmp_label));
// free processed line:
text_data[i].clear();
text_data->at(i).clear();
// shrink_to_fit will be very slow in linux, and seems not free memory, disable for now
// text_reader_->Lines()[i].shrink_to_fit();
// push data
......@@ -1072,7 +1072,7 @@ void DatasetLoader::ExtractFeaturesFromMemory(std::vector<std::string>& text_dat
const int tid = omp_get_thread_num();
oneline_features.clear();
// parser
parser->ParseOneLine(text_data[i].c_str(), &oneline_features, &tmp_label);
parser->ParseOneLine(text_data->at(i).c_str(), &oneline_features, &tmp_label);
// set initial score
std::vector<double> oneline_init_score(num_class_);
predict_fun_(oneline_features, oneline_init_score.data());
......@@ -1110,7 +1110,7 @@ void DatasetLoader::ExtractFeaturesFromMemory(std::vector<std::string>& text_dat
}
dataset->FinishLoad();
// text data can be free after loaded feature values
text_data.clear();
text_data->clear();
}
/*! \brief Extract local features from file */
......
......@@ -48,7 +48,7 @@ template <typename VAL_T>
class DenseBin: public Bin {
public:
friend DenseBinIterator<VAL_T>;
DenseBin(data_size_t num_data)
explicit DenseBin(data_size_t num_data)
: num_data_(num_data), data_(num_data_, static_cast<VAL_T>(0)) {
}
......@@ -192,7 +192,7 @@ class DenseBin: public Bin {
}
}
virtual data_size_t Split(
data_size_t Split(
uint32_t min_bin, uint32_t max_bin, uint32_t default_bin, MissingType missing_type, bool default_left,
uint32_t threshold, data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices) const override {
......@@ -253,7 +253,7 @@ class DenseBin: public Bin {
return lte_count;
}
virtual data_size_t SplitCategorical(
data_size_t SplitCategorical(
uint32_t min_bin, uint32_t max_bin, uint32_t default_bin,
const uint32_t* threshold, int num_threahold, data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices) const override {
......
......@@ -42,7 +42,7 @@ class Dense4bitsBinIterator : public BinIterator {
class Dense4bitsBin : public Bin {
public:
friend Dense4bitsBinIterator;
Dense4bitsBin(data_size_t num_data)
explicit Dense4bitsBin(data_size_t num_data)
: num_data_(num_data) {
int len = (num_data_ + 1) / 2;
data_ = std::vector<uint8_t>(len, static_cast<uint8_t>(0));
......@@ -215,7 +215,7 @@ class Dense4bitsBin : public Bin {
}
}
virtual data_size_t Split(
data_size_t Split(
uint32_t min_bin, uint32_t max_bin, uint32_t default_bin, MissingType missing_type, bool default_left,
uint32_t threshold, data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices) const override {
......@@ -276,7 +276,7 @@ class Dense4bitsBin : public Bin {
return lte_count;
}
virtual data_size_t SplitCategorical(
data_size_t SplitCategorical(
uint32_t min_bin, uint32_t max_bin, uint32_t default_bin,
const uint32_t* threshold, int num_threahold, data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices) const override {
......
......@@ -54,7 +54,7 @@ struct LocalFile : VirtualFileReader, VirtualFileWriter {
const std::string mode_;
};
const std::string kHdfsProto = "hdfs://";
const char* kHdfsProto = "hdfs://";
#ifdef USE_HDFS
struct HDFSFile : VirtualFileReader, VirtualFileWriter {
......
......@@ -36,7 +36,7 @@ class OrderedSparseBin: public OrderedBin {
SparsePair() : ridx(0), bin(0) {}
};
OrderedSparseBin(const SparseBin<VAL_T>* bin_data)
explicit OrderedSparseBin(const SparseBin<VAL_T>* bin_data)
:bin_data_(bin_data) {
data_size_t cur_pos = 0;
data_size_t i_delta = -1;
......
......@@ -27,13 +27,13 @@ void GetStatistic(const char* str, int* comma_cnt, int* tab_cnt, int* colon_cnt)
}
}
int GetLabelIdxForLibsvm(std::string& str, int num_features, int label_idx) {
int GetLabelIdxForLibsvm(const std::string& str, int num_features, int label_idx) {
if (num_features <= 0) {
return label_idx;
}
str = Common::Trim(str);
auto pos_space = str.find_first_of(" \f\n\r\t\v");
auto pos_colon = str.find_first_of(":");
auto str2 = Common::Trim(str);
auto pos_space = str2.find_first_of(" \f\n\r\t\v");
auto pos_colon = str2.find_first_of(":");
if (pos_space == std::string::npos || pos_space < pos_colon) {
return label_idx;
} else {
......@@ -41,12 +41,12 @@ int GetLabelIdxForLibsvm(std::string& str, int num_features, int label_idx) {
}
}
int GetLabelIdxForTSV(std::string& str, int num_features, int label_idx) {
int GetLabelIdxForTSV(const std::string& str, int num_features, int label_idx) {
if (num_features <= 0) {
return label_idx;
}
str = Common::Trim(str);
auto tokens = Common::Split(str.c_str(), '\t');
auto str2 = Common::Trim(str);
auto tokens = Common::Split(str2.c_str(), '\t');
if (static_cast<int>(tokens.size()) == num_features) {
return -1;
} else {
......@@ -54,12 +54,12 @@ int GetLabelIdxForTSV(std::string& str, int num_features, int label_idx) {
}
}
int GetLabelIdxForCSV(std::string& str, int num_features, int label_idx) {
int GetLabelIdxForCSV(const std::string& str, int num_features, int label_idx) {
if (num_features <= 0) {
return label_idx;
}
str = Common::Trim(str);
auto tokens = Common::Split(str.c_str(), ',');
auto str2 = Common::Trim(str);
auto tokens = Common::Split(str2.c_str(), ',');
if (static_cast<int>(tokens.size()) == num_features) {
return -1;
} else {
......@@ -74,18 +74,18 @@ enum DataType {
LIBSVM
};
void getline(std::stringstream& ss, std::string& line, const VirtualFileReader* reader, std::vector<char>& buffer, size_t buffer_size) {
std::getline(ss, line);
while (ss.eof()) {
size_t read_len = reader->Read(buffer.data(), buffer_size);
void GetLine(std::stringstream* ss, std::string* line, const VirtualFileReader* reader, std::vector<char>* buffer, size_t buffer_size) {
std::getline(*ss, *line);
while (ss->eof()) {
size_t read_len = reader->Read(buffer->data(), buffer_size);
if (read_len <= 0) {
break;
}
ss.clear();
ss.str(std::string(buffer.data(), read_len));
ss->clear();
ss->str(std::string(buffer->data(), read_len));
std::string tmp;
std::getline(ss, tmp);
line += tmp;
std::getline(*ss, tmp);
*line += tmp;
}
}
......@@ -105,16 +105,16 @@ Parser* Parser::CreateParser(const char* filename, bool header, int num_features
std::stringstream tmp_file(std::string(buffer.data(), read_len));
if (header) {
if (!tmp_file.eof()) {
getline(tmp_file, line1, reader.get(), buffer, buffer_size);
GetLine(&tmp_file, &line1, reader.get(), &buffer, buffer_size);
}
}
if (!tmp_file.eof()) {
getline(tmp_file, line1, reader.get(), buffer, buffer_size);
GetLine(&tmp_file, &line1, reader.get(), &buffer, buffer_size);
} else {
Log::Fatal("Data file %s should have at least one line", filename);
}
if (!tmp_file.eof()) {
getline(tmp_file, line2, reader.get(), buffer, buffer_size);
GetLine(&tmp_file, &line2, reader.get(), &buffer, buffer_size);
} else {
Log::Warning("Data file %s only has one line", filename);
}
......
......@@ -75,7 +75,7 @@ class SparseBin: public Bin {
friend class SparseBinIterator<VAL_T>;
friend class OrderedSparseBin<VAL_T>;
SparseBin(data_size_t num_data)
explicit SparseBin(data_size_t num_data)
: num_data_(num_data) {
int num_threads = 1;
#pragma omp parallel
......@@ -145,7 +145,7 @@ class SparseBin: public Bin {
}
}
virtual data_size_t Split(
data_size_t Split(
uint32_t min_bin, uint32_t max_bin, uint32_t default_bin, MissingType missing_type, bool default_left,
uint32_t threshold, data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices) const override {
......@@ -208,7 +208,7 @@ class SparseBin: public Bin {
return lte_count;
}
virtual data_size_t SplitCategorical(
data_size_t SplitCategorical(
uint32_t min_bin, uint32_t max_bin, uint32_t default_bin,
const uint32_t* threshold, int num_threahold, data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices) const override {
......
......@@ -72,7 +72,7 @@ class MulticlassMetric: public Metric {
std::vector<double> rec(num_pred_per_row);
objective->ConvertOutput(raw_score.data(), rec.data());
// add loss
sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], rec, config_);
sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], &rec, config_);
}
} else {
#pragma omp parallel for schedule(static) reduction(+:sum_loss)
......@@ -85,7 +85,7 @@ class MulticlassMetric: public Metric {
std::vector<double> rec(num_pred_per_row);
objective->ConvertOutput(raw_score.data(), rec.data());
// add loss
sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], rec, config_) * weights_[i];
sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], &rec, config_) * weights_[i];
}
}
} else {
......@@ -98,7 +98,7 @@ class MulticlassMetric: public Metric {
rec[k] = static_cast<double>(score[idx]);
}
// add loss
sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], rec, config_);
sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], &rec, config_);
}
} else {
#pragma omp parallel for schedule(static) reduction(+:sum_loss)
......@@ -109,7 +109,7 @@ class MulticlassMetric: public Metric {
rec[k] = static_cast<double>(score[idx]);
}
// add loss
sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], rec, config_) * weights_[i];
sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], &rec, config_) * weights_[i];
}
}
}
......@@ -138,11 +138,11 @@ class MultiErrorMetric: public MulticlassMetric<MultiErrorMetric> {
public:
explicit MultiErrorMetric(const Config& config) :MulticlassMetric<MultiErrorMetric>(config) {}
inline static double LossOnPoint(label_t label, std::vector<double>& score, const Config& config) {
inline static double LossOnPoint(label_t label, std::vector<double>* score, const Config& config) {
size_t k = static_cast<size_t>(label);
int num_larger = 0;
for (size_t i = 0; i < score.size(); ++i) {
if (score[i] >= score[k]) ++num_larger;
for (size_t i = 0; i < score->size(); ++i) {
if (score->at(i) >= score->at(k)) ++num_larger;
if (num_larger > config.multi_error_top_k) return 1.0f;
}
return 0.0f;
......@@ -162,10 +162,10 @@ class MultiSoftmaxLoglossMetric: public MulticlassMetric<MultiSoftmaxLoglossMetr
public:
explicit MultiSoftmaxLoglossMetric(const Config& config) :MulticlassMetric<MultiSoftmaxLoglossMetric>(config) {}
inline static double LossOnPoint(label_t label, std::vector<double>& score, const Config&) {
inline static double LossOnPoint(label_t label, std::vector<double>* score, const Config&) {
size_t k = static_cast<size_t>(label);
if (score[k] > kEpsilon) {
return static_cast<double>(-std::log(score[k]));
if (score->at(k) > kEpsilon) {
return static_cast<double>(-std::log(score->at(k)));
} else {
return -std::log(kEpsilon);
}
......
......@@ -8,6 +8,7 @@
#include <LightGBM/config.h>
#include <LightGBM/meta.h>
#include <LightGBM/network.h>
#include <LightGBM/utils/common.h>
#include <string>
#include <algorithm>
......@@ -19,7 +20,6 @@
#ifdef USE_SOCKET
#include "socket_wrapper.hpp"
#include <LightGBM/utils/common.h>
#endif
#ifdef USE_MPI
......
......@@ -6,10 +6,14 @@
#define LIGHTGBM_NETWORK_SOCKET_WRAPPER_HPP_
#ifdef USE_SOCKET
#include <LightGBM/utils/log.h>
#if defined(_WIN32)
#ifdef _MSC_VER
#define NOMINMAX
#endif
#include <winsock2.h>
#include <ws2tcpip.h>
#include <iphlpapi.h>
......@@ -18,7 +22,6 @@
#include <fcntl.h>
#include <netdb.h>
#include <cerrno>
#include <unistd.h>
#include <arpa/inet.h>
#include <netinet/in.h>
......@@ -30,10 +33,9 @@
#endif
#include <LightGBM/utils/log.h>
#include <string>
#include <cerrno>
#include <cstdlib>
#include <string>
#include <unordered_set>
#ifdef _MSC_VER
......
......@@ -105,7 +105,7 @@ class LambdarankNDCG: public ObjectiveFunction {
}
std::stable_sort(sorted_idx.begin(), sorted_idx.end(),
[score](data_size_t a, data_size_t b) { return score[a] > score[b]; });
// get best and worst score
// get best and worst score
const double best_score = score[sorted_idx[0]];
data_size_t worst_idx = cnt - 1;
if (worst_idx > 0 && score[sorted_idx[worst_idx]] == kMinScore) {
......@@ -143,7 +143,7 @@ class LambdarankNDCG: public ObjectiveFunction {
const double paired_discount = fabs(high_discount - low_discount);
// get delta NDCG
double delta_pair_NDCG = dcg_gap * paired_discount * inverse_max_dcg;
// regular the delta_pair_NDCG by score distance
// regular the delta_pair_NDCG by score distance
if (norm_ && high_label != low_label && best_score != worst_score) {
delta_pair_NDCG /= (0.01f + fabs(delta_score));
}
......
......@@ -138,7 +138,7 @@ void DataParallelTreeLearner<TREELEARNER_T>::BeforeTrain() {
}
});
// copy back
std::memcpy((void*)&data, output_buffer_.data(), size);
std::memcpy(reinterpret_cast<void*>(&data), output_buffer_.data(), size);
// set global sumup info
this->smaller_leaf_splits_->Init(std::get<1>(data), std::get<2>(data));
// init global data count in leaf
......
......@@ -184,8 +184,8 @@ void GPUTreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featur
// copy the results asynchronously. Size depends on if double precision is used
size_t output_size = num_dense_feature4_ * dword_features_ * device_bin_size_ * hist_bin_entry_sz_;
boost::compute::event histogram_wait_event;
host_histogram_outputs_ = (void*)queue_.enqueue_map_buffer_async(device_histogram_outputs_, boost::compute::command_queue::map_read,
0, output_size, histogram_wait_event, kernel_wait_obj_);
host_histogram_outputs_ = reinterpret_cast<void*>(queue_.enqueue_map_buffer_async(device_histogram_outputs_, boost::compute::command_queue::map_read,
0, output_size, histogram_wait_event, kernel_wait_obj_));
// we will wait for this object in WaitAndGetHistograms
histograms_wait_obj_ = boost::compute::wait_list(histogram_wait_event);
}
......@@ -736,7 +736,7 @@ void GPUTreeLearner::InitGPU(int platform_id, int device_id) {
}
Tree* GPUTreeLearner::Train(const score_t* gradients, const score_t *hessians,
bool is_constant_hessian, Json& forced_split_json) {
bool is_constant_hessian, const Json& forced_split_json) {
// check if we need to recompile the GPU kernel (is_constant_hessian changed)
// this should rarely occur
if (is_constant_hessian != is_constant_hessian_) {
......@@ -977,7 +977,7 @@ void GPUTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_u
train_data_->ConstructHistograms(is_sparse_feature_used,
nullptr, smaller_leaf_splits_->num_data_in_leaf(),
smaller_leaf_splits_->LeafIndex(),
ordered_bins_, gradients_, hessians_,
&ordered_bins_, gradients_, hessians_,
ordered_gradients_.data(), ordered_hessians_.data(), is_constant_hessian_,
ptr_smaller_leaf_hist_data);
// wait for GPU to finish, only if GPU is actually used
......@@ -1030,7 +1030,7 @@ void GPUTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_u
train_data_->ConstructHistograms(is_sparse_feature_used,
nullptr, larger_leaf_splits_->num_data_in_leaf(),
larger_leaf_splits_->LeafIndex(),
ordered_bins_, gradients_, hessians_,
&ordered_bins_, gradients_, hessians_,
ordered_gradients_.data(), ordered_hessians_.data(), is_constant_hessian_,
ptr_larger_leaf_hist_data);
// wait for GPU to finish, only if GPU is actually used
......
......@@ -48,7 +48,7 @@ class GPUTreeLearner: public SerialTreeLearner {
void Init(const Dataset* train_data, bool is_constant_hessian) override;
void ResetTrainingData(const Dataset* train_data) override;
Tree* Train(const score_t* gradients, const score_t *hessians,
bool is_constant_hessian, Json& forced_split_json) override;
bool is_constant_hessian, const Json& forced_split_json) override;
void SetBaggingData(const data_size_t* used_indices, data_size_t num_data) override {
SerialTreeLearner::SetBaggingData(used_indices, num_data);
......
......@@ -19,7 +19,7 @@ namespace LightGBM {
*/
class LeafSplits {
public:
LeafSplits(data_size_t num_data)
explicit LeafSplits(data_size_t num_data)
:num_data_in_leaf_(num_data), num_data_(num_data),
data_indices_(nullptr) {
}
......
......@@ -170,7 +170,7 @@ void SerialTreeLearner::ResetConfig(const Config* config) {
histogram_pool_.ResetConfig(config_);
}
Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians, bool is_constant_hessian, Json& forced_split_json) {
Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians, bool is_constant_hessian, const Json& forced_split_json) {
gradients_ = gradients;
hessians_ = hessians;
is_constant_hessian_ = is_constant_hessian;
......@@ -290,7 +290,7 @@ std::vector<int8_t> SerialTreeLearner::GetUsedFeatures(bool is_tree_level) {
CHECK(inner_feature_index >= 0);
ret[inner_feature_index] = 1;
}
} else if(used_feature_indices_.size() <= 0) {
} else if (used_feature_indices_.size() <= 0) {
int used_feature_cnt = static_cast<int>(std::round(valid_feature_indices_.size() * config_->feature_fraction_bynode));
used_feature_cnt = std::max(used_feature_cnt, min_used_features);
auto sampled_indices = random_.Sample(static_cast<int>(valid_feature_indices_.size()), used_feature_cnt);
......@@ -502,7 +502,7 @@ void SerialTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_featur
train_data_->ConstructHistograms(is_feature_used,
smaller_leaf_splits_->data_indices(), smaller_leaf_splits_->num_data_in_leaf(),
smaller_leaf_splits_->LeafIndex(),
ordered_bins_, gradients_, hessians_,
&ordered_bins_, gradients_, hessians_,
ordered_gradients_.data(), ordered_hessians_.data(), is_constant_hessian_,
ptr_smaller_leaf_hist_data);
......@@ -512,7 +512,7 @@ void SerialTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_featur
train_data_->ConstructHistograms(is_feature_used,
larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf(),
larger_leaf_splits_->LeafIndex(),
ordered_bins_, gradients_, hessians_,
&ordered_bins_, gradients_, hessians_,
ordered_gradients_.data(), ordered_hessians_.data(), is_constant_hessian_,
ptr_larger_leaf_hist_data);
}
......@@ -636,7 +636,7 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(const std::vector<int8_t>&
#endif
}
int32_t SerialTreeLearner::ForceSplits(Tree* tree, Json& forced_split_json, int* left_leaf,
int32_t SerialTreeLearner::ForceSplits(Tree* tree, const Json& forced_split_json, int* left_leaf,
int* right_leaf, int *cur_depth,
bool *aborted_last_force_split) {
int32_t result_count = 0;
......@@ -819,7 +819,7 @@ void SerialTreeLearner::Split(Tree* tree, int best_leaf, int* left_leaf, int* ri
auto tmp_idx = data_partition_->GetIndexOnLeaf(best_leaf, &cnt_leaf_data);
for (data_size_t i_input = 0; i_input < cnt_leaf_data; ++i_input) {
int real_idx = tmp_idx[i_input];
Common::InsertBitset(feature_used_in_data, train_data_->num_data() * inner_feature_index + real_idx);
Common::InsertBitset(&feature_used_in_data, train_data_->num_data() * inner_feature_index + real_idx);
}
}
......@@ -932,8 +932,8 @@ void SerialTreeLearner::RenewTreeOutput(Tree* tree, const ObjectiveFunction* obj
for (int i = 0; i < tree->num_leaves(); ++i) {
outputs[i] = static_cast<double>(tree->LeafOutput(i));
}
Network::GlobalSum(outputs);
Network::GlobalSum(n_nozeroworker_perleaf);
outputs = Network::GlobalSum(&outputs);
n_nozeroworker_perleaf = Network::GlobalSum(&n_nozeroworker_perleaf);
for (int i = 0; i < tree->num_leaves(); ++i) {
tree->SetLeafOutput(i, outputs[i] / n_nozeroworker_perleaf[i]);
}
......
......@@ -49,7 +49,7 @@ class SerialTreeLearner: public TreeLearner {
void ResetConfig(const Config* config) override;
Tree* Train(const score_t* gradients, const score_t *hessians, bool is_constant_hessian,
Json& forced_split_json) override;
const Json& forced_split_json) override;
Tree* FitByExistingTree(const Tree* old_tree, const score_t* gradients, const score_t* hessians) const override;
......@@ -78,7 +78,6 @@ class SerialTreeLearner: public TreeLearner {
data_size_t total_num_data, const data_size_t* bag_indices, data_size_t bag_cnt) const override;
protected:
virtual std::vector<int8_t> GetUsedFeatures(bool is_tree_level);
/*!
* \brief Some initial works before training
......@@ -106,7 +105,7 @@ class SerialTreeLearner: public TreeLearner {
virtual void Split(Tree* tree, int best_leaf, int* left_leaf, int* right_leaf);
/* Force splits with forced_split_json dict and then return num splits forced.*/
virtual int32_t ForceSplits(Tree* tree, Json& forced_split_json, int* left_leaf,
virtual int32_t ForceSplits(Tree* tree, const Json& forced_split_json, int* left_leaf,
int* right_leaf, int* cur_depth,
bool *aborted_last_force_split);
......
......@@ -135,7 +135,7 @@ void VotingParallelTreeLearner<TREELEARNER_T>::BeforeTrain() {
}
});
std::memcpy((void*)&data, output_buffer_.data(), size);
std::memcpy(reinterpret_cast<void*>(&data), output_buffer_.data(), size);
// set global sumup info
smaller_leaf_splits_global_->Init(std::get<1>(data), std::get<2>(data));
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment