Commit 1c774687 authored by Guolin Ke's avatar Guolin Ke
Browse files

first commit

parents
#ifndef LIGHTGBM_TREE_LEARNER_H_
#define LIGHTGBM_TREE_LEARNER_H_
#include <LightGBM/meta.h>
#include <LightGBM/config.h>
#include <vector>
namespace LightGBM {
/*! \brief forward declaration */
class Tree;
class Dataset;
/*!
* \brief Interface for tree learner
*/
class TreeLearner {
public:
/*! \brief virtual destructor */
virtual ~TreeLearner() {}
/*!
* \brief Init tree learner with training data set and tree config
* \param train_data The used training data
* \param tree_config The tree setting
*/
virtual void Init(const Dataset* train_data) = 0;
/*!
* \brief fit train data set and return a trained tree
* \param gradients The first order gradients
* \param hessians The second order gradients
* \return A trained tree
*/
virtual Tree* Train(const score_t* gradients, const score_t* hessians) = 0;
/*!
* \brief SetLabelAt bagging data
* \param used_indices Used data indices
* \param num_data Number of used data
*/
virtual void SetBaggingData(const data_size_t* used_indices,
data_size_t num_data) = 0;
/*!
* \brief Use last trained tree to predition training score, and add to out_score;
* \param out_score output score
*/
virtual void AddPredictionToScore(score_t *out_score) const = 0;
/*!
* \brief Create object of tree learner
* \param type Type of tree learner
*/
static TreeLearner* CreateTreeLearner(TreeLearnerType type,
const TreeConfig& tree_config);
};
} // namespace LightGBM
#endif #endif // LightGBM_TREE_LEARNER_H_
#ifndef LIGHTGBM_UTILS_ARRAY_AGRS_H_
#define LIGHTGBM_UTILS_ARRAY_AGRS_H_
#include <vector>
#include <algorithm>
namespace LightGBM {
/*!
* \brief Contains some operation for a array, e.g. ArgMax, TopK.
*/
template<typename VAL_T>
class ArrayArgs {
public:
inline static size_t ArgMax(const std::vector<VAL_T>& array) {
if (array.size() <= 0) {
return 0;
}
size_t argMax = 0;
for (size_t i = 1; i < array.size(); ++i) {
if (array[i] > array[argMax]) {
argMax = i;
}
}
return argMax;
}
inline static size_t ArgMin(const std::vector<VAL_T>& array) {
if (array.size() <= 0) {
return 0;
}
size_t argMin = 0;
for (size_t i = 1; i < array.size(); ++i) {
if (array[i] < array[argMin]) {
argMin = i;
}
}
return argMin;
}
inline static size_t ArgMax(const VAL_T* array, size_t n) {
if (n <= 0) {
return 0;
}
size_t argMax = 0;
for (size_t i = 1; i < n; ++i) {
if (array[i] > array[argMax]) {
argMax = i;
}
}
return argMax;
}
inline static size_t ArgMin(const VAL_T* array, size_t n) {
if (n <= 0) {
return 0;
}
size_t argMin = 0;
for (size_t i = 1; i < n; ++i) {
if (array[i] < array[argMin]) {
argMin = i;
}
}
return argMin;
}
inline static size_t Partition(std::vector<VAL_T>* array, size_t start, size_t end) {
VAL_T& pivot = (*array)[end - 1];
size_t p_idx = start;
for (size_t i = start; i < end - 1; ++i) {
if ((*array)[i] > pivot) {
std::swap((*array)[p_idx], (*array)[i]);
++p_idx;
}
}
std::swap((*array)[p_idx], (*array)[end - 1]);
return p_idx;
};
inline static size_t ArgMaxAtK(std::vector<VAL_T>* array, size_t start, size_t end, size_t k) {
if (start == end - 1) {
return start;
}
size_t p_idx = Partition(array, start, end);
if (p_idx == k) {
return p_idx;
}
else if (k < p_idx) {
return ArgMaxAtK(array, start, p_idx, k);
}
else {
return ArgMaxAtK(array, p_idx + 1, end, k);
}
}
inline static void MaxK(const std::vector<VAL_T>& array, size_t k, std::vector<VAL_T>* out) {
out->clear();
if (k <= 0) {
return;
}
for (auto val : array) {
out->push_back(val);
}
if (k >= array.size()) {
return;
}
ArgMaxAtK(out, 0, out->size(), k - 1);
out->erase(out->begin() + k, out->end());
}
};
} // namespace LightGBM
#endif #endif // LightGBM_UTILS_ARRAY_AGRS_H_
#ifndef LIGHTGBM_UTILS_COMMON_FUN_H_
#define LIGHTGBM_UTILS_COMMON_FUN_H_
#include <LightGBM/utils/log.h>
#include <cstdio>
#include <string>
#include <vector>
#include <sstream>
#include <cstdint>
namespace LightGBM {
namespace Common {
template<typename T>
inline static T Max(const T& a, const T& b) {
return a > b ? a : b;
}
template<typename T>
inline static T Min(const T& a, const T& b) {
return a < b ? a : b;
}
inline static std::string& Trim(std::string& str) {
if (str.size() <= 0) {
return str;
}
str.erase(str.find_last_not_of(" \f\n\r\t\v") + 1);
str.erase(0, str.find_first_not_of(" \f\n\r\t\v"));
return str;
}
inline static std::vector<std::string> Split(const char* str, char delimiter) {
std::stringstream ss(str);
std::string tmp_str;
std::vector<std::string> ret;
while (std::getline(ss, tmp_str, delimiter)) {
ret.push_back(tmp_str);
}
return ret;
}
inline static const char* Atoi(const char* p, int* out) {
int sign, value;
while (*p == ' ') {
++p;
}
sign = 1;
if (*p == '-') {
sign = -1;
++p;
}
else if (*p == '+') {
++p;
}
for (value = 0; *p >= '0' && *p <= '9'; ++p) {
value = value * 10 + (*p - '0');
}
*out = sign * value;
while (*p == ' ') {
++p;
}
return p;
}
//ref to http://www.leapsecond.com/tools/fast_atof.c
inline static const char* Atof(const char* p, double* out) {
int frac;
double sign, value, scale;
// Skip leading white space, if any.
while (*p == ' ') {
++p;
}
// Get sign, if any.
sign = 1.0;
if (*p == '-') {
sign = -1.0;
++p;
}
else if (*p == '+') {
++p;
}
// Get digits before decimal point or exponent, if any.
for (value = 0.0; *p >= '0' && *p <= '9'; ++p) {
value = value * 10.0 + (*p - '0');
}
// Get digits after decimal point, if any.
if (*p == '.') {
double pow10 = 10.0;
++p;
while (*p >= '0' && *p <= '9') {
value += (*p - '0') / pow10;
pow10 *= 10.0;
++p;
}
}
// Handle exponent, if any.
frac = 0;
scale = 1.0;
if ((*p == 'e') || (*p == 'E')) {
unsigned int expon;
// Get sign of exponent, if any.
++p;
if (*p == '-') {
frac = 1;
++p;
}
else if (*p == '+') {
++p;
}
// Get digits of exponent, if any.
for (expon = 0; *p >= '0' && *p <= '9'; ++p) {
expon = expon * 10 + (*p - '0');
}
if (expon > 308) expon = 308;
// Calculate scaling factor.
while (expon >= 50) { scale *= 1E50; expon -= 50; }
while (expon >= 8) { scale *= 1E8; expon -= 8; }
while (expon > 0) { scale *= 10.0; expon -= 1; }
}
// Return signed and scaled floating point result.
*out = sign * (frac ? (value / scale) : (value * scale));
while (*p == ' ') {
++p;
}
return p;
}
inline static const char* SkipSpaceAndTab(const char* p) {
while (*p == ' ' || *p == '\t') {
++p;
}
return p;
}
inline static const char* SkipReturn(const char* p) {
while (*p == '\n' || *p == '\r' || *p == ' ') {
++p;
}
return p;
}
template<typename T>
inline static std::string ArrayToString(const T* arr, int n, char delimiter) {
if (n <= 0) {
return std::string("");
}
std::stringstream ss;
ss << arr[0];
for (int i = 1; i < n; ++i) {
ss << delimiter;
ss << arr[i];
}
return ss.str();
}
inline static void StringToIntArray(const std::string& str, char delimiter, size_t n, int* out) {
std::vector<std::string> strs = Split(str.c_str(), delimiter);
if (strs.size() != n) {
Log::Stderr("StringToIntArray error, size don't equal.");
}
for (size_t i = 0; i < strs.size(); ++i) {
strs[i] = Trim(strs[i]);
Atoi(strs[i].c_str(), &out[i]);
}
}
inline static void StringToDoubleArray(const std::string& str, char delimiter, size_t n, double* out) {
std::vector<std::string> strs = Split(str.c_str(), delimiter);
if (strs.size() != n) {
Log::Stderr("StringToDoubleArray error, size don't equal");
}
for (size_t i = 0; i < strs.size(); ++i) {
strs[i] = Trim(strs[i]);
Atof(strs[i].c_str(), &out[i]);
}
}
inline static void StringToDoubleArray(const std::string& str, char delimiter, size_t n, float* out) {
std::vector<std::string> strs = Split(str.c_str(), delimiter);
if (strs.size() != n) {
Log::Stderr("StringToDoubleArray error, size don't equal");
}
double tmp;
for (size_t i = 0; i < strs.size(); ++i) {
strs[i] = Trim(strs[i]);
Atof(strs[i].c_str(), &tmp);
out[i] = static_cast<float>(tmp);
}
}
inline static std::vector<double> StringToDoubleArray(const std::string& str, char delimiter) {
std::vector<std::string> strs = Split(str.c_str(), delimiter);
std::vector<double> ret;
for (size_t i = 0; i < strs.size(); ++i) {
strs[i] = Trim(strs[i]);
double val = 0.0;
Atof(strs[i].c_str(), &val);
ret.push_back(val);
}
return ret;
}
inline static std::vector<int> StringToIntArray(const std::string& str, char delimiter) {
std::vector<std::string> strs = Split(str.c_str(), delimiter);
std::vector<int> ret;
for (size_t i = 0; i < strs.size(); ++i) {
strs[i] = Trim(strs[i]);
int val = 0;
Atoi(strs[i].c_str(), &val);
ret.push_back(val);
}
return ret;
}
inline static std::string Join(const std::vector<std::string>& strs, char delimiter) {
if (strs.size() <= 0) {
return std::string("");
}
std::stringstream ss;
ss << strs[0];
for (size_t i = 1; i < strs.size(); ++i) {
ss << delimiter;
ss << strs[i];
}
return ss.str();
}
inline static std::string Join(const std::vector<std::string>& strs, size_t start, size_t end, char delimiter) {
if (end - start <= 0) {
return std::string("");
}
start = Min<size_t>(start, static_cast<size_t>(strs.size()) - 1);
end = Min<size_t>(end, static_cast<size_t>(strs.size()));
std::stringstream ss;
ss << strs[start];
for (size_t i = start + 1; i < end; ++i) {
ss << delimiter;
ss << strs[i];
}
return ss.str();
}
static inline int64_t Pow2RoundUp(int64_t x) {
int64_t t = 1;
for (int i = 0; i < 64; ++i) {
if (t >= x) {
return t;
}
t <<= 1;
}
return 0;
}
} // namespace Common
} // namespace LightGBM
#endif #endif // LightGBM_UTILS_COMMON_FUN_H_
#ifndef LIGHTGBM_UTILS_LOG_H_
#define LIGHTGBM_UTILS_LOG_H_
#include <cstdio>
#include <cstdlib>
#include <cstdarg>
#include <cstring>
namespace LightGBM {
class Log {
public:
inline static void Stderr(const char *format, ...) {
va_list argptr;
char fixed[512];
#ifdef _MSC_VER
sprintf_s(fixed, "[LightGBM Error] %s \n", format);
#else
sprintf(fixed, "[LightGBM Error] %s \n", format);
#endif
va_start(argptr, format);
vfprintf(stderr, fixed, argptr);
va_end(argptr);
fflush(stderr);
std::exit(1);
}
inline static void Stdout(const char *format, ...) {
va_list argptr;
char fixed[512];
#ifdef _MSC_VER
sprintf_s(fixed, "[LightGBM] %s\n", format);
#else
sprintf(fixed, "[LightGBM] %s\n", format);
#endif
va_start(argptr, format);
vfprintf(stdout, fixed, argptr);
va_end(argptr);
fflush(stdout);
}
};
#define CHECK(condition) \
if (!(condition)) Log::Stderr("Check failed: " #condition \
" at %s, line %d .\n", __FILE__, __LINE__);
} // namespace LightGBM
#endif #endif // LightGBM_UTILS_LOG_H_
#ifndef LIGHTGBM_UTILS_PIPELINE_READER_H_
#define LIGHTGBM_UTILS_PIPELINE_READER_H_
#include <LightGBM/utils/log.h>
#include <cstdio>
#include <algorithm>
#include <functional>
#include <thread>
namespace LightGBM{
/*!
* \brief A pipeline file reader, use 2 threads, one read block from file, the other process the block
*/
class PipelineReader {
public:
/*!
* \brief Read data from a file, use pipeline methods
* \param filename Filename of data
* \process_fun Process function
*/
static size_t Read(const char* filename, const std::function<size_t (const char*, size_t)>& process_fun) {
FILE* file;
#ifdef _MSC_VER
fopen_s(&file, filename, "rb");
#else
file = fopen(filename, "rb");
#endif
if (file == NULL) {
return 0;
}
size_t cnt = 0;
const size_t buffer_size = 16 * 1024 * 1024 ;
// buffer used for the process_fun
char* buffer_process = new char[buffer_size];
// buffer used for the file reading
char* buffer_read = new char[buffer_size];
// read first block
size_t read_cnt = fread(buffer_process, 1, buffer_size, file);
size_t last_read_cnt = 0;
while (read_cnt > 0) {
// strat read thread
std::thread read_worker = std::thread(
[file, buffer_read, buffer_size, &last_read_cnt] {
last_read_cnt = fread(buffer_read, 1, buffer_size, file);
}
);
// start process
cnt += process_fun(buffer_process, read_cnt);
// wait for read thread
read_worker.join();
// exchange the buffer
std::swap(buffer_process, buffer_read);
read_cnt = last_read_cnt;
}
delete[] buffer_process;
delete[] buffer_read;
// close file
fclose(file);
return cnt;
}
};
} // namespace LightGBM
#endif #endif // LightGBM_UTILS_PIPELINE_READER_H_
#ifndef LIGHTGBM_UTILS_RANDOM_H_
#define LIGHTGBM_UTILS_RANDOM_H_
#include <cstdint>
#include <random>
#include <vector>
namespace LightGBM {
/*!
* \brief A wrapper for random generator
*/
class Random {
public:
/*!
* \brief Constructor, with random seed
*/
Random()
:distribution_zero_to_one_(0.0, 1.0) {
std::random_device rd;
generator_ = std::mt19937(rd());
}
/*!
* \brief Constructor, with specific seed
*/
Random(int seed)
:generator_(seed), distribution_zero_to_one_(0.0, 1.0) {
}
/*!
* \brief Generate random integer
* \param lower_bound lower bound
* \param upper_bound upper bound
* \return The random integer between [lower_bound, upper_bound)
*/
inline int64_t NextInt(int64_t lower_bound, int64_t upper_bound) {
// get random interge in [a,b)
std::uniform_int_distribution<int64_t> distribution(lower_bound, upper_bound - 1);
return distribution(generator_);
}
/*!
* \brief Generate random float data
* \return The random float between [0.0, 1.0)
*/
inline double NextDouble() {
// get random float in [0,1)
return distribution_zero_to_one_(generator_);
}
/*!
* \brief Sample K data from {0,1,...,N-1}
* \param N
* \param K
* \return K Ordered sampled data from {0,1,...,N-1}
*/
inline std::vector<size_t> Sample(size_t N, size_t K) {
std::vector<size_t> ret;
if (K > N || K < 0) {
return ret;
}
for (size_t i = 0; i < N; ++i) {
double prob = (K - ret.size()) / static_cast<double>(N - i);
if (NextDouble() < prob) {
ret.push_back(i);
}
}
return ret;
}
private:
/*! \brief Random generator */
std::mt19937 generator_;
/*! \brief Cache distribution for [0.0, 1.0) */
std::uniform_real_distribution<double> distribution_zero_to_one_;
};
} // namespace LightGBM
#endif #endif // LightGBM_UTILS_RANDOM_H_
#ifndef LIGHTGBM_UTILS_TEXT_READER_H_
#define LIGHTGBM_UTILS_TEXT_READER_H_
#include <LightGBM/utils/pipeline_reader.h>
#include <LightGBM/utils/log.h>
#include <LightGBM/utils/random.h>
#include <cstdio>
#include <vector>
#include <string>
#include <functional>
namespace LightGBM {
/*!
* \brief Read text data from file
*/
template<typename INDEX_T>
class TextReader {
public:
/*!
* \brief Constructor
* \param filename Filename of data
*/
TextReader(const char* filename):
filename_(filename){
}
/*!
* \brief Destructor
*/
~TextReader() {
Clear();
}
/*!
* \brief Clear cached data
*/
inline void Clear() {
lines_.clear();
lines_.shrink_to_fit();
}
/*!
* \brief Get text data that read from file
* \return Text data, store in std::vector by line
*/
inline std::vector<std::string>& Lines() { return lines_; }
INDEX_T ReadAllAndProcess(const std::function<void(INDEX_T, const char*, size_t)>& process_fun) {
last_line_ = "";
INDEX_T total_cnt = 0;
PipelineReader::Read(filename_,
[this, &total_cnt, &process_fun]
(const char* buffer_process, size_t read_cnt) {
size_t cnt = 0;
size_t i = 0;
size_t last_i = 0;
// skip the break between \r and \n
if (last_line_.size() == 0 && buffer_process[0] == '\n') {
i = 1;
last_i = i;
}
while (i < read_cnt) {
if (buffer_process[i] == '\n' || buffer_process[i] == '\r') {
if (last_line_.size() > 0) {
last_line_.append(buffer_process + last_i, i - last_i);
process_fun(total_cnt, last_line_.c_str(), last_line_.size());
last_line_ = "";
}
else {
process_fun(total_cnt, buffer_process + last_i, i - last_i);
}
++cnt;
++i;
++total_cnt;
// skip end of line
while (buffer_process[i] == '\n' || buffer_process[i] == '\r') { ++i; }
last_i = i;
}
else {
++i;
}
}
if (last_i != read_cnt) {
last_line_ = std::string(buffer_process + last_i, read_cnt - last_i);
}
return cnt;
});
// if last line of file doesn't contain end of line
if (last_line_.size() > 0) {
Log::Stdout("Warning: last line of file %s doesn't contain end of line, application will still use this line", filename_);
process_fun(total_cnt, last_line_.c_str(), last_line_.size());
++total_cnt;
last_line_ = "";
}
return total_cnt;
}
/*!
* \brief Read all text data from file in memory
* \return number of lines of text data
*/
INDEX_T ReadAllLines() {
return ReadAllAndProcess(
[this](INDEX_T, const char* buffer, size_t size) {
lines_.emplace_back(buffer, size);
});
}
INDEX_T SampleFromFile(Random& random, INDEX_T sample_cnt, std::vector<std::string>* out_sampled_data) {
INDEX_T cur_sample_cnt = 0;
return ReadAllAndProcess(
[this, &random, &cur_sample_cnt, &sample_cnt, &out_sampled_data]
(INDEX_T line_idx, const char* buffer, size_t size) {
if (cur_sample_cnt < sample_cnt) {
out_sampled_data->emplace_back(buffer, size);
++cur_sample_cnt;
}
else {
const size_t idx = random.NextInt(0, line_idx + 1);
if (idx < sample_cnt) {
out_sampled_data->operator[](idx) = std::string(buffer, size);
}
}
});
}
/*!
* \brief Read part of text data from file in memory, use filter_fun to filter data
* \param filter_fun Function that perform data filter
* \param out_used_data_indices Store line indices that read text data
* \return The number of total data
*/
INDEX_T ReadAndFilterLines(const std::function<bool(INDEX_T)>& filter_fun, std::vector<INDEX_T>* out_used_data_indices) {
out_used_data_indices->clear();
INDEX_T total_cnt = ReadAllAndProcess(
[this, &out_used_data_indices, &filter_fun]
(INDEX_T line_idx , const char* buffer, size_t size) {
bool is_used = filter_fun(line_idx);
if (is_used) { out_used_data_indices->push_back(line_idx); }
if (is_used) { lines_.emplace_back(buffer, size); }
});
return total_cnt;
}
INDEX_T SampleAndFilterFromFile(const std::function<bool(INDEX_T)>& filter_fun, std::vector<INDEX_T>* out_used_data_indices,
Random& random, size_t sample_cnt, std::vector<std::string>* out_sampled_data) {
INDEX_T cur_sample_cnt = 0;
out_used_data_indices->clear();
INDEX_T total_cnt = ReadAllAndProcess(
[this, &out_used_data_indices, &filter_fun, &random, &cur_sample_cnt, &sample_cnt, &out_sampled_data]
(INDEX_T line_idx, const char* buffer, size_t size) {
bool is_used = filter_fun(line_idx);
if (is_used) { out_used_data_indices->push_back(line_idx); }
if (is_used) {
if (cur_sample_cnt < sample_cnt) {
out_sampled_data->emplace_back(buffer, size);
++cur_sample_cnt;
}
else {
const size_t idx = random.NextInt(0, out_used_data_indices->size());
if (idx < sample_cnt) {
out_sampled_data->operator[](idx) = std::string(buffer, size);
}
}
}
});
return total_cnt;
}
INDEX_T CountLine() {
return ReadAllAndProcess(
[this](INDEX_T, const char*, size_t) {
});
}
INDEX_T ReadAllAndProcessParallelWithFilter(const std::function<void(INDEX_T, const std::vector<std::string>&)>& process_fun, const std::function<bool(INDEX_T,INDEX_T)>& filter_fun) {
last_line_ = "";
INDEX_T total_cnt = 0;
INDEX_T used_cnt = 0;
PipelineReader::Read(filename_,
[this, &total_cnt, &process_fun,&used_cnt, &filter_fun]
(const char* buffer_process, size_t read_cnt) {
size_t cnt = 0;
size_t i = 0;
size_t last_i = 0;
INDEX_T start_idx = used_cnt;
// skip the break between \r and \n
if (last_line_.size() == 0 && buffer_process[0] == '\n') {
i = 1;
last_i = i;
}
while (i < read_cnt) {
if (buffer_process[i] == '\n' || buffer_process[i] == '\r') {
if (last_line_.size() > 0) {
last_line_.append(buffer_process + last_i, i - last_i);
if (filter_fun(used_cnt, total_cnt)) {
lines_.push_back(last_line_);
++used_cnt;
}
last_line_ = "";
}
else {
if (filter_fun(used_cnt, total_cnt)) {
lines_.emplace_back(buffer_process + last_i, i - last_i);
++used_cnt;
}
}
++cnt;
++i;
++total_cnt;
// skip end of line
while (buffer_process[i] == '\n' || buffer_process[i] == '\r') { ++i; }
last_i = i;
}
else {
++i;
}
}
process_fun(start_idx, lines_);
lines_.clear();
if (last_i != read_cnt) {
last_line_ = std::string(buffer_process + last_i, read_cnt - last_i);
}
return cnt;
});
// if last line of file doesn't contain end of line
if (last_line_.size() > 0) {
Log::Stdout("Warning: last line of file %s doesn't contain end of line, application will still use this line", filename_);
if (filter_fun(used_cnt, total_cnt)) {
lines_.push_back(last_line_);
process_fun(used_cnt, lines_);
}
lines_.clear();
++total_cnt;
++used_cnt;
last_line_ = "";
}
return total_cnt;
}
INDEX_T ReadAllAndProcessParallel(const std::function<void(INDEX_T, const std::vector<std::string>&)>& process_fun) {
return ReadAllAndProcessParallelWithFilter(process_fun, [](INDEX_T, INDEX_T) { return true; });
}
INDEX_T ReadPartAndProcessParallel(const std::vector<INDEX_T>& used_data_indices, const std::function<void(INDEX_T, const std::vector<std::string>&)>& process_fun) {
return ReadAllAndProcessParallelWithFilter(process_fun,
[&used_data_indices](INDEX_T used_cnt ,INDEX_T total_cnt) {
if (used_cnt < used_data_indices.size() && total_cnt == used_data_indices[used_cnt]) {
return true;
}
else {
return false;
}
});
}
private:
/*! \brief Filename of text data */
const char* filename_;
/*! \brief Cache the read text data */
std::vector<std::string> lines_;
/*! \brief Buffer for last line */
std::string last_line_;
};
} // namespace LightGBM
#endif #endif // LightGBM_UTILS_TEXT_READER_H_
#ifndef LIGHTGBM_UTILS_THREADING_H_
#define LIGHTGBM_UTILS_THREADING_H_
#include <omp.h>
#include <vector>
#include <functional>
namespace LightGBM {
class Threading {
public:
template<typename INDEX_T>
static inline void For(INDEX_T start, INDEX_T end, const std::function<void(int, INDEX_T, INDEX_T)>& inner_fun) {
int num_threads = 1;
#pragma omp parallel
#pragma omp master
{
num_threads = omp_get_num_threads();
}
INDEX_T num_inner = (end - start + num_threads - 1) / num_threads;
if (num_inner <= 0) { num_inner = 1; }
#pragma omp parallel for schedule(static,1)
for (int i = 0; i < num_threads; ++i) {
INDEX_T inner_start = start + num_inner * i;
INDEX_T inner_end = inner_start + num_inner;
if (inner_end > end) { inner_end = end; }
if (inner_start < end) {
inner_fun(i, inner_start, inner_end);
}
}
}
};
} // namespace LightGBM
#endif #endif // LightGBM_UTILS_THREADING_H_
include_directories(${LightGBM_HEADER_DIR})
if(USE_MPI)
include_directories(${MPI_CXX_INCLUDE_PATH})
endif()
AUX_SOURCE_DIRECTORY(./application/ APPLICATION_SRC)
AUX_SOURCE_DIRECTORY(./boosting/ BOOSTING_SRC)
AUX_SOURCE_DIRECTORY(./io/ IO_SRC)
AUX_SOURCE_DIRECTORY(./metric/ METRIC_SRC)
AUX_SOURCE_DIRECTORY(./objective/ OBJECTIVE_SRC)
AUX_SOURCE_DIRECTORY(./network/ NETWORK_SRC)
AUX_SOURCE_DIRECTORY(./treelearner/ TREELEARNER_SRC)
add_executable(LightGBM main.cpp ${APPLICATION_SRC} ${BOOSTING_SRC} ${IO_SRC} ${METRIC_SRC} ${OBJECTIVE_SRC} ${NETWORK_SRC} ${TREELEARNER_SRC})
if(USE_MPI)
TARGET_LINK_LIBRARIES(LightGBM ${MPI_CXX_LIBRARIES})
endif(USE_MPI)
#include <LightGBM/application.h>
#include <LightGBM/utils/common.h>
#include <LightGBM/utils/text_reader.h>
#include <LightGBM/network.h>
#include <LightGBM/dataset.h>
#include <LightGBM/boosting.h>
#include <LightGBM/objective_function.h>
#include <LightGBM/metric.h>
#include "predictor.hpp"
#include <omp.h>
#include <cstdio>
#include <ctime>
#include <chrono>
#include <fstream>
#include <sstream>
#include <string>
#include <utility>
#include <vector>
namespace LightGBM {
Application::Application(int argc, char** argv)
:train_data_(nullptr), boosting_(nullptr), objective_fun_(nullptr) {
LoadParameters(argc, argv);
// set number of threads for openmp
if (config_.num_threads > 0) {
omp_set_num_threads(config_.num_threads);
}
}
Application::~Application() {
if (train_data_ != nullptr) { delete train_data_; }
for (auto& data : valid_datas_) {
if (data != nullptr) { delete data; }
}
valid_datas_.clear();
for (auto& metric : train_metric_) {
if (metric != nullptr) { delete metric; }
}
for (auto& metric : valid_metrics_) {
for (auto& sub_metric : metric) {
if (sub_metric != nullptr) { delete sub_metric; }
}
}
valid_metrics_.clear();
if (boosting_ != nullptr) { delete boosting_; }
if (objective_fun_ != nullptr) { delete objective_fun_; }
if (config_.is_parallel) {
Network::Dispose();
}
}
void Application::LoadParameters(int argc, char** argv) {
std::unordered_map<std::string, std::string> params;
for (int i = 0; i < argc; ++i) {
std::vector<std::string> tmp_strs = Common::Split(argv[i], '=');
if (tmp_strs.size() == 2) {
std::string key = Common::Trim(tmp_strs[0]);
std::string value = Common::Trim(tmp_strs[1]);
if (key.size() <= 0) {
continue;
}
params[key] = value;
}
}
// check for alias
ParameterAlias::KeyAliasTransform(&params);
// read parameters from config file
if (params.count("config_file") > 0) {
TextReader<size_t> config_reader(params["config_file"].c_str());
config_reader.ReadAllLines();
if (config_reader.Lines().size() > 0) {
for (auto& line : config_reader.Lines()) {
line = Common::Trim(line);
// skip comment
if (line.size() == 0 || line[0] == '#') {
continue;
}
std::vector<std::string> tmp_strs = Common::Split(line.c_str(), '=');
if (tmp_strs.size() == 2) {
std::string key = Common::Trim(tmp_strs[0]);
std::string value = Common::Trim(tmp_strs[1]);
if (key.size() <= 0) {
continue;
}
// Command line have higher priority
if (params.count(key) == 0) {
params[key] = value;
}
}
}
} else {
Log::Stdout("config file: %s doesn't exist, will ignore",
params["config_file"].c_str());
}
}
// check for alias again
ParameterAlias::KeyAliasTransform(&params);
// load configs
config_.Set(params);
Log::Stdout("finished load parameters");
}
void Application::LoadData() {
auto start_time = std::chrono::high_resolution_clock::now();
// predition is needed if using input initial model(continued train)
PredictFunction predict_fun = nullptr;
Predictor* predictor = nullptr;
// load init model
if (config_.io_config.input_model.size() > 0) {
LoadModel();
if (boosting_->NumberOfSubModels() > 0) {
predictor = new Predictor(boosting_, config_.io_config.is_sigmoid);
predict_fun =
[&predictor](const std::vector<std::pair<int, double>>& features) {
return predictor->PredictRawOneLine(features);
};
}
}
// sync up random seed for data partition
if (config_.is_parallel_find_bin) {
config_.io_config.data_random_seed =
GlobalSyncUpByMin<int>(config_.io_config.data_random_seed);
}
train_data_ = new Dataset(config_.io_config.data_filename.c_str(),
config_.io_config.input_init_score.c_str(),
config_.io_config.max_bin,
config_.io_config.data_random_seed,
config_.io_config.is_enable_sparse,
predict_fun);
// load Training data
if (config_.is_parallel_find_bin) {
// load data for parallel training
train_data_->LoadTrainData(Network::rank(), Network::num_machines(),
config_.io_config.is_pre_partition,
config_.io_config.use_two_round_loading);
} else {
// load data for single machine
train_data_->LoadTrainData(config_.io_config.use_two_round_loading);
}
// need save binary file
if (config_.io_config.is_save_binary_file) {
train_data_->SaveBinaryFile();
}
// create training metric
if (config_.metric_config.is_provide_training_metric) {
for (auto metric_type : config_.metric_types) {
Metric* metric =
Metric::CreateMetric(metric_type, config_.metric_config);
if (metric == nullptr) { continue; }
metric->Init("training", train_data_->metadata(),
train_data_->num_data());
train_metric_.push_back(metric);
}
}
// Add validation data, if exists
for (size_t i = 0; i < config_.io_config.valid_data_filenames.size(); ++i) {
// add
valid_datas_.push_back(
new Dataset(config_.io_config.valid_data_filenames[i].c_str(),
config_.io_config.max_bin,
config_.io_config.data_random_seed,
config_.io_config.is_enable_sparse,
predict_fun));
// load validation data like train data
valid_datas_.back()->LoadValidationData(train_data_,
config_.io_config.use_two_round_loading);
// need save binary file
if (config_.io_config.is_save_binary_file) {
valid_datas_.back()->SaveBinaryFile();
}
// add metric for validation data
valid_metrics_.emplace_back();
for (auto metric_type : config_.metric_types) {
Metric* metric = Metric::CreateMetric(metric_type, config_.metric_config);
if (metric == nullptr) { continue; }
metric->Init(config_.io_config.valid_data_filenames[i].c_str(),
valid_datas_.back()->metadata(),
valid_datas_.back()->num_data());
valid_metrics_.back().push_back(metric);
}
}
if (predictor != nullptr) {
delete predictor;
}
auto end_time = std::chrono::high_resolution_clock::now();
// output used time on each iteration
Log::Stdout("Finish loading data, use %f seconds ",
std::chrono::duration<double, std::milli>(end_time - start_time) * 1e-3);
}
void Application::InitTrain() {
if (config_.is_parallel) {
// need init network
Network::Init(config_.network_config);
Log::Stdout("finish network initialization");
// sync global random seed for feature patition
if (config_.boosting_type == BoostingType::kGBDT) {
GBDTConfig* gbdt_config =
dynamic_cast<GBDTConfig*>(config_.boosting_config);
gbdt_config->tree_config.feature_fraction_seed =
GlobalSyncUpByMin<int>(gbdt_config->tree_config.feature_fraction_seed);
gbdt_config->tree_config.feature_fraction =
GlobalSyncUpByMin<double>(gbdt_config->tree_config.feature_fraction);
}
}
// create boosting
boosting_ =
Boosting::CreateBoosting(config_.boosting_type, config_.boosting_config);
// create objective function
objective_fun_ =
ObjectiveFunction::CreateObjectiveFunction(config_.objective_type,
config_.objective_config);
// load training data
LoadData();
// initialize the objective function
objective_fun_->Init(train_data_->metadata(), train_data_->num_data());
// initialize the boosting
boosting_->Init(train_data_, objective_fun_,
ConstPtrInVectorWarpper<Metric>(train_metric_),
config_.io_config.output_model.c_str());
// add validation data into boosting
for (size_t i = 0; i < valid_datas_.size(); ++i) {
boosting_->AddDataset(valid_datas_[i],
ConstPtrInVectorWarpper<Metric>(valid_metrics_[i]));
}
Log::Stdout("finish training init");
}
void Application::Train() {
Log::Stdout("start train");
boosting_->Train();
Log::Stdout("finish train");
}
void Application::Predict() {
// create predictor
Predictor predictor(boosting_, config_.io_config.is_sigmoid);
predictor.Predict(config_.io_config.data_filename.c_str(),
config_.io_config.data_has_label, config_.io_config.output_result.c_str());
Log::Stdout("finish predict");
}
void Application::InitPredict() {
boosting_ =
Boosting::CreateBoosting(config_.boosting_type, config_.boosting_config);
LoadModel();
Log::Stdout("finish predict init");
}
void Application::LoadModel() {
TextReader<size_t> model_reader(config_.io_config.input_model.c_str());
model_reader.ReadAllLines();
std::stringstream ss;
for (auto& line : model_reader.Lines()) {
ss << line << '\n';
}
boosting_->ModelsFromString(ss.str(), config_.io_config.num_model_predict);
}
template<typename T>
T Application::GlobalSyncUpByMin(T& local) {
T global = local;
if (!config_.is_parallel) {
// not need to sync if not parallel learning
return global;
}
Network::Allreduce(reinterpret_cast<char*>(&local),
sizeof(local), sizeof(local),
reinterpret_cast<char*>(&global),
[](const char* src, char* dst, int len) {
int used_size = 0;
const int type_size = sizeof(T);
const T *p1;
T *p2;
while (used_size < len) {
p1 = reinterpret_cast<const T *>(src);
p2 = reinterpret_cast<T *>(dst);
if (*p1 < *p2) {
std::memcpy(dst, src, type_size);
}
src += type_size;
dst += type_size;
used_size += type_size;
}
});
return global;
}
} // namespace LightGBM
#ifndef LIGHTGBM_PREDICTOR_HPP_
#define LIGHTGBM_PREDICTOR_HPP_
#include <LightGBM/meta.h>
#include <LightGBM/boosting.h>
#include <LightGBM/utils/text_reader.h>
#include <LightGBM/dataset.h>
#include <omp.h>
#include <cstring>
#include <cstdio>
#include <vector>
#include <utility>
#include <functional>
#include <string>
namespace LightGBM {
/*!
* \brief Used to prediction data with input model
*/
class Predictor {
public:
/*!
* \brief Constructor
* \param boosting Input boosting model
* \param is_sigmoid True if need to predict result with sigmoid transform(if needed, like binary classification)
*/
Predictor(const Boosting* boosting, bool is_simgoid)
: is_simgoid_(is_simgoid) {
boosting_ = boosting;
num_features_ = boosting_->MaxFeatureIdx() + 1;
#pragma omp parallel
#pragma omp master
{
num_threads_ = omp_get_num_threads();
}
features_ = new double*[num_threads_];
for (int i = 0; i < num_threads_; ++i) {
features_[i] = new double[num_features_];
}
}
/*!
* \brief Destructor
*/
~Predictor() {
if (features_ != nullptr) {
for (int i = 0; i < num_threads_; ++i) {
delete[] features_[i];
}
delete[] features_;
}
}
/*!
* \brief prediction for one record, only raw result(not sigmoid transform)
* \param features Feature for this record
* \return Prediction result
*/
double PredictRawOneLine(const std::vector<std::pair<int, double>>& features) {
const int tid = omp_get_thread_num();
// init feature value
std::memset(features_[tid], 0, sizeof(double)*num_features_);
// put feature value
for (const auto& p : features) {
if (p.first < num_features_) {
features_[tid][p.first] = p.second;
}
}
// get result without sigmoid transform
return boosting_->PredictRaw(features_[tid]);
}
/*!
* \brief prediction for one record, will use sigmoid transform if needed(only needs in binary classification now)
* \param features Feature for this record
* \return Prediction result
*/
double PredictOneLine(const std::vector<std::pair<int, double>>& features) {
const int tid = omp_get_thread_num();
// init feature value
std::memset(features_[tid], 0, sizeof(double)*num_features_);
// put feature value
for (const auto& p : features) {
if (p.first < num_features_) {
features_[tid][p.first] = p.second;
}
}
// get result with sigmoid transform
return boosting_->Predict(features_[tid]);
}
/*!
* \brief prediction for a data, and save result
* \param data_filename Filename of data
* \param has_label True if this data contains label
* \param result_filename Filename of output result
*/
void Predict(const char* data_filename, bool has_label, const char* result_filename) {
FILE* result_file;
#ifdef _MSC_VER
fopen_s(&result_file, result_filename, "w");
#else
result_file = fopen(result_filename, "w");
#endif
if (result_file == NULL) {
Log::Stderr("predition result file %s doesn't exists", data_filename);
}
Parser* parser = Parser::CreateParser(data_filename);
if (parser == nullptr) {
Log::Stderr("can regonise input data format, filename %s", data_filename);
}
// function for parse data
std::function<void(const char*, std::vector<std::pair<int, double>>*)> parser_fun;
double tmp_label;
if (has_label) {
// parse function with label
parser_fun = [this, &parser, &tmp_label]
(const char* buffer, std::vector<std::pair<int, double>>* feature) {
parser->ParseOneLine(buffer, feature, &tmp_label);
};
Log::Stdout("start prediction for data %s, and data has label", data_filename);
} else {
// parse function without label
parser_fun = [this, &parser]
(const char* buffer, std::vector<std::pair<int, double>>* feature) {
parser->ParseOneLine(buffer, feature);
};
Log::Stdout("start prediction for data %s, and data doesn't has label", data_filename);
}
std::function<double(const std::vector<std::pair<int, double>>&)> predict_fun;
if (is_simgoid_) {
predict_fun = [this](const std::vector<std::pair<int, double>>& features) {
return PredictOneLine(features);
};
} else {
predict_fun = [this](const std::vector<std::pair<int, double>>& features) {
return PredictRawOneLine(features);
};
}
std::function<void(data_size_t, const std::vector<std::string>&)> process_fun =
[this, &parser_fun, &predict_fun, &result_file]
(data_size_t, const std::vector<std::string>& lines) {
std::vector<std::pair<int, double>> oneline_features;
std::vector<double> pred_result(lines.size(), 0.0f);
#pragma omp parallel for schedule(static) private(oneline_features)
for (data_size_t i = 0; i < static_cast<data_size_t>(lines.size()); i++) {
oneline_features.clear();
// parser
parser_fun(lines[i].c_str(), &oneline_features);
// predict
pred_result[i] = predict_fun(oneline_features);
}
for (size_t i = 0; i < pred_result.size(); ++i) {
fprintf(result_file, "%f\n", pred_result[i]);
}
};
TextReader<data_size_t> predict_data_reader(data_filename);
predict_data_reader.ReadAllAndProcessParallel(process_fun);
fclose(result_file);
delete parser;
}
private:
/*! \brief Boosting model */
const Boosting* boosting_;
/*! \brief Buffer for feature values */
double** features_;
/*! \brief Number of features */
int num_features_;
/*! \brief True if need to predict result with sigmoid transform */
bool is_simgoid_;
/*! \brief Number of threads */
int num_threads_;
};
} // namespace LightGBM
#endif #endif // LightGBM_PREDICTOR_HPP_
#include <LightGBM/boosting.h>
#include "gbdt.h"
namespace LightGBM {
Boosting* Boosting::CreateBoosting(BoostingType type,
const BoostingConfig* config) {
if (type == BoostingType::kGBDT) {
return new GBDT(config);
} else {
return nullptr;
}
}
} // namespace LightGBM
#include "gbdt.h"
#include <LightGBM/utils/common.h>
#include <LightGBM/feature.h>
#include <LightGBM/objective_function.h>
#include <LightGBM/metric.h>
#include <ctime>
#include <sstream>
#include <chrono>
#include <string>
#include <vector>
namespace LightGBM {
GBDT::GBDT(const BoostingConfig* config)
: tree_learner_(nullptr), train_score_updater_(nullptr),
gradients_(nullptr), hessians_(nullptr),
out_of_bag_data_indices_(nullptr), bag_data_indices_(nullptr) {
max_feature_idx_ = 0;
gbdt_config_ = dynamic_cast<const GBDTConfig*>(config);
}
GBDT::~GBDT() {
if (tree_learner_ != nullptr) { delete tree_learner_; }
if (gradients_ != nullptr) { delete[] gradients_; }
if (hessians_ != nullptr) { delete[] hessians_; }
if (out_of_bag_data_indices_ != nullptr) { delete[] out_of_bag_data_indices_; }
if (bag_data_indices_ != nullptr) { delete[] bag_data_indices_; }
for (auto& tree : models_) {
if (tree != nullptr) { delete tree; }
}
if (train_score_updater_ != nullptr) { delete train_score_updater_; }
for (auto& score_tracker : valid_score_updater_) {
if (score_tracker != nullptr) { delete score_tracker; }
}
}
void GBDT::Init(const Dataset* train_data, const ObjectiveFunction* object_function,
const std::vector<const Metric*>& training_metrics, const char* output_model_filename) {
train_data_ = train_data;
// create tree learner
tree_learner_ =
TreeLearner::CreateTreeLearner(gbdt_config_->tree_learner_type, gbdt_config_->tree_config);
// init tree learner
tree_learner_->Init(train_data_);
object_function_ = object_function;
// push training metrics
for (const auto& metric : training_metrics) {
training_metrics_.push_back(metric);
}
// create score tracker
train_score_updater_ = new ScoreUpdater(train_data_);
num_data_ = train_data_->num_data();
// create buffer for gradients and hessians
gradients_ = new score_t[num_data_];
hessians_ = new score_t[num_data_];
// get max feature index
for (int i = 0; i < train_data->num_features(); ++i) {
max_feature_idx_ = Common::Max<int>(max_feature_idx_,
train_data->FeatureAt(i)->feature_index());
}
// if need bagging, create buffer
if (gbdt_config_->bagging_fraction < 1.0 && gbdt_config_->bagging_freq > 0) {
out_of_bag_data_indices_ = new data_size_t[num_data_];
bag_data_indices_ = new data_size_t[num_data_];
} else {
out_of_bag_data_cnt_ = 0;
out_of_bag_data_indices_ = nullptr;
bag_data_cnt_ = num_data_;
bag_data_indices_ = nullptr;
}
// initialize random generator
random_ = Random(gbdt_config_->bagging_seed);
// open model output file
#ifdef _MSC_VER
fopen_s(&output_model_file, output_model_filename, "w");
#else
output_model_file = fopen(output_model_filename, "w");
#endif
// output models
fprintf(output_model_file, "%s", this->ModelsToString().c_str());
}
void GBDT::AddDataset(const Dataset* valid_data,
const std::vector<const Metric*>& valid_metrics) {
// for a validation dataset, we need its score and metric
valid_score_updater_.push_back(new ScoreUpdater(valid_data));
valid_metrics_.emplace_back();
for (const auto& metric : valid_metrics) {
valid_metrics_.back().push_back(metric);
}
}
void GBDT::Bagging(int iter) {
// if need bagging
if (out_of_bag_data_indices_ != nullptr && iter % gbdt_config_->bagging_freq == 0) {
// if doesn't have query data
if (train_data_->metadata().query_boundaries() == nullptr) {
bag_data_cnt_ =
static_cast<data_size_t>(gbdt_config_->bagging_fraction * num_data_);
out_of_bag_data_cnt_ = num_data_ - bag_data_cnt_;
data_size_t cur_left_cnt = 0;
data_size_t cur_right_cnt = 0;
// random bagging, minimal unit is one record
for (data_size_t i = 0; i < num_data_; ++i) {
double prob =
(bag_data_cnt_ - cur_left_cnt) / static_cast<double>(num_data_ - i);
if (random_.NextDouble() < prob) {
bag_data_indices_[cur_left_cnt++] = i;
} else {
out_of_bag_data_indices_[cur_right_cnt++] = i;
}
}
} else {
// if have query data
const data_size_t* query_boundaries = train_data_->metadata().query_boundaries();
data_size_t num_query = train_data_->metadata().num_queries();
data_size_t bag_query_cnt =
static_cast<data_size_t>(num_query * gbdt_config_->bagging_fraction);
data_size_t cur_left_query_cnt = 0;
data_size_t cur_left_cnt = 0;
data_size_t cur_right_cnt = 0;
// random bagging, minimal unit is one query
for (data_size_t i = 0; i < num_query; ++i) {
double prob =
(bag_query_cnt - cur_left_query_cnt) / static_cast<double>(num_query - i);
if (random_.NextDouble() < prob) {
for (data_size_t j = query_boundaries[i]; j < query_boundaries[i + 1]; ++j) {
bag_data_indices_[cur_left_cnt++] = j;
}
cur_left_query_cnt++;
} else {
for (data_size_t j = query_boundaries[i]; j < query_boundaries[i + 1]; ++j) {
out_of_bag_data_indices_[cur_right_cnt++] = j;
}
}
}
bag_data_cnt_ = cur_left_cnt;
out_of_bag_data_cnt_ = num_data_ - bag_data_cnt_;
}
Log::Stdout("re-bagging, using %d data to train", bag_data_cnt_);
// set bagging data to tree learner
tree_learner_->SetBaggingData(bag_data_indices_, bag_data_cnt_);
}
}
void GBDT::UpdateScoreOutOfBag(const Tree* tree) {
// we need to predict out-of-bag data's socres for boosing
if (out_of_bag_data_indices_ != nullptr) {
train_score_updater_->
AddScore(tree, out_of_bag_data_indices_, out_of_bag_data_cnt_);
}
}
void GBDT::Train() {
// training start time
auto start_time = std::chrono::high_resolution_clock::now();
for (int iter = 0; iter < gbdt_config_->num_iterations; ++iter) {
// boosting first
Boosting();
// bagging logic
Bagging(iter);
// train a new tree
Tree * new_tree = TrainOneTree();
// if cannon learn a new tree, stop
if (new_tree->num_leaves() <= 1) {
Log::Stdout("Cannot do any boosting for tree cannot split");
break;
}
// Shrinkage by learning rate
new_tree->Shrinkage(gbdt_config_->learning_rate);
// update score
UpdateScore(new_tree);
UpdateScoreOutOfBag(new_tree);
// print message for metric
OutputMetric(iter + 1);
// add model
models_.push_back(new_tree);
// write model to file on every iteration
fprintf(output_model_file, "Tree=%d\n", iter);
fprintf(output_model_file, "%s\n", new_tree->ToString().c_str());
fflush(output_model_file);
auto end_time = std::chrono::high_resolution_clock::now();
// output used time on each iteration
Log::Stdout("%f seconds elapsed, finished %d iteration", std::chrono::duration<double,
std::milli>(end_time - start_time) * 1e-3, iter + 1);
}
// close file
fclose(output_model_file);
}
Tree* GBDT::TrainOneTree() {
return tree_learner_->Train(gradients_, hessians_);
}
void GBDT::UpdateScore(const Tree* tree) {
// update training score
train_score_updater_->AddScore(tree_learner_);
// update validation score
for (auto& score_tracker : valid_score_updater_) {
score_tracker->AddScore(tree);
}
}
void GBDT::OutputMetric(int iter) {
// print training metric
for (auto& sub_metric : training_metrics_) {
sub_metric->Print(iter, train_score_updater_->score());
}
// print validation metric
for (size_t i = 0; i < valid_metrics_.size(); ++i) {
for (auto& sub_metric : valid_metrics_[i]) {
sub_metric->Print(iter, valid_score_updater_[i]->score());
}
}
}
void GBDT::Boosting() {
// objective function will calculation gradients and hessians
object_function_->
GetGradients(train_score_updater_->score(), gradients_, hessians_);
}
std::string GBDT::ModelsToString() const {
// serialize this object to string
std::stringstream ss;
// output max_feature_idx
ss << "max_feature_idx=" << max_feature_idx_ << std::endl;
// output sigmoid parameter
ss << "sigmoid=" << object_function_->GetSigmoid() << std::endl;
ss << std::endl;
// output tree models
for (size_t i = 0; i < models_.size(); ++i) {
ss << "Tree=" << i << std::endl;
ss << models_[i]->ToString() << std::endl;
}
return ss.str();
}
void GBDT::ModelsFromString(const std::string& model_str, int num_used_model) {
// use serialized string to restore this object
models_.clear();
std::vector<std::string> lines = Common::Split(model_str.c_str(), '\n');
size_t i = 0;
// get max_feature_idx first
while (i < lines.size()) {
size_t find_pos = lines[i].find("max_feature_idx=");
if (find_pos != std::string::npos) {
std::vector<std::string> strs = Common::Split(lines[i].c_str(), '=');
Common::Atoi(strs[1].c_str(), &max_feature_idx_);
++i;
break;
} else {
++i;
}
}
if (i == lines.size()) {
Log::Stderr("The model doesn't contain max_feature_idx");
return;
}
// get sigmoid parameter
i = 0;
while (i < lines.size()) {
size_t find_pos = lines[i].find("sigmoid=");
if (find_pos != std::string::npos) {
std::vector<std::string> strs = Common::Split(lines[i].c_str(), '=');
Common::Atof(strs[1].c_str(), &sigmoid_);
++i;
break;
} else {
++i;
}
}
// if sigmoid doesn't exists
if (i == lines.size()) {
sigmoid_ = -1.0;
}
// get tree models
i = 0;
while (i < lines.size()) {
size_t find_pos = lines[i].find("Tree=");
if (find_pos != std::string::npos) {
++i;
int start = static_cast<int>(i);
while (i < lines.size() && lines[i].find("Tree=") == std::string::npos) { ++i; }
int end = static_cast<int>(i);
std::string tree_str = Common::Join(lines, start, end, '\n');
models_.push_back(new Tree(tree_str));
if (num_used_model > 0 && models_.size() >= num_used_model) {
break;
}
} else {
++i;
}
}
Log::Stdout("Loaded %d modles\n", models_.size());
}
double GBDT::PredictRaw(const double* value) const {
double ret = 0.0;
for (size_t i = 0; i < models_.size(); ++i) {
ret += models_[i]->Predict(value);
}
return ret;
}
double GBDT::Predict(const double* value) const {
double ret = 0.0;
for (size_t i = 0; i < models_.size(); ++i) {
ret += models_[i]->Predict(value);
}
// if need sigmoid transform
if (sigmoid_ > 0) {
ret = 1.0 / (1.0 + std::exp(-sigmoid_ * ret));
}
return ret;
}
} // namespace LightGBM
#ifndef LIGHTGBM_BOOSTING_GBDT_H_
#define LIGHTGBM_BOOSTING_GBDT_H_
#include <LightGBM/boosting.h>
#include "score_updater.hpp"
#include <cstdio>
#include <vector>
#include <string>
namespace LightGBM {
/*!
* \brief GBDT algorithm implementation. including Training, prediction, bagging.
*/
class GBDT: public Boosting {
public:
/*!
* \brief Constructor
* \param config Config of GBDT
*/
explicit GBDT(const BoostingConfig* config);
/*!
* \brief Destructor
*/
~GBDT();
/*!
* \brief Initial logic
* \param config Config for boosting
* \param train_data Training data
* \param object_function Training objective function
* \param training_metrics Training metrics
* \param output_model_filename Filename of output model
*/
void Init(const Dataset* train_data, const ObjectiveFunction* object_function,
const std::vector<const Metric*>& training_metrics,
const char* output_model_filename)
override;
/*!
* \brief Add a validation data
* \param valid_data Validation data
* \param valid_metrics Metrics for validation data
*/
void AddDataset(const Dataset* valid_data,
const std::vector<const Metric*>& valid_metrics) override;
/*!
* \brief one training iteration
*/
void Train() override;
/*!
* \brief Predtion for one record, not use sigmoid
* \param feature_values Feature value on this record
* \return Prediction result for this record
*/
double PredictRaw(const double * feature_values) const override;
/*!
* \brief Predtion for one record, will use sigmoid transform if needed
* \param feature_values Feature value on this record
* \return Prediction result for this record
*/
double Predict(const double * feature_values) const override;
/*!
* \brief Serialize models by string
* \return String output of tranined model
*/
std::string ModelsToString() const override;
/*!
* \brief Restore from a serialized string
* \param model_str The string of model
*/
void ModelsFromString(const std::string& model_str, int num_used_model) override;
/*!
* \brief Get max feature index of this model
* \return Max feature index of this model
*/
inline int MaxFeatureIdx() const override { return max_feature_idx_; }
/*!
* \brief Get number of weak sub-models
* \return Number of weak sub-models
*/
inline int NumberOfSubModels() const override { return static_cast<int>(models_.size()); }
private:
/*!
* \brief Implement bagging logic
* \param iter Current interation
*/
void Bagging(int iter);
/*!
* \brief update score for out-of-bag data.
* It is necessary for this update, since we may re-bagging data on training
* \param tree Trained tree of this iteration
*/
void UpdateScoreOutOfBag(const Tree* tree);
/*!
* \brief calculate the object function
*/
void Boosting();
/*!
* \brief train one tree
* \return Trained tree of this iteration
*/
Tree* TrainOneTree();
/*!
* \brief update score after tree trained
* \param tree Trained tree of this iteration
*/
void UpdateScore(const Tree* tree);
/*!
* \brief Print Metric result of current iteration
* \param iter Current interation
*/
void OutputMetric(int iter);
/*! \brief Pointer to training data */
const Dataset* train_data_;
/*! \brief Config of gbdt */
const GBDTConfig* gbdt_config_;
/*! \brief Tree learner, will use tihs class to learn trees */
TreeLearner* tree_learner_;
/*! \brief Objective function */
const ObjectiveFunction* object_function_;
/*! \brief Store and update traning data's score */
ScoreUpdater* train_score_updater_;
/*! \brief Metrics for training data */
std::vector<const Metric*> training_metrics_;
/*! \brief Store and update validation data's scores */
std::vector<ScoreUpdater*> valid_score_updater_;
/*! \brief Metric for validation data */
std::vector<std::vector<const Metric*>> valid_metrics_;
/*! \brief Trained models(trees) */
std::vector<Tree*> models_;
/*! \brief Max feature index of training data*/
int max_feature_idx_;
/*! \brief First order derivative of training data */
score_t* gradients_;
/*! \brief Secend order derivative of training data */
score_t* hessians_;
/*! \brief Store the data indices of out-of-bag */
data_size_t* out_of_bag_data_indices_;
/*! \brief Number of out-of-bag data */
data_size_t out_of_bag_data_cnt_;
/*! \brief Store the indices of in-bag data */
data_size_t* bag_data_indices_;
/*! \brief Number of in-bag data */
data_size_t bag_data_cnt_;
/*! \brief Number of traning data */
data_size_t num_data_;
/*! \brief Random generator, used for bagging */
Random random_;
/*! \brief The filename that the models will save to */
FILE * output_model_file;
/*!
* \brief Sigmoid parameter, used for prediction.
* if > 0 meas output score will transform by sigmoid function
*/
double sigmoid_;
};
} // namespace LightGBM
#endif #endif // LightGBM_BOOSTING_GBDT_H_
#ifndef LIGHTGBM_BOOSTING_SCORE_UPDATER_HPP_
#define LIGHTGBM_BOOSTING_SCORE_UPDATER_HPP_
#include <LightGBM/meta.h>
#include <LightGBM/dataset.h>
#include <LightGBM/tree.h>
#include <LightGBM/tree_learner.h>
#include <cstring>
namespace LightGBM {
/*!
* \brief Used to store and update score for data
*/
class ScoreUpdater {
public:
/*!
* \brief Constructor, will pass a const pointer of dataset
* \param data This class will bind with this data set
*/
explicit ScoreUpdater(const Dataset* data)
:data_(data) {
num_data_ = data->num_data();
score_ = new score_t[num_data_];
// default start score is zero
std::memset(score_, 0, sizeof(score_t)*num_data_);
const score_t* init_score = data->metadata().init_score();
// if exists initial score, will start from it
if (init_score != nullptr) {
for (data_size_t i = 0; i < num_data_; ++i) {
score_[i] = init_score[i];
}
}
}
/*! \brief Destructor */
~ScoreUpdater() {
delete[] score_;
}
/*!
* \brief Use tree model to get prediction, then add to score for all data
* Note: this function generally will be used for validation data.
* \param tree Trained tree model
*/
inline void AddScore(const Tree* tree) {
tree->AddPredictionToScore(data_, num_data_, score_);
}
/*!
* \brief Add prediction score, only used for training data.
* After trained a tree, the training data is partitioned into tree leaves.
* We can get prediction by faster speed based on this.
* \param tree_learner
*/
inline void AddScore(const TreeLearner* tree_learner) {
tree_learner->AddPredictionToScore(score_);
}
/*!
* \brief Like AddScore(const Tree* tree), but only for part of data
* Used for prediction of training out-of-bad data
* \param tree Trained tree model
* \param data_indices Indices of data that want proccess to
* \param data_cnt Number of data that want proccess to
*/
inline void AddScore(const Tree* tree, const data_size_t* data_indices,
data_size_t data_cnt) {
tree->AddPredictionToScore(data_, data_indices, data_cnt, score_);
}
/*! \brief Pointer of score */
inline const score_t * score() { return score_; }
private:
/*! \brief Number of total data */
data_size_t num_data_;
/*! \brief Pointer of data set */
const Dataset* data_;
/*! \brief scores for data set */
score_t* score_;
};
} // namespace LightGBM
#endif #endif // LightGBM_BOOSTING_SCORE_UPDATER_HPP_
#include <LightGBM/bin.h>
#include "dense_bin.hpp"
#include "sparse_bin.hpp"
#include <cmath>
#include <cstring>
#include <cstdint>
#include <limits>
#include <vector>
#include <algorithm>
namespace LightGBM {
BinMapper::BinMapper()
:bin_upper_bound_(nullptr) {
}
// deep copy function for BinMapper
BinMapper::BinMapper(const BinMapper& other)
: bin_upper_bound_(nullptr) {
num_bin_ = other.num_bin_;
is_trival_ = other.is_trival_;
sparse_rate_ = other.sparse_rate_;
bin_upper_bound_ = new double[num_bin_];
for (int i = 0; i < num_bin_; ++i) {
bin_upper_bound_[i] = other.bin_upper_bound_[i];
}
}
BinMapper::BinMapper(const void* memory)
:bin_upper_bound_(nullptr) {
CopyFrom(reinterpret_cast<const char*>(memory));
}
BinMapper::~BinMapper() {
delete[] bin_upper_bound_;
}
void BinMapper::FindBin(std::vector<double>* values, int max_bin) {
size_t sample_size = values->size();
// find distinct_values first
double* distinct_values = new double[sample_size];
int *counts = new int[sample_size];
int num_values = 1;
std::sort(values->begin(), values->end());
distinct_values[0] = (*values)[0];
counts[0] = 1;
for (size_t i = 1; i < values->size(); ++i) {
if ((*values)[i] != (*values)[i - 1]) {
distinct_values[num_values] = (*values)[i];
counts[num_values] = 1;
++num_values;
} else {
++counts[num_values - 1];
}
}
int cnt_in_bin0 = 0;
if (num_values <= max_bin) {
// use distinct value is enough
num_bin_ = num_values;
bin_upper_bound_ = new double[num_values];
for (int i = 0; i < num_values - 1; ++i) {
bin_upper_bound_[i] = (distinct_values[i] + distinct_values[i + 1]) / 2;
}
cnt_in_bin0 = counts[0];
bin_upper_bound_[num_values - 1] = std::numeric_limits<double>::infinity();
} else {
// need find bins
num_bin_ = max_bin;
bin_upper_bound_ = new double[max_bin];
double * bin_lower_bound = new double[max_bin];
// mean size for one bin
double mean_bin_size = sample_size / static_cast<double>(max_bin);
int rest_sample_cnt = static_cast<int>(sample_size);
int cur_cnt_inbin = 0;
int bin_cnt = 0;
bin_lower_bound[0] = distinct_values[0];
for (int i = 0; i < num_values - 1; ++i) {
rest_sample_cnt -= counts[i];
cur_cnt_inbin += counts[i];
// need a new bin
if (cur_cnt_inbin >= mean_bin_size) {
bin_upper_bound_[bin_cnt] = distinct_values[i];
if (bin_cnt == 0) { cnt_in_bin0 = cur_cnt_inbin; }
++bin_cnt;
bin_lower_bound[bin_cnt] = distinct_values[i + 1];
cur_cnt_inbin = 0;
mean_bin_size = rest_sample_cnt / static_cast<double>(max_bin - bin_cnt);
}
}
cur_cnt_inbin += counts[num_values - 1];
// update bin upper bound
for (int i = 0; i < bin_cnt; ++i) {
bin_upper_bound_[i] = (bin_upper_bound_[i] + bin_lower_bound[i + 1]) / 2.0;
}
// last bin upper bound
bin_upper_bound_[bin_cnt] = std::numeric_limits<double>::infinity();
++bin_cnt;
delete[] bin_lower_bound;
// if no so much bin
if (bin_cnt < max_bin) {
// old bin data
double * tmp_bin_upper_bound = bin_upper_bound_;
num_bin_ = bin_cnt;
bin_upper_bound_ = new double[num_bin_];
// copy back
for (int i = 0; i < num_bin_; ++i) {
bin_upper_bound_[i] = tmp_bin_upper_bound[i];
}
// free old space
delete[] tmp_bin_upper_bound;
}
}
delete[] distinct_values;
delete[] counts;
// check trival(num_bin_ == 1) feature
if (num_bin_ <= 1) {
is_trival_ = true;
} else {
is_trival_ = false;
}
// calculate sparse rate
sparse_rate_ = static_cast<double>(cnt_in_bin0) / static_cast<double>(sample_size);
}
int BinMapper::SizeForSpecificBin(int bin) {
int size = 0;
size += sizeof(int);
size += sizeof(bool);
size += sizeof(double);
size += bin * sizeof(double);
return size;
}
void BinMapper::CopyTo(char * buffer) {
std::memcpy(buffer, &num_bin_, sizeof(num_bin_));
buffer += sizeof(num_bin_);
std::memcpy(buffer, &is_trival_, sizeof(is_trival_));
buffer += sizeof(is_trival_);
std::memcpy(buffer, &sparse_rate_, sizeof(sparse_rate_));
buffer += sizeof(sparse_rate_);
std::memcpy(buffer, bin_upper_bound_, num_bin_ * sizeof(double));
}
void BinMapper::CopyFrom(const char * buffer) {
std::memcpy(&num_bin_, buffer, sizeof(num_bin_));
buffer += sizeof(num_bin_);
std::memcpy(&is_trival_, buffer, sizeof(is_trival_));
buffer += sizeof(is_trival_);
std::memcpy(&sparse_rate_, buffer, sizeof(sparse_rate_));
buffer += sizeof(sparse_rate_);
if (bin_upper_bound_ != nullptr) { delete[] bin_upper_bound_; }
bin_upper_bound_ = new double[num_bin_];
std::memcpy(bin_upper_bound_, buffer, num_bin_ * sizeof(double));
}
void BinMapper::SaveBinaryToFile(FILE* file) const {
fwrite(&num_bin_, sizeof(num_bin_), 1, file);
fwrite(&is_trival_, sizeof(is_trival_), 1, file);
fwrite(&sparse_rate_, sizeof(sparse_rate_), 1, file);
fwrite(bin_upper_bound_, sizeof(double), num_bin_, file);
}
size_t BinMapper::SizesInByte() const {
return sizeof(num_bin_) + sizeof(is_trival_) + sizeof(sparse_rate_) + sizeof(double) * num_bin_;
}
template class DenseBin<uint8_t>;
template class DenseBin<uint16_t>;
template class DenseBin<uint32_t>;
template class SparseBin<uint8_t>;
template class SparseBin<uint16_t>;
template class SparseBin<uint32_t>;
template class OrderedSparseBin<uint8_t>;
template class OrderedSparseBin<uint16_t>;
template class OrderedSparseBin<uint32_t>;
Bin* Bin::CreateBin(data_size_t num_data, int num_bin, double sparse_rate, bool is_enable_sparse, bool* is_sparse) {
// sparse threshold
const double kSparseThreshold = 0.8;
if (sparse_rate >= kSparseThreshold && is_enable_sparse) {
*is_sparse = true;
return CreateSparseBin(num_data, num_bin);
} else {
*is_sparse = false;
return CreateDenseBin(num_data, num_bin);
}
}
Bin* Bin::CreateDenseBin(data_size_t num_data, int num_bin) {
if (num_bin <= 256) {
return new DenseBin<uint8_t>(num_data);
} else if (num_bin <= 65536) {
return new DenseBin<uint16_t>(num_data);
} else {
return new DenseBin<uint32_t>(num_data);
}
}
Bin* Bin::CreateSparseBin(data_size_t num_data, int num_bin) {
if (num_bin <= 256) {
return new SparseBin<uint8_t>(num_data);
} else if (num_bin <= 65536) {
return new SparseBin<uint16_t>(num_data);
} else {
return new SparseBin<uint32_t>(num_data);
}
}
} // namespace LightGBM
#include <LightGBM/config.h>
#include <LightGBM/utils/common.h>
#include <LightGBM/utils/log.h>
#include <vector>
#include <string>
#include <unordered_map>
#include <algorithm>
namespace LightGBM {
void OverallConfig::Set(const std::unordered_map<std::string, std::string>& params) {
// load main config types
GetInt(params, "num_threads", &num_threads);
GetTaskType(params);
// prediction task, default not has label
if (task_type == TaskType::kPredict) {
io_config.data_has_label = false;
}
GetBoostingType(params);
GetObjectiveType(params);
GetMetricType(params);
// construct boosting configs
if (boosting_type == BoostingType::kGBDT) {
boosting_config = new GBDTConfig();
}
// sub-config setup
network_config.Set(params);
io_config.Set(params);
boosting_config->Set(params);
objective_config.Set(params);
metric_config.Set(params);
// check for conflicts
CheckParamConflict();
}
void OverallConfig::GetBoostingType(const std::unordered_map<std::string, std::string>& params) {
std::string value;
if (GetString(params, "boosting_type", &value)) {
std::transform(value.begin(), value.end(), value.begin(), ::tolower);
if (value == std::string("gbdt") || value == std::string("gbrt")) {
boosting_type = BoostingType::kGBDT;
} else {
Log::Stderr("boosting type %s error", value.c_str());
}
}
}
void OverallConfig::GetObjectiveType(const std::unordered_map<std::string, std::string>& params) {
std::string value;
if (GetString(params, "objective", &value)) {
std::transform(value.begin(), value.end(), value.begin(), ::tolower);
objective_type = value;
}
}
void OverallConfig::GetMetricType(const std::unordered_map<std::string, std::string>& params) {
std::string value;
if (GetString(params, "metric", &value)) {
// clear old metrics
metric_types.clear();
// to lower
std::transform(value.begin(), value.end(), value.begin(), ::tolower);
// split
std::vector<std::string> metrics = Common::Split(value.c_str(), ',');
// remove dumplicate
std::unordered_map<std::string, int> metric_maps;
for (auto& metric : metrics) {
std::transform(metric.begin(), metric.end(), metric.begin(), ::tolower);
if (metric_maps.count(metric) <= 0) {
metric_maps[metric] = 1;
}
}
for (auto& pair : metric_maps) {
std::string sub_metric_str = pair.first;
metric_types.push_back(sub_metric_str);
}
}
}
void OverallConfig::GetTaskType(const std::unordered_map<std::string, std::string>& params) {
std::string value;
if (GetString(params, "task", &value)) {
std::transform(value.begin(), value.end(), value.begin(), ::tolower);
if (value == std::string("train") || value == std::string("training")) {
task_type = TaskType::kTrain;
} else if (value == std::string("predict") || value == std::string("prediction")
|| value == std::string("test")) {
task_type = TaskType::kPredict;
} else {
Log::Stderr("task type error");
}
}
}
void OverallConfig::CheckParamConflict() {
if (network_config.num_machines > 1) {
is_parallel = true;
} else {
is_parallel = false;
dynamic_cast<GBDTConfig*>(boosting_config)->tree_learner_type =
TreeLearnerType::kSerialTreeLearner;
}
if (dynamic_cast<GBDTConfig*>(boosting_config)->tree_learner_type ==
TreeLearnerType::kSerialTreeLearner) {
is_parallel = false;
network_config.num_machines = 1;
}
if (dynamic_cast<GBDTConfig*>(boosting_config)->tree_learner_type ==
TreeLearnerType::kSerialTreeLearner ||
dynamic_cast<GBDTConfig*>(boosting_config)->tree_learner_type ==
TreeLearnerType::kFeatureParallelTreelearner) {
is_parallel_find_bin = false;
} else if (dynamic_cast<GBDTConfig*>(boosting_config)->tree_learner_type ==
TreeLearnerType::kDataParallelTreeLearner) {
is_parallel_find_bin = true;
}
if (task_type == TaskType::kTrain && io_config.data_has_label == false) {
Log::Stderr("Data should have label in training task");
}
}
void IOConfig::Set(const std::unordered_map<std::string, std::string>& params) {
GetInt(params, "max_bin", &max_bin);
CHECK(max_bin > 0);
GetInt(params, "data_random_seed", &data_random_seed);
if (!GetString(params, "data", &data_filename)) {
Log::Stderr("No training/prediction data, application quit");
}
GetInt(params, "num_model_predict", &num_model_predict);
GetBool(params, "data_has_label", &data_has_label);
GetBool(params, "is_pre_partition", &is_pre_partition);
GetBool(params, "is_enable_sparse", &is_enable_sparse);
GetBool(params, "use_two_round_loading", &use_two_round_loading);
GetBool(params, "is_save_binary_file", &is_save_binary_file);
GetBool(params, "is_sigmoid", &is_sigmoid);
GetString(params, "output_model", &output_model);
GetString(params, "input_model", &input_model);
GetString(params, "output_result", &output_result);
GetString(params, "input_init_score", &input_init_score);
std::string tmp_str = "";
if (GetString(params, "valid_data", &tmp_str)) {
valid_data_filenames = Common::Split(tmp_str.c_str(), ',');
}
}
void ObjectiveConfig::Set(const std::unordered_map<std::string, std::string>& params) {
GetBool(params, "is_unbalance", &is_unbalance);
GetDouble(params, "sigmoid", &sigmoid);
GetInt(params, "max_position", &max_position);
CHECK(max_position > 0);
std::string tmp_str = "";
if (GetString(params, "label_gain", &tmp_str)) {
label_gain = Common::StringToDoubleArray(tmp_str, ',');
} else {
// label_gain = 2^i - 1, may overflow, so we use 31 here
const int max_label = 31;
label_gain.push_back(0.0);
for (int i = 1; i < max_label; ++i) {
label_gain.push_back((1 << i) - 1);
}
}
}
void MetricConfig::Set(const std::unordered_map<std::string, std::string>& params) {
GetInt(params, "metric_freq", &output_freq);
CHECK(output_freq >= 0);
GetDouble(params, "sigmoid", &sigmoid);
GetBool(params, "is_training_metric", &is_provide_training_metric);
std::string tmp_str = "";
if (GetString(params, "label_gain", &tmp_str)) {
label_gain = Common::StringToDoubleArray(tmp_str, ',');
} else {
// label_gain = 2^i - 1, may overflow, so we use 31 here
const int max_label = 31;
label_gain.push_back(0.0);
for (int i = 1; i < max_label; ++i) {
label_gain.push_back((1 << i) - 1);
}
}
if (GetString(params, "ndcg_eval_at", &tmp_str)) {
eval_at = Common::StringToIntArray(tmp_str, ',');
std::sort(eval_at.begin(), eval_at.end());
for (size_t i = 0; i < eval_at.size(); ++i) {
CHECK(eval_at[i] > 0);
}
} else {
// default eval ndcg @[1-5]
for (int i = 1; i <= 5; ++i) {
eval_at.push_back(i);
}
}
}
void TreeConfig::Set(const std::unordered_map<std::string, std::string>& params) {
GetInt(params, "min_data_in_leaf", &min_data_in_leaf);
CHECK(min_data_in_leaf > 0);
GetDouble(params, "min_sum_hessian_in_leaf", &min_sum_hessian_in_leaf);
CHECK(min_sum_hessian_in_leaf >= 0.0);
GetInt(params, "num_leaves", &num_leaves);
CHECK(num_leaves > 0);
GetInt(params, "feature_fraction_seed", &feature_fraction_seed);
GetDouble(params, "feature_fraction", &feature_fraction);
CHECK(feature_fraction > 0.0 && feature_fraction <= 1.0);
}
void BoostingConfig::Set(const std::unordered_map<std::string, std::string>& params) {
GetInt(params, "num_iterations", &num_iterations);
CHECK(num_iterations >= 0);
GetInt(params, "bagging_seed", &bagging_seed);
GetInt(params, "bagging_freq", &bagging_freq);
CHECK(bagging_freq >= 0);
GetDouble(params, "bagging_fraction", &bagging_fraction);
CHECK(bagging_fraction > 0.0 && bagging_fraction <= 1.0);
GetDouble(params, "learning_rate", &learning_rate);
CHECK(learning_rate > 0.0);
}
void GBDTConfig::GetTreeLearnerType(const std::unordered_map<std::string, std::string>& params) {
std::string value;
if (GetString(params, "tree_learner", &value)) {
std::transform(value.begin(), value.end(), value.begin(), ::tolower);
if (value == std::string("serial")) {
tree_learner_type = TreeLearnerType::kSerialTreeLearner;
} else if (value == std::string("feature") || value == std::string("feature_parallel")) {
tree_learner_type = TreeLearnerType::kFeatureParallelTreelearner;
} else if (value == std::string("data") || value == std::string("data_parallel")) {
tree_learner_type = TreeLearnerType::kDataParallelTreeLearner;
}
else {
Log::Stderr("tree learner type error");
}
}
}
void GBDTConfig::Set(const std::unordered_map<std::string, std::string>& params) {
BoostingConfig::Set(params);
GetTreeLearnerType(params);
tree_config.Set(params);
}
void NetworkConfig::Set(const std::unordered_map<std::string, std::string>& params) {
GetInt(params, "num_machines", &num_machines);
CHECK(num_machines >= 1);
GetInt(params, "local_listen_port", &local_listen_port);
CHECK(local_listen_port > 0);
GetInt(params, "time_out", &time_out);
CHECK(time_out > 0);
GetString(params, "machine_list_file", &machine_list_filename);
}
} // namespace LightGBM
#include <LightGBM/dataset.h>
#include <LightGBM/feature.h>
#include <LightGBM/network.h>
#include <omp.h>
#include <cstdio>
#include <unordered_map>
#include <limits>
#include <vector>
#include <utility>
#include <string>
namespace LightGBM {
Dataset::Dataset(const char* data_filename, const char* init_score_filename,
int max_bin, int random_seed, bool is_enable_sparse, const PredictFunction& predict_fun)
:data_filename_(data_filename), random_(random_seed),
max_bin_(max_bin), is_enable_sparse_(is_enable_sparse), predict_fun_(predict_fun) {
CheckCanLoadFromBin();
if (is_loading_from_binfile_ && predict_fun != nullptr) {
Log::Stdout("cannot perform initial prediction for binary file, will use text file instead");
is_loading_from_binfile_ = false;
}
if (!is_loading_from_binfile_) {
// load weight, query information and initilize score
metadata_.Init(data_filename, init_score_filename);
// create text parser
parser_ = Parser::CreateParser(data_filename_);
if (parser_ == nullptr) {
Log::Stderr("cannot recognise input data format, filename: %s", data_filename_);
}
// create text reader
text_reader_ = new TextReader<data_size_t>(data_filename);
} else {
// only need to load initilize score, other meta data will load from bin flie
metadata_.Init(init_score_filename);
Log::Stdout("will load data set from binary file");
parser_ = nullptr;
text_reader_ = nullptr;
}
}
Dataset::~Dataset() {
if (parser_ != nullptr) { delete parser_; }
if (text_reader_ != nullptr) { delete text_reader_; }
for (auto& feature : features_) {
delete feature;
}
features_.clear();
}
void Dataset::LoadDataToMemory(int rank, int num_machines, bool is_pre_partition) {
used_data_indices_.clear();
if (num_machines == 1 || is_pre_partition) {
// read all lines
num_data_ = text_reader_->ReadAllLines();
global_num_data_ = num_data_;
} else { // need partition data
// get query data
const data_size_t* query_boundaries = metadata_.query_boundaries();
if (query_boundaries == nullptr) {
// if not contain query data, minimal sample unit is one record
global_num_data_ = text_reader_->ReadAndFilterLines([this, rank, num_machines](data_size_t) {
if (random_.NextInt(0, num_machines) == rank) {
return true;
} else {
return false;
}
}, &used_data_indices_);
} else {
// if contain query data, minimal sample unit is one query
data_size_t num_queries = metadata_.num_queries();
data_size_t qid = -1;
bool is_query_used = false;
global_num_data_ = text_reader_->ReadAndFilterLines(
[this, rank, num_machines, &qid, &query_boundaries, &is_query_used, num_queries]
(data_size_t line_idx) {
if (qid >= num_queries) {
Log::Stderr("current query is exceed the range of query file, please ensure your query file is correct");
}
if (line_idx >= query_boundaries[qid + 1]) {
// if is new query
is_query_used = false;
if (random_.NextInt(0, num_machines) == rank) {
is_query_used = true;
}
++qid;
}
return is_query_used;
}, &used_data_indices_);
}
// set number of data
num_data_ = static_cast<data_size_t>(used_data_indices_.size());
}
}
void Dataset::SampleDataFromMemory(std::vector<std::string>* out_data) {
const size_t sample_cnt = static_cast<size_t>(num_data_ < 50000 ? num_data_ : 50000);
std::vector<size_t> sample_indices = random_.Sample(num_data_, sample_cnt);
out_data->clear();
for (size_t i = 0; i < sample_indices.size(); ++i) {
const size_t idx = sample_indices[i];
out_data->push_back(text_reader_->Lines()[idx]);
}
}
void Dataset::SampleDataFromFile(int rank, int num_machines, bool is_pre_partition,
std::vector<std::string>* out_data) {
used_data_indices_.clear();
const size_t sample_cnt = 50000;
if (num_machines == 1 || is_pre_partition) {
num_data_ = static_cast<data_size_t>(text_reader_->SampleFromFile(random_, sample_cnt, out_data));
global_num_data_ = num_data_;
} else { // need partition data
// get query data
const data_size_t* query_boundaries = metadata_.query_boundaries();
if (query_boundaries == nullptr) {
// if not contain query file, minimal sample unit is one record
global_num_data_ = text_reader_->SampleAndFilterFromFile([this, rank, num_machines]
(data_size_t) {
if (random_.NextInt(0, num_machines) == rank) {
return true;
} else {
return false;
}
}, &used_data_indices_, random_, sample_cnt, out_data);
} else {
// if contain query file, minimal sample unit is one query
data_size_t num_queries = metadata_.num_queries();
data_size_t qid = -1;
bool is_query_used = false;
global_num_data_ = text_reader_->SampleAndFilterFromFile(
[this, rank, num_machines, &qid, &query_boundaries, &is_query_used, num_queries]
(data_size_t line_idx) {
if (qid >= num_queries) {
Log::Stderr("current query is exceed the range of query file, \
please ensure your query file is correct");
}
if (line_idx >= query_boundaries[qid + 1]) {
// if is new query
is_query_used = false;
if (random_.NextInt(0, num_machines) == rank) {
is_query_used = true;
}
++qid;
}
return is_query_used;
}, &used_data_indices_, random_, sample_cnt, out_data);
}
num_data_ = static_cast<data_size_t>(used_data_indices_.size());
}
}
void Dataset::ConstructBinMappers(int rank, int num_machines, const std::vector<std::string>& sample_data) {
// sample_values[i][j], means the value of j-th sample on i-th feature
std::vector<std::vector<double>> sample_values;
// temp buffer for one line features and label
std::vector<std::pair<int, double>> oneline_features;
double label;
for (size_t i = 0; i < sample_data.size(); ++i) {
oneline_features.clear();
// parse features
parser_->ParseOneLine(sample_data[i].c_str(), &oneline_features, &label);
// push 0 first, then edit the value according existing feature values
for (auto& feature_values : sample_values) {
feature_values.push_back(0.0);
}
for (std::pair<int, double>& inner_data : oneline_features) {
if (static_cast<size_t>(inner_data.first) >= sample_values.size()) {
// if need expand feature set
size_t need_size = inner_data.first - sample_values.size() + 1;
for (size_t j = 0; j < need_size; ++j) {
// push i+1 0
sample_values.emplace_back(i + 1, 0.0);
}
}
// edit the feature value
sample_values[inner_data.first][i] = inner_data.second;
}
}
features_.clear();
// -1 means doesn't use this feature
used_feature_map_ = std::vector<int>(sample_values.size(), -1);
// start find bins
if (num_machines == 1) {
std::vector<BinMapper*> bin_mappers(sample_values.size());
// if only 1 machines, find bin locally
#pragma omp parallel for schedule(guided)
for (int i = 0; i < static_cast<int>(sample_values.size()); ++i) {
bin_mappers[i] = new BinMapper();
bin_mappers[i]->FindBin(&sample_values[i], max_bin_);
}
for (size_t i = 0; i < sample_values.size(); ++i) {
if (!bin_mappers[i]->is_trival()) {
// map real feature index to used feature index
used_feature_map_[i] = static_cast<int>(features_.size());
// push new feature
features_.push_back(new Feature(static_cast<int>(i), bin_mappers[i],
num_data_, is_enable_sparse_));
} else {
// if feature is trival(only 1 bin), free spaces
delete bin_mappers[i];
}
}
} else {
// if have multi-machines, need find bin distributed
// different machines will find bin for different features
// start and len will store the process feature indices for different machines
// machine i will find bins for features in [ strat[i], start[i] + len[i] )
int* start = new int[num_machines];
int* len = new int[num_machines];
int total_num_feature = static_cast<int>(sample_values.size());
int step = (total_num_feature + num_machines - 1) / num_machines;
if (step < 1) { step = 1; }
start[0] = 0;
for (int i = 0; i < num_machines - 1; ++i) {
len[i] = Common::Min<int>(step, total_num_feature - start[i]);
start[i + 1] = start[i] + len[i];
}
len[num_machines - 1] = total_num_feature - start[num_machines - 1];
// get size of bin mapper with max_bin_ size
int type_size = BinMapper::SizeForSpecificBin(max_bin_);
// since sizes of different feature may not be same, we expand all bin mapper to type_size
int buffer_size = type_size * total_num_feature;
char* input_buffer = new char[buffer_size];
char* output_buffer = new char[buffer_size];
// find local feature bins and copy to buffer
#pragma omp parallel for schedule(guided)
for (int i = 0; i < len[rank]; ++i) {
BinMapper* bin_mapper = new BinMapper();
bin_mapper->FindBin(&sample_values[start[rank] + i], max_bin_);
bin_mapper->CopyTo(input_buffer + i * type_size);
// don't need this any more
delete bin_mapper;
}
// convert to binary size
for (int i = 0; i < num_machines; ++i) {
start[i] *= type_size;
len[i] *= type_size;
}
// gather global feature bin mappers
Network::Allgather(input_buffer, buffer_size, start, len, output_buffer);
// restore features bins from buffer
for (int i = 0; i < total_num_feature; ++i) {
BinMapper* bin_mapper = new BinMapper();
bin_mapper->CopyFrom(output_buffer + i * type_size);
if (!bin_mapper->is_trival()) {
used_feature_map_[i] = static_cast<int>(features_.size());
features_.push_back(new Feature(static_cast<int>(i), bin_mapper, num_data_, is_enable_sparse_));
} else {
delete bin_mapper;
}
}
// free buffer
delete[] start;
delete[] len;
delete[] input_buffer;
delete[] output_buffer;
}
num_features_ = static_cast<int>(features_.size());
}
void Dataset::LoadTrainData(int rank, int num_machines, bool is_pre_partition, bool use_two_round_loading) {
used_data_indices_.clear();
if (!is_loading_from_binfile_ ) {
if (!use_two_round_loading) {
// read data to memory
LoadDataToMemory(rank, num_machines, is_pre_partition);
std::vector<std::string> sample_data;
// sample data
SampleDataFromMemory(&sample_data);
// construct feature bin mappers
ConstructBinMappers(rank, num_machines, sample_data);
// initialize label
metadata_.InitLabel(num_data_);
// extract features
ExtractFeaturesFromMemory();
} else {
std::vector<std::string> sample_data;
// sample data from file
SampleDataFromFile(rank, num_machines, is_pre_partition, &sample_data);
// construct feature bin mappers
ConstructBinMappers(rank, num_machines, sample_data);
// initialize label
metadata_.InitLabel(num_data_);
// extract features
ExtractFeaturesFromFile();
}
} else {
// load data from binary file
LoadDataFromBinFile(rank, num_machines, is_pre_partition);
}
// check meta data
metadata_.CheckOrPartition(static_cast<data_size_t>(global_num_data_), used_data_indices_);
// free memory
used_data_indices_.clear();
used_data_indices_.shrink_to_fit();
// need to check training data
CheckDataset();
}
void Dataset::LoadValidationData(const Dataset* train_set, bool use_two_round_loading) {
used_data_indices_.clear();
if (!is_loading_from_binfile_ ) {
if (!use_two_round_loading) {
// read data in memory
LoadDataToMemory(0, 1, false);
// initialize label
metadata_.InitLabel(num_data_);
features_.clear();
// copy feature bin mapper data
for (Feature* feature : train_set->features_) {
features_.push_back(new Feature(feature->feature_index(), new BinMapper(*feature->bin_mapper()), num_data_, is_enable_sparse_));
}
used_feature_map_ = train_set->used_feature_map_;
num_features_ = static_cast<int>(features_.size());
// extract features
ExtractFeaturesFromMemory();
} else {
// Get number of lines of data file
num_data_ = static_cast<data_size_t>(text_reader_->CountLine());
// initialize label
metadata_.InitLabel(num_data_);
features_.clear();
// copy feature bin mapper data
for (Feature* feature : train_set->features_) {
features_.push_back(new Feature(feature->feature_index(), new BinMapper(*feature->bin_mapper()), num_data_, is_enable_sparse_));
}
used_feature_map_ = train_set->used_feature_map_;
num_features_ = static_cast<int>(features_.size());
// extract features
ExtractFeaturesFromFile();
}
} else {
// load from binary file
LoadDataFromBinFile(0, 1, false);
}
// not need to check validation data
// check meta data
metadata_.CheckOrPartition(static_cast<data_size_t>(global_num_data_), used_data_indices_);
// CheckDataset();
}
void Dataset::ExtractFeaturesFromMemory() {
std::vector<std::pair<int, double>> oneline_features;
double tmp_label = 0.0;
if (predict_fun_ == nullptr) {
// if doesn't need to prediction with initial model
#pragma omp parallel for schedule(guided) private(oneline_features) firstprivate(tmp_label)
for (data_size_t i = 0; i < num_data_; ++i) {
const int tid = omp_get_thread_num();
oneline_features.clear();
// parser
parser_->ParseOneLine(text_reader_->Lines()[i].c_str(), &oneline_features, &tmp_label);
// set label
metadata_.SetLabelAt(i, tmp_label);
// free processed line:
text_reader_->Lines()[i].clear();
// shrink_to_fit will be very slow in linux, and seems not free memory, disable for now
// text_reader_->Lines()[i].shrink_to_fit();
// push data
for (auto& inner_data : oneline_features) {
int feature_idx = used_feature_map_[inner_data.first];
if (feature_idx >= 0) {
// if is used feature
features_[feature_idx]->PushData(tid, i, inner_data.second);
}
}
}
} else {
// if need to prediction with initial model
score_t* init_score = new score_t[num_data_];
#pragma omp parallel for schedule(guided) private(oneline_features) firstprivate(tmp_label)
for (data_size_t i = 0; i < num_data_; ++i) {
const int tid = omp_get_thread_num();
oneline_features.clear();
// parser
parser_->ParseOneLine(text_reader_->Lines()[i].c_str(), &oneline_features, &tmp_label);
// set initial score
init_score[i] = static_cast<score_t>(predict_fun_(oneline_features));
// set label
metadata_.SetLabelAt(i, tmp_label);
// free processed line:
text_reader_->Lines()[i].clear();
// shrink_to_fit will be very slow in linux, and seems not free memory, disable for now
// text_reader_->Lines()[i].shrink_to_fit();
// push data
for (auto& inner_data : oneline_features) {
int feature_idx = used_feature_map_[inner_data.first];
if (feature_idx >= 0) {
// if is used feature
features_[feature_idx]->PushData(tid, i, inner_data.second);
}
}
}
// metadata_ will manage space of init_score
metadata_.SetInitScore(init_score);
}
#pragma omp parallel for schedule(guided)
for (int i = 0; i < num_features_; i++) {
features_[i]->FinishLoad();
}
// text data can be free after loaded feature values
text_reader_->Clear();
}
void Dataset::ExtractFeaturesFromFile() {
score_t* init_score = nullptr;
if (predict_fun_ != nullptr) {
init_score = new score_t[num_data_];
}
std::function<void(data_size_t, const std::vector<std::string>&)> process_fun =
[this, &init_score]
(data_size_t start_idx, const std::vector<std::string>& lines) {
std::vector<std::pair<int, double>> oneline_features;
double tmp_label = 0.0;
#pragma omp parallel for schedule(static) private(oneline_features) firstprivate(tmp_label)
for (data_size_t i = 0; i < static_cast<data_size_t>(lines.size()); i++) {
const int tid = omp_get_thread_num();
oneline_features.clear();
// parser
parser_->ParseOneLine(lines[i].c_str(), &oneline_features, &tmp_label);
// set initial score
if (init_score != nullptr) {
init_score[start_idx + i] = static_cast<score_t>(predict_fun_(oneline_features));
}
// set label
metadata_.SetLabelAt(start_idx + i, tmp_label);
// push data
for (auto& inner_data : oneline_features) {
int feature_idx = used_feature_map_[inner_data.first];
if (feature_idx >= 0) {
// if is used feature
features_[feature_idx]->PushData(tid, start_idx + i, inner_data.second);
}
}
}
};
if (used_data_indices_.size() > 0) {
// only need part of data
text_reader_->ReadPartAndProcessParallel(used_data_indices_, process_fun);
} else {
// need full data
text_reader_->ReadAllAndProcessParallel(process_fun);
}
// metadata_ will manage space of init_score
if (init_score != nullptr) {
metadata_.SetInitScore(init_score);
}
#pragma omp parallel for schedule(guided)
for (int i = 0; i < num_features_; i++) {
features_[i]->FinishLoad();
}
}
void Dataset::SaveBinaryFile() {
// if is loaded from binary file, not need to save
if (!is_loading_from_binfile_) {
std::string bin_filename(data_filename_);
bin_filename.append(".bin");
FILE* file;
#ifdef _MSC_VER
fopen_s(&file, bin_filename.c_str(), "wb");
#else
file = fopen(bin_filename.c_str(), "wb");
#endif
if (file == NULL) {
Log::Stderr("cannot write binary data to %s ", bin_filename.c_str());
}
Log::Stdout("start save binary file for data %s", data_filename_);
// get size of header
size_t size_of_header = sizeof(global_num_data_) + sizeof(is_enable_sparse_)
+ sizeof(max_bin_) + sizeof(num_data_) + sizeof(num_features_) + sizeof(size_t) + sizeof(int) * used_feature_map_.size();
fwrite(&size_of_header, sizeof(size_of_header), 1, file);
// write header
fwrite(&global_num_data_, sizeof(global_num_data_), 1, file);
fwrite(&is_enable_sparse_, sizeof(is_enable_sparse_), 1, file);
fwrite(&max_bin_, sizeof(max_bin_), 1, file);
fwrite(&num_data_, sizeof(num_data_), 1, file);
fwrite(&num_features_, sizeof(num_features_), 1, file);
size_t num_used_feature_map = used_feature_map_.size();
fwrite(&num_used_feature_map, sizeof(num_used_feature_map), 1, file);
fwrite(used_feature_map_.data(), sizeof(int), num_used_feature_map, file);
// get size of meta data
size_t size_of_metadata = metadata_.SizesInByte();
fwrite(&size_of_metadata, sizeof(size_of_metadata), 1, file);
// write meta data
metadata_.SaveBinaryToFile(file);
// write feature data
for (int i = 0; i < num_features_; ++i) {
// get size of feature
size_t size_of_feature = features_[i]->SizesInByte();
fwrite(&size_of_feature, sizeof(size_of_feature), 1, file);
// write feature
features_[i]->SaveBinaryToFile(file);
}
fclose(file);
}
}
void Dataset::CheckCanLoadFromBin() {
std::string bin_filename(data_filename_);
bin_filename.append(".bin");
FILE* file;
#ifdef _MSC_VER
fopen_s(&file, bin_filename.c_str(), "rb");
#else
file = fopen(bin_filename.c_str(), "rb");
#endif
if (file == NULL) {
is_loading_from_binfile_ = false;
} else {
is_loading_from_binfile_ = true;
fclose(file);
}
}
void Dataset::LoadDataFromBinFile(int rank, int num_machines, bool is_pre_partition) {
std::string bin_filename(data_filename_);
bin_filename.append(".bin");
FILE* file;
#ifdef _MSC_VER
fopen_s(&file, bin_filename.c_str(), "rb");
#else
file = fopen(bin_filename.c_str(), "rb");
#endif
if (file == NULL) {
Log::Stderr("cannot read binary data from %s", bin_filename.c_str());
}
// buffer to read binary file
size_t buffer_size = 16 * 1024 * 1024;
char* buffer = new char[buffer_size];
// read size of header
size_t read_cnt = fread(buffer, sizeof(size_t), 1, file);
if (read_cnt != 1) {
Log::Stderr("binary file format error at header size");
}
size_t size_of_head = *(reinterpret_cast<size_t*>(buffer));
// re-allocmate space if not enough
if (size_of_head > buffer_size) {
delete[] buffer;
buffer_size = size_of_head;
buffer = new char[buffer_size];
}
// read header
read_cnt = fread(buffer, 1, size_of_head, file);
if (read_cnt != size_of_head) {
Log::Stderr("binary file format error at header");
}
// get header
const char* mem_ptr = buffer;
global_num_data_ = *(reinterpret_cast<const size_t*>(mem_ptr));
mem_ptr += sizeof(global_num_data_);
is_enable_sparse_ = *(reinterpret_cast<const bool*>(mem_ptr));
mem_ptr += sizeof(is_enable_sparse_);
max_bin_ = *(reinterpret_cast<const int*>(mem_ptr));
mem_ptr += sizeof(max_bin_);
num_data_ = *(reinterpret_cast<const data_size_t*>(mem_ptr));
mem_ptr += sizeof(num_data_);
num_features_ = *(reinterpret_cast<const int*>(mem_ptr));
mem_ptr += sizeof(num_features_);
size_t num_used_feature_map = *(reinterpret_cast<const size_t*>(mem_ptr));
mem_ptr += sizeof(num_used_feature_map);
const int* tmp_feature_map = reinterpret_cast<const int*>(mem_ptr);
used_feature_map_.clear();
for (size_t i = 0; i < num_used_feature_map; ++i) {
used_feature_map_.push_back(tmp_feature_map[i]);
}
// read size of meta data
read_cnt = fread(buffer, sizeof(size_t), 1, file);
if (read_cnt != 1) {
Log::Stderr("binary file format error at size of meta data");
}
size_t size_of_metadata = *(reinterpret_cast<size_t*>(buffer));
// re-allocmate space if not enough
if (size_of_metadata > buffer_size) {
delete[] buffer;
buffer_size = size_of_metadata;
buffer = new char[buffer_size];
}
// read meta data
read_cnt = fread(buffer, 1, size_of_metadata, file);
if (read_cnt != size_of_metadata) {
Log::Stderr("binary file format error at meta data");
}
// load meta data
metadata_.LoadFromMemory(buffer);
used_data_indices_.clear();
global_num_data_ = num_data_;
// sample local used data if need to partition
if (num_machines > 1 && !is_pre_partition) {
const data_size_t* query_boundaries = metadata_.query_boundaries();
if (query_boundaries == nullptr) {
// if not contain query file, minimal sample unit is one record
for (data_size_t i = 0; i < num_data_; i++) {
if (random_.NextInt(0, num_machines) == rank) {
used_data_indices_.push_back(i);
}
}
} else {
// if contain query file, minimal sample unit is one query
data_size_t num_queries = metadata_.num_queries();
data_size_t qid = -1;
bool is_query_used = false;
for (data_size_t i = 0; i < num_data_; i++) {
if (qid >= num_queries) {
Log::Stderr("current query is exceed the range of query file, please ensure your query file is correct");
}
if (i >= query_boundaries[qid + 1]) {
// if is new query
is_query_used = false;
if (random_.NextInt(0, num_machines) == rank) {
is_query_used = true;
}
++qid;
}
if (is_query_used) {
used_data_indices_.push_back(i);
}
}
}
num_data_ = static_cast<data_size_t>(used_data_indices_.size());
}
metadata_.PartitionLabel(used_data_indices_);
// read feature data
for (int i = 0; i < num_features_; ++i) {
// read feature size
read_cnt = fread(buffer, sizeof(size_t), 1, file);
if (read_cnt != 1) {
Log::Stderr("binary file format error at feature %d's size", i);
}
size_t size_of_feature = *(reinterpret_cast<size_t*>(buffer));
// re-allocmate space if not enough
if (size_of_feature > buffer_size) {
delete[] buffer;
buffer_size = size_of_feature;
buffer = new char[buffer_size];
}
read_cnt = fread(buffer, 1, size_of_feature, file);
if (read_cnt != size_of_feature) {
Log::Stderr("binary file format error at feature %d loading , read count %d", i, read_cnt);
}
features_.push_back(new Feature(buffer, static_cast<data_size_t>(global_num_data_), used_data_indices_));
}
delete[] buffer;
fclose(file);
}
void Dataset::CheckDataset() {
if (num_data_ <= 0) {
Log::Stderr("data size of %s is zero", data_filename_);
}
if (features_.size() <= 0) {
Log::Stderr("not useful feature of data %s", data_filename_);
}
}
} // namespace LightGBM
#ifndef LIGHTGBM_IO_DENSE_BIN_HPP_
#define LIGHTGBM_IO_DENSE_BIN_HPP_
#include <LightGBM/bin.h>
#include <vector>
#include <cstring>
#include <cstdint>
namespace LightGBM {
/*!
* \brief Used to Store bins for dense feature
* Use template to reduce memory cost
*/
template <typename VAL_T>
class DenseBin: public Bin {
public:
explicit DenseBin(data_size_t num_data)
: num_data_(num_data) {
data_ = new VAL_T[num_data_];
std::memset(data_, 0, sizeof(VAL_T)*num_data_);
}
~DenseBin() {
delete[] data_;
}
void Push(int, data_size_t idx, uint32_t value) override {
data_[idx] = static_cast<VAL_T>(value);
}
inline uint32_t Get(data_size_t idx) const {
return static_cast<uint32_t>(data_[idx]);
}
BinIterator* GetIterator(data_size_t start_idx) const override;
void ConstructHistogram(data_size_t* data_indices, data_size_t num_data,
const score_t* ordered_gradients, const score_t* ordered_hessians,
HistogramBinEntry* out) const override {
// use 4-way unrolling, will be faster
if (data_indices != nullptr) { // if use part of data
data_size_t rest = num_data % 4;
data_size_t i = 0;
for (; i < num_data - rest; i += 4) {
VAL_T bin0 = data_[data_indices[i]];
VAL_T bin1 = data_[data_indices[i + 1]];
VAL_T bin2 = data_[data_indices[i + 2]];
VAL_T bin3 = data_[data_indices[i + 3]];
out[bin0].sum_gradients += ordered_gradients[i];
out[bin1].sum_gradients += ordered_gradients[i + 1];
out[bin2].sum_gradients += ordered_gradients[i + 2];
out[bin3].sum_gradients += ordered_gradients[i + 3];
out[bin0].sum_hessians += ordered_hessians[i];
out[bin1].sum_hessians += ordered_hessians[i + 1];
out[bin2].sum_hessians += ordered_hessians[i + 2];
out[bin3].sum_hessians += ordered_hessians[i + 3];
++out[bin0].cnt;
++out[bin1].cnt;
++out[bin2].cnt;
++out[bin3].cnt;
}
for (; i < num_data; ++i) {
VAL_T bin = data_[data_indices[i]];
out[bin].sum_gradients += ordered_gradients[i];
out[bin].sum_hessians += ordered_hessians[i];
++out[bin].cnt;
}
}
else { // use full data
data_size_t rest = num_data % 4;
data_size_t i = 0;
for (; i < num_data - rest; i += 4) {
VAL_T bin0 = data_[i];
VAL_T bin1 = data_[i + 1];
VAL_T bin2 = data_[i + 2];
VAL_T bin3 = data_[i + 3];
out[bin0].sum_gradients += ordered_gradients[i];
out[bin1].sum_gradients += ordered_gradients[i + 1];
out[bin2].sum_gradients += ordered_gradients[i + 2];
out[bin3].sum_gradients += ordered_gradients[i + 3];
out[bin0].sum_hessians += ordered_hessians[i];
out[bin1].sum_hessians += ordered_hessians[i + 1];
out[bin2].sum_hessians += ordered_hessians[i + 2];
out[bin3].sum_hessians += ordered_hessians[i + 3];
++out[bin0].cnt;
++out[bin1].cnt;
++out[bin2].cnt;
++out[bin3].cnt;
}
for (; i < num_data; ++i) {
VAL_T bin = data_[i];
out[bin].sum_gradients += ordered_gradients[i];
out[bin].sum_hessians += ordered_hessians[i];
++out[bin].cnt;
}
}
}
data_size_t Split(unsigned int threshold, data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices) const override {
data_size_t lte_count = 0;
data_size_t gt_count = 0;
for (data_size_t i = 0; i < num_data; ++i) {
data_size_t idx = data_indices[i];
if (data_[idx] > threshold) {
gt_indices[gt_count++] = idx;
} else {
lte_indices[lte_count++] = idx;
}
}
return lte_count;
}
data_size_t num_data() const override { return num_data_; }
/*! \brief not ordered bin for dense feature */
OrderedBin* CreateOrderedBin() const override { return nullptr; }
void FinishLoad() override {}
void LoadFromMemory(const void* memory, const std::vector<data_size_t>& local_used_indices) override {
const VAL_T* mem_data = reinterpret_cast<const VAL_T*>(memory);
if (local_used_indices.size() > 0) {
for (int i = 0; i < num_data_; ++i) {
data_[i] = mem_data[local_used_indices[i]];
}
} else {
for (int i = 0; i < num_data_; ++i) {
data_[i] = mem_data[i];
}
}
}
void SaveBinaryToFile(FILE* file) const override {
fwrite(data_, sizeof(VAL_T), num_data_, file);
}
size_t SizesInByte() const override {
return sizeof(VAL_T) * num_data_;
}
private:
data_size_t num_data_;
VAL_T* data_;
};
template <typename VAL_T>
class DenseBinIterator: public BinIterator {
public:
explicit DenseBinIterator(const DenseBin<VAL_T>* bin_data)
: bin_data_(bin_data) {
}
uint32_t Get(data_size_t idx) override {
return bin_data_->Get(idx);
}
private:
const DenseBin<VAL_T>* bin_data_;
};
template <typename VAL_T>
BinIterator* DenseBin<VAL_T>::GetIterator(data_size_t) const {
return new DenseBinIterator<VAL_T>(this);
}
} // namespace LightGBM
#endif #endif // LightGBM_IO_DENSE_BIN_HPP_
#include <LightGBM/dataset.h>
#include <LightGBM/utils/common.h>
#include <vector>
#include <string>
namespace LightGBM {
Metadata::Metadata()
:label_(nullptr), label_int_(nullptr), weights_(nullptr),
query_boundaries_(nullptr),
query_weights_(nullptr), init_score_(nullptr) {
}
void Metadata::Init(const char * data_filename, const char* init_score_filename) {
data_filename_ = data_filename;
init_score_filename_ = init_score_filename;
// for lambdarank, it needs query data for partition data in parallel learning
LoadQueryBoundaries();
LoadWeights();
LoadQueryWeights();
LoadInitialScore();
}
void Metadata::Init(const char* init_score_filename) {
init_score_filename_ = init_score_filename;
LoadInitialScore();
}
Metadata::~Metadata() {
if (label_ != nullptr) { delete[] label_; }
if (weights_ != nullptr) { delete[] weights_; }
if (query_boundaries_ != nullptr) { delete[] query_boundaries_; }
if (query_weights_ != nullptr) { delete[] query_weights_; }
if (init_score_ != nullptr) { delete[] init_score_; }
}
void Metadata::InitLabel(data_size_t num_data) {
num_data_ = num_data;
label_ = new float[num_data_];
}
void Metadata::PartitionLabel(const std::vector<data_size_t>& used_indices) {
if (used_indices.size() <= 0) {
return;
}
float* old_label = label_;
num_data_ = static_cast<data_size_t>(used_indices.size());
label_ = new float[num_data_];
for (data_size_t i = 0; i < num_data_; ++i) {
label_[i] = old_label[used_indices[i]];
}
delete[] old_label;
}
void Metadata::CheckOrPartition(data_size_t num_all_data, const std::vector<data_size_t>& used_data_indices) {
if (used_data_indices.size() == 0) {
// check weights
if (weights_ != nullptr && num_weights_ != num_data_) {
Log::Stdout("init weight size doesn't equal with data file, will ignore");
delete[] weights_;
num_weights_ = 0;
weights_ = nullptr;
}
// check query boundries
if (query_boundaries_ != nullptr && query_boundaries_[num_queries_] != num_data_) {
Log::Stdout("init query size doesn't equal with data file, will ignore");
delete[] query_boundaries_;
num_queries_ = 0;
query_boundaries_ = nullptr;
}
// contain initial score file
if (init_score_ != nullptr && num_init_score_ != num_data_) {
delete[] init_score_;
Log::Stdout("init score size doesn't equal with data file, will ignore");
num_init_score_ = 0;
}
} else {
data_size_t num_used_data = static_cast<data_size_t>(used_data_indices.size());
// check weights
if (weights_ != nullptr && num_weights_ != num_all_data) {
Log::Stdout("init weight size doesn't equal with data file, will ignore");
delete[] weights_;
num_weights_ = 0;
weights_ = nullptr;
}
// check query boundries
if (query_boundaries_ != nullptr && query_boundaries_[num_queries_] != num_all_data) {
Log::Stdout("init query size doesn't equal with data file, will ignore");
delete[] query_boundaries_;
num_queries_ = 0;
query_boundaries_ = nullptr;
}
// contain initial score file
if (init_score_ != nullptr && num_init_score_ != num_all_data) {
Log::Stdout("init score size doesn't equal with data file, will ignore");
delete[] init_score_;
num_init_score_ = 0;
}
// get local weights
if (weights_ != nullptr) {
float* old_weights = weights_;
num_weights_ = num_data_;
weights_ = new float[num_data_];
for (size_t i = 0; i < used_data_indices.size(); ++i) {
weights_[i] = old_weights[used_data_indices[i]];
}
delete[] old_weights;
}
// get local query boundaries
if (query_boundaries_ != nullptr) {
std::vector<data_size_t> used_query;
data_size_t data_idx = 0;
for (data_size_t qid = 0; qid < num_queries_ && data_idx < num_used_data; ++qid) {
data_size_t start = query_boundaries_[qid];
data_size_t end = query_boundaries_[qid + 1];
data_size_t len = end - start;
if (used_data_indices[data_idx] > start) {
continue;
} else if (used_data_indices[data_idx] == start) {
if (num_used_data >= data_idx + len && used_data_indices[data_idx + len - 1] == end - 1) {
used_query.push_back(qid);
data_idx += len;
} else {
Log::Stderr("data partition error, not according to query");
}
} else {
Log::Stderr("data partition error, not according to query");
}
}
data_size_t * old_query_boundaries = query_boundaries_;
query_boundaries_ = new data_size_t[used_query.size() + 1];
num_queries_ = static_cast<data_size_t>(used_query.size());
query_boundaries_[0] = 0;
for (data_size_t i = 0; i < num_queries_; ++i) {
data_size_t qid = used_query[i];
data_size_t len = old_query_boundaries[qid + 1] - old_query_boundaries[qid];
query_boundaries_[i + 1] = query_boundaries_[i] + len;
}
delete[] old_query_boundaries;
}
// get local initial scores
if (init_score_ != nullptr) {
score_t* old_scores = init_score_;
num_init_score_ = num_data_;
init_score_ = new score_t[num_init_score_];
for (size_t i = 0; i < used_data_indices.size(); ++i) {
init_score_[i] = old_scores[used_data_indices[i]];
}
delete[] old_scores;
}
// re-load query weight
LoadQueryWeights();
}
}
void Metadata::SetInitScore(score_t* init_score) {
if (init_score_ != nullptr) { delete[] init_score_; }
num_init_score_ = num_data_;
init_score_ = init_score;
}
void Metadata::LoadWeights() {
num_weights_ = 0;
std::string weight_filename(data_filename_);
// default weight file name
weight_filename.append(".weight");
TextReader<size_t> reader(weight_filename.c_str());
reader.ReadAllLines();
if (reader.Lines().size() <= 0) {
return;
}
Log::Stdout("Start to load weights");
num_weights_ = static_cast<data_size_t>(reader.Lines().size());
weights_ = new float[num_weights_];
for (data_size_t i = 0; i < num_weights_; ++i) {
double tmp_weight;
Common::Atof(reader.Lines()[i].c_str(), &tmp_weight);
weights_[i] = static_cast<float>(tmp_weight);
}
}
void Metadata::LoadInitialScore() {
num_init_score_ = 0;
if (init_score_filename_[0] == '\0') { return; }
TextReader<size_t> reader(init_score_filename_);
reader.ReadAllLines();
Log::Stdout("Start to load initial score");
num_init_score_ = static_cast<data_size_t>(reader.Lines().size());
init_score_ = new score_t[num_init_score_];
double tmp;
for (data_size_t i = 0; i < num_init_score_; ++i) {
Common::Atof(reader.Lines()[i].c_str(), &tmp);
init_score_[i] = static_cast<score_t>(tmp);
}
}
void Metadata::LoadQueryBoundaries() {
num_queries_ = 0;
std::string query_filename(data_filename_);
// default query file name
query_filename.append(".query");
TextReader<size_t> reader(query_filename.c_str());
reader.ReadAllLines();
if (reader.Lines().size() <= 0) {
return;
}
Log::Stdout("Start to load query boundries");
query_boundaries_ = new data_size_t[reader.Lines().size() + 1];
num_queries_ = static_cast<data_size_t>(reader.Lines().size());
query_boundaries_[0] = 0;
for (size_t i = 0; i < reader.Lines().size(); ++i) {
int tmp_cnt;
Common::Atoi(reader.Lines()[i].c_str(), &tmp_cnt);
query_boundaries_[i + 1] = query_boundaries_[i] + static_cast<data_size_t>(tmp_cnt);
}
}
void Metadata::LoadQueryWeights() {
if (weights_ == nullptr || query_boundaries_ == nullptr) {
return;
}
Log::Stdout("Start to load query weights");
query_weights_ = new float[num_queries_];
for (data_size_t i = 0; i < num_queries_; ++i) {
query_weights_[i] = 0.0f;
for (data_size_t j = query_boundaries_[i]; j < query_boundaries_[i + 1]; ++j) {
query_weights_[i] += weights_[j];
}
query_weights_[i] /= (query_boundaries_[i + 1] - query_boundaries_[i]);
}
}
void Metadata::LoadFromMemory(const void* memory) {
const char* mem_ptr = reinterpret_cast<const char*>(memory);
num_data_ = *(reinterpret_cast<const data_size_t*>(mem_ptr));
mem_ptr += sizeof(num_data_);
num_weights_ = *(reinterpret_cast<const data_size_t*>(mem_ptr));
mem_ptr += sizeof(num_weights_);
num_queries_ = *(reinterpret_cast<const data_size_t*>(mem_ptr));
mem_ptr += sizeof(num_queries_);
if (label_ != nullptr) { delete[] label_; }
label_ = new float[num_data_];
std::memcpy(label_, mem_ptr, sizeof(float)*num_data_);
mem_ptr += sizeof(float)*num_weights_;
if (num_weights_ > 0) {
if (weights_ != nullptr) { delete[] weights_; }
weights_ = new float[num_weights_];
std::memcpy(weights_, mem_ptr, sizeof(float)*num_weights_);
mem_ptr += sizeof(float)*num_weights_;
}
if (num_queries_ > 0) {
if (query_boundaries_ != nullptr) { delete[] query_boundaries_; }
query_boundaries_ = new data_size_t[num_queries_ + 1];
std::memcpy(query_boundaries_, mem_ptr, sizeof(data_size_t)*(num_queries_ + 1));
mem_ptr += sizeof(data_size_t)*(num_queries_ + 1);
}
if (num_weights_ > 0 && num_queries_ > 0) {
if (query_weights_ != nullptr) { delete[] query_weights_; }
query_weights_ = new float[num_queries_];
std::memcpy(query_weights_, mem_ptr, sizeof(float)*num_queries_);
mem_ptr += sizeof(float)*num_queries_;
}
}
void Metadata::SaveBinaryToFile(FILE* file) const {
fwrite(&num_data_, sizeof(num_data_), 1, file);
fwrite(&num_weights_, sizeof(num_weights_), 1, file);
fwrite(&num_queries_, sizeof(num_queries_), 1, file);
fwrite(label_, sizeof(float), num_data_, file);
if (weights_ != nullptr) {
fwrite(weights_, sizeof(float), num_weights_, file);
}
if (query_boundaries_ != nullptr) {
fwrite(query_boundaries_, sizeof(data_size_t), num_queries_ + 1, file);
}
if (query_weights_ != nullptr) {
fwrite(query_weights_, sizeof(float), num_queries_, file);
}
}
size_t Metadata::SizesInByte() const {
size_t size = sizeof(num_data_) + sizeof(num_weights_)
+ sizeof(num_queries_);
size += sizeof(float) * num_data_;
if (weights_ != nullptr) {
size += sizeof(float) * num_weights_;
}
if (query_boundaries_ != nullptr) {
size += sizeof(data_size_t) * (num_queries_ + 1);
}
if (query_weights_ != nullptr) {
size += sizeof(float) * num_queries_;
}
return size;
}
} // namespace LightGBM
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment