"python-package/vscode:/vscode.git/clone" did not exist on "b918b5b2ff2dd776527e134f26eb8c0f0f096cfa"
Commit b23a2c31 authored by xuehui's avatar xuehui Committed by GitHub
Browse files

Merge pull request #44 from guolinke/master

To solve #41
parents 2af0dccd 3a06ce35
......@@ -85,6 +85,12 @@ public:
*/
virtual int MaxFeatureIdx() const = 0;
/*!
* \brief Get index of label column
* \return index of label column
*/
virtual int LabelIdx() const = 0;
/*!
* \brief Get number of weak sub-models
* \return Number of weak sub-models
......
......@@ -100,6 +100,20 @@ public:
bool use_two_round_loading = false;
bool is_save_binary_file = false;
bool is_sigmoid = true;
bool has_header = false;
/*! \brief Index or column name of label, default is the first column
* And add an prefix "name:" while using column name */
std::string label_column = "";
/*! \brief Index or column name of weight, < 0 means not used
* And add an prefix "name:" while using column name */
std::string weight_column = "";
/*! \brief Index or column name of group, < 0 means not used */
std::string group_column = "";
/*! \brief ignored features, separate by ','
* e.g. name:column_name1,column_name2 */
std::string ignore_column = "";
void Set(const std::unordered_map<std::string, std::string>& params) override;
};
......@@ -323,7 +337,15 @@ struct ParameterAlias {
{ "save_binary", "is_save_binary_file" },
{ "early_stopping_rounds", "early_stopping_round"},
{ "early_stopping", "early_stopping_round"},
{ "verbosity", "verbose" }
{ "verbosity", "verbose" },
{ "header", "has_header" },
{ "label", "label_column" },
{ "weight", "weight_column" },
{ "group", "group_column" },
{ "query", "group_column" },
{ "query_column", "group_column" },
{ "ignore_feature", "ignore_column" },
{ "blacklist", "ignore_column" }
});
std::unordered_map<std::string, std::string> tmp_map;
for (const auto& pair : *params) {
......
......@@ -5,11 +5,13 @@
#include <LightGBM/utils/text_reader.h>
#include <LightGBM/meta.h>
#include <LightGBM/config.h>
#include <vector>
#include <utility>
#include <functional>
#include <string>
#include <unordered_set>
namespace LightGBM {
......@@ -56,10 +58,12 @@ public:
~Metadata();
/*!
* \brief Initial work, will auto load weight, inital scores
* \brief Initial work, will allocate space for label, weight(if exists) and query(if exists)
* \param num_data Number of training data
* \param weight_idx Index of weight column, < 0 means doesn't exists
* \param query_idx Index of query id column, < 0 means doesn't exists
*/
void InitLabel(data_size_t num_data);
void Init(data_size_t num_data, int weight_idx, int query_idx);
/*!
* \brief Partition label by used indices
......@@ -109,6 +113,26 @@ public:
label_[idx] = static_cast<float>(value);
}
/*!
* \brief Set Weight for one record
* \param idx Index of this record
* \param value Weight value of this record
*/
inline void SetWeightAt(data_size_t idx, double value)
{
weights_[idx] = static_cast<float>(value);
}
/*!
* \brief Set Query Id for one record
* \param idx Index of this record
* \param value Query Id value of this record
*/
inline void SetQueryAt(data_size_t idx, double value)
{
queries_[idx] = static_cast<data_size_t>(value);
}
/*!
* \brief Get weights, if not exists, will return nullptr
* \return Pointer of weights
......@@ -178,41 +202,35 @@ private:
data_size_t num_init_score_;
/*! \brief Initial score */
score_t* init_score_;
/*! \brief Queries data */
data_size_t* queries_;
};
/*! \brief Interface for Parser */
class Parser {
public:
/*! \brief virtual destructor */
virtual ~Parser() {}
/*!
* \brief Parse one line with label
* \param str One line record, string format, should end with '\0'
* \param out_features Output features, store in (feature_idx, feature_value)
* \param out_label Output label
*/
virtual void ParseOneLine(const char* str,
std::vector<std::pair<int, double>>* out_features,
double* out_label) const = 0;
/*!
* \brief Parse one line with label
* \param str One line record, string format, should end with '\0'
* \param out_features Output features, store in (feature_idx, feature_value)
* \param out_label Output label
* \param out_features Output columns, store in (column_idx, values)
* \param out_label Label will store to this if exists
*/
virtual void ParseOneLine(const char* str,
std::vector<std::pair<int, double>>* out_features) const = 0;
std::vector<std::pair<int, double>>* out_features, double* out_label) const = 0;
/*!
* \brief Create a object of parser, will auto choose the format depend on file
* \param filename One Filename of data
* \param num_features Pass num_features of this data file if you know, <=0 means don't know
* \param has_label output, if num_features > 0, will output this data has label or not
* \param label_idx index of label column
* \return Object of parser
*/
static Parser* CreateParser(const char* filename, int num_features, bool* has_label);
static Parser* CreateParser(const char* filename, bool has_header, int num_features, int label_idx);
};
using PredictFunction =
......@@ -227,29 +245,21 @@ public:
* \brief Constructor
* \param data_filename Filename of dataset
* \param init_score_filename Filename of initial score
* \param is_int_label True if label is int type
* \param max_bin The maximal number of bin that feature values will bucket in
* \param random_seed The seed for random generator
* \param is_enable_sparse True for sparse feature
* \param io_config configs for IO
* \param predict_fun Used for initial model, will give a prediction score based on this function, then set as initial score
*/
Dataset(const char* data_filename, const char* init_score_filename,
int max_bin, int random_seed, bool is_enable_sparse, const PredictFunction& predict_fun);
const IOConfig& io_config, const PredictFunction& predict_fun);
/*!
* \brief Constructor
* \param data_filename Filename of dataset
* \param is_int_label True if label is int type
* \param max_bin The maximal number of bin that feature values will bucket in
* \param random_seed The seed for random generator
* \param is_enable_sparse True for sparse feature
* \param io_config configs for IO
* \param predict_fun Used for initial model, will give a prediction score based on this function, then set as initial score
*/
Dataset(const char* data_filename,
int max_bin, int random_seed, bool is_enable_sparse,
const PredictFunction& predict_fun)
: Dataset(data_filename, "", max_bin, random_seed,
is_enable_sparse, predict_fun) {
const IOConfig& io_config, const PredictFunction& predict_fun)
: Dataset(data_filename, "", io_config, predict_fun) {
}
/*! \brief Destructor */
......@@ -304,6 +314,12 @@ public:
/*! \brief Get Number of total features */
inline int num_total_features() const { return num_total_features_; }
/*! \brief Get the index of label column */
inline int label_idx() const { return label_idx_; }
/*! \brief Get names of current data set */
inline std::vector<std::string> feature_names() const { return feature_names_; }
/*! \brief Get Number of data */
inline data_size_t num_data() const { return num_data_; }
......@@ -394,10 +410,20 @@ private:
bool is_loading_from_binfile_;
/*! \brief Number of global data, used for distributed learning */
size_t global_num_data_ = 0;
// used to local used data indices
/*! \brief used to local used data indices */
std::vector<data_size_t> used_data_indices_;
// prediction function for initial model
/*! \brief prediction function for initial model */
const PredictFunction& predict_fun_;
/*! \brief index of label column */
int label_idx_ = 0;
/*! \brief index of weight column */
int weight_idx_ = -1;
/*! \brief index of group column */
int group_idx_ = -1;
/*! \brief Mapper from real feature index to used index*/
std::unordered_set<int> ignore_features_;
/*! \brief store feature names */
std::vector<std::string> feature_names_;
};
} // namespace LightGBM
......
......@@ -43,7 +43,13 @@ inline static std::string& RemoveQuotationSymbol(std::string& str) {
str.erase(0, str.find_first_not_of("'\""));
return str;
}
inline static bool StartsWith(const std::string& str, const std::string prefix) {
if (str.substr(0, prefix.size()) == prefix) {
return true;
} else {
return false;
}
}
inline static std::vector<std::string> Split(const char* c_str, char delimiter) {
std::vector<std::string> ret;
std::string str(c_str);
......@@ -58,6 +64,21 @@ inline static std::vector<std::string> Split(const char* c_str, char delimiter)
return ret;
}
inline static std::vector<std::string> Split(const char* c_str, const char* delimiters) {
// will split when met any chars in delimiters
std::vector<std::string> ret;
std::string str(c_str);
size_t i = 0;
size_t pos = str.find_first_of(delimiters);
while (pos != std::string::npos) {
ret.push_back(str.substr(i, pos - i));
i = ++pos;
pos = str.find_first_of(delimiters, pos);
}
ret.push_back(str.substr(i));
return ret;
}
inline static const char* Atoi(const char* p, int* out) {
int sign, value;
while (*p == ' ') {
......
......@@ -21,7 +21,7 @@ public:
* \param filename Filename of data
* \process_fun Process function
*/
static size_t Read(const char* filename, const std::function<size_t (const char*, size_t)>& process_fun) {
static size_t Read(const char* filename, int skip_bytes, const std::function<size_t (const char*, size_t)>& process_fun) {
FILE* file;
#ifdef _MSC_VER
......@@ -38,8 +38,13 @@ public:
char* buffer_process = new char[buffer_size];
// buffer used for the file reading
char* buffer_read = new char[buffer_size];
size_t read_cnt = 0;
if (skip_bytes > 0) {
// skip first k bytes
read_cnt = fread(buffer_process, 1, skip_bytes, file);
}
// read first block
size_t read_cnt = fread(buffer_process, 1, buffer_size, file);
read_cnt = fread(buffer_process, 1, buffer_size, file);
size_t last_read_cnt = 0;
while (read_cnt > 0) {
// strat read thread
......
......@@ -6,6 +6,7 @@
#include <LightGBM/utils/random.h>
#include <cstdio>
#include <sstream>
#include <vector>
#include <string>
......@@ -22,9 +23,41 @@ public:
/*!
* \brief Constructor
* \param filename Filename of data
* \param is_skip_first_line True if need to skip header
*/
TextReader(const char* filename):
filename_(filename){
TextReader(const char* filename, bool is_skip_first_line):
filename_(filename), is_skip_first_line_(is_skip_first_line){
if (is_skip_first_line_) {
FILE* file;
#ifdef _MSC_VER
fopen_s(&file, filename, "r");
#else
file = fopen(filename, "r");
#endif
std::stringstream str_buf;
int read_c = -1;
read_c = fgetc(file);
while (read_c != EOF) {
char tmp_ch = static_cast<char>(read_c);
if (tmp_ch == '\n' || tmp_ch == '\r') {
break;
}
str_buf << tmp_ch;
++skip_bytes_;
read_c = fgetc(file);
}
if (static_cast<char>(read_c) == '\r') {
read_c = fgetc(file);
++skip_bytes_;
}
if (static_cast<char>(read_c) == '\n') {
read_c = fgetc(file);
++skip_bytes_;
}
fclose(file);
first_line_ = str_buf.str();
Log::Info("skip header:\"%s\" in file %s", first_line_.c_str(), filename_);
}
}
/*!
* \brief Destructor
......@@ -40,6 +73,12 @@ public:
lines_.shrink_to_fit();
}
/*!
* \brief return first line of data
*/
inline std::string first_line() {
return first_line_;
}
/*!
* \brief Get text data that read from file
* \return Text data, store in std::vector by line
*/
......@@ -48,7 +87,7 @@ public:
INDEX_T ReadAllAndProcess(const std::function<void(INDEX_T, const char*, size_t)>& process_fun) {
last_line_ = "";
INDEX_T total_cnt = 0;
PipelineReader::Read(filename_,
PipelineReader::Read(filename_, skip_bytes_,
[this, &total_cnt, &process_fun]
(const char* buffer_process, size_t read_cnt) {
size_t cnt = 0;
......@@ -176,7 +215,7 @@ public:
last_line_ = "";
INDEX_T total_cnt = 0;
INDEX_T used_cnt = 0;
PipelineReader::Read(filename_,
PipelineReader::Read(filename_, skip_bytes_,
[this, &total_cnt, &process_fun,&used_cnt, &filter_fun]
(const char* buffer_process, size_t read_cnt) {
size_t cnt = 0;
......@@ -260,6 +299,12 @@ private:
std::vector<std::string> lines_;
/*! \brief Buffer for last line */
std::string last_line_;
/*! \brief first line */
std::string first_line_="";
/*! \brief is skip first line */
bool is_skip_first_line_ = false;
/*! \brief is skip first line */
int skip_bytes_ = 0;
};
} // namespace LightGBM
......
......@@ -76,7 +76,7 @@ void Application::LoadParameters(int argc, char** argv) {
ParameterAlias::KeyAliasTransform(&params);
// read parameters from config file
if (params.count("config_file") > 0) {
TextReader<size_t> config_reader(params["config_file"].c_str());
TextReader<size_t> config_reader(params["config_file"].c_str(), false);
config_reader.ReadAllLines();
if (config_reader.Lines().size() > 0) {
for (auto& line : config_reader.Lines()) {
......@@ -139,9 +139,7 @@ void Application::LoadData() {
}
train_data_ = new Dataset(config_.io_config.data_filename.c_str(),
config_.io_config.input_init_score.c_str(),
config_.io_config.max_bin,
config_.io_config.data_random_seed,
config_.io_config.is_enable_sparse,
config_.io_config,
predict_fun);
// load Training data
if (config_.is_parallel_find_bin) {
......@@ -173,9 +171,7 @@ void Application::LoadData() {
// add
valid_datas_.push_back(
new Dataset(config_.io_config.valid_data_filenames[i].c_str(),
config_.io_config.max_bin,
config_.io_config.data_random_seed,
config_.io_config.is_enable_sparse,
config_.io_config,
predict_fun));
// load validation data like train data
valid_datas_.back()->LoadValidationData(train_data_,
......@@ -253,7 +249,8 @@ void Application::Train() {
void Application::Predict() {
// create predictor
Predictor predictor(boosting_, config_.io_config.is_sigmoid, config_.predict_leaf_index);
predictor.Predict(config_.io_config.data_filename.c_str(), config_.io_config.output_result.c_str());
predictor.Predict(config_.io_config.data_filename.c_str(),
config_.io_config.output_result.c_str(), config_.io_config.has_header);
Log::Info("Finish predict.");
}
......@@ -265,7 +262,7 @@ void Application::InitPredict() {
}
void Application::LoadModel() {
TextReader<size_t> model_reader(config_.io_config.input_model.c_str());
TextReader<size_t> model_reader(config_.io_config.input_model.c_str(), false);
model_reader.ReadAllLines();
std::stringstream ss;
for (auto& line : model_reader.Lines()) {
......
......@@ -92,7 +92,7 @@ public:
* \param has_label True if this data contains label
* \param result_filename Filename of output result
*/
void Predict(const char* data_filename, const char* result_filename) {
void Predict(const char* data_filename, const char* result_filename, bool has_header) {
FILE* result_file;
#ifdef _MSC_VER
......@@ -104,8 +104,7 @@ public:
if (result_file == NULL) {
Log::Fatal("Predition result file %s doesn't exists", data_filename);
}
bool has_label = false;
Parser* parser = Parser::CreateParser(data_filename, num_features_, &has_label);
Parser* parser = Parser::CreateParser(data_filename, has_header, num_features_, boosting_->LabelIdx());
if (parser == nullptr) {
Log::Fatal("Recongnizing input data format failed, filename %s", data_filename);
......@@ -114,21 +113,12 @@ public:
// function for parse data
std::function<void(const char*, std::vector<std::pair<int, double>>*)> parser_fun;
double tmp_label;
if (has_label) {
// parse function with label
parser_fun = [this, &parser, &tmp_label]
(const char* buffer, std::vector<std::pair<int, double>>* feature) {
parser->ParseOneLine(buffer, feature, &tmp_label);
};
Log::Info("Start prediction for data %s with labels", data_filename);
} else {
// parse function without label
parser_fun = [this, &parser]
(const char* buffer, std::vector<std::pair<int, double>>* feature) {
parser->ParseOneLine(buffer, feature);
};
Log::Info("Start prediction for data %s without label", data_filename);
}
parser_fun = [this, &parser, &tmp_label]
(const char* buffer, std::vector<std::pair<int, double>>* feature) {
parser->ParseOneLine(buffer, feature, &tmp_label);
};
std::function<std::string(const std::vector<std::pair<int, double>>&)> predict_fun;
if (predict_leaf_index) {
predict_fun = [this](const std::vector<std::pair<int, double>>& features){
......@@ -173,7 +163,7 @@ public:
fprintf(result_file, "%s\n", pred_result[i].c_str());
}
};
TextReader<data_size_t> predict_data_reader(data_filename);
TextReader<data_size_t> predict_data_reader(data_filename, has_header);
predict_data_reader.ReadAllAndProcessParallel(process_fun);
fclose(result_file);
......
......@@ -61,7 +61,8 @@ void GBDT::Init(const Dataset* train_data, const ObjectiveFunction* object_funct
// get max feature index
max_feature_idx_ = train_data_->num_total_features() - 1;
// get label index
label_idx_ = train_data_->label_idx();
// if need bagging, create buffer
if (gbdt_config_->bagging_fraction < 1.0 && gbdt_config_->bagging_freq > 0) {
out_of_bag_data_indices_ = new data_size_t[num_data_];
......@@ -276,19 +277,21 @@ void GBDT::Boosting() {
std::string GBDT::ModelsToString() const {
// serialize this object to string
std::stringstream ss;
std::stringstream str_buf;
// output label index
str_buf << "label_index=" << label_idx_ << std::endl;
// output max_feature_idx
ss << "max_feature_idx=" << max_feature_idx_ << std::endl;
str_buf << "max_feature_idx=" << max_feature_idx_ << std::endl;
// output sigmoid parameter
ss << "sigmoid=" << object_function_->GetSigmoid() << std::endl;
ss << std::endl;
str_buf << "sigmoid=" << object_function_->GetSigmoid() << std::endl;
str_buf << std::endl;
// output tree models
for (size_t i = 0; i < models_.size(); ++i) {
ss << "Tree=" << i << std::endl;
ss << models_[i]->ToString() << std::endl;
str_buf << "Tree=" << i << std::endl;
str_buf << models_[i]->ToString() << std::endl;
}
return ss.str();
return str_buf.str();
}
void GBDT::ModelsFromString(const std::string& model_str, int num_used_model) {
......@@ -296,7 +299,26 @@ void GBDT::ModelsFromString(const std::string& model_str, int num_used_model) {
models_.clear();
std::vector<std::string> lines = Common::Split(model_str.c_str(), '\n');
size_t i = 0;
// get index of label
while (i < lines.size()) {
size_t find_pos = lines[i].find("label_index=");
if (find_pos != std::string::npos) {
std::vector<std::string> strs = Common::Split(lines[i].c_str(), '=');
Common::Atoi(strs[1].c_str(), &label_idx_);
++i;
break;
} else {
++i;
}
}
if (i == lines.size()) {
Log::Fatal("Model file doesn't contain label index");
return;
}
// get max_feature_idx first
i = 0;
while (i < lines.size()) {
size_t find_pos = lines[i].find("max_feature_idx=");
if (find_pos != std::string::npos) {
......
......@@ -82,6 +82,13 @@ public:
* \return Max feature index of this model
*/
inline int MaxFeatureIdx() const override { return max_feature_idx_; }
/*!
* \brief Get index of label column
* \return index of label column
*/
inline int LabelIdx() const override { return label_idx_; }
/*!
* \brief Get number of weak sub-models
* \return Number of weak sub-models
......@@ -173,6 +180,9 @@ private:
* if > 0 meas output score will transform by sigmoid function
*/
double sigmoid_;
/*! \brief Index of label column */
data_size_t label_idx_;
};
} // namespace LightGBM
......
......@@ -163,6 +163,11 @@ void IOConfig::Set(const std::unordered_map<std::string, std::string>& params) {
if (GetString(params, "valid_data", &tmp_str)) {
valid_data_filenames = Common::Split(tmp_str.c_str(), ',');
}
GetBool(params, "has_header", &has_header);
GetString(params, "label_column", &label_column);
GetString(params, "weight_column", &weight_column);
GetString(params, "group_column", &group_column);
GetString(params, "ignore_column", &ignore_column);
}
......
......@@ -11,13 +11,14 @@
#include <vector>
#include <utility>
#include <string>
#include <sstream>
namespace LightGBM {
Dataset::Dataset(const char* data_filename, const char* init_score_filename,
int max_bin, int random_seed, bool is_enable_sparse, const PredictFunction& predict_fun)
:data_filename_(data_filename), random_(random_seed),
max_bin_(max_bin), is_enable_sparse_(is_enable_sparse), predict_fun_(predict_fun) {
const IOConfig& io_config, const PredictFunction& predict_fun)
:data_filename_(data_filename), random_(io_config.data_random_seed),
max_bin_(io_config.max_bin), is_enable_sparse_(io_config.is_enable_sparse), predict_fun_(predict_fun) {
CheckCanLoadFromBin();
if (is_loading_from_binfile_ && predict_fun != nullptr) {
......@@ -28,13 +29,134 @@ Dataset::Dataset(const char* data_filename, const char* init_score_filename,
if (!is_loading_from_binfile_) {
// load weight, query information and initilize score
metadata_.Init(data_filename, init_score_filename);
// create text reader
text_reader_ = new TextReader<data_size_t>(data_filename, io_config.has_header);
std::unordered_map<std::string, int> name2idx;
// get column names
if (io_config.has_header) {
std::string first_line = text_reader_->first_line();
feature_names_ = Common::Split(first_line.c_str(), "\t ,");
for (size_t i = 0; i < feature_names_.size(); ++i) {
name2idx[feature_names_[i]] = static_cast<int>(i);
}
}
std::string name_prefix("name:");
// load label idx
if (io_config.label_column.size() > 0) {
if (Common::StartsWith(io_config.label_column, name_prefix)) {
std::string name = io_config.label_column.substr(name_prefix.size());
if (name2idx.count(name) > 0) {
label_idx_ = name2idx[name];
Log::Info("use %s column as label", name.c_str());
} else {
Log::Fatal("cannot find label column: %s in data file", name.c_str());
}
} else {
size_t pos = 0;
label_idx_ = std::stoi(io_config.label_column, &pos);
if (pos != io_config.label_column.size()) {
Log::Fatal("label_column is not a number, \
if you want to use column name, \
please add prefix \"name:\" before column name");
}
Log::Info("use %d-th column as label", label_idx_);
}
}
if (feature_names_.size() > 0) {
// erase label column name
feature_names_.erase(feature_names_.begin() + label_idx_);
}
// load ignore columns
if (io_config.ignore_column.size() > 0) {
if (Common::StartsWith(io_config.ignore_column, name_prefix)) {
std::string names = io_config.ignore_column.substr(name_prefix.size());
for (auto name : Common::Split(names.c_str(), ',')) {
if (name2idx.count(name) > 0) {
int tmp = name2idx[name];
// skip for label column
if (tmp > label_idx_) { tmp -= 1; }
ignore_features_.emplace(tmp);
} else {
Log::Fatal("cannot find column: %s in data file", name.c_str());
}
}
} else {
for (auto token : Common::Split(io_config.ignore_column.c_str(), ',')) {
size_t pos = 0;
int tmp = std::stoi(token, &pos);
if (pos != token.size()) {
Log::Fatal("ignore_column is not a number, \
if you want to use column name, \
please add prefix \"name:\" before column name");
}
// skip for label column
if (tmp > label_idx_) { tmp -= 1; }
ignore_features_.emplace(tmp);
}
}
}
// load weight idx
if (io_config.weight_column.size() > 0) {
if (Common::StartsWith(io_config.weight_column, name_prefix)) {
std::string name = io_config.weight_column.substr(name_prefix.size());
if (name2idx.count(name) > 0) {
weight_idx_ = name2idx[name];
Log::Info("use %s column as weight", name.c_str());
} else {
Log::Fatal("cannot find weight column: %s in data file", name.c_str());
}
} else {
size_t pos = 0;
weight_idx_ = std::stoi(io_config.weight_column, &pos);
if (pos != io_config.weight_column.size()) {
Log::Fatal("weight_column is not a number, \
if you want to use column name, \
please add prefix \"name:\" before column name");
}
Log::Info("use %d-th column as weight", weight_idx_);
}
// skip for label column
if (weight_idx_ > label_idx_) {
weight_idx_ -= 1;
}
ignore_features_.emplace(weight_idx_);
}
if (io_config.group_column.size() > 0) {
if (Common::StartsWith(io_config.group_column, name_prefix)) {
std::string name = io_config.group_column.substr(name_prefix.size());
if (name2idx.count(name) > 0) {
group_idx_ = name2idx[name];
Log::Info("use %s column as group/query id", name.c_str());
} else {
Log::Fatal("cannot find group/query column: %s in data file", name.c_str());
}
} else {
size_t pos = 0;
group_idx_ = std::stoi(io_config.group_column, &pos);
if (pos != io_config.group_column.size()) {
Log::Fatal("group_column is not a number, \
if you want to use column name, \
please add prefix \"name:\" before column name");
}
Log::Info("use %d-th column as group/query id", group_idx_);
}
// skip for label column
if (group_idx_ > label_idx_) {
group_idx_ -= 1;
}
ignore_features_.emplace(group_idx_);
}
// create text parser
parser_ = Parser::CreateParser(data_filename_, 0, nullptr);
parser_ = Parser::CreateParser(data_filename_, io_config.has_header, 0, label_idx_);
if (parser_ == nullptr) {
Log::Fatal("Cannot recognising input data format, filename: %s", data_filename_);
}
// create text reader
text_reader_ = new TextReader<data_size_t>(data_filename);
} else {
// only need to load initilize score, other meta data will be loaded from bin flie
metadata_.Init(init_score_filename);
......@@ -190,18 +312,40 @@ void Dataset::ConstructBinMappers(int rank, int num_machines, const std::vector<
// -1 means doesn't use this feature
used_feature_map_ = std::vector<int>(sample_values.size(), -1);
num_total_features_ = static_cast<int>(sample_values.size());
// check the range of label_idx, weight_idx and group_idx
CHECK(label_idx_ >= 0 && label_idx_ <= num_total_features_);
CHECK(weight_idx_ < 0 || weight_idx_ < num_total_features_);
CHECK(group_idx_ < 0 || group_idx_ < num_total_features_);
// fill feature_names_ if not header
if (feature_names_.size() <= 0) {
for (int i = 0; i < num_total_features_; ++i) {
std::stringstream str_buf;
str_buf << "Column_" << i;
feature_names_.push_back(str_buf.str());
}
}
// start find bins
if (num_machines == 1) {
std::vector<BinMapper*> bin_mappers(sample_values.size());
// if only 1 machines, find bin locally
#pragma omp parallel for schedule(guided)
for (int i = 0; i < static_cast<int>(sample_values.size()); ++i) {
if (ignore_features_.count(i) > 0) {
bin_mappers[i] = nullptr;
continue;
}
bin_mappers[i] = new BinMapper();
bin_mappers[i]->FindBin(&sample_values[i], max_bin_);
}
for (size_t i = 0; i < sample_values.size(); ++i) {
if (!bin_mappers[i]->is_trival()) {
if (bin_mappers[i] == nullptr) {
Log::Error("Ignore Feature %s ", feature_names_[i].c_str());
}
else if (!bin_mappers[i]->is_trival()) {
// map real feature index to used feature index
used_feature_map_[i] = static_cast<int>(features_.size());
// push new feature
......@@ -209,7 +353,7 @@ void Dataset::ConstructBinMappers(int rank, int num_machines, const std::vector<
num_data_, is_enable_sparse_));
} else {
// if feature is trival(only 1 bin), free spaces
Log::Error("Feature %d only contains one value, will be ignored", i);
Log::Error("Feature %s only contains one value, will be ignored", feature_names_[i].c_str());
delete bin_mappers[i];
}
}
......@@ -256,12 +400,17 @@ void Dataset::ConstructBinMappers(int rank, int num_machines, const std::vector<
Network::Allgather(input_buffer, buffer_size, start, len, output_buffer);
// restore features bins from buffer
for (int i = 0; i < total_num_feature; ++i) {
if (ignore_features_.count(i) > 0) {
Log::Error("Ignore Feature %s ", feature_names_[i].c_str());
continue;
}
BinMapper* bin_mapper = new BinMapper();
bin_mapper->CopyFrom(output_buffer + i * type_size);
if (!bin_mapper->is_trival()) {
used_feature_map_[i] = static_cast<int>(features_.size());
features_.push_back(new Feature(static_cast<int>(i), bin_mapper, num_data_, is_enable_sparse_));
} else {
Log::Error("Feature %s only contains one value, will be ignored", feature_names_[i].c_str());
delete bin_mapper;
}
}
......@@ -276,6 +425,13 @@ void Dataset::ConstructBinMappers(int rank, int num_machines, const std::vector<
void Dataset::LoadTrainData(int rank, int num_machines, bool is_pre_partition, bool use_two_round_loading) {
// don't support query id in data file when training in parallel
if (num_machines > 1 && !is_pre_partition) {
if (group_idx_ > 0) {
Log::Fatal("Don't support query id in data file when training parallel without pre-partition. \
Please use an additional query file or pre-partition your data");
}
}
used_data_indices_.clear();
if (!is_loading_from_binfile_ ) {
if (!use_two_round_loading) {
......@@ -287,7 +443,7 @@ void Dataset::LoadTrainData(int rank, int num_machines, bool is_pre_partition, b
// construct feature bin mappers
ConstructBinMappers(rank, num_machines, sample_data);
// initialize label
metadata_.InitLabel(num_data_);
metadata_.Init(num_data_, weight_idx_, group_idx_);
// extract features
ExtractFeaturesFromMemory();
} else {
......@@ -297,7 +453,7 @@ void Dataset::LoadTrainData(int rank, int num_machines, bool is_pre_partition, b
// construct feature bin mappers
ConstructBinMappers(rank, num_machines, sample_data);
// initialize label
metadata_.InitLabel(num_data_);
metadata_.Init(num_data_, weight_idx_, group_idx_);
// extract features
ExtractFeaturesFromFile();
......@@ -322,7 +478,7 @@ void Dataset::LoadValidationData(const Dataset* train_set, bool use_two_round_lo
// read data in memory
LoadDataToMemory(0, 1, false);
// initialize label
metadata_.InitLabel(num_data_);
metadata_.Init(num_data_, weight_idx_, group_idx_);
features_.clear();
// copy feature bin mapper data
for (Feature* feature : train_set->features_) {
......@@ -336,7 +492,7 @@ void Dataset::LoadValidationData(const Dataset* train_set, bool use_two_round_lo
// Get number of lines of data file
num_data_ = static_cast<data_size_t>(text_reader_->CountLine());
// initialize label
metadata_.InitLabel(num_data_);
metadata_.Init(num_data_, weight_idx_, group_idx_);
features_.clear();
// copy feature bin mapper data
for (Feature* feature : train_set->features_) {
......@@ -381,6 +537,13 @@ void Dataset::ExtractFeaturesFromMemory() {
// if is used feature
features_[feature_idx]->PushData(tid, i, inner_data.second);
}
else {
if (inner_data.first == weight_idx_) {
metadata_.SetWeightAt(i, inner_data.second);
} else if (inner_data.first == group_idx_) {
metadata_.SetQueryAt(i, inner_data.second);
}
}
}
}
} else {
......@@ -407,6 +570,13 @@ void Dataset::ExtractFeaturesFromMemory() {
// if is used feature
features_[feature_idx]->PushData(tid, i, inner_data.second);
}
else {
if (inner_data.first == weight_idx_) {
metadata_.SetWeightAt(i, inner_data.second);
} else if (inner_data.first == group_idx_) {
metadata_.SetQueryAt(i, inner_data.second);
}
}
}
}
// metadata_ will manage space of init_score
......@@ -451,6 +621,13 @@ void Dataset::ExtractFeaturesFromFile() {
// if is used feature
features_[feature_idx]->PushData(tid, start_idx + i, inner_data.second);
}
else {
if (inner_data.first == weight_idx_) {
metadata_.SetWeightAt(start_idx + i, inner_data.second);
} else if (inner_data.first == group_idx_) {
metadata_.SetQueryAt(start_idx + i, inner_data.second);
}
}
}
}
};
......
......@@ -10,7 +10,7 @@ namespace LightGBM {
Metadata::Metadata()
:label_(nullptr), label_int_(nullptr), weights_(nullptr),
query_boundaries_(nullptr),
query_weights_(nullptr), init_score_(nullptr) {
query_weights_(nullptr), init_score_(nullptr), queries_(nullptr){
}
......@@ -36,12 +36,31 @@ Metadata::~Metadata() {
if (query_boundaries_ != nullptr) { delete[] query_boundaries_; }
if (query_weights_ != nullptr) { delete[] query_weights_; }
if (init_score_ != nullptr) { delete[] init_score_; }
if (queries_ != nullptr) { delete[] queries_; }
}
void Metadata::InitLabel(data_size_t num_data) {
void Metadata::Init(data_size_t num_data, int weight_idx, int query_idx) {
num_data_ = num_data;
label_ = new float[num_data_];
if (weight_idx >= 0) {
if (weights_ != nullptr) {
Log::Info("using weight in data file, and ignore additional weight file");
delete[] weights_;
}
weights_ = new float[num_data_];
num_weights_ = num_data_;
memset(weights_, 0, sizeof(float) * num_data_);
}
if (query_idx >= 0) {
if (query_boundaries_ != nullptr) {
Log::Info("using query id in data file, and ignore additional query file");
delete[] query_boundaries_;
}
if (query_weights_ != nullptr) { delete[] query_weights_; }
queries_ = new data_size_t[num_data_];
memset(queries_, 0, sizeof(data_size_t) * num_data_);
}
}
void Metadata::PartitionLabel(const std::vector<data_size_t>& used_indices) {
......@@ -59,6 +78,32 @@ void Metadata::PartitionLabel(const std::vector<data_size_t>& used_indices) {
void Metadata::CheckOrPartition(data_size_t num_all_data, const std::vector<data_size_t>& used_data_indices) {
if (used_data_indices.size() == 0) {
if (queries_ != nullptr) {
// need convert query_id to boundaries
std::vector<data_size_t> tmp_buffer;
data_size_t last_qid = -1;
data_size_t cur_cnt = 0;
for (data_size_t i = 0; i < num_data_; ++i) {
if (last_qid != queries_[i]) {
if (cur_cnt > 0) {
tmp_buffer.push_back(cur_cnt);
}
cur_cnt = 0;
last_qid = queries_[i];
}
++cur_cnt;
}
tmp_buffer.push_back(cur_cnt);
query_boundaries_ = new data_size_t[tmp_buffer.size() + 1];
num_queries_ = static_cast<data_size_t>(tmp_buffer.size());
query_boundaries_[0] = 0;
for (size_t i = 0; i < tmp_buffer.size(); ++i) {
query_boundaries_[i + 1] = query_boundaries_[i] + tmp_buffer[i];
}
LoadQueryWeights();
delete[] queries_;
queries_ = nullptr;
}
// check weights
if (weights_ != nullptr && num_weights_ != num_data_) {
Log::Error("Initial weight size doesn't equal to data, weights will be ignored");
......@@ -131,10 +176,10 @@ void Metadata::CheckOrPartition(data_size_t num_all_data, const std::vector<data
used_query.push_back(qid);
data_idx += len;
} else {
Log::Fatal("Data partition error, data didn't match queies");
Log::Fatal("Data partition error, data didn't match queries");
}
} else {
Log::Fatal("Data partition error, data didn't match queies");
Log::Fatal("Data partition error, data didn't match queries");
}
}
data_size_t * old_query_boundaries = query_boundaries_;
......@@ -177,7 +222,7 @@ void Metadata::LoadWeights() {
std::string weight_filename(data_filename_);
// default weight file name
weight_filename.append(".weight");
TextReader<size_t> reader(weight_filename.c_str());
TextReader<size_t> reader(weight_filename.c_str(), false);
reader.ReadAllLines();
if (reader.Lines().size() <= 0) {
return;
......@@ -195,7 +240,7 @@ void Metadata::LoadWeights() {
void Metadata::LoadInitialScore() {
num_init_score_ = 0;
if (init_score_filename_[0] == '\0') { return; }
TextReader<size_t> reader(init_score_filename_);
TextReader<size_t> reader(init_score_filename_, false);
reader.ReadAllLines();
Log::Info("Start loading initial scores");
......@@ -213,7 +258,7 @@ void Metadata::LoadQueryBoundaries() {
std::string query_filename(data_filename_);
// default query file name
query_filename.append(".query");
TextReader<size_t> reader(query_filename.c_str());
TextReader<size_t> reader(query_filename.c_str(), false);
reader.ReadAllLines();
if (reader.Lines().size() <= 0) {
return;
......
......@@ -2,6 +2,7 @@
#include <iostream>
#include <fstream>
#include <functional>
namespace LightGBM {
......@@ -20,44 +21,65 @@ void GetStatistic(const char* str, int* comma_cnt, int* tab_cnt, int* colon_cnt)
}
}
bool CheckHasLabelForLibsvm(std::string& str) {
int GetLabelIdxForLibsvm(std::string& str, int num_features, int label_idx) {
if (num_features <= 0) {
return label_idx;
}
str = Common::Trim(str);
auto pos_space = str.find_first_of(" \f\n\r\t\v");
auto pos_colon = str.find_first_of(":");
if (pos_colon == std::string::npos || pos_colon > pos_space) {
return true;
return -1;
} else {
return false;
return label_idx;
}
}
bool CheckHasLabelForTSV(std::string& str, int num_features) {
int GetLabelIdxForTSV(std::string& str, int num_features, int label_idx) {
if (num_features <= 0) {
return label_idx;
}
str = Common::Trim(str);
auto tokens = Common::Split(str.c_str(), '\t');
if (static_cast<int>(tokens.size()) == num_features) {
return false;
return -1;
} else {
return true;
return label_idx;
}
}
bool CheckHasLabelForCSV(std::string& str, int num_features) {
int GetLabelIdxForCSV(std::string& str, int num_features, int label_idx) {
if (num_features <= 0) {
return label_idx;
}
str = Common::Trim(str);
auto tokens = Common::Split(str.c_str(), ',');
if (static_cast<int>(tokens.size()) == num_features) {
return false;
return -1;
} else {
return true;
return label_idx;
}
}
Parser* Parser::CreateParser(const char* filename, int num_features, bool* has_label) {
enum DataType {
INVALID,
CSV,
TSV,
LIBSVM
};
Parser* Parser::CreateParser(const char* filename, bool has_header, int num_features, int label_idx) {
std::ifstream tmp_file;
tmp_file.open(filename);
if (!tmp_file.is_open()) {
Log::Fatal("Data file: %s doesn't exist", filename);
}
std::string line1, line2;
if (has_header) {
if (!tmp_file.eof()) {
std::getline(tmp_file, line1);
}
}
if (!tmp_file.eof()) {
std::getline(tmp_file, line1);
} else {
......@@ -75,44 +97,48 @@ Parser* Parser::CreateParser(const char* filename, int num_features, bool* has_l
// Get some statistic from 2 line
GetStatistic(line1.c_str(), &comma_cnt, &tab_cnt, &colon_cnt);
GetStatistic(line2.c_str(), &comma_cnt2, &tab_cnt2, &colon_cnt2);
Parser* ret = nullptr;
DataType type = DataType::INVALID;
if (line2.size() == 0) {
// if only have one line on file
if (colon_cnt > 0) {
ret = new LibSVMParser();
if (num_features > 0 && has_label != nullptr) {
*has_label = CheckHasLabelForLibsvm(line1);
}
type = DataType::LIBSVM;
} else if (tab_cnt > 0) {
ret = new TSVParser();
if (num_features > 0 && has_label != nullptr) {
*has_label = CheckHasLabelForTSV(line1, num_features);
}
type = DataType::TSV;
} else if (comma_cnt > 0) {
ret = new CSVParser();
if (num_features > 0 && has_label != nullptr) {
*has_label = CheckHasLabelForCSV(line1, num_features);
}
}
type = DataType::CSV;
}
} else {
if (colon_cnt > 0 || colon_cnt2 > 0) {
ret = new LibSVMParser();
if (num_features > 0 && has_label != nullptr) {
*has_label = CheckHasLabelForLibsvm(line1);
}
}
else if (tab_cnt == tab_cnt2 && tab_cnt > 0) {
ret = new TSVParser();
if (num_features > 0 && has_label != nullptr) {
*has_label = CheckHasLabelForTSV(line1, num_features);
}
type = DataType::LIBSVM;
} else if (tab_cnt == tab_cnt2 && tab_cnt > 0) {
type = DataType::TSV;
} else if (comma_cnt == comma_cnt2 && comma_cnt > 0) {
ret = new CSVParser();
if (num_features > 0 && has_label != nullptr) {
*has_label = CheckHasLabelForCSV(line1, num_features);
}
type = DataType::CSV;
}
}
if (type == DataType::INVALID) {
Log::Fatal("Unkown format of training data");
}
Parser* ret = nullptr;
if (type == DataType::LIBSVM) {
label_idx = GetLabelIdxForLibsvm(line1, num_features, label_idx);
ret = new LibSVMParser(label_idx);
}
else if (type == DataType::TSV) {
label_idx = GetLabelIdxForTSV(line1, num_features, label_idx);
ret = new TSVParser(label_idx);
}
else if (type == DataType::CSV) {
label_idx = GetLabelIdxForCSV(line1, num_features, label_idx);
ret = new CSVParser(label_idx);
}
if (label_idx < 0) {
Log::Info("Data file: %s doesn't contain label column", filename);
}
return ret;
}
......
......@@ -14,14 +14,23 @@ namespace LightGBM {
class CSVParser: public Parser {
public:
explicit CSVParser(int label_idx)
:label_idx_(label_idx) {
}
inline void ParseOneLine(const char* str,
std::vector<std::pair<int, double>>* out_features) const override {
std::vector<std::pair<int, double>>* out_features, double* out_label) const override {
int idx = 0;
double val = 0.0;
int bias = 0;
*out_label = 0.0f;
while (*str != '\0') {
str = Common::Atof(str, &val);
if (fabs(val) > 1e-10) {
out_features->emplace_back(idx, val);
if (idx == label_idx_) {
*out_label = val;
bias = -1;
}
else if (fabs(val) > 1e-10) {
out_features->emplace_back(idx + bias, val);
}
++idx;
if (*str == ',') {
......@@ -31,28 +40,27 @@ public:
}
}
}
inline void ParseOneLine(const char* str, std::vector<std::pair<int, double>>* out_features,
double* out_label) const override {
// first column is label
str = Common::Atof(str, out_label);
if (*str == ',') {
++str;
} else if (*str != '\0') {
Log::Fatal("input format error, should be CSV");
}
return ParseOneLine(str, out_features);
}
private:
int label_idx_ = 0;
};
class TSVParser: public Parser {
public:
inline void ParseOneLine(const char* str, std::vector<std::pair<int, double>>* out_features) const override {
explicit TSVParser(int label_idx)
:label_idx_(label_idx) {
}
inline void ParseOneLine(const char* str,
std::vector<std::pair<int, double>>* out_features, double* out_label) const override {
int idx = 0;
double val = 0.0;
int bias = 0;
while (*str != '\0') {
str = Common::Atof(str, &val);
if (fabs(val) > 1e-10) {
out_features->emplace_back(idx, val);
if (idx == label_idx_) {
*out_label = val;
bias = -1;
} else if (fabs(val) > 1e-10) {
out_features->emplace_back(idx + bias, val);
}
++idx;
if (*str == '\t') {
......@@ -62,24 +70,27 @@ public:
}
}
}
inline void ParseOneLine(const char* str, std::vector<std::pair<int, double>>* out_features,
double* out_label) const override {
// first column is label
str = Common::Atof(str, out_label);
if (*str == '\t') {
++str;
} else if (*str != '\0') {
Log::Fatal("input format error, should be TSV");
}
return ParseOneLine(str, out_features);
}
private:
int label_idx_ = 0;
};
class LibSVMParser: public Parser {
public:
inline void ParseOneLine(const char* str, std::vector<std::pair<int, double>>* out_features) const override {
explicit LibSVMParser(int label_idx)
:label_idx_(label_idx) {
if (label_idx > 0) {
Log::Fatal("label should be the first column in Libsvm file");
}
}
inline void ParseOneLine(const char* str,
std::vector<std::pair<int, double>>* out_features, double* out_label) const override {
int idx = 0;
double val = 0.0;
if (label_idx_ == 0) {
str = Common::Atof(str, &val);
*out_label = val;
str = Common::SkipSpaceAndTab(str);
}
while (*str != '\0') {
str = Common::Atoi(str, &idx);
str = Common::SkipSpaceAndTab(str);
......@@ -93,13 +104,9 @@ public:
str = Common::SkipSpaceAndTab(str);
}
}
inline void ParseOneLine(const char* str, std::vector<std::pair<int, double>>* out_features,
double* out_label) const override {
// first column is label
str = Common::Atof(str, out_label);
str = Common::SkipSpaceAndTab(str);
return ParseOneLine(str, out_features);
}
private:
int label_idx_ = 0;
};
} // namespace LightGBM
#endif // LightGBM_IO_PARSER_HPP_
......@@ -77,7 +77,7 @@ Linkers::~Linkers() {
}
void Linkers::ParseMachineList(const char * filename) {
TextReader<size_t> machine_list_reader(filename);
TextReader<size_t> machine_list_reader(filename, false);
machine_list_reader.ReadAllLines();
if (machine_list_reader.Lines().size() <= 0) {
Log::Fatal("Machine list file:%s doesn't exist", filename);
......
......@@ -47,7 +47,7 @@ public:
// get boundries
query_boundaries_ = metadata.query_boundaries();
if (query_boundaries_ == nullptr) {
Log::Fatal("For NDCG metric, should have query information");
Log::Fatal("For lambdarank tasks, should have query information");
}
num_queries_ = metadata.num_queries();
// cache inverse max DCG, avoid computation many times
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment