Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tianlh
LightGBM-DCU
Commits
b23a2c31
Commit
b23a2c31
authored
Oct 29, 2016
by
xuehui
Committed by
GitHub
Oct 29, 2016
Browse files
Merge pull request #44 from guolinke/master
To solve #41
parents
2af0dccd
3a06ce35
Changes
17
Hide whitespace changes
Inline
Side-by-side
Showing
17 changed files
with
574 additions
and
170 deletions
+574
-170
include/LightGBM/boosting.h
include/LightGBM/boosting.h
+6
-0
include/LightGBM/config.h
include/LightGBM/config.h
+23
-1
include/LightGBM/dataset.h
include/LightGBM/dataset.h
+57
-31
include/LightGBM/utils/common.h
include/LightGBM/utils/common.h
+22
-1
include/LightGBM/utils/pipeline_reader.h
include/LightGBM/utils/pipeline_reader.h
+7
-2
include/LightGBM/utils/text_reader.h
include/LightGBM/utils/text_reader.h
+49
-4
src/application/application.cpp
src/application/application.cpp
+6
-9
src/application/predictor.hpp
src/application/predictor.hpp
+9
-19
src/boosting/gbdt.cpp
src/boosting/gbdt.cpp
+30
-8
src/boosting/gbdt.h
src/boosting/gbdt.h
+10
-0
src/io/config.cpp
src/io/config.cpp
+5
-0
src/io/dataset.cpp
src/io/dataset.cpp
+189
-12
src/io/metadata.cpp
src/io/metadata.cpp
+52
-7
src/io/parser.cpp
src/io/parser.cpp
+64
-38
src/io/parser.hpp
src/io/parser.hpp
+43
-36
src/network/linkers_socket.cpp
src/network/linkers_socket.cpp
+1
-1
src/objective/rank_objective.hpp
src/objective/rank_objective.hpp
+1
-1
No files found.
include/LightGBM/boosting.h
View file @
b23a2c31
...
...
@@ -85,6 +85,12 @@ public:
*/
virtual
int
MaxFeatureIdx
()
const
=
0
;
/*!
* \brief Get index of label column
* \return index of label column
*/
virtual
int
LabelIdx
()
const
=
0
;
/*!
* \brief Get number of weak sub-models
* \return Number of weak sub-models
...
...
include/LightGBM/config.h
View file @
b23a2c31
...
...
@@ -100,6 +100,20 @@ public:
bool
use_two_round_loading
=
false
;
bool
is_save_binary_file
=
false
;
bool
is_sigmoid
=
true
;
bool
has_header
=
false
;
/*! \brief Index or column name of label, default is the first column
* And add an prefix "name:" while using column name */
std
::
string
label_column
=
""
;
/*! \brief Index or column name of weight, < 0 means not used
* And add an prefix "name:" while using column name */
std
::
string
weight_column
=
""
;
/*! \brief Index or column name of group, < 0 means not used */
std
::
string
group_column
=
""
;
/*! \brief ignored features, separate by ','
* e.g. name:column_name1,column_name2 */
std
::
string
ignore_column
=
""
;
void
Set
(
const
std
::
unordered_map
<
std
::
string
,
std
::
string
>&
params
)
override
;
};
...
...
@@ -323,7 +337,15 @@ struct ParameterAlias {
{
"save_binary"
,
"is_save_binary_file"
},
{
"early_stopping_rounds"
,
"early_stopping_round"
},
{
"early_stopping"
,
"early_stopping_round"
},
{
"verbosity"
,
"verbose"
}
{
"verbosity"
,
"verbose"
},
{
"header"
,
"has_header"
},
{
"label"
,
"label_column"
},
{
"weight"
,
"weight_column"
},
{
"group"
,
"group_column"
},
{
"query"
,
"group_column"
},
{
"query_column"
,
"group_column"
},
{
"ignore_feature"
,
"ignore_column"
},
{
"blacklist"
,
"ignore_column"
}
});
std
::
unordered_map
<
std
::
string
,
std
::
string
>
tmp_map
;
for
(
const
auto
&
pair
:
*
params
)
{
...
...
include/LightGBM/dataset.h
View file @
b23a2c31
...
...
@@ -5,11 +5,13 @@
#include <LightGBM/utils/text_reader.h>
#include <LightGBM/meta.h>
#include <LightGBM/config.h>
#include <vector>
#include <utility>
#include <functional>
#include <string>
#include <unordered_set>
namespace
LightGBM
{
...
...
@@ -56,10 +58,12 @@ public:
~
Metadata
();
/*!
* \brief Initial work, will a
uto load weight, inital scores
* \brief Initial work, will a
llocate space for label, weight(if exists) and query(if exists)
* \param num_data Number of training data
* \param weight_idx Index of weight column, < 0 means doesn't exists
* \param query_idx Index of query id column, < 0 means doesn't exists
*/
void
Init
Label
(
data_size_t
num_data
);
void
Init
(
data_size_t
num_data
,
int
weight_idx
,
int
query_idx
);
/*!
* \brief Partition label by used indices
...
...
@@ -109,6 +113,26 @@ public:
label_
[
idx
]
=
static_cast
<
float
>
(
value
);
}
/*!
* \brief Set Weight for one record
* \param idx Index of this record
* \param value Weight value of this record
*/
inline
void
SetWeightAt
(
data_size_t
idx
,
double
value
)
{
weights_
[
idx
]
=
static_cast
<
float
>
(
value
);
}
/*!
* \brief Set Query Id for one record
* \param idx Index of this record
* \param value Query Id value of this record
*/
inline
void
SetQueryAt
(
data_size_t
idx
,
double
value
)
{
queries_
[
idx
]
=
static_cast
<
data_size_t
>
(
value
);
}
/*!
* \brief Get weights, if not exists, will return nullptr
* \return Pointer of weights
...
...
@@ -178,41 +202,35 @@ private:
data_size_t
num_init_score_
;
/*! \brief Initial score */
score_t
*
init_score_
;
/*! \brief Queries data */
data_size_t
*
queries_
;
};
/*! \brief Interface for Parser */
class
Parser
{
public:
/*! \brief virtual destructor */
virtual
~
Parser
()
{}
/*!
* \brief Parse one line with label
* \param str One line record, string format, should end with '\0'
* \param out_features Output features, store in (feature_idx, feature_value)
* \param out_label Output label
*/
virtual
void
ParseOneLine
(
const
char
*
str
,
std
::
vector
<
std
::
pair
<
int
,
double
>>*
out_features
,
double
*
out_label
)
const
=
0
;
/*!
* \brief Parse one line with label
* \param str One line record, string format, should end with '\0'
* \param out_features Output
feature
s, store in (
feature_idx, feature_
value)
* \param out_label
Output label
* \param out_features Output
column
s, store in (
column_idx,
value
s
)
* \param out_label
Label will store to this if exists
*/
virtual
void
ParseOneLine
(
const
char
*
str
,
std
::
vector
<
std
::
pair
<
int
,
double
>>*
out_features
)
const
=
0
;
std
::
vector
<
std
::
pair
<
int
,
double
>>*
out_features
,
double
*
out_label
)
const
=
0
;
/*!
* \brief Create a object of parser, will auto choose the format depend on file
* \param filename One Filename of data
* \param num_features Pass num_features of this data file if you know, <=0 means don't know
* \param
has_
label
output, if num_features > 0, will output this data has
label
or not
* \param label
_idx index of
label
column
* \return Object of parser
*/
static
Parser
*
CreateParser
(
const
char
*
filename
,
int
num_features
,
bool
*
has_
label
);
static
Parser
*
CreateParser
(
const
char
*
filename
,
bool
has_header
,
int
num_features
,
int
label
_idx
);
};
using
PredictFunction
=
...
...
@@ -227,29 +245,21 @@ public:
* \brief Constructor
* \param data_filename Filename of dataset
* \param init_score_filename Filename of initial score
* \param is_int_label True if label is int type
* \param max_bin The maximal number of bin that feature values will bucket in
* \param random_seed The seed for random generator
* \param is_enable_sparse True for sparse feature
* \param io_config configs for IO
* \param predict_fun Used for initial model, will give a prediction score based on this function, then set as initial score
*/
Dataset
(
const
char
*
data_filename
,
const
char
*
init_score_filename
,
int
max_bin
,
int
random_seed
,
bool
is_enable_sparse
,
const
PredictFunction
&
predict_fun
);
const
IOConfig
&
io_config
,
const
PredictFunction
&
predict_fun
);
/*!
* \brief Constructor
* \param data_filename Filename of dataset
* \param is_int_label True if label is int type
* \param max_bin The maximal number of bin that feature values will bucket in
* \param random_seed The seed for random generator
* \param is_enable_sparse True for sparse feature
* \param io_config configs for IO
* \param predict_fun Used for initial model, will give a prediction score based on this function, then set as initial score
*/
Dataset
(
const
char
*
data_filename
,
int
max_bin
,
int
random_seed
,
bool
is_enable_sparse
,
const
PredictFunction
&
predict_fun
)
:
Dataset
(
data_filename
,
""
,
max_bin
,
random_seed
,
is_enable_sparse
,
predict_fun
)
{
const
IOConfig
&
io_config
,
const
PredictFunction
&
predict_fun
)
:
Dataset
(
data_filename
,
""
,
io_config
,
predict_fun
)
{
}
/*! \brief Destructor */
...
...
@@ -304,6 +314,12 @@ public:
/*! \brief Get Number of total features */
inline
int
num_total_features
()
const
{
return
num_total_features_
;
}
/*! \brief Get the index of label column */
inline
int
label_idx
()
const
{
return
label_idx_
;
}
/*! \brief Get names of current data set */
inline
std
::
vector
<
std
::
string
>
feature_names
()
const
{
return
feature_names_
;
}
/*! \brief Get Number of data */
inline
data_size_t
num_data
()
const
{
return
num_data_
;
}
...
...
@@ -394,10 +410,20 @@ private:
bool
is_loading_from_binfile_
;
/*! \brief Number of global data, used for distributed learning */
size_t
global_num_data_
=
0
;
/
/
used to local used data indices
/
*! \brief
used to local used data indices
*/
std
::
vector
<
data_size_t
>
used_data_indices_
;
/
/
prediction function for initial model
/
*! \brief
prediction function for initial model
*/
const
PredictFunction
&
predict_fun_
;
/*! \brief index of label column */
int
label_idx_
=
0
;
/*! \brief index of weight column */
int
weight_idx_
=
-
1
;
/*! \brief index of group column */
int
group_idx_
=
-
1
;
/*! \brief Mapper from real feature index to used index*/
std
::
unordered_set
<
int
>
ignore_features_
;
/*! \brief store feature names */
std
::
vector
<
std
::
string
>
feature_names_
;
};
}
// namespace LightGBM
...
...
include/LightGBM/utils/common.h
View file @
b23a2c31
...
...
@@ -43,7 +43,13 @@ inline static std::string& RemoveQuotationSymbol(std::string& str) {
str
.
erase
(
0
,
str
.
find_first_not_of
(
"'
\"
"
));
return
str
;
}
inline
static
bool
StartsWith
(
const
std
::
string
&
str
,
const
std
::
string
prefix
)
{
if
(
str
.
substr
(
0
,
prefix
.
size
())
==
prefix
)
{
return
true
;
}
else
{
return
false
;
}
}
inline
static
std
::
vector
<
std
::
string
>
Split
(
const
char
*
c_str
,
char
delimiter
)
{
std
::
vector
<
std
::
string
>
ret
;
std
::
string
str
(
c_str
);
...
...
@@ -58,6 +64,21 @@ inline static std::vector<std::string> Split(const char* c_str, char delimiter)
return
ret
;
}
inline
static
std
::
vector
<
std
::
string
>
Split
(
const
char
*
c_str
,
const
char
*
delimiters
)
{
// will split when met any chars in delimiters
std
::
vector
<
std
::
string
>
ret
;
std
::
string
str
(
c_str
);
size_t
i
=
0
;
size_t
pos
=
str
.
find_first_of
(
delimiters
);
while
(
pos
!=
std
::
string
::
npos
)
{
ret
.
push_back
(
str
.
substr
(
i
,
pos
-
i
));
i
=
++
pos
;
pos
=
str
.
find_first_of
(
delimiters
,
pos
);
}
ret
.
push_back
(
str
.
substr
(
i
));
return
ret
;
}
inline
static
const
char
*
Atoi
(
const
char
*
p
,
int
*
out
)
{
int
sign
,
value
;
while
(
*
p
==
' '
)
{
...
...
include/LightGBM/utils/pipeline_reader.h
View file @
b23a2c31
...
...
@@ -21,7 +21,7 @@ public:
* \param filename Filename of data
* \process_fun Process function
*/
static
size_t
Read
(
const
char
*
filename
,
const
std
::
function
<
size_t
(
const
char
*
,
size_t
)
>&
process_fun
)
{
static
size_t
Read
(
const
char
*
filename
,
int
skip_bytes
,
const
std
::
function
<
size_t
(
const
char
*
,
size_t
)
>&
process_fun
)
{
FILE
*
file
;
#ifdef _MSC_VER
...
...
@@ -38,8 +38,13 @@ public:
char
*
buffer_process
=
new
char
[
buffer_size
];
// buffer used for the file reading
char
*
buffer_read
=
new
char
[
buffer_size
];
size_t
read_cnt
=
0
;
if
(
skip_bytes
>
0
)
{
// skip first k bytes
read_cnt
=
fread
(
buffer_process
,
1
,
skip_bytes
,
file
);
}
// read first block
size_t
read_cnt
=
fread
(
buffer_process
,
1
,
buffer_size
,
file
);
read_cnt
=
fread
(
buffer_process
,
1
,
buffer_size
,
file
);
size_t
last_read_cnt
=
0
;
while
(
read_cnt
>
0
)
{
// strat read thread
...
...
include/LightGBM/utils/text_reader.h
View file @
b23a2c31
...
...
@@ -6,6 +6,7 @@
#include <LightGBM/utils/random.h>
#include <cstdio>
#include <sstream>
#include <vector>
#include <string>
...
...
@@ -22,9 +23,41 @@ public:
/*!
* \brief Constructor
* \param filename Filename of data
* \param is_skip_first_line True if need to skip header
*/
TextReader
(
const
char
*
filename
)
:
filename_
(
filename
){
TextReader
(
const
char
*
filename
,
bool
is_skip_first_line
)
:
filename_
(
filename
),
is_skip_first_line_
(
is_skip_first_line
){
if
(
is_skip_first_line_
)
{
FILE
*
file
;
#ifdef _MSC_VER
fopen_s
(
&
file
,
filename
,
"r"
);
#else
file
=
fopen
(
filename
,
"r"
);
#endif
std
::
stringstream
str_buf
;
int
read_c
=
-
1
;
read_c
=
fgetc
(
file
);
while
(
read_c
!=
EOF
)
{
char
tmp_ch
=
static_cast
<
char
>
(
read_c
);
if
(
tmp_ch
==
'\n'
||
tmp_ch
==
'\r'
)
{
break
;
}
str_buf
<<
tmp_ch
;
++
skip_bytes_
;
read_c
=
fgetc
(
file
);
}
if
(
static_cast
<
char
>
(
read_c
)
==
'\r'
)
{
read_c
=
fgetc
(
file
);
++
skip_bytes_
;
}
if
(
static_cast
<
char
>
(
read_c
)
==
'\n'
)
{
read_c
=
fgetc
(
file
);
++
skip_bytes_
;
}
fclose
(
file
);
first_line_
=
str_buf
.
str
();
Log
::
Info
(
"skip header:
\"
%s
\"
in file %s"
,
first_line_
.
c_str
(),
filename_
);
}
}
/*!
* \brief Destructor
...
...
@@ -40,6 +73,12 @@ public:
lines_
.
shrink_to_fit
();
}
/*!
* \brief return first line of data
*/
inline
std
::
string
first_line
()
{
return
first_line_
;
}
/*!
* \brief Get text data that read from file
* \return Text data, store in std::vector by line
*/
...
...
@@ -48,7 +87,7 @@ public:
INDEX_T
ReadAllAndProcess
(
const
std
::
function
<
void
(
INDEX_T
,
const
char
*
,
size_t
)
>&
process_fun
)
{
last_line_
=
""
;
INDEX_T
total_cnt
=
0
;
PipelineReader
::
Read
(
filename_
,
PipelineReader
::
Read
(
filename_
,
skip_bytes_
,
[
this
,
&
total_cnt
,
&
process_fun
]
(
const
char
*
buffer_process
,
size_t
read_cnt
)
{
size_t
cnt
=
0
;
...
...
@@ -176,7 +215,7 @@ public:
last_line_
=
""
;
INDEX_T
total_cnt
=
0
;
INDEX_T
used_cnt
=
0
;
PipelineReader
::
Read
(
filename_
,
PipelineReader
::
Read
(
filename_
,
skip_bytes_
,
[
this
,
&
total_cnt
,
&
process_fun
,
&
used_cnt
,
&
filter_fun
]
(
const
char
*
buffer_process
,
size_t
read_cnt
)
{
size_t
cnt
=
0
;
...
...
@@ -260,6 +299,12 @@ private:
std
::
vector
<
std
::
string
>
lines_
;
/*! \brief Buffer for last line */
std
::
string
last_line_
;
/*! \brief first line */
std
::
string
first_line_
=
""
;
/*! \brief is skip first line */
bool
is_skip_first_line_
=
false
;
/*! \brief is skip first line */
int
skip_bytes_
=
0
;
};
}
// namespace LightGBM
...
...
src/application/application.cpp
View file @
b23a2c31
...
...
@@ -76,7 +76,7 @@ void Application::LoadParameters(int argc, char** argv) {
ParameterAlias
::
KeyAliasTransform
(
&
params
);
// read parameters from config file
if
(
params
.
count
(
"config_file"
)
>
0
)
{
TextReader
<
size_t
>
config_reader
(
params
[
"config_file"
].
c_str
());
TextReader
<
size_t
>
config_reader
(
params
[
"config_file"
].
c_str
()
,
false
);
config_reader
.
ReadAllLines
();
if
(
config_reader
.
Lines
().
size
()
>
0
)
{
for
(
auto
&
line
:
config_reader
.
Lines
())
{
...
...
@@ -139,9 +139,7 @@ void Application::LoadData() {
}
train_data_
=
new
Dataset
(
config_
.
io_config
.
data_filename
.
c_str
(),
config_
.
io_config
.
input_init_score
.
c_str
(),
config_
.
io_config
.
max_bin
,
config_
.
io_config
.
data_random_seed
,
config_
.
io_config
.
is_enable_sparse
,
config_
.
io_config
,
predict_fun
);
// load Training data
if
(
config_
.
is_parallel_find_bin
)
{
...
...
@@ -173,9 +171,7 @@ void Application::LoadData() {
// add
valid_datas_
.
push_back
(
new
Dataset
(
config_
.
io_config
.
valid_data_filenames
[
i
].
c_str
(),
config_
.
io_config
.
max_bin
,
config_
.
io_config
.
data_random_seed
,
config_
.
io_config
.
is_enable_sparse
,
config_
.
io_config
,
predict_fun
));
// load validation data like train data
valid_datas_
.
back
()
->
LoadValidationData
(
train_data_
,
...
...
@@ -253,7 +249,8 @@ void Application::Train() {
void
Application
::
Predict
()
{
// create predictor
Predictor
predictor
(
boosting_
,
config_
.
io_config
.
is_sigmoid
,
config_
.
predict_leaf_index
);
predictor
.
Predict
(
config_
.
io_config
.
data_filename
.
c_str
(),
config_
.
io_config
.
output_result
.
c_str
());
predictor
.
Predict
(
config_
.
io_config
.
data_filename
.
c_str
(),
config_
.
io_config
.
output_result
.
c_str
(),
config_
.
io_config
.
has_header
);
Log
::
Info
(
"Finish predict."
);
}
...
...
@@ -265,7 +262,7 @@ void Application::InitPredict() {
}
void
Application
::
LoadModel
()
{
TextReader
<
size_t
>
model_reader
(
config_
.
io_config
.
input_model
.
c_str
());
TextReader
<
size_t
>
model_reader
(
config_
.
io_config
.
input_model
.
c_str
()
,
false
);
model_reader
.
ReadAllLines
();
std
::
stringstream
ss
;
for
(
auto
&
line
:
model_reader
.
Lines
())
{
...
...
src/application/predictor.hpp
View file @
b23a2c31
...
...
@@ -92,7 +92,7 @@ public:
* \param has_label True if this data contains label
* \param result_filename Filename of output result
*/
void
Predict
(
const
char
*
data_filename
,
const
char
*
result_filename
)
{
void
Predict
(
const
char
*
data_filename
,
const
char
*
result_filename
,
bool
has_header
)
{
FILE
*
result_file
;
#ifdef _MSC_VER
...
...
@@ -104,8 +104,7 @@ public:
if
(
result_file
==
NULL
)
{
Log
::
Fatal
(
"Predition result file %s doesn't exists"
,
data_filename
);
}
bool
has_label
=
false
;
Parser
*
parser
=
Parser
::
CreateParser
(
data_filename
,
num_features_
,
&
has_label
);
Parser
*
parser
=
Parser
::
CreateParser
(
data_filename
,
has_header
,
num_features_
,
boosting_
->
LabelIdx
());
if
(
parser
==
nullptr
)
{
Log
::
Fatal
(
"Recongnizing input data format failed, filename %s"
,
data_filename
);
...
...
@@ -114,21 +113,12 @@ public:
// function for parse data
std
::
function
<
void
(
const
char
*
,
std
::
vector
<
std
::
pair
<
int
,
double
>>*
)
>
parser_fun
;
double
tmp_label
;
if
(
has_label
)
{
// parse function with label
parser_fun
=
[
this
,
&
parser
,
&
tmp_label
]
(
const
char
*
buffer
,
std
::
vector
<
std
::
pair
<
int
,
double
>>*
feature
)
{
parser
->
ParseOneLine
(
buffer
,
feature
,
&
tmp_label
);
};
Log
::
Info
(
"Start prediction for data %s with labels"
,
data_filename
);
}
else
{
// parse function without label
parser_fun
=
[
this
,
&
parser
]
(
const
char
*
buffer
,
std
::
vector
<
std
::
pair
<
int
,
double
>>*
feature
)
{
parser
->
ParseOneLine
(
buffer
,
feature
);
};
Log
::
Info
(
"Start prediction for data %s without label"
,
data_filename
);
}
parser_fun
=
[
this
,
&
parser
,
&
tmp_label
]
(
const
char
*
buffer
,
std
::
vector
<
std
::
pair
<
int
,
double
>>*
feature
)
{
parser
->
ParseOneLine
(
buffer
,
feature
,
&
tmp_label
);
};
std
::
function
<
std
::
string
(
const
std
::
vector
<
std
::
pair
<
int
,
double
>>&
)
>
predict_fun
;
if
(
predict_leaf_index
)
{
predict_fun
=
[
this
](
const
std
::
vector
<
std
::
pair
<
int
,
double
>>&
features
){
...
...
@@ -173,7 +163,7 @@ public:
fprintf
(
result_file
,
"%s
\n
"
,
pred_result
[
i
].
c_str
());
}
};
TextReader
<
data_size_t
>
predict_data_reader
(
data_filename
);
TextReader
<
data_size_t
>
predict_data_reader
(
data_filename
,
has_header
);
predict_data_reader
.
ReadAllAndProcessParallel
(
process_fun
);
fclose
(
result_file
);
...
...
src/boosting/gbdt.cpp
View file @
b23a2c31
...
...
@@ -61,7 +61,8 @@ void GBDT::Init(const Dataset* train_data, const ObjectiveFunction* object_funct
// get max feature index
max_feature_idx_
=
train_data_
->
num_total_features
()
-
1
;
// get label index
label_idx_
=
train_data_
->
label_idx
();
// if need bagging, create buffer
if
(
gbdt_config_
->
bagging_fraction
<
1.0
&&
gbdt_config_
->
bagging_freq
>
0
)
{
out_of_bag_data_indices_
=
new
data_size_t
[
num_data_
];
...
...
@@ -276,19 +277,21 @@ void GBDT::Boosting() {
std
::
string
GBDT
::
ModelsToString
()
const
{
// serialize this object to string
std
::
stringstream
ss
;
std
::
stringstream
str_buf
;
// output label index
str_buf
<<
"label_index="
<<
label_idx_
<<
std
::
endl
;
// output max_feature_idx
s
s
<<
"max_feature_idx="
<<
max_feature_idx_
<<
std
::
endl
;
s
tr_buf
<<
"max_feature_idx="
<<
max_feature_idx_
<<
std
::
endl
;
// output sigmoid parameter
s
s
<<
"sigmoid="
<<
object_function_
->
GetSigmoid
()
<<
std
::
endl
;
s
s
<<
std
::
endl
;
s
tr_buf
<<
"sigmoid="
<<
object_function_
->
GetSigmoid
()
<<
std
::
endl
;
s
tr_buf
<<
std
::
endl
;
// output tree models
for
(
size_t
i
=
0
;
i
<
models_
.
size
();
++
i
)
{
s
s
<<
"Tree="
<<
i
<<
std
::
endl
;
s
s
<<
models_
[
i
]
->
ToString
()
<<
std
::
endl
;
s
tr_buf
<<
"Tree="
<<
i
<<
std
::
endl
;
s
tr_buf
<<
models_
[
i
]
->
ToString
()
<<
std
::
endl
;
}
return
s
s
.
str
();
return
s
tr_buf
.
str
();
}
void
GBDT
::
ModelsFromString
(
const
std
::
string
&
model_str
,
int
num_used_model
)
{
...
...
@@ -296,7 +299,26 @@ void GBDT::ModelsFromString(const std::string& model_str, int num_used_model) {
models_
.
clear
();
std
::
vector
<
std
::
string
>
lines
=
Common
::
Split
(
model_str
.
c_str
(),
'\n'
);
size_t
i
=
0
;
// get index of label
while
(
i
<
lines
.
size
())
{
size_t
find_pos
=
lines
[
i
].
find
(
"label_index="
);
if
(
find_pos
!=
std
::
string
::
npos
)
{
std
::
vector
<
std
::
string
>
strs
=
Common
::
Split
(
lines
[
i
].
c_str
(),
'='
);
Common
::
Atoi
(
strs
[
1
].
c_str
(),
&
label_idx_
);
++
i
;
break
;
}
else
{
++
i
;
}
}
if
(
i
==
lines
.
size
())
{
Log
::
Fatal
(
"Model file doesn't contain label index"
);
return
;
}
// get max_feature_idx first
i
=
0
;
while
(
i
<
lines
.
size
())
{
size_t
find_pos
=
lines
[
i
].
find
(
"max_feature_idx="
);
if
(
find_pos
!=
std
::
string
::
npos
)
{
...
...
src/boosting/gbdt.h
View file @
b23a2c31
...
...
@@ -82,6 +82,13 @@ public:
* \return Max feature index of this model
*/
inline
int
MaxFeatureIdx
()
const
override
{
return
max_feature_idx_
;
}
/*!
* \brief Get index of label column
* \return index of label column
*/
inline
int
LabelIdx
()
const
override
{
return
label_idx_
;
}
/*!
* \brief Get number of weak sub-models
* \return Number of weak sub-models
...
...
@@ -173,6 +180,9 @@ private:
* if > 0 meas output score will transform by sigmoid function
*/
double
sigmoid_
;
/*! \brief Index of label column */
data_size_t
label_idx_
;
};
}
// namespace LightGBM
...
...
src/io/config.cpp
View file @
b23a2c31
...
...
@@ -163,6 +163,11 @@ void IOConfig::Set(const std::unordered_map<std::string, std::string>& params) {
if
(
GetString
(
params
,
"valid_data"
,
&
tmp_str
))
{
valid_data_filenames
=
Common
::
Split
(
tmp_str
.
c_str
(),
','
);
}
GetBool
(
params
,
"has_header"
,
&
has_header
);
GetString
(
params
,
"label_column"
,
&
label_column
);
GetString
(
params
,
"weight_column"
,
&
weight_column
);
GetString
(
params
,
"group_column"
,
&
group_column
);
GetString
(
params
,
"ignore_column"
,
&
ignore_column
);
}
...
...
src/io/dataset.cpp
View file @
b23a2c31
...
...
@@ -11,13 +11,14 @@
#include <vector>
#include <utility>
#include <string>
#include <sstream>
namespace
LightGBM
{
Dataset
::
Dataset
(
const
char
*
data_filename
,
const
char
*
init_score_filename
,
int
max_bin
,
int
random_seed
,
bool
is_enable_sparse
,
const
PredictFunction
&
predict_fun
)
:
data_filename_
(
data_filename
),
random_
(
random_seed
),
max_bin_
(
max_bin
),
is_enable_sparse_
(
is_enable_sparse
),
predict_fun_
(
predict_fun
)
{
const
IOConfig
&
io_config
,
const
PredictFunction
&
predict_fun
)
:
data_filename_
(
data_filename
),
random_
(
io_config
.
data_
random_seed
),
max_bin_
(
io_config
.
max_bin
),
is_enable_sparse_
(
io_config
.
is_enable_sparse
),
predict_fun_
(
predict_fun
)
{
CheckCanLoadFromBin
();
if
(
is_loading_from_binfile_
&&
predict_fun
!=
nullptr
)
{
...
...
@@ -28,13 +29,134 @@ Dataset::Dataset(const char* data_filename, const char* init_score_filename,
if
(
!
is_loading_from_binfile_
)
{
// load weight, query information and initilize score
metadata_
.
Init
(
data_filename
,
init_score_filename
);
// create text reader
text_reader_
=
new
TextReader
<
data_size_t
>
(
data_filename
,
io_config
.
has_header
);
std
::
unordered_map
<
std
::
string
,
int
>
name2idx
;
// get column names
if
(
io_config
.
has_header
)
{
std
::
string
first_line
=
text_reader_
->
first_line
();
feature_names_
=
Common
::
Split
(
first_line
.
c_str
(),
"
\t
,"
);
for
(
size_t
i
=
0
;
i
<
feature_names_
.
size
();
++
i
)
{
name2idx
[
feature_names_
[
i
]]
=
static_cast
<
int
>
(
i
);
}
}
std
::
string
name_prefix
(
"name:"
);
// load label idx
if
(
io_config
.
label_column
.
size
()
>
0
)
{
if
(
Common
::
StartsWith
(
io_config
.
label_column
,
name_prefix
))
{
std
::
string
name
=
io_config
.
label_column
.
substr
(
name_prefix
.
size
());
if
(
name2idx
.
count
(
name
)
>
0
)
{
label_idx_
=
name2idx
[
name
];
Log
::
Info
(
"use %s column as label"
,
name
.
c_str
());
}
else
{
Log
::
Fatal
(
"cannot find label column: %s in data file"
,
name
.
c_str
());
}
}
else
{
size_t
pos
=
0
;
label_idx_
=
std
::
stoi
(
io_config
.
label_column
,
&
pos
);
if
(
pos
!=
io_config
.
label_column
.
size
())
{
Log
::
Fatal
(
"label_column is not a number, \
if you want to use column name, \
please add prefix
\"
name:
\"
before column name"
);
}
Log
::
Info
(
"use %d-th column as label"
,
label_idx_
);
}
}
if
(
feature_names_
.
size
()
>
0
)
{
// erase label column name
feature_names_
.
erase
(
feature_names_
.
begin
()
+
label_idx_
);
}
// load ignore columns
if
(
io_config
.
ignore_column
.
size
()
>
0
)
{
if
(
Common
::
StartsWith
(
io_config
.
ignore_column
,
name_prefix
))
{
std
::
string
names
=
io_config
.
ignore_column
.
substr
(
name_prefix
.
size
());
for
(
auto
name
:
Common
::
Split
(
names
.
c_str
(),
','
))
{
if
(
name2idx
.
count
(
name
)
>
0
)
{
int
tmp
=
name2idx
[
name
];
// skip for label column
if
(
tmp
>
label_idx_
)
{
tmp
-=
1
;
}
ignore_features_
.
emplace
(
tmp
);
}
else
{
Log
::
Fatal
(
"cannot find column: %s in data file"
,
name
.
c_str
());
}
}
}
else
{
for
(
auto
token
:
Common
::
Split
(
io_config
.
ignore_column
.
c_str
(),
','
))
{
size_t
pos
=
0
;
int
tmp
=
std
::
stoi
(
token
,
&
pos
);
if
(
pos
!=
token
.
size
())
{
Log
::
Fatal
(
"ignore_column is not a number, \
if you want to use column name, \
please add prefix
\"
name:
\"
before column name"
);
}
// skip for label column
if
(
tmp
>
label_idx_
)
{
tmp
-=
1
;
}
ignore_features_
.
emplace
(
tmp
);
}
}
}
// load weight idx
if
(
io_config
.
weight_column
.
size
()
>
0
)
{
if
(
Common
::
StartsWith
(
io_config
.
weight_column
,
name_prefix
))
{
std
::
string
name
=
io_config
.
weight_column
.
substr
(
name_prefix
.
size
());
if
(
name2idx
.
count
(
name
)
>
0
)
{
weight_idx_
=
name2idx
[
name
];
Log
::
Info
(
"use %s column as weight"
,
name
.
c_str
());
}
else
{
Log
::
Fatal
(
"cannot find weight column: %s in data file"
,
name
.
c_str
());
}
}
else
{
size_t
pos
=
0
;
weight_idx_
=
std
::
stoi
(
io_config
.
weight_column
,
&
pos
);
if
(
pos
!=
io_config
.
weight_column
.
size
())
{
Log
::
Fatal
(
"weight_column is not a number, \
if you want to use column name, \
please add prefix
\"
name:
\"
before column name"
);
}
Log
::
Info
(
"use %d-th column as weight"
,
weight_idx_
);
}
// skip for label column
if
(
weight_idx_
>
label_idx_
)
{
weight_idx_
-=
1
;
}
ignore_features_
.
emplace
(
weight_idx_
);
}
if
(
io_config
.
group_column
.
size
()
>
0
)
{
if
(
Common
::
StartsWith
(
io_config
.
group_column
,
name_prefix
))
{
std
::
string
name
=
io_config
.
group_column
.
substr
(
name_prefix
.
size
());
if
(
name2idx
.
count
(
name
)
>
0
)
{
group_idx_
=
name2idx
[
name
];
Log
::
Info
(
"use %s column as group/query id"
,
name
.
c_str
());
}
else
{
Log
::
Fatal
(
"cannot find group/query column: %s in data file"
,
name
.
c_str
());
}
}
else
{
size_t
pos
=
0
;
group_idx_
=
std
::
stoi
(
io_config
.
group_column
,
&
pos
);
if
(
pos
!=
io_config
.
group_column
.
size
())
{
Log
::
Fatal
(
"group_column is not a number, \
if you want to use column name, \
please add prefix
\"
name:
\"
before column name"
);
}
Log
::
Info
(
"use %d-th column as group/query id"
,
group_idx_
);
}
// skip for label column
if
(
group_idx_
>
label_idx_
)
{
group_idx_
-=
1
;
}
ignore_features_
.
emplace
(
group_idx_
);
}
// create text parser
parser_
=
Parser
::
CreateParser
(
data_filename_
,
0
,
nullptr
);
parser_
=
Parser
::
CreateParser
(
data_filename_
,
io_config
.
has_header
,
0
,
label_idx_
);
if
(
parser_
==
nullptr
)
{
Log
::
Fatal
(
"Cannot recognising input data format, filename: %s"
,
data_filename_
);
}
// create text reader
text_reader_
=
new
TextReader
<
data_size_t
>
(
data_filename
);
}
else
{
// only need to load initilize score, other meta data will be loaded from bin flie
metadata_
.
Init
(
init_score_filename
);
...
...
@@ -190,18 +312,40 @@ void Dataset::ConstructBinMappers(int rank, int num_machines, const std::vector<
// -1 means doesn't use this feature
used_feature_map_
=
std
::
vector
<
int
>
(
sample_values
.
size
(),
-
1
);
num_total_features_
=
static_cast
<
int
>
(
sample_values
.
size
());
// check the range of label_idx, weight_idx and group_idx
CHECK
(
label_idx_
>=
0
&&
label_idx_
<=
num_total_features_
);
CHECK
(
weight_idx_
<
0
||
weight_idx_
<
num_total_features_
);
CHECK
(
group_idx_
<
0
||
group_idx_
<
num_total_features_
);
// fill feature_names_ if not header
if
(
feature_names_
.
size
()
<=
0
)
{
for
(
int
i
=
0
;
i
<
num_total_features_
;
++
i
)
{
std
::
stringstream
str_buf
;
str_buf
<<
"Column_"
<<
i
;
feature_names_
.
push_back
(
str_buf
.
str
());
}
}
// start find bins
if
(
num_machines
==
1
)
{
std
::
vector
<
BinMapper
*>
bin_mappers
(
sample_values
.
size
());
// if only 1 machines, find bin locally
#pragma omp parallel for schedule(guided)
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
sample_values
.
size
());
++
i
)
{
if
(
ignore_features_
.
count
(
i
)
>
0
)
{
bin_mappers
[
i
]
=
nullptr
;
continue
;
}
bin_mappers
[
i
]
=
new
BinMapper
();
bin_mappers
[
i
]
->
FindBin
(
&
sample_values
[
i
],
max_bin_
);
}
for
(
size_t
i
=
0
;
i
<
sample_values
.
size
();
++
i
)
{
if
(
!
bin_mappers
[
i
]
->
is_trival
())
{
if
(
bin_mappers
[
i
]
==
nullptr
)
{
Log
::
Error
(
"Ignore Feature %s "
,
feature_names_
[
i
].
c_str
());
}
else
if
(
!
bin_mappers
[
i
]
->
is_trival
())
{
// map real feature index to used feature index
used_feature_map_
[
i
]
=
static_cast
<
int
>
(
features_
.
size
());
// push new feature
...
...
@@ -209,7 +353,7 @@ void Dataset::ConstructBinMappers(int rank, int num_machines, const std::vector<
num_data_
,
is_enable_sparse_
));
}
else
{
// if feature is trival(only 1 bin), free spaces
Log
::
Error
(
"Feature %
d
only contains one value, will be ignored"
,
i
);
Log
::
Error
(
"Feature %
s
only contains one value, will be ignored"
,
feature_names_
[
i
].
c_str
()
);
delete
bin_mappers
[
i
];
}
}
...
...
@@ -256,12 +400,17 @@ void Dataset::ConstructBinMappers(int rank, int num_machines, const std::vector<
Network
::
Allgather
(
input_buffer
,
buffer_size
,
start
,
len
,
output_buffer
);
// restore features bins from buffer
for
(
int
i
=
0
;
i
<
total_num_feature
;
++
i
)
{
if
(
ignore_features_
.
count
(
i
)
>
0
)
{
Log
::
Error
(
"Ignore Feature %s "
,
feature_names_
[
i
].
c_str
());
continue
;
}
BinMapper
*
bin_mapper
=
new
BinMapper
();
bin_mapper
->
CopyFrom
(
output_buffer
+
i
*
type_size
);
if
(
!
bin_mapper
->
is_trival
())
{
used_feature_map_
[
i
]
=
static_cast
<
int
>
(
features_
.
size
());
features_
.
push_back
(
new
Feature
(
static_cast
<
int
>
(
i
),
bin_mapper
,
num_data_
,
is_enable_sparse_
));
}
else
{
Log
::
Error
(
"Feature %s only contains one value, will be ignored"
,
feature_names_
[
i
].
c_str
());
delete
bin_mapper
;
}
}
...
...
@@ -276,6 +425,13 @@ void Dataset::ConstructBinMappers(int rank, int num_machines, const std::vector<
void
Dataset
::
LoadTrainData
(
int
rank
,
int
num_machines
,
bool
is_pre_partition
,
bool
use_two_round_loading
)
{
// don't support query id in data file when training in parallel
if
(
num_machines
>
1
&&
!
is_pre_partition
)
{
if
(
group_idx_
>
0
)
{
Log
::
Fatal
(
"Don't support query id in data file when training parallel without pre-partition. \
Please use an additional query file or pre-partition your data"
);
}
}
used_data_indices_
.
clear
();
if
(
!
is_loading_from_binfile_
)
{
if
(
!
use_two_round_loading
)
{
...
...
@@ -287,7 +443,7 @@ void Dataset::LoadTrainData(int rank, int num_machines, bool is_pre_partition, b
// construct feature bin mappers
ConstructBinMappers
(
rank
,
num_machines
,
sample_data
);
// initialize label
metadata_
.
Init
Label
(
num_data_
);
metadata_
.
Init
(
num_data_
,
weight_idx_
,
group_idx_
);
// extract features
ExtractFeaturesFromMemory
();
}
else
{
...
...
@@ -297,7 +453,7 @@ void Dataset::LoadTrainData(int rank, int num_machines, bool is_pre_partition, b
// construct feature bin mappers
ConstructBinMappers
(
rank
,
num_machines
,
sample_data
);
// initialize label
metadata_
.
Init
Label
(
num_data_
);
metadata_
.
Init
(
num_data_
,
weight_idx_
,
group_idx_
);
// extract features
ExtractFeaturesFromFile
();
...
...
@@ -322,7 +478,7 @@ void Dataset::LoadValidationData(const Dataset* train_set, bool use_two_round_lo
// read data in memory
LoadDataToMemory
(
0
,
1
,
false
);
// initialize label
metadata_
.
Init
Label
(
num_data_
);
metadata_
.
Init
(
num_data_
,
weight_idx_
,
group_idx_
);
features_
.
clear
();
// copy feature bin mapper data
for
(
Feature
*
feature
:
train_set
->
features_
)
{
...
...
@@ -336,7 +492,7 @@ void Dataset::LoadValidationData(const Dataset* train_set, bool use_two_round_lo
// Get number of lines of data file
num_data_
=
static_cast
<
data_size_t
>
(
text_reader_
->
CountLine
());
// initialize label
metadata_
.
Init
Label
(
num_data_
);
metadata_
.
Init
(
num_data_
,
weight_idx_
,
group_idx_
);
features_
.
clear
();
// copy feature bin mapper data
for
(
Feature
*
feature
:
train_set
->
features_
)
{
...
...
@@ -381,6 +537,13 @@ void Dataset::ExtractFeaturesFromMemory() {
// if is used feature
features_
[
feature_idx
]
->
PushData
(
tid
,
i
,
inner_data
.
second
);
}
else
{
if
(
inner_data
.
first
==
weight_idx_
)
{
metadata_
.
SetWeightAt
(
i
,
inner_data
.
second
);
}
else
if
(
inner_data
.
first
==
group_idx_
)
{
metadata_
.
SetQueryAt
(
i
,
inner_data
.
second
);
}
}
}
}
}
else
{
...
...
@@ -407,6 +570,13 @@ void Dataset::ExtractFeaturesFromMemory() {
// if is used feature
features_
[
feature_idx
]
->
PushData
(
tid
,
i
,
inner_data
.
second
);
}
else
{
if
(
inner_data
.
first
==
weight_idx_
)
{
metadata_
.
SetWeightAt
(
i
,
inner_data
.
second
);
}
else
if
(
inner_data
.
first
==
group_idx_
)
{
metadata_
.
SetQueryAt
(
i
,
inner_data
.
second
);
}
}
}
}
// metadata_ will manage space of init_score
...
...
@@ -451,6 +621,13 @@ void Dataset::ExtractFeaturesFromFile() {
// if is used feature
features_
[
feature_idx
]
->
PushData
(
tid
,
start_idx
+
i
,
inner_data
.
second
);
}
else
{
if
(
inner_data
.
first
==
weight_idx_
)
{
metadata_
.
SetWeightAt
(
start_idx
+
i
,
inner_data
.
second
);
}
else
if
(
inner_data
.
first
==
group_idx_
)
{
metadata_
.
SetQueryAt
(
start_idx
+
i
,
inner_data
.
second
);
}
}
}
}
};
...
...
src/io/metadata.cpp
View file @
b23a2c31
...
...
@@ -10,7 +10,7 @@ namespace LightGBM {
Metadata
::
Metadata
()
:
label_
(
nullptr
),
label_int_
(
nullptr
),
weights_
(
nullptr
),
query_boundaries_
(
nullptr
),
query_weights_
(
nullptr
),
init_score_
(
nullptr
)
{
query_weights_
(
nullptr
),
init_score_
(
nullptr
)
,
queries_
(
nullptr
)
{
}
...
...
@@ -36,12 +36,31 @@ Metadata::~Metadata() {
if
(
query_boundaries_
!=
nullptr
)
{
delete
[]
query_boundaries_
;
}
if
(
query_weights_
!=
nullptr
)
{
delete
[]
query_weights_
;
}
if
(
init_score_
!=
nullptr
)
{
delete
[]
init_score_
;
}
if
(
queries_
!=
nullptr
)
{
delete
[]
queries_
;
}
}
void
Metadata
::
Init
Label
(
data_size_t
num_data
)
{
void
Metadata
::
Init
(
data_size_t
num_data
,
int
weight_idx
,
int
query_idx
)
{
num_data_
=
num_data
;
label_
=
new
float
[
num_data_
];
if
(
weight_idx
>=
0
)
{
if
(
weights_
!=
nullptr
)
{
Log
::
Info
(
"using weight in data file, and ignore additional weight file"
);
delete
[]
weights_
;
}
weights_
=
new
float
[
num_data_
];
num_weights_
=
num_data_
;
memset
(
weights_
,
0
,
sizeof
(
float
)
*
num_data_
);
}
if
(
query_idx
>=
0
)
{
if
(
query_boundaries_
!=
nullptr
)
{
Log
::
Info
(
"using query id in data file, and ignore additional query file"
);
delete
[]
query_boundaries_
;
}
if
(
query_weights_
!=
nullptr
)
{
delete
[]
query_weights_
;
}
queries_
=
new
data_size_t
[
num_data_
];
memset
(
queries_
,
0
,
sizeof
(
data_size_t
)
*
num_data_
);
}
}
void
Metadata
::
PartitionLabel
(
const
std
::
vector
<
data_size_t
>&
used_indices
)
{
...
...
@@ -59,6 +78,32 @@ void Metadata::PartitionLabel(const std::vector<data_size_t>& used_indices) {
void
Metadata
::
CheckOrPartition
(
data_size_t
num_all_data
,
const
std
::
vector
<
data_size_t
>&
used_data_indices
)
{
if
(
used_data_indices
.
size
()
==
0
)
{
if
(
queries_
!=
nullptr
)
{
// need convert query_id to boundaries
std
::
vector
<
data_size_t
>
tmp_buffer
;
data_size_t
last_qid
=
-
1
;
data_size_t
cur_cnt
=
0
;
for
(
data_size_t
i
=
0
;
i
<
num_data_
;
++
i
)
{
if
(
last_qid
!=
queries_
[
i
])
{
if
(
cur_cnt
>
0
)
{
tmp_buffer
.
push_back
(
cur_cnt
);
}
cur_cnt
=
0
;
last_qid
=
queries_
[
i
];
}
++
cur_cnt
;
}
tmp_buffer
.
push_back
(
cur_cnt
);
query_boundaries_
=
new
data_size_t
[
tmp_buffer
.
size
()
+
1
];
num_queries_
=
static_cast
<
data_size_t
>
(
tmp_buffer
.
size
());
query_boundaries_
[
0
]
=
0
;
for
(
size_t
i
=
0
;
i
<
tmp_buffer
.
size
();
++
i
)
{
query_boundaries_
[
i
+
1
]
=
query_boundaries_
[
i
]
+
tmp_buffer
[
i
];
}
LoadQueryWeights
();
delete
[]
queries_
;
queries_
=
nullptr
;
}
// check weights
if
(
weights_
!=
nullptr
&&
num_weights_
!=
num_data_
)
{
Log
::
Error
(
"Initial weight size doesn't equal to data, weights will be ignored"
);
...
...
@@ -131,10 +176,10 @@ void Metadata::CheckOrPartition(data_size_t num_all_data, const std::vector<data
used_query
.
push_back
(
qid
);
data_idx
+=
len
;
}
else
{
Log
::
Fatal
(
"Data partition error, data didn't match queies"
);
Log
::
Fatal
(
"Data partition error, data didn't match que
r
ies"
);
}
}
else
{
Log
::
Fatal
(
"Data partition error, data didn't match queies"
);
Log
::
Fatal
(
"Data partition error, data didn't match que
r
ies"
);
}
}
data_size_t
*
old_query_boundaries
=
query_boundaries_
;
...
...
@@ -177,7 +222,7 @@ void Metadata::LoadWeights() {
std
::
string
weight_filename
(
data_filename_
);
// default weight file name
weight_filename
.
append
(
".weight"
);
TextReader
<
size_t
>
reader
(
weight_filename
.
c_str
());
TextReader
<
size_t
>
reader
(
weight_filename
.
c_str
()
,
false
);
reader
.
ReadAllLines
();
if
(
reader
.
Lines
().
size
()
<=
0
)
{
return
;
...
...
@@ -195,7 +240,7 @@ void Metadata::LoadWeights() {
void
Metadata
::
LoadInitialScore
()
{
num_init_score_
=
0
;
if
(
init_score_filename_
[
0
]
==
'\0'
)
{
return
;
}
TextReader
<
size_t
>
reader
(
init_score_filename_
);
TextReader
<
size_t
>
reader
(
init_score_filename_
,
false
);
reader
.
ReadAllLines
();
Log
::
Info
(
"Start loading initial scores"
);
...
...
@@ -213,7 +258,7 @@ void Metadata::LoadQueryBoundaries() {
std
::
string
query_filename
(
data_filename_
);
// default query file name
query_filename
.
append
(
".query"
);
TextReader
<
size_t
>
reader
(
query_filename
.
c_str
());
TextReader
<
size_t
>
reader
(
query_filename
.
c_str
()
,
false
);
reader
.
ReadAllLines
();
if
(
reader
.
Lines
().
size
()
<=
0
)
{
return
;
...
...
src/io/parser.cpp
View file @
b23a2c31
...
...
@@ -2,6 +2,7 @@
#include <iostream>
#include <fstream>
#include <functional>
namespace
LightGBM
{
...
...
@@ -20,44 +21,65 @@ void GetStatistic(const char* str, int* comma_cnt, int* tab_cnt, int* colon_cnt)
}
}
bool
CheckHasLabelForLibsvm
(
std
::
string
&
str
)
{
int
GetLabelIdxForLibsvm
(
std
::
string
&
str
,
int
num_features
,
int
label_idx
)
{
if
(
num_features
<=
0
)
{
return
label_idx
;
}
str
=
Common
::
Trim
(
str
);
auto
pos_space
=
str
.
find_first_of
(
"
\f\n\r\t\v
"
);
auto
pos_colon
=
str
.
find_first_of
(
":"
);
if
(
pos_colon
==
std
::
string
::
npos
||
pos_colon
>
pos_space
)
{
return
true
;
return
-
1
;
}
else
{
return
false
;
return
label_idx
;
}
}
bool
CheckHasLabelForTSV
(
std
::
string
&
str
,
int
num_features
)
{
int
GetLabelIdxForTSV
(
std
::
string
&
str
,
int
num_features
,
int
label_idx
)
{
if
(
num_features
<=
0
)
{
return
label_idx
;
}
str
=
Common
::
Trim
(
str
);
auto
tokens
=
Common
::
Split
(
str
.
c_str
(),
'\t'
);
if
(
static_cast
<
int
>
(
tokens
.
size
())
==
num_features
)
{
return
false
;
return
-
1
;
}
else
{
return
true
;
return
label_idx
;
}
}
bool
CheckHasLabelForCSV
(
std
::
string
&
str
,
int
num_features
)
{
int
GetLabelIdxForCSV
(
std
::
string
&
str
,
int
num_features
,
int
label_idx
)
{
if
(
num_features
<=
0
)
{
return
label_idx
;
}
str
=
Common
::
Trim
(
str
);
auto
tokens
=
Common
::
Split
(
str
.
c_str
(),
','
);
if
(
static_cast
<
int
>
(
tokens
.
size
())
==
num_features
)
{
return
false
;
return
-
1
;
}
else
{
return
true
;
return
label_idx
;
}
}
Parser
*
Parser
::
CreateParser
(
const
char
*
filename
,
int
num_features
,
bool
*
has_label
)
{
enum
DataType
{
INVALID
,
CSV
,
TSV
,
LIBSVM
};
Parser
*
Parser
::
CreateParser
(
const
char
*
filename
,
bool
has_header
,
int
num_features
,
int
label_idx
)
{
std
::
ifstream
tmp_file
;
tmp_file
.
open
(
filename
);
if
(
!
tmp_file
.
is_open
())
{
Log
::
Fatal
(
"Data file: %s doesn't exist"
,
filename
);
}
std
::
string
line1
,
line2
;
if
(
has_header
)
{
if
(
!
tmp_file
.
eof
())
{
std
::
getline
(
tmp_file
,
line1
);
}
}
if
(
!
tmp_file
.
eof
())
{
std
::
getline
(
tmp_file
,
line1
);
}
else
{
...
...
@@ -75,44 +97,48 @@ Parser* Parser::CreateParser(const char* filename, int num_features, bool* has_l
// Get some statistic from 2 line
GetStatistic
(
line1
.
c_str
(),
&
comma_cnt
,
&
tab_cnt
,
&
colon_cnt
);
GetStatistic
(
line2
.
c_str
(),
&
comma_cnt2
,
&
tab_cnt2
,
&
colon_cnt2
);
Parser
*
ret
=
nullptr
;
DataType
type
=
DataType
::
INVALID
;
if
(
line2
.
size
()
==
0
)
{
// if only have one line on file
if
(
colon_cnt
>
0
)
{
ret
=
new
LibSVMParser
();
if
(
num_features
>
0
&&
has_label
!=
nullptr
)
{
*
has_label
=
CheckHasLabelForLibsvm
(
line1
);
}
type
=
DataType
::
LIBSVM
;
}
else
if
(
tab_cnt
>
0
)
{
ret
=
new
TSVParser
();
if
(
num_features
>
0
&&
has_label
!=
nullptr
)
{
*
has_label
=
CheckHasLabelForTSV
(
line1
,
num_features
);
}
type
=
DataType
::
TSV
;
}
else
if
(
comma_cnt
>
0
)
{
ret
=
new
CSVParser
();
if
(
num_features
>
0
&&
has_label
!=
nullptr
)
{
*
has_label
=
CheckHasLabelForCSV
(
line1
,
num_features
);
}
}
type
=
DataType
::
CSV
;
}
}
else
{
if
(
colon_cnt
>
0
||
colon_cnt2
>
0
)
{
ret
=
new
LibSVMParser
();
if
(
num_features
>
0
&&
has_label
!=
nullptr
)
{
*
has_label
=
CheckHasLabelForLibsvm
(
line1
);
}
}
else
if
(
tab_cnt
==
tab_cnt2
&&
tab_cnt
>
0
)
{
ret
=
new
TSVParser
();
if
(
num_features
>
0
&&
has_label
!=
nullptr
)
{
*
has_label
=
CheckHasLabelForTSV
(
line1
,
num_features
);
}
type
=
DataType
::
LIBSVM
;
}
else
if
(
tab_cnt
==
tab_cnt2
&&
tab_cnt
>
0
)
{
type
=
DataType
::
TSV
;
}
else
if
(
comma_cnt
==
comma_cnt2
&&
comma_cnt
>
0
)
{
ret
=
new
CSVParser
();
if
(
num_features
>
0
&&
has_label
!=
nullptr
)
{
*
has_label
=
CheckHasLabelForCSV
(
line1
,
num_features
);
}
type
=
DataType
::
CSV
;
}
}
if
(
type
==
DataType
::
INVALID
)
{
Log
::
Fatal
(
"Unkown format of training data"
);
}
Parser
*
ret
=
nullptr
;
if
(
type
==
DataType
::
LIBSVM
)
{
label_idx
=
GetLabelIdxForLibsvm
(
line1
,
num_features
,
label_idx
);
ret
=
new
LibSVMParser
(
label_idx
);
}
else
if
(
type
==
DataType
::
TSV
)
{
label_idx
=
GetLabelIdxForTSV
(
line1
,
num_features
,
label_idx
);
ret
=
new
TSVParser
(
label_idx
);
}
else
if
(
type
==
DataType
::
CSV
)
{
label_idx
=
GetLabelIdxForCSV
(
line1
,
num_features
,
label_idx
);
ret
=
new
CSVParser
(
label_idx
);
}
if
(
label_idx
<
0
)
{
Log
::
Info
(
"Data file: %s doesn't contain label column"
,
filename
);
}
return
ret
;
}
...
...
src/io/parser.hpp
View file @
b23a2c31
...
...
@@ -14,14 +14,23 @@ namespace LightGBM {
class
CSVParser
:
public
Parser
{
public:
explicit
CSVParser
(
int
label_idx
)
:
label_idx_
(
label_idx
)
{
}
inline
void
ParseOneLine
(
const
char
*
str
,
std
::
vector
<
std
::
pair
<
int
,
double
>>*
out_features
)
const
override
{
std
::
vector
<
std
::
pair
<
int
,
double
>>*
out_features
,
double
*
out_label
)
const
override
{
int
idx
=
0
;
double
val
=
0.0
;
int
bias
=
0
;
*
out_label
=
0.0
f
;
while
(
*
str
!=
'\0'
)
{
str
=
Common
::
Atof
(
str
,
&
val
);
if
(
fabs
(
val
)
>
1e-10
)
{
out_features
->
emplace_back
(
idx
,
val
);
if
(
idx
==
label_idx_
)
{
*
out_label
=
val
;
bias
=
-
1
;
}
else
if
(
fabs
(
val
)
>
1e-10
)
{
out_features
->
emplace_back
(
idx
+
bias
,
val
);
}
++
idx
;
if
(
*
str
==
','
)
{
...
...
@@ -31,28 +40,27 @@ public:
}
}
}
inline
void
ParseOneLine
(
const
char
*
str
,
std
::
vector
<
std
::
pair
<
int
,
double
>>*
out_features
,
double
*
out_label
)
const
override
{
// first column is label
str
=
Common
::
Atof
(
str
,
out_label
);
if
(
*
str
==
','
)
{
++
str
;
}
else
if
(
*
str
!=
'\0'
)
{
Log
::
Fatal
(
"input format error, should be CSV"
);
}
return
ParseOneLine
(
str
,
out_features
);
}
private:
int
label_idx_
=
0
;
};
class
TSVParser
:
public
Parser
{
public:
inline
void
ParseOneLine
(
const
char
*
str
,
std
::
vector
<
std
::
pair
<
int
,
double
>>*
out_features
)
const
override
{
explicit
TSVParser
(
int
label_idx
)
:
label_idx_
(
label_idx
)
{
}
inline
void
ParseOneLine
(
const
char
*
str
,
std
::
vector
<
std
::
pair
<
int
,
double
>>*
out_features
,
double
*
out_label
)
const
override
{
int
idx
=
0
;
double
val
=
0.0
;
int
bias
=
0
;
while
(
*
str
!=
'\0'
)
{
str
=
Common
::
Atof
(
str
,
&
val
);
if
(
fabs
(
val
)
>
1e-10
)
{
out_features
->
emplace_back
(
idx
,
val
);
if
(
idx
==
label_idx_
)
{
*
out_label
=
val
;
bias
=
-
1
;
}
else
if
(
fabs
(
val
)
>
1e-10
)
{
out_features
->
emplace_back
(
idx
+
bias
,
val
);
}
++
idx
;
if
(
*
str
==
'\t'
)
{
...
...
@@ -62,24 +70,27 @@ public:
}
}
}
inline
void
ParseOneLine
(
const
char
*
str
,
std
::
vector
<
std
::
pair
<
int
,
double
>>*
out_features
,
double
*
out_label
)
const
override
{
// first column is label
str
=
Common
::
Atof
(
str
,
out_label
);
if
(
*
str
==
'\t'
)
{
++
str
;
}
else
if
(
*
str
!=
'\0'
)
{
Log
::
Fatal
(
"input format error, should be TSV"
);
}
return
ParseOneLine
(
str
,
out_features
);
}
private:
int
label_idx_
=
0
;
};
class
LibSVMParser
:
public
Parser
{
public:
inline
void
ParseOneLine
(
const
char
*
str
,
std
::
vector
<
std
::
pair
<
int
,
double
>>*
out_features
)
const
override
{
explicit
LibSVMParser
(
int
label_idx
)
:
label_idx_
(
label_idx
)
{
if
(
label_idx
>
0
)
{
Log
::
Fatal
(
"label should be the first column in Libsvm file"
);
}
}
inline
void
ParseOneLine
(
const
char
*
str
,
std
::
vector
<
std
::
pair
<
int
,
double
>>*
out_features
,
double
*
out_label
)
const
override
{
int
idx
=
0
;
double
val
=
0.0
;
if
(
label_idx_
==
0
)
{
str
=
Common
::
Atof
(
str
,
&
val
);
*
out_label
=
val
;
str
=
Common
::
SkipSpaceAndTab
(
str
);
}
while
(
*
str
!=
'\0'
)
{
str
=
Common
::
Atoi
(
str
,
&
idx
);
str
=
Common
::
SkipSpaceAndTab
(
str
);
...
...
@@ -93,13 +104,9 @@ public:
str
=
Common
::
SkipSpaceAndTab
(
str
);
}
}
inline
void
ParseOneLine
(
const
char
*
str
,
std
::
vector
<
std
::
pair
<
int
,
double
>>*
out_features
,
double
*
out_label
)
const
override
{
// first column is label
str
=
Common
::
Atof
(
str
,
out_label
);
str
=
Common
::
SkipSpaceAndTab
(
str
);
return
ParseOneLine
(
str
,
out_features
);
}
private:
int
label_idx_
=
0
;
};
}
// namespace LightGBM
#endif // LightGBM_IO_PARSER_HPP_
src/network/linkers_socket.cpp
View file @
b23a2c31
...
...
@@ -77,7 +77,7 @@ Linkers::~Linkers() {
}
void
Linkers
::
ParseMachineList
(
const
char
*
filename
)
{
TextReader
<
size_t
>
machine_list_reader
(
filename
);
TextReader
<
size_t
>
machine_list_reader
(
filename
,
false
);
machine_list_reader
.
ReadAllLines
();
if
(
machine_list_reader
.
Lines
().
size
()
<=
0
)
{
Log
::
Fatal
(
"Machine list file:%s doesn't exist"
,
filename
);
...
...
src/objective/rank_objective.hpp
View file @
b23a2c31
...
...
@@ -47,7 +47,7 @@ public:
// get boundries
query_boundaries_
=
metadata
.
query_boundaries
();
if
(
query_boundaries_
==
nullptr
)
{
Log
::
Fatal
(
"For
NDCG metric
, should have query information"
);
Log
::
Fatal
(
"For
lambdarank tasks
, should have query information"
);
}
num_queries_
=
metadata
.
num_queries
();
// cache inverse max DCG, avoid computation many times
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment