Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tianlh
LightGBM-DCU
Commits
33344088
Commit
33344088
authored
Nov 07, 2016
by
Guolin Ke
Browse files
clean code
parent
56f6f30f
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
39 additions
and
38 deletions
+39
-38
include/LightGBM/dataset.h
include/LightGBM/dataset.h
+4
-6
include/LightGBM/dataset_loader.h
include/LightGBM/dataset_loader.h
+2
-3
include/LightGBM/utils/common.h
include/LightGBM/utils/common.h
+3
-3
src/application/application.cpp
src/application/application.cpp
+2
-2
src/c_api.cpp
src/c_api.cpp
+11
-17
src/io/dataset.cpp
src/io/dataset.cpp
+10
-2
src/io/dataset_loader.cpp
src/io/dataset_loader.cpp
+7
-5
No files found.
include/LightGBM/dataset.h
View file @
33344088
...
@@ -246,6 +246,8 @@ public:
...
@@ -246,6 +246,8 @@ public:
Dataset
();
Dataset
();
explicit
Dataset
(
data_size_t
num_data
);
/*! \brief Destructor */
/*! \brief Destructor */
~
Dataset
();
~
Dataset
();
...
@@ -267,7 +269,7 @@ public:
...
@@ -267,7 +269,7 @@ public:
}
}
}
}
inline
void
PushOneCol
(
int
tid
,
data_size_t
col_idx
,
const
std
::
vector
<
std
::
pair
<
int
,
double
>>&
feature_values
)
{
inline
void
PushOneCol
umn
(
int
tid
,
data_size_t
col_idx
,
const
std
::
vector
<
std
::
pair
<
int
,
double
>>&
feature_values
)
{
int
feature_idx
=
used_feature_map_
[
col_idx
];
int
feature_idx
=
used_feature_map_
[
col_idx
];
if
(
feature_idx
>=
0
)
{
if
(
feature_idx
>=
0
)
{
for
(
auto
&
inner_data
:
feature_values
)
{
for
(
auto
&
inner_data
:
feature_values
)
{
...
@@ -276,10 +278,6 @@ public:
...
@@ -276,10 +278,6 @@ public:
}
}
}
}
inline
void
SetNumData
(
data_size_t
num_data
)
{
num_data_
=
num_data
;
}
void
FinishLoad
();
void
FinishLoad
();
void
SetField
(
const
char
*
field_name
,
const
void
*
field_data
,
data_size_t
num_element
,
int
type
);
void
SetField
(
const
char
*
field_name
,
const
void
*
field_data
,
data_size_t
num_element
,
int
type
);
...
@@ -293,7 +291,7 @@ public:
...
@@ -293,7 +291,7 @@ public:
std
::
vector
<
const
BinMapper
*>
GetBinMappers
()
const
;
std
::
vector
<
const
BinMapper
*>
GetBinMappers
()
const
;
void
CopyFeature
Metadata
To
(
Dataset
*
dataset
,
bool
is_enable_sparse
)
const
;
void
CopyFeature
BinMapper
To
(
Dataset
*
dataset
,
bool
is_enable_sparse
)
const
;
/*!
/*!
* \brief Get a feature pointer for specific index
* \brief Get a feature pointer for specific index
...
...
include/LightGBM/dataset_loader.h
View file @
33344088
...
@@ -12,7 +12,7 @@ public:
...
@@ -12,7 +12,7 @@ public:
~
DatasetLoader
();
~
DatasetLoader
();
void
SetHead
d
er
(
const
char
*
filename
);
void
SetHeader
(
const
char
*
filename
);
Dataset
*
LoadFromFile
(
const
char
*
filename
,
int
rank
,
int
num_machines
);
Dataset
*
LoadFromFile
(
const
char
*
filename
,
int
rank
,
int
num_machines
);
...
@@ -20,7 +20,7 @@ public:
...
@@ -20,7 +20,7 @@ public:
return
LoadFromFile
(
filename
,
0
,
1
);
return
LoadFromFile
(
filename
,
0
,
1
);
}
}
Dataset
*
LoadFromFile
LikeOthers
(
const
char
*
filename
,
const
Dataset
*
other
);
Dataset
*
LoadFromFile
AlignWithOtherDataset
(
const
char
*
filename
,
const
Dataset
*
train_data
);
Dataset
*
LoadFromBinFile
(
const
char
*
bin_filename
,
int
rank
,
int
num_machines
);
Dataset
*
LoadFromBinFile
(
const
char
*
bin_filename
,
int
rank
,
int
num_machines
);
...
@@ -51,7 +51,6 @@ private:
...
@@ -51,7 +51,6 @@ private:
/*! \brief Check can load from binary file */
/*! \brief Check can load from binary file */
bool
CheckCanLoadFromBin
(
const
char
*
filename
);
bool
CheckCanLoadFromBin
(
const
char
*
filename
);
const
IOConfig
&
io_config_
;
const
IOConfig
&
io_config_
;
/*! \brief Random generator*/
/*! \brief Random generator*/
Random
random_
;
Random
random_
;
...
...
include/LightGBM/utils/common.h
View file @
33344088
...
@@ -382,7 +382,7 @@ inline void SortForPair(std::vector<T1>& keys, std::vector<T2>& values, size_t s
...
@@ -382,7 +382,7 @@ inline void SortForPair(std::vector<T1>& keys, std::vector<T2>& values, size_t s
}
}
inline
std
::
function
<
std
::
vector
<
double
>
(
int
row_idx
)
>
inline
std
::
function
<
std
::
vector
<
double
>
(
int
row_idx
)
>
Get
RowFunctionFrom
Mat
(
const
void
*
data
,
int
num_row
,
int
num_col
,
int
float_type
,
int
is_row_major
)
{
RowFunctionFrom
DenseMatric
(
const
void
*
data
,
int
num_row
,
int
num_col
,
int
float_type
,
int
is_row_major
)
{
if
(
float_type
==
0
)
{
if
(
float_type
==
0
)
{
const
float
*
dptr
=
reinterpret_cast
<
const
float
*>
(
data
);
const
float
*
dptr
=
reinterpret_cast
<
const
float
*>
(
data
);
if
(
is_row_major
)
{
if
(
is_row_major
)
{
...
@@ -432,7 +432,7 @@ GetRowFunctionFromMat(const void* data, int num_row, int num_col, int float_type
...
@@ -432,7 +432,7 @@ GetRowFunctionFromMat(const void* data, int num_row, int num_col, int float_type
inline
std
::
function
<
std
::
vector
<
std
::
pair
<
int
,
double
>>
(
int
idx
)
>
inline
std
::
function
<
std
::
vector
<
std
::
pair
<
int
,
double
>>
(
int
idx
)
>
Get
RowFunctionFromCSR
(
const
int32_t
*
indptr
,
const
int32_t
*
indices
,
const
void
*
data
,
int
float_type
,
uint64_t
nindptr
,
uint64_t
nelem
)
{
RowFunctionFromCSR
(
const
int32_t
*
indptr
,
const
int32_t
*
indices
,
const
void
*
data
,
int
float_type
,
uint64_t
nindptr
,
uint64_t
nelem
)
{
if
(
float_type
==
0
)
{
if
(
float_type
==
0
)
{
const
float
*
dptr
=
reinterpret_cast
<
const
float
*>
(
data
);
const
float
*
dptr
=
reinterpret_cast
<
const
float
*>
(
data
);
return
[
&
indptr
,
&
indices
,
&
dptr
,
&
nindptr
,
&
nelem
](
int
idx
)
{
return
[
&
indptr
,
&
indices
,
&
dptr
,
&
nindptr
,
&
nelem
](
int
idx
)
{
...
@@ -463,7 +463,7 @@ GetRowFunctionFromCSR(const int32_t* indptr, const int32_t* indices, const void*
...
@@ -463,7 +463,7 @@ GetRowFunctionFromCSR(const int32_t* indptr, const int32_t* indices, const void*
}
}
inline
std
::
function
<
std
::
vector
<
std
::
pair
<
int
,
double
>>
(
int
idx
)
>
inline
std
::
function
<
std
::
vector
<
std
::
pair
<
int
,
double
>>
(
int
idx
)
>
Get
ColFunctionFromCSC
(
const
int32_t
*
col_ptr
,
const
int32_t
*
indices
,
const
void
*
data
,
int
float_type
,
uint64_t
ncol_ptr
,
uint64_t
nelem
)
{
Col
umn
FunctionFromCSC
(
const
int32_t
*
col_ptr
,
const
int32_t
*
indices
,
const
void
*
data
,
int
float_type
,
uint64_t
ncol_ptr
,
uint64_t
nelem
)
{
if
(
float_type
==
0
)
{
if
(
float_type
==
0
)
{
const
float
*
dptr
=
reinterpret_cast
<
const
float
*>
(
data
);
const
float
*
dptr
=
reinterpret_cast
<
const
float
*>
(
data
);
return
[
&
col_ptr
,
&
indices
,
&
dptr
,
&
ncol_ptr
,
&
nelem
](
int
idx
)
{
return
[
&
col_ptr
,
&
indices
,
&
dptr
,
&
ncol_ptr
,
&
nelem
](
int
idx
)
{
...
...
src/application/application.cpp
View file @
33344088
...
@@ -145,7 +145,7 @@ void Application::LoadData() {
...
@@ -145,7 +145,7 @@ void Application::LoadData() {
}
}
dataset_loader_
=
new
DatasetLoader
(
config_
.
io_config
,
predict_fun
);
dataset_loader_
=
new
DatasetLoader
(
config_
.
io_config
,
predict_fun
);
dataset_loader_
->
SetHead
d
er
(
config_
.
io_config
.
data_filename
.
c_str
());
dataset_loader_
->
SetHeader
(
config_
.
io_config
.
data_filename
.
c_str
());
// load Training data
// load Training data
if
(
config_
.
is_parallel_find_bin
)
{
if
(
config_
.
is_parallel_find_bin
)
{
// load data for parallel training
// load data for parallel training
...
@@ -173,7 +173,7 @@ void Application::LoadData() {
...
@@ -173,7 +173,7 @@ void Application::LoadData() {
// Add validation data, if it exists
// Add validation data, if it exists
for
(
size_t
i
=
0
;
i
<
config_
.
io_config
.
valid_data_filenames
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
config_
.
io_config
.
valid_data_filenames
.
size
();
++
i
)
{
// add
// add
valid_datas_
.
push_back
(
dataset_loader_
->
LoadFromFile
LikeOthers
(
config_
.
io_config
.
valid_data_filenames
[
i
].
c_str
(),
valid_datas_
.
push_back
(
dataset_loader_
->
LoadFromFile
AlignWithOtherDataset
(
config_
.
io_config
.
valid_data_filenames
[
i
].
c_str
(),
train_data_
));
train_data_
));
// need save binary file
// need save binary file
if
(
config_
.
io_config
.
is_save_binary_file
)
{
if
(
config_
.
io_config
.
is_save_binary_file
)
{
...
...
src/c_api.cpp
View file @
33344088
...
@@ -126,7 +126,7 @@ DllExport int LGBM_CreateDatasetFromFile(const char* filename,
...
@@ -126,7 +126,7 @@ DllExport int LGBM_CreateDatasetFromFile(const char* filename,
if
(
reference
==
nullptr
)
{
if
(
reference
==
nullptr
)
{
*
out
=
loader
.
LoadFromFile
(
filename
);
*
out
=
loader
.
LoadFromFile
(
filename
);
}
else
{
}
else
{
*
out
=
loader
.
LoadFromFile
LikeOthers
(
filename
,
reinterpret_cast
<
const
Dataset
*>
(
*
reference
));
*
out
=
loader
.
LoadFromFile
AlignWithOtherDataset
(
filename
,
reinterpret_cast
<
const
Dataset
*>
(
*
reference
));
}
}
return
0
;
return
0
;
}
}
...
@@ -154,7 +154,7 @@ DllExport int LGBM_CreateDatasetFromMat(const void* data,
...
@@ -154,7 +154,7 @@ DllExport int LGBM_CreateDatasetFromMat(const void* data,
config
.
LoadFromString
(
parameters
);
config
.
LoadFromString
(
parameters
);
DatasetLoader
loader
(
config
.
io_config
,
nullptr
);
DatasetLoader
loader
(
config
.
io_config
,
nullptr
);
Dataset
*
ret
=
nullptr
;
Dataset
*
ret
=
nullptr
;
auto
get_row_fun
=
Common
::
Get
RowFunctionFrom
Mat
(
data
,
nrow
,
ncol
,
float_type
,
is_row_major
);
auto
get_row_fun
=
Common
::
RowFunctionFrom
DenseMatric
(
data
,
nrow
,
ncol
,
float_type
,
is_row_major
);
if
(
reference
==
nullptr
)
{
if
(
reference
==
nullptr
)
{
// sample data first
// sample data first
Random
rand
(
config
.
io_config
.
data_random_seed
);
Random
rand
(
config
.
io_config
.
data_random_seed
);
...
@@ -170,10 +170,8 @@ DllExport int LGBM_CreateDatasetFromMat(const void* data,
...
@@ -170,10 +170,8 @@ DllExport int LGBM_CreateDatasetFromMat(const void* data,
}
}
ret
=
loader
.
CostructFromSampleData
(
sample_values
,
nrow
);
ret
=
loader
.
CostructFromSampleData
(
sample_values
,
nrow
);
}
else
{
}
else
{
ret
=
new
Dataset
();
ret
=
new
Dataset
(
nrow
);
// need to set num_data first
reinterpret_cast
<
const
Dataset
*>
(
*
reference
)
->
CopyFeatureBinMapperTo
(
ret
,
config
.
io_config
.
is_enable_sparse
);
ret
->
SetNumData
(
nrow
);
reinterpret_cast
<
const
Dataset
*>
(
*
reference
)
->
CopyFeatureMetadataTo
(
ret
,
config
.
io_config
.
is_enable_sparse
);
}
}
#pragma omp parallel for schedule(guided)
#pragma omp parallel for schedule(guided)
...
@@ -202,7 +200,7 @@ DllExport int LGBM_CreateDatasetFromCSR(const int32_t* indptr,
...
@@ -202,7 +200,7 @@ DllExport int LGBM_CreateDatasetFromCSR(const int32_t* indptr,
config
.
LoadFromString
(
parameters
);
config
.
LoadFromString
(
parameters
);
DatasetLoader
loader
(
config
.
io_config
,
nullptr
);
DatasetLoader
loader
(
config
.
io_config
,
nullptr
);
Dataset
*
ret
=
nullptr
;
Dataset
*
ret
=
nullptr
;
auto
get_row_fun
=
Common
::
Get
RowFunctionFromCSR
(
indptr
,
indices
,
data
,
float_type
,
nindptr
,
nelem
);
auto
get_row_fun
=
Common
::
RowFunctionFromCSR
(
indptr
,
indices
,
data
,
float_type
,
nindptr
,
nelem
);
int32_t
nrow
=
static_cast
<
int32_t
>
(
nindptr
-
1
);
int32_t
nrow
=
static_cast
<
int32_t
>
(
nindptr
-
1
);
if
(
reference
==
nullptr
)
{
if
(
reference
==
nullptr
)
{
// sample data first
// sample data first
...
@@ -233,10 +231,8 @@ DllExport int LGBM_CreateDatasetFromCSR(const int32_t* indptr,
...
@@ -233,10 +231,8 @@ DllExport int LGBM_CreateDatasetFromCSR(const int32_t* indptr,
CHECK
(
num_col
>=
sample_values
.
size
());
CHECK
(
num_col
>=
sample_values
.
size
());
ret
=
loader
.
CostructFromSampleData
(
sample_values
,
nrow
);
ret
=
loader
.
CostructFromSampleData
(
sample_values
,
nrow
);
}
else
{
}
else
{
ret
=
new
Dataset
();
ret
=
new
Dataset
(
nrow
);
// need to set num_data first
reinterpret_cast
<
const
Dataset
*>
(
*
reference
)
->
CopyFeatureBinMapperTo
(
ret
,
config
.
io_config
.
is_enable_sparse
);
ret
->
SetNumData
(
nrow
);
reinterpret_cast
<
const
Dataset
*>
(
*
reference
)
->
CopyFeatureMetadataTo
(
ret
,
config
.
io_config
.
is_enable_sparse
);
}
}
#pragma omp parallel for schedule(guided)
#pragma omp parallel for schedule(guided)
...
@@ -266,7 +262,7 @@ DllExport int LGBM_CreateDatasetFromCSC(const int32_t* col_ptr,
...
@@ -266,7 +262,7 @@ DllExport int LGBM_CreateDatasetFromCSC(const int32_t* col_ptr,
config
.
LoadFromString
(
parameters
);
config
.
LoadFromString
(
parameters
);
DatasetLoader
loader
(
config
.
io_config
,
nullptr
);
DatasetLoader
loader
(
config
.
io_config
,
nullptr
);
Dataset
*
ret
=
nullptr
;
Dataset
*
ret
=
nullptr
;
auto
get_col_fun
=
Common
::
Get
ColFunctionFromCSC
(
col_ptr
,
indices
,
data
,
float_type
,
ncol_ptr
,
nelem
);
auto
get_col_fun
=
Common
::
Col
umn
FunctionFromCSC
(
col_ptr
,
indices
,
data
,
float_type
,
ncol_ptr
,
nelem
);
int32_t
nrow
=
static_cast
<
int32_t
>
(
num_row
);
int32_t
nrow
=
static_cast
<
int32_t
>
(
num_row
);
if
(
reference
==
nullptr
)
{
if
(
reference
==
nullptr
)
{
Log
::
Warning
(
"Construct from CSC format is not efficient"
);
Log
::
Warning
(
"Construct from CSC format is not efficient"
);
...
@@ -282,17 +278,15 @@ DllExport int LGBM_CreateDatasetFromCSC(const int32_t* col_ptr,
...
@@ -282,17 +278,15 @@ DllExport int LGBM_CreateDatasetFromCSC(const int32_t* col_ptr,
}
}
ret
=
loader
.
CostructFromSampleData
(
sample_values
,
nrow
);
ret
=
loader
.
CostructFromSampleData
(
sample_values
,
nrow
);
}
else
{
}
else
{
ret
=
new
Dataset
();
ret
=
new
Dataset
(
nrow
);
// need to set num_data first
reinterpret_cast
<
const
Dataset
*>
(
*
reference
)
->
CopyFeatureBinMapperTo
(
ret
,
config
.
io_config
.
is_enable_sparse
);
ret
->
SetNumData
(
nrow
);
reinterpret_cast
<
const
Dataset
*>
(
*
reference
)
->
CopyFeatureMetadataTo
(
ret
,
config
.
io_config
.
is_enable_sparse
);
}
}
#pragma omp parallel for schedule(guided)
#pragma omp parallel for schedule(guided)
for
(
int
i
=
0
;
i
<
ncol_ptr
-
1
;
++
i
)
{
for
(
int
i
=
0
;
i
<
ncol_ptr
-
1
;
++
i
)
{
const
int
tid
=
omp_get_thread_num
();
const
int
tid
=
omp_get_thread_num
();
auto
one_col
=
get_col_fun
(
i
);
auto
one_col
=
get_col_fun
(
i
);
ret
->
PushOneCol
(
tid
,
i
,
one_col
);
ret
->
PushOneCol
umn
(
tid
,
i
,
one_col
);
}
}
ret
->
FinishLoad
();
ret
->
FinishLoad
();
*
out
=
ret
;
*
out
=
ret
;
...
...
src/io/dataset.cpp
View file @
33344088
...
@@ -21,6 +21,12 @@ Dataset::Dataset() {
...
@@ -21,6 +21,12 @@ Dataset::Dataset() {
is_loading_from_binfile_
=
false
;
is_loading_from_binfile_
=
false
;
}
}
Dataset
::
Dataset
(
data_size_t
num_data
)
{
num_class_
=
1
;
num_data_
=
num_data
;
is_loading_from_binfile_
=
false
;
}
Dataset
::~
Dataset
()
{
Dataset
::~
Dataset
()
{
for
(
auto
&
feature
:
features_
)
{
for
(
auto
&
feature
:
features_
)
{
delete
feature
;
delete
feature
;
...
@@ -35,13 +41,14 @@ void Dataset::FinishLoad() {
...
@@ -35,13 +41,14 @@ void Dataset::FinishLoad() {
}
}
}
}
void
Dataset
::
CopyFeature
Metadata
To
(
Dataset
*
dataset
,
bool
is_enable_sparse
)
const
{
void
Dataset
::
CopyFeature
BinMapper
To
(
Dataset
*
dataset
,
bool
is_enable_sparse
)
const
{
dataset
->
features_
.
clear
();
dataset
->
features_
.
clear
();
// copy feature bin mapper data
// copy feature bin mapper data
for
(
Feature
*
feature
:
features_
)
{
for
(
Feature
*
feature
:
features_
)
{
dataset
->
features_
.
push_back
(
new
Feature
(
feature
->
feature_index
(),
dataset
->
features_
.
push_back
(
new
Feature
(
feature
->
feature_index
(),
new
BinMapper
(
*
feature
->
bin_mapper
()),
dataset
->
num_data_
,
is_enable_sparse
));
new
BinMapper
(
*
feature
->
bin_mapper
()),
dataset
->
num_data_
,
is_enable_sparse
));
}
}
dataset
->
num_class_
=
num_class_
;
dataset
->
used_feature_map_
=
used_feature_map_
;
dataset
->
used_feature_map_
=
used_feature_map_
;
dataset
->
num_features_
=
static_cast
<
int
>
(
dataset
->
features_
.
size
());
dataset
->
num_features_
=
static_cast
<
int
>
(
dataset
->
features_
.
size
());
dataset
->
num_total_features_
=
num_total_features_
;
dataset
->
num_total_features_
=
num_total_features_
;
...
@@ -131,7 +138,7 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
...
@@ -131,7 +138,7 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
Log
::
Info
(
"Saving data to binary file %s"
,
data_filename_
);
Log
::
Info
(
"Saving data to binary file %s"
,
data_filename_
);
// get size of header
// get size of header
size_t
size_of_header
=
sizeof
(
num_data_
)
+
sizeof
(
num_features_
)
+
sizeof
(
num_total_features_
)
size_t
size_of_header
=
sizeof
(
num_data_
)
+
sizeof
(
num_class_
)
+
sizeof
(
num_features_
)
+
sizeof
(
num_total_features_
)
+
sizeof
(
size_t
)
+
sizeof
(
int
)
*
used_feature_map_
.
size
();
+
sizeof
(
size_t
)
+
sizeof
(
int
)
*
used_feature_map_
.
size
();
// size of feature names
// size of feature names
for
(
int
i
=
0
;
i
<
num_total_features_
;
++
i
)
{
for
(
int
i
=
0
;
i
<
num_total_features_
;
++
i
)
{
...
@@ -140,6 +147,7 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
...
@@ -140,6 +147,7 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
fwrite
(
&
size_of_header
,
sizeof
(
size_of_header
),
1
,
file
);
fwrite
(
&
size_of_header
,
sizeof
(
size_of_header
),
1
,
file
);
// write header
// write header
fwrite
(
&
num_data_
,
sizeof
(
num_data_
),
1
,
file
);
fwrite
(
&
num_data_
,
sizeof
(
num_data_
),
1
,
file
);
fwrite
(
&
num_class_
,
sizeof
(
num_class_
),
1
,
file
);
fwrite
(
&
num_features_
,
sizeof
(
num_features_
),
1
,
file
);
fwrite
(
&
num_features_
,
sizeof
(
num_features_
),
1
,
file
);
fwrite
(
&
num_total_features_
,
sizeof
(
num_features_
),
1
,
file
);
fwrite
(
&
num_total_features_
,
sizeof
(
num_features_
),
1
,
file
);
size_t
num_used_feature_map
=
used_feature_map_
.
size
();
size_t
num_used_feature_map
=
used_feature_map_
.
size
();
...
...
src/io/dataset_loader.cpp
View file @
33344088
...
@@ -17,7 +17,7 @@ DatasetLoader::~DatasetLoader() {
...
@@ -17,7 +17,7 @@ DatasetLoader::~DatasetLoader() {
}
}
void
DatasetLoader
::
SetHead
d
er
(
const
char
*
filename
)
{
void
DatasetLoader
::
SetHeader
(
const
char
*
filename
)
{
TextReader
<
data_size_t
>
text_reader
(
filename
,
io_config_
.
has_header
);
TextReader
<
data_size_t
>
text_reader
(
filename
,
io_config_
.
has_header
);
std
::
unordered_map
<
std
::
string
,
int
>
name2idx
;
std
::
unordered_map
<
std
::
string
,
int
>
name2idx
;
...
@@ -200,7 +200,7 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac
...
@@ -200,7 +200,7 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac
Dataset
*
DatasetLoader
::
LoadFromFile
LikeOthers
(
const
char
*
filename
,
const
Dataset
*
other
)
{
Dataset
*
DatasetLoader
::
LoadFromFile
AlignWithOtherDataset
(
const
char
*
filename
,
const
Dataset
*
train_data
)
{
auto
parser
=
Parser
::
CreateParser
(
filename
,
io_config_
.
has_header
,
0
,
label_idx_
);
auto
parser
=
Parser
::
CreateParser
(
filename
,
io_config_
.
has_header
,
0
,
label_idx_
);
if
(
parser
==
nullptr
)
{
if
(
parser
==
nullptr
)
{
Log
::
Fatal
(
"Could not recognize data format of %s"
,
filename
);
Log
::
Fatal
(
"Could not recognize data format of %s"
,
filename
);
...
@@ -219,7 +219,7 @@ Dataset* DatasetLoader::LoadFromFileLikeOthers(const char* filename, const Datas
...
@@ -219,7 +219,7 @@ Dataset* DatasetLoader::LoadFromFileLikeOthers(const char* filename, const Datas
dataset
->
num_data_
=
static_cast
<
data_size_t
>
(
text_data
.
size
());
dataset
->
num_data_
=
static_cast
<
data_size_t
>
(
text_data
.
size
());
// initialize label
// initialize label
dataset
->
metadata_
.
Init
(
dataset
->
num_data_
,
dataset
->
num_class_
,
weight_idx_
,
group_idx_
);
dataset
->
metadata_
.
Init
(
dataset
->
num_data_
,
dataset
->
num_class_
,
weight_idx_
,
group_idx_
);
other
->
CopyFeature
Metadata
To
(
dataset
,
io_config_
.
is_enable_sparse
);
train_data
->
CopyFeature
BinMapper
To
(
dataset
,
io_config_
.
is_enable_sparse
);
// extract features
// extract features
ExtractFeaturesFromMemory
(
text_data
,
parser
,
dataset
);
ExtractFeaturesFromMemory
(
text_data
,
parser
,
dataset
);
text_data
.
clear
();
text_data
.
clear
();
...
@@ -230,7 +230,7 @@ Dataset* DatasetLoader::LoadFromFileLikeOthers(const char* filename, const Datas
...
@@ -230,7 +230,7 @@ Dataset* DatasetLoader::LoadFromFileLikeOthers(const char* filename, const Datas
num_global_data
=
dataset
->
num_data_
;
num_global_data
=
dataset
->
num_data_
;
// initialize label
// initialize label
dataset
->
metadata_
.
Init
(
dataset
->
num_data_
,
dataset
->
num_class_
,
weight_idx_
,
group_idx_
);
dataset
->
metadata_
.
Init
(
dataset
->
num_data_
,
dataset
->
num_class_
,
weight_idx_
,
group_idx_
);
other
->
CopyFeature
Metadata
To
(
dataset
,
io_config_
.
is_enable_sparse
);
train_data
->
CopyFeature
BinMapper
To
(
dataset
,
io_config_
.
is_enable_sparse
);
// extract features
// extract features
ExtractFeaturesFromFile
(
filename
,
parser
,
used_data_indices
,
dataset
);
ExtractFeaturesFromFile
(
filename
,
parser
,
used_data_indices
,
dataset
);
}
}
...
@@ -290,6 +290,8 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* bin_filename, int rank, int
...
@@ -290,6 +290,8 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* bin_filename, int rank, int
const
char
*
mem_ptr
=
buffer
;
const
char
*
mem_ptr
=
buffer
;
dataset
->
num_data_
=
*
(
reinterpret_cast
<
const
data_size_t
*>
(
mem_ptr
));
dataset
->
num_data_
=
*
(
reinterpret_cast
<
const
data_size_t
*>
(
mem_ptr
));
mem_ptr
+=
sizeof
(
dataset
->
num_data_
);
mem_ptr
+=
sizeof
(
dataset
->
num_data_
);
dataset
->
num_class_
=
*
(
reinterpret_cast
<
const
int
*>
(
mem_ptr
));
mem_ptr
+=
sizeof
(
dataset
->
num_class_
);
dataset
->
num_features_
=
*
(
reinterpret_cast
<
const
int
*>
(
mem_ptr
));
dataset
->
num_features_
=
*
(
reinterpret_cast
<
const
int
*>
(
mem_ptr
));
mem_ptr
+=
sizeof
(
dataset
->
num_features_
);
mem_ptr
+=
sizeof
(
dataset
->
num_features_
);
dataset
->
num_total_features_
=
*
(
reinterpret_cast
<
const
int
*>
(
mem_ptr
));
dataset
->
num_total_features_
=
*
(
reinterpret_cast
<
const
int
*>
(
mem_ptr
));
...
@@ -415,7 +417,7 @@ Dataset* DatasetLoader::CostructFromSampleData(std::vector<std::vector<double>>&
...
@@ -415,7 +417,7 @@ Dataset* DatasetLoader::CostructFromSampleData(std::vector<std::vector<double>>&
}
}
Dataset
*
dataset
=
new
Dataset
();
Dataset
*
dataset
=
new
Dataset
();
dataset
->
num_class_
=
io_config_
.
num_class
;
dataset
->
features_
.
clear
();
dataset
->
features_
.
clear
();
dataset
->
num_data_
=
num_data
;
dataset
->
num_data_
=
num_data
;
// -1 means doesn't use this feature
// -1 means doesn't use this feature
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment