Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tianlh
LightGBM-DCU
Commits
522e9993
Commit
522e9993
authored
Nov 26, 2016
by
Guolin Ke
Browse files
support identity bin file from file content
parent
0ae51f14
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
45 additions
and
33 deletions
+45
-33
include/LightGBM/c_api.h
include/LightGBM/c_api.h
+0
-9
include/LightGBM/dataset.h
include/LightGBM/dataset.h
+2
-0
include/LightGBM/dataset_loader.h
include/LightGBM/dataset_loader.h
+1
-1
src/c_api.cpp
src/c_api.cpp
+0
-9
src/io/dataset.cpp
src/io/dataset.cpp
+3
-1
src/io/dataset_loader.cpp
src/io/dataset_loader.cpp
+39
-13
No files found.
include/LightGBM/c_api.h
View file @
522e9993
...
@@ -62,15 +62,6 @@ DllExport int LGBM_CreateDatasetFromFile(const char* filename,
...
@@ -62,15 +62,6 @@ DllExport int LGBM_CreateDatasetFromFile(const char* filename,
const
DatesetHandle
*
reference
,
const
DatesetHandle
*
reference
,
DatesetHandle
*
out
);
DatesetHandle
*
out
);
/*!
* \brief load data set from binary file like the command_line LightGBM do
* \param filename the name of the file
* \param out a loaded dataset
* \return 0 when succeed, -1 when failure happens
*/
DllExport
int
LGBM_CreateDatasetFromBinaryFile
(
const
char
*
filename
,
DatesetHandle
*
out
);
/*!
/*!
* \brief create a dataset from CSR format
* \brief create a dataset from CSR format
* \param indptr pointer to row headers
* \param indptr pointer to row headers
...
...
include/LightGBM/dataset.h
View file @
522e9993
...
@@ -402,6 +402,8 @@ private:
...
@@ -402,6 +402,8 @@ private:
int
label_idx_
=
0
;
int
label_idx_
=
0
;
/*! \brief store feature names */
/*! \brief store feature names */
std
::
vector
<
std
::
string
>
feature_names_
;
std
::
vector
<
std
::
string
>
feature_names_
;
/*! \brief store feature names */
static
const
char
*
binary_file_token
;
};
};
}
// namespace LightGBM
}
// namespace LightGBM
...
...
include/LightGBM/dataset_loader.h
View file @
522e9993
...
@@ -49,7 +49,7 @@ private:
...
@@ -49,7 +49,7 @@ private:
void
ExtractFeaturesFromFile
(
const
char
*
filename
,
const
Parser
*
parser
,
const
std
::
vector
<
data_size_t
>&
used_data_indices
,
Dataset
*
dataset
);
void
ExtractFeaturesFromFile
(
const
char
*
filename
,
const
Parser
*
parser
,
const
std
::
vector
<
data_size_t
>&
used_data_indices
,
Dataset
*
dataset
);
/*! \brief Check can load from binary file */
/*! \brief Check can load from binary file */
bool
CheckCanLoadFromBin
(
const
char
*
filename
);
std
::
string
CheckCanLoadFromBin
(
const
char
*
filename
);
const
IOConfig
&
io_config_
;
const
IOConfig
&
io_config_
;
/*! \brief Random generator*/
/*! \brief Random generator*/
...
...
src/c_api.cpp
View file @
522e9993
...
@@ -223,15 +223,6 @@ DllExport int LGBM_CreateDatasetFromFile(const char* filename,
...
@@ -223,15 +223,6 @@ DllExport int LGBM_CreateDatasetFromFile(const char* filename,
API_END
();
API_END
();
}
}
DllExport
int
LGBM_CreateDatasetFromBinaryFile
(
const
char
*
filename
,
DatesetHandle
*
out
)
{
API_BEGIN
();
OverallConfig
config
;
DatasetLoader
loader
(
config
.
io_config
,
nullptr
);
*
out
=
loader
.
LoadFromBinFile
(
filename
,
0
,
1
);
API_END
();
}
DllExport
int
LGBM_CreateDatasetFromMat
(
const
void
*
data
,
DllExport
int
LGBM_CreateDatasetFromMat
(
const
void
*
data
,
int
data_type
,
int
data_type
,
int32_t
nrow
,
int32_t
nrow
,
...
...
src/io/dataset.cpp
View file @
522e9993
...
@@ -14,6 +14,7 @@
...
@@ -14,6 +14,7 @@
namespace
LightGBM
{
namespace
LightGBM
{
const
char
*
Dataset
::
binary_file_token
=
"______LightGBM_Binary_File_Token______
\n
"
;
Dataset
::
Dataset
()
{
Dataset
::
Dataset
()
{
num_class_
=
1
;
num_class_
=
1
;
...
@@ -135,7 +136,8 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
...
@@ -135,7 +136,8 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
Log
::
Fatal
(
"Cannot write binary data to %s "
,
bin_filename
);
Log
::
Fatal
(
"Cannot write binary data to %s "
,
bin_filename
);
}
}
Log
::
Info
(
"Saving data to binary file %s"
,
bin_filename
);
Log
::
Info
(
"Saving data to binary file %s"
,
bin_filename
);
size_t
size_of_token
=
std
::
strlen
(
binary_file_token
);
fwrite
(
binary_file_token
,
sizeof
(
char
),
size_of_token
,
file
);
// get size of header
// get size of header
size_t
size_of_header
=
sizeof
(
num_data_
)
+
sizeof
(
num_class_
)
+
sizeof
(
num_features_
)
+
sizeof
(
num_total_features_
)
size_t
size_of_header
=
sizeof
(
num_data_
)
+
sizeof
(
num_class_
)
+
sizeof
(
num_features_
)
+
sizeof
(
num_total_features_
)
+
sizeof
(
size_t
)
+
sizeof
(
int
)
*
used_feature_map_
.
size
();
+
sizeof
(
size_t
)
+
sizeof
(
int
)
*
used_feature_map_
.
size
();
...
...
src/io/dataset_loader.cpp
View file @
522e9993
...
@@ -152,8 +152,8 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac
...
@@ -152,8 +152,8 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac
dataset
->
data_filename_
=
filename
;
dataset
->
data_filename_
=
filename
;
dataset
->
num_class_
=
io_config_
.
num_class
;
dataset
->
num_class_
=
io_config_
.
num_class
;
dataset
->
metadata_
.
Init
(
filename
,
dataset
->
num_class_
);
dataset
->
metadata_
.
Init
(
filename
,
dataset
->
num_class_
);
bool
is_load
in
g
_f
rom_binfil
e
=
CheckCanLoadFromBin
(
filename
);
auto
b
in_f
ilenam
e
=
CheckCanLoadFromBin
(
filename
);
if
(
!
is_load
in
g
_f
rom_binfile
)
{
if
(
b
in_f
ilename
.
size
()
==
0
)
{
if
(
!
io_config_
.
use_two_round_loading
)
{
if
(
!
io_config_
.
use_two_round_loading
)
{
// read data to memory
// read data to memory
auto
text_data
=
LoadTextDataToMemory
(
filename
,
dataset
->
metadata_
,
rank
,
num_machines
,
&
num_global_data
,
&
used_data_indices
);
auto
text_data
=
LoadTextDataToMemory
(
filename
,
dataset
->
metadata_
,
rank
,
num_machines
,
&
num_global_data
,
&
used_data_indices
);
...
@@ -185,8 +185,6 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac
...
@@ -185,8 +185,6 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac
}
}
}
else
{
}
else
{
// load data from binary file
// load data from binary file
std
::
string
bin_filename
(
filename
);
bin_filename
.
append
(
".bin"
);
dataset
.
reset
(
LoadFromBinFile
(
bin_filename
.
c_str
(),
rank
,
num_machines
));
dataset
.
reset
(
LoadFromBinFile
(
bin_filename
.
c_str
(),
rank
,
num_machines
));
}
}
// check meta data
// check meta data
...
@@ -209,8 +207,8 @@ Dataset* DatasetLoader::LoadFromFileAlignWithOtherDataset(const char* filename,
...
@@ -209,8 +207,8 @@ Dataset* DatasetLoader::LoadFromFileAlignWithOtherDataset(const char* filename,
dataset
->
data_filename_
=
filename
;
dataset
->
data_filename_
=
filename
;
dataset
->
num_class_
=
io_config_
.
num_class
;
dataset
->
num_class_
=
io_config_
.
num_class
;
dataset
->
metadata_
.
Init
(
filename
,
dataset
->
num_class_
);
dataset
->
metadata_
.
Init
(
filename
,
dataset
->
num_class_
);
bool
is_load
in
g
_f
rom_binfil
e
=
CheckCanLoadFromBin
(
filename
);
auto
b
in_f
ilenam
e
=
CheckCanLoadFromBin
(
filename
);
if
(
!
is_load
in
g
_f
rom_binfile
)
{
if
(
b
in_f
ilename
.
size
()
==
0
)
{
if
(
!
io_config_
.
use_two_round_loading
)
{
if
(
!
io_config_
.
use_two_round_loading
)
{
// read data in memory
// read data in memory
auto
text_data
=
LoadTextDataToMemory
(
filename
,
dataset
->
metadata_
,
0
,
1
,
&
num_global_data
,
&
used_data_indices
);
auto
text_data
=
LoadTextDataToMemory
(
filename
,
dataset
->
metadata_
,
0
,
1
,
&
num_global_data
,
&
used_data_indices
);
...
@@ -234,8 +232,6 @@ Dataset* DatasetLoader::LoadFromFileAlignWithOtherDataset(const char* filename,
...
@@ -234,8 +232,6 @@ Dataset* DatasetLoader::LoadFromFileAlignWithOtherDataset(const char* filename,
}
}
}
else
{
}
else
{
// load data from binary file
// load data from binary file
std
::
string
bin_filename
(
filename
);
bin_filename
.
append
(
".bin"
);
dataset
.
reset
(
LoadFromBinFile
(
bin_filename
.
c_str
(),
0
,
1
));
dataset
.
reset
(
LoadFromBinFile
(
bin_filename
.
c_str
(),
0
,
1
));
}
}
// not need to check validation data
// not need to check validation data
...
@@ -260,9 +256,19 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* bin_filename, int rank, int
...
@@ -260,9 +256,19 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* bin_filename, int rank, int
// buffer to read binary file
// buffer to read binary file
size_t
buffer_size
=
16
*
1024
*
1024
;
size_t
buffer_size
=
16
*
1024
*
1024
;
auto
buffer
=
std
::
vector
<
char
>
(
buffer_size
);
auto
buffer
=
std
::
vector
<
char
>
(
buffer_size
);
// check token
size_t
size_of_token
=
std
::
strlen
(
Dataset
::
binary_file_token
);
size_t
read_cnt
=
fread
(
buffer
.
data
(),
sizeof
(
char
),
size_of_token
,
file
);
if
(
read_cnt
!=
size_of_token
)
{
Log
::
Fatal
(
"Binary file error: token has the wrong size"
);
}
if
(
std
::
string
(
buffer
.
data
())
!=
std
::
string
(
Dataset
::
binary_file_token
))
{
Log
::
Fatal
(
"input file is not LightGBM binary file"
);
}
// read size of header
// read size of header
size_t
read_cnt
=
fread
(
buffer
.
data
(),
sizeof
(
size_t
),
1
,
file
);
read_cnt
=
fread
(
buffer
.
data
(),
sizeof
(
size_t
),
1
,
file
);
if
(
read_cnt
!=
1
)
{
if
(
read_cnt
!=
1
)
{
Log
::
Fatal
(
"Binary file error: header has the wrong size"
);
Log
::
Fatal
(
"Binary file error: header has the wrong size"
);
...
@@ -849,7 +855,7 @@ void DatasetLoader::ExtractFeaturesFromFile(const char* filename, const Parser*
...
@@ -849,7 +855,7 @@ void DatasetLoader::ExtractFeaturesFromFile(const char* filename, const Parser*
}
}
/*! \brief Check can load from binary file */
/*! \brief Check can load from binary file */
bool
DatasetLoader
::
CheckCanLoadFromBin
(
const
char
*
filename
)
{
std
::
string
DatasetLoader
::
CheckCanLoadFromBin
(
const
char
*
filename
)
{
std
::
string
bin_filename
(
filename
);
std
::
string
bin_filename
(
filename
);
bin_filename
.
append
(
".bin"
);
bin_filename
.
append
(
".bin"
);
...
@@ -860,12 +866,32 @@ bool DatasetLoader::CheckCanLoadFromBin(const char* filename) {
...
@@ -860,12 +866,32 @@ bool DatasetLoader::CheckCanLoadFromBin(const char* filename) {
#else
#else
file
=
fopen
(
bin_filename
.
c_str
(),
"rb"
);
file
=
fopen
(
bin_filename
.
c_str
(),
"rb"
);
#endif
#endif
if
(
file
==
NULL
)
{
if
(
file
==
NULL
)
{
return
false
;
bin_filename
=
std
::
string
(
filename
);
#ifdef _MSC_VER
fopen_s
(
&
file
,
bin_filename
.
c_str
(),
"rb"
);
#else
file
=
fopen
(
bin_filename
.
c_str
(),
"rb"
);
#endif
if
(
file
==
NULL
)
{
Log
::
Fatal
(
"cannot open data file %s"
,
bin_filename
.
c_str
());
}
}
size_t
buffer_size
=
256
;
auto
buffer
=
std
::
vector
<
char
>
(
buffer_size
);
// read size of token
size_t
size_of_token
=
std
::
strlen
(
Dataset
::
binary_file_token
);
size_t
read_cnt
=
fread
(
buffer
.
data
(),
sizeof
(
char
),
size_of_token
,
file
);
fclose
(
file
);
if
(
read_cnt
==
size_of_token
&&
std
::
string
(
buffer
.
data
())
==
std
::
string
(
Dataset
::
binary_file_token
))
{
return
bin_filename
;
}
else
{
}
else
{
fclose
(
file
);
return
std
::
string
();
return
true
;
}
}
}
}
}
}
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment