Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tianlh
LightGBM-DCU
Commits
c6512e01
Commit
c6512e01
authored
Nov 09, 2016
by
Guolin Ke
Browse files
reduce memory cost at sample phase
parent
2e5e9134
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
44 additions
and
39 deletions
+44
-39
include/LightGBM/bin.h
include/LightGBM/bin.h
+1
-1
include/LightGBM/dataset_loader.h
include/LightGBM/dataset_loader.h
+1
-1
src/boosting/gbdt.cpp
src/boosting/gbdt.cpp
+3
-1
src/c_api.cpp
src/c_api.cpp
+15
-16
src/io/bin.cpp
src/io/bin.cpp
+12
-4
src/io/dataset_loader.cpp
src/io/dataset_loader.cpp
+12
-16
No files found.
include/LightGBM/bin.h
View file @
c6512e01
...
@@ -86,7 +86,7 @@ public:
...
@@ -86,7 +86,7 @@ public:
* \param values (Sampled) values of this feature
* \param values (Sampled) values of this feature
* \param max_bin The maximal number of bin
* \param max_bin The maximal number of bin
*/
*/
void
FindBin
(
std
::
vector
<
double
>*
values
,
int
max_bin
);
void
FindBin
(
std
::
vector
<
double
>*
values
,
size_t
total_sample_cnt
,
int
max_bin
);
/*!
/*!
* \brief Use specific number of bin to calculate the size of this class
* \brief Use specific number of bin to calculate the size of this class
...
...
include/LightGBM/dataset_loader.h
View file @
c6512e01
...
@@ -24,7 +24,7 @@ public:
...
@@ -24,7 +24,7 @@ public:
Dataset
*
LoadFromBinFile
(
const
char
*
bin_filename
,
int
rank
,
int
num_machines
);
Dataset
*
LoadFromBinFile
(
const
char
*
bin_filename
,
int
rank
,
int
num_machines
);
Dataset
*
CostructFromSampleData
(
std
::
vector
<
std
::
vector
<
double
>>&
sample_values
,
data_size_t
num_data
);
Dataset
*
CostructFromSampleData
(
std
::
vector
<
std
::
vector
<
double
>>&
sample_values
,
size_t
total_sample_size
,
data_size_t
num_data
);
/*! \brief Disable copy */
/*! \brief Disable copy */
DatasetLoader
&
operator
=
(
const
DatasetLoader
&
)
=
delete
;
DatasetLoader
&
operator
=
(
const
DatasetLoader
&
)
=
delete
;
...
...
src/boosting/gbdt.cpp
View file @
c6512e01
...
@@ -464,7 +464,9 @@ std::string GBDT::FeatureImportance() const {
...
@@ -464,7 +464,9 @@ std::string GBDT::FeatureImportance() const {
// store the importance first
// store the importance first
std
::
vector
<
std
::
pair
<
size_t
,
std
::
string
>>
pairs
;
std
::
vector
<
std
::
pair
<
size_t
,
std
::
string
>>
pairs
;
for
(
size_t
i
=
0
;
i
<
feature_importances
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
feature_importances
.
size
();
++
i
)
{
pairs
.
emplace_back
(
feature_importances
[
i
],
train_data_
->
feature_names
()[
i
]);
if
(
feature_importances
[
i
]
>
0
)
{
pairs
.
emplace_back
(
feature_importances
[
i
],
train_data_
->
feature_names
()[
i
]);
}
}
}
// sort the importance
// sort the importance
std
::
sort
(
pairs
.
begin
(),
pairs
.
end
(),
std
::
sort
(
pairs
.
begin
(),
pairs
.
end
(),
...
...
src/c_api.cpp
View file @
c6512e01
...
@@ -206,10 +206,12 @@ DllExport int LGBM_CreateDatasetFromMat(const void* data,
...
@@ -206,10 +206,12 @@ DllExport int LGBM_CreateDatasetFromMat(const void* data,
auto
idx
=
sample_indices
[
i
];
auto
idx
=
sample_indices
[
i
];
auto
row
=
get_row_fun
(
static_cast
<
int
>
(
idx
));
auto
row
=
get_row_fun
(
static_cast
<
int
>
(
idx
));
for
(
size_t
j
=
0
;
j
<
row
.
size
();
++
j
)
{
for
(
size_t
j
=
0
;
j
<
row
.
size
();
++
j
)
{
sample_values
[
j
].
push_back
(
row
[
j
]);
if
(
std
::
fabs
(
row
[
j
])
>
1e-15
)
{
sample_values
[
j
].
push_back
(
row
[
j
]);
}
}
}
}
}
ret
=
loader
.
CostructFromSampleData
(
sample_values
,
nrow
);
ret
=
loader
.
CostructFromSampleData
(
sample_values
,
sample_cnt
,
nrow
);
}
else
{
}
else
{
ret
=
new
Dataset
(
nrow
,
config
.
io_config
.
num_class
);
ret
=
new
Dataset
(
nrow
,
config
.
io_config
.
num_class
);
ret
->
CopyFeatureMapperFrom
(
reinterpret_cast
<
const
Dataset
*>
(
*
reference
),
config
.
io_config
.
is_enable_sparse
);
ret
->
CopyFeatureMapperFrom
(
reinterpret_cast
<
const
Dataset
*>
(
*
reference
),
config
.
io_config
.
is_enable_sparse
);
...
@@ -253,25 +255,22 @@ DllExport int LGBM_CreateDatasetFromCSR(const void* indptr,
...
@@ -253,25 +255,22 @@ DllExport int LGBM_CreateDatasetFromCSR(const void* indptr,
for
(
size_t
i
=
0
;
i
<
sample_indices
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
sample_indices
.
size
();
++
i
)
{
auto
idx
=
sample_indices
[
i
];
auto
idx
=
sample_indices
[
i
];
auto
row
=
get_row_fun
(
static_cast
<
int
>
(
idx
));
auto
row
=
get_row_fun
(
static_cast
<
int
>
(
idx
));
// push 0 first, then edit the value according existing feature values
for
(
auto
&
feature_values
:
sample_values
)
{
feature_values
.
push_back
(
0.0
);
}
for
(
std
::
pair
<
int
,
double
>&
inner_data
:
row
)
{
for
(
std
::
pair
<
int
,
double
>&
inner_data
:
row
)
{
if
(
static_cast
<
size_t
>
(
inner_data
.
first
)
>=
sample_values
.
size
())
{
if
(
std
::
fabs
(
inner_data
.
second
)
>
1e-15
)
{
// if need expand feature set
if
(
static_cast
<
size_t
>
(
inner_data
.
first
)
>=
sample_values
.
size
())
{
size_t
need_size
=
inner_data
.
first
-
sample_values
.
size
()
+
1
;
// if need expand feature set
for
(
size_t
j
=
0
;
j
<
need_size
;
++
j
)
{
size_t
need_size
=
inner_data
.
first
-
sample_values
.
size
()
+
1
;
// push i+1 0
for
(
size_t
j
=
0
;
j
<
need_size
;
++
j
)
{
sample_values
.
emplace_back
(
i
+
1
,
0.0
f
);
sample_values
.
emplace_back
();
}
}
}
// edit the feature value
sample_values
[
inner_data
.
first
].
push_back
(
inner_data
.
second
);
}
}
// edit the feature value
sample_values
[
inner_data
.
first
][
i
]
=
inner_data
.
second
;
}
}
}
}
CHECK
(
num_col
>=
static_cast
<
int
>
(
sample_values
.
size
()));
CHECK
(
num_col
>=
static_cast
<
int
>
(
sample_values
.
size
()));
ret
=
loader
.
CostructFromSampleData
(
sample_values
,
nrow
);
ret
=
loader
.
CostructFromSampleData
(
sample_values
,
sample_cnt
,
nrow
);
}
else
{
}
else
{
ret
=
new
Dataset
(
nrow
,
config
.
io_config
.
num_class
);
ret
=
new
Dataset
(
nrow
,
config
.
io_config
.
num_class
);
ret
->
CopyFeatureMapperFrom
(
reinterpret_cast
<
const
Dataset
*>
(
*
reference
),
config
.
io_config
.
is_enable_sparse
);
ret
->
CopyFeatureMapperFrom
(
reinterpret_cast
<
const
Dataset
*>
(
*
reference
),
config
.
io_config
.
is_enable_sparse
);
...
@@ -319,7 +318,7 @@ DllExport int LGBM_CreateDatasetFromCSC(const void* col_ptr,
...
@@ -319,7 +318,7 @@ DllExport int LGBM_CreateDatasetFromCSC(const void* col_ptr,
auto
cur_col
=
get_col_fun
(
i
);
auto
cur_col
=
get_col_fun
(
i
);
sample_values
[
i
]
=
SampleFromOneColumn
(
cur_col
,
sample_indices
);
sample_values
[
i
]
=
SampleFromOneColumn
(
cur_col
,
sample_indices
);
}
}
ret
=
loader
.
CostructFromSampleData
(
sample_values
,
nrow
);
ret
=
loader
.
CostructFromSampleData
(
sample_values
,
sample_cnt
,
nrow
);
}
else
{
}
else
{
ret
=
new
Dataset
(
nrow
,
config
.
io_config
.
num_class
);
ret
=
new
Dataset
(
nrow
,
config
.
io_config
.
num_class
);
ret
->
CopyFeatureMapperFrom
(
reinterpret_cast
<
const
Dataset
*>
(
*
reference
),
config
.
io_config
.
is_enable_sparse
);
ret
->
CopyFeatureMapperFrom
(
reinterpret_cast
<
const
Dataset
*>
(
*
reference
),
config
.
io_config
.
is_enable_sparse
);
...
...
src/io/bin.cpp
View file @
c6512e01
...
@@ -39,16 +39,24 @@ BinMapper::~BinMapper() {
...
@@ -39,16 +39,24 @@ BinMapper::~BinMapper() {
delete
[]
bin_upper_bound_
;
delete
[]
bin_upper_bound_
;
}
}
void
BinMapper
::
FindBin
(
std
::
vector
<
double
>*
values
,
int
max_bin
)
{
void
BinMapper
::
FindBin
(
std
::
vector
<
double
>*
values
,
size_t
total_sample_cnt
,
int
max_bin
)
{
std
::
vector
<
double
>&
ref_values
=
(
*
values
);
std
::
vector
<
double
>&
ref_values
=
(
*
values
);
size_t
sample_size
=
values
->
size
();
size_t
sample_size
=
total_sample_cnt
;
size_t
zero_cnt
=
total_sample_cnt
-
ref_values
.
size
();
// find distinct_values first
// find distinct_values first
std
::
vector
<
double
>
distinct_values
;
std
::
vector
<
double
>
distinct_values
;
std
::
vector
<
int
>
counts
;
std
::
vector
<
int
>
counts
;
std
::
sort
(
ref_values
.
begin
(),
ref_values
.
end
());
std
::
sort
(
ref_values
.
begin
(),
ref_values
.
end
());
distinct_values
.
push_back
(
ref_values
[
0
]);
// push 0 first
counts
.
push_back
(
1
);
if
(
zero_cnt
>
0
)
{
distinct_values
.
push_back
(
0.0
f
);
counts
.
push_back
(
static_cast
<
int
>
(
zero_cnt
));
}
if
(
ref_values
.
size
()
>
0
)
{
distinct_values
.
push_back
(
ref_values
[
0
]);
counts
.
push_back
(
1
);
}
for
(
size_t
i
=
1
;
i
<
ref_values
.
size
();
++
i
)
{
for
(
size_t
i
=
1
;
i
<
ref_values
.
size
();
++
i
)
{
if
(
ref_values
[
i
]
!=
ref_values
[
i
-
1
])
{
if
(
ref_values
[
i
]
!=
ref_values
[
i
-
1
])
{
distinct_values
.
push_back
(
ref_values
[
i
]);
distinct_values
.
push_back
(
ref_values
[
i
]);
...
...
src/io/dataset_loader.cpp
View file @
c6512e01
...
@@ -408,12 +408,12 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* bin_filename, int rank, int
...
@@ -408,12 +408,12 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* bin_filename, int rank, int
return
dataset
;
return
dataset
;
}
}
Dataset
*
DatasetLoader
::
CostructFromSampleData
(
std
::
vector
<
std
::
vector
<
double
>>&
sample_values
,
data_size_t
num_data
)
{
Dataset
*
DatasetLoader
::
CostructFromSampleData
(
std
::
vector
<
std
::
vector
<
double
>>&
sample_values
,
size_t
total_sample_size
,
data_size_t
num_data
)
{
std
::
vector
<
BinMapper
*>
bin_mappers
(
sample_values
.
size
());
std
::
vector
<
BinMapper
*>
bin_mappers
(
sample_values
.
size
());
#pragma omp parallel for schedule(guided)
#pragma omp parallel for schedule(guided)
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
sample_values
.
size
());
++
i
)
{
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
sample_values
.
size
());
++
i
)
{
bin_mappers
[
i
]
=
new
BinMapper
();
bin_mappers
[
i
]
=
new
BinMapper
();
bin_mappers
[
i
]
->
FindBin
(
&
sample_values
[
i
],
io_config_
.
max_bin
);
bin_mappers
[
i
]
->
FindBin
(
&
sample_values
[
i
],
total_sample_size
,
io_config_
.
max_bin
);
}
}
Dataset
*
dataset
=
new
Dataset
();
Dataset
*
dataset
=
new
Dataset
();
...
@@ -580,21 +580,17 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
...
@@ -580,21 +580,17 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
oneline_features
.
clear
();
oneline_features
.
clear
();
// parse features
// parse features
parser
->
ParseOneLine
(
sample_data
[
i
].
c_str
(),
&
oneline_features
,
&
label
);
parser
->
ParseOneLine
(
sample_data
[
i
].
c_str
(),
&
oneline_features
,
&
label
);
// push 0 first, then edit the value according existing feature values
for
(
auto
&
feature_values
:
sample_values
)
{
feature_values
.
push_back
(
0.0
);
}
for
(
std
::
pair
<
int
,
double
>&
inner_data
:
oneline_features
)
{
for
(
std
::
pair
<
int
,
double
>&
inner_data
:
oneline_features
)
{
if
(
static_cast
<
size_t
>
(
inner_data
.
first
)
>=
sample_values
.
size
())
{
if
(
std
::
fabs
(
inner_data
.
second
)
>
1e-15
)
{
// if need expand feature set
if
(
static_cast
<
size_t
>
(
inner_data
.
first
)
>=
sample_values
.
size
())
{
size_t
need_size
=
inner_data
.
first
-
sample_values
.
size
()
+
1
;
// if need expand feature set
for
(
size_t
j
=
0
;
j
<
need_size
;
++
j
)
{
size_t
need_size
=
inner_data
.
first
-
sample_values
.
size
()
+
1
;
// push i+1 0
for
(
size_t
j
=
0
;
j
<
need_size
;
++
j
)
{
sample_values
.
emplace_back
(
i
+
1
,
0.0
f
);
sample_values
.
emplace_back
();
}
}
}
sample_values
[
inner_data
.
first
].
push_back
(
inner_data
.
second
);
}
}
// edit the feature value
sample_values
[
inner_data
.
first
][
i
]
=
inner_data
.
second
;
}
}
}
}
...
@@ -629,7 +625,7 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
...
@@ -629,7 +625,7 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
continue
;
continue
;
}
}
bin_mappers
[
i
]
=
new
BinMapper
();
bin_mappers
[
i
]
=
new
BinMapper
();
bin_mappers
[
i
]
->
FindBin
(
&
sample_values
[
i
],
io_config_
.
max_bin
);
bin_mappers
[
i
]
->
FindBin
(
&
sample_values
[
i
],
sample_data
.
size
(),
io_config_
.
max_bin
);
}
}
for
(
size_t
i
=
0
;
i
<
sample_values
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
sample_values
.
size
();
++
i
)
{
...
@@ -676,7 +672,7 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
...
@@ -676,7 +672,7 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
#pragma omp parallel for schedule(guided)
#pragma omp parallel for schedule(guided)
for
(
int
i
=
0
;
i
<
len
[
rank
];
++
i
)
{
for
(
int
i
=
0
;
i
<
len
[
rank
];
++
i
)
{
BinMapper
*
bin_mapper
=
new
BinMapper
();
BinMapper
*
bin_mapper
=
new
BinMapper
();
bin_mapper
->
FindBin
(
&
sample_values
[
start
[
rank
]
+
i
],
io_config_
.
max_bin
);
bin_mapper
->
FindBin
(
&
sample_values
[
start
[
rank
]
+
i
],
sample_data
.
size
(),
io_config_
.
max_bin
);
bin_mapper
->
CopyTo
(
input_buffer
+
i
*
type_size
);
bin_mapper
->
CopyTo
(
input_buffer
+
i
*
type_size
);
// don't need this any more
// don't need this any more
delete
bin_mapper
;
delete
bin_mapper
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment