Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tianlh
LightGBM-DCU
Commits
3489607f
Commit
3489607f
authored
Jan 24, 2017
by
Guolin Ke
Browse files
reduce function call cost for constructing subset.
parent
6c736da9
Changes
10
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
58 additions
and
30 deletions
+58
-30
include/LightGBM/bin.h
include/LightGBM/bin.h
+2
-0
include/LightGBM/dataset.h
include/LightGBM/dataset.h
+1
-1
include/LightGBM/feature.h
include/LightGBM/feature.h
+10
-1
src/boosting/gbdt.cpp
src/boosting/gbdt.cpp
+2
-2
src/c_api.cpp
src/c_api.cpp
+4
-7
src/io/dataset.cpp
src/io/dataset.cpp
+9
-6
src/io/dataset_loader.cpp
src/io/dataset_loader.cpp
+2
-2
src/io/dense_bin.hpp
src/io/dense_bin.hpp
+7
-0
src/io/ordered_sparse_bin.hpp
src/io/ordered_sparse_bin.hpp
+4
-3
src/io/sparse_bin.hpp
src/io/sparse_bin.hpp
+17
-8
No files found.
include/LightGBM/bin.h
View file @
3489607f
...
...
@@ -263,6 +263,8 @@ public:
*/
virtual
void
Push
(
int
tid
,
data_size_t
idx
,
uint32_t
value
)
=
0
;
virtual
void
CopySubset
(
const
Bin
*
full_bin
,
const
data_size_t
*
used_indices
,
data_size_t
num_used_indices
)
=
0
;
/*!
* \brief Get bin interator of this bin
* \param start_idx start index of this
...
...
include/LightGBM/dataset.h
View file @
3489607f
...
...
@@ -351,7 +351,7 @@ public:
*/
LIGHTGBM_EXPORT
void
SaveBinaryFile
(
const
char
*
bin_filename
);
LIGHTGBM_EXPORT
void
CopyFeatureMapperFrom
(
const
Dataset
*
dataset
,
bool
is_enable_sparse
);
LIGHTGBM_EXPORT
void
CopyFeatureMapperFrom
(
const
Dataset
*
dataset
);
/*!
* \brief Get a feature pointer for specific index
...
...
include/LightGBM/feature.h
View file @
3489607f
...
...
@@ -80,12 +80,21 @@ public:
unsigned
int
bin
=
bin_mapper_
->
ValueToBin
(
value
);
bin_data_
->
Push
(
tid
,
line_idx
,
bin
);
}
inline
void
PushBin
(
int
tid
,
data_size_t
line_idx
,
unsigned
int
bin
)
{
bin_data_
->
Push
(
tid
,
line_idx
,
bin
);
}
void
ReSize
(
data_size_t
num_data
)
{
inline
void
CopySubset
(
const
Feature
*
full_feature
,
const
data_size_t
*
used_indices
,
data_size_t
num_used_indices
)
{
bin_data_
->
CopySubset
(
full_feature
->
bin_data_
.
get
(),
used_indices
,
num_used_indices
);
}
inline
void
ReSize
(
data_size_t
num_data
)
{
bin_data_
->
ReSize
(
num_data
);
}
inline
bool
is_sparse
()
const
{
return
is_sparse_
;
}
inline
void
FinishLoad
()
{
bin_data_
->
FinishLoad
();
}
/*! \brief Index of this feature */
inline
int
feature_index
()
const
{
return
feature_index_
;
}
...
...
src/boosting/gbdt.cpp
View file @
3489607f
...
...
@@ -133,9 +133,9 @@ void GBDT::ResetTrainingData(const BoostingConfig* config, const Dataset* train_
right_write_pos_buf_
.
resize
(
num_threads_
);
double
average_bag_rate
=
new_config
->
bagging_fraction
/
new_config
->
bagging_freq
;
is_use_subset_
=
false
;
if
(
average_bag_rate
<
0.5
)
{
if
(
average_bag_rate
<
=
0.5
)
{
tmp_subset_
.
reset
(
new
Dataset
(
bag_data_cnt_
));
tmp_subset_
->
CopyFeatureMapperFrom
(
train_data
,
false
);
tmp_subset_
->
CopyFeatureMapperFrom
(
train_data
);
is_use_subset_
=
true
;
Log
::
Debug
(
"use subset for bagging"
);
}
...
...
src/c_api.cpp
View file @
3489607f
...
...
@@ -335,8 +335,7 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromMat(const void* data,
}
else
{
ret
.
reset
(
new
Dataset
(
nrow
));
ret
->
CopyFeatureMapperFrom
(
reinterpret_cast
<
const
Dataset
*>
(
reference
),
io_config
.
is_enable_sparse
);
reinterpret_cast
<
const
Dataset
*>
(
reference
));
}
#pragma omp parallel for schedule(guided)
...
...
@@ -397,8 +396,7 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromCSR(const void* indptr,
}
else
{
ret
.
reset
(
new
Dataset
(
nrow
));
ret
->
CopyFeatureMapperFrom
(
reinterpret_cast
<
const
Dataset
*>
(
reference
),
io_config
.
is_enable_sparse
);
reinterpret_cast
<
const
Dataset
*>
(
reference
));
}
#pragma omp parallel for schedule(guided)
...
...
@@ -450,8 +448,7 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromCSC(const void* col_ptr,
}
else
{
ret
.
reset
(
new
Dataset
(
nrow
));
ret
->
CopyFeatureMapperFrom
(
reinterpret_cast
<
const
Dataset
*>
(
reference
),
io_config
.
is_enable_sparse
);
reinterpret_cast
<
const
Dataset
*>
(
reference
));
}
#pragma omp parallel for schedule(guided)
...
...
@@ -486,7 +483,7 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetGetSubset(
io_config
.
Set
(
param
);
auto
full_dataset
=
reinterpret_cast
<
const
Dataset
*>
(
handle
);
auto
ret
=
std
::
unique_ptr
<
Dataset
>
(
new
Dataset
(
num_used_row_indices
));
ret
->
CopyFeatureMapperFrom
(
full_dataset
,
io_config
.
is_enable_sparse
);
ret
->
CopyFeatureMapperFrom
(
full_dataset
);
ret
->
CopySubset
(
full_dataset
,
used_row_indices
,
num_used_row_indices
,
true
);
*
out
=
ret
.
release
();
API_END
();
...
...
src/io/dataset.cpp
View file @
3489607f
...
...
@@ -38,9 +38,16 @@ void Dataset::FinishLoad() {
}
}
void
Dataset
::
CopyFeatureMapperFrom
(
const
Dataset
*
dataset
,
bool
is_enable_sparse
)
{
void
Dataset
::
CopyFeatureMapperFrom
(
const
Dataset
*
dataset
)
{
features_
.
clear
();
num_features_
=
dataset
->
num_features_
;
bool
is_enable_sparse
=
false
;
for
(
int
i
=
0
;
i
<
num_features_
;
++
i
)
{
if
(
dataset
->
features_
[
i
]
->
is_sparse
())
{
is_enable_sparse
=
true
;
break
;
}
}
// copy feature bin mapper data
for
(
int
i
=
0
;
i
<
num_features_
;
++
i
){
features_
.
emplace_back
(
new
Feature
(
dataset
->
features_
[
i
]
->
feature_index
(),
...
...
@@ -69,15 +76,11 @@ void Dataset::CopySubset(const Dataset* fullset, const data_size_t* used_indices
CHECK
(
num_used_indices
==
num_data_
);
#pragma omp parallel for schedule(guided)
for
(
int
fidx
=
0
;
fidx
<
num_features_
;
++
fidx
)
{
auto
iterator
=
fullset
->
features_
[
fidx
]
->
bin_data
()
->
GetIterator
(
used_indices
[
0
]);
for
(
data_size_t
i
=
0
;
i
<
num_used_indices
;
++
i
)
{
features_
[
fidx
]
->
PushBin
(
0
,
i
,
iterator
->
Get
(
used_indices
[
i
]));
}
features_
[
fidx
]
->
CopySubset
(
fullset
->
features_
[
fidx
].
get
(),
used_indices
,
num_used_indices
);
}
if
(
need_meta_data
)
{
metadata_
.
Init
(
metadata_
,
used_indices
,
num_used_indices
);
}
FinishLoad
();
}
bool
Dataset
::
SetFloatField
(
const
char
*
field_name
,
const
float
*
field_data
,
data_size_t
num_element
)
{
...
...
src/io/dataset_loader.cpp
View file @
3489607f
...
...
@@ -238,7 +238,7 @@ Dataset* DatasetLoader::LoadFromFileAlignWithOtherDataset(const char* filename,
dataset
->
num_data_
=
static_cast
<
data_size_t
>
(
text_data
.
size
());
// initialize label
dataset
->
metadata_
.
Init
(
dataset
->
num_data_
,
weight_idx_
,
group_idx_
);
dataset
->
CopyFeatureMapperFrom
(
train_data
,
io_config_
.
is_enable_sparse
);
dataset
->
CopyFeatureMapperFrom
(
train_data
);
// extract features
ExtractFeaturesFromMemory
(
text_data
,
parser
.
get
(),
dataset
.
get
());
text_data
.
clear
();
...
...
@@ -249,7 +249,7 @@ Dataset* DatasetLoader::LoadFromFileAlignWithOtherDataset(const char* filename,
num_global_data
=
dataset
->
num_data_
;
// initialize label
dataset
->
metadata_
.
Init
(
dataset
->
num_data_
,
weight_idx_
,
group_idx_
);
dataset
->
CopyFeatureMapperFrom
(
train_data
,
io_config_
.
is_enable_sparse
);
dataset
->
CopyFeatureMapperFrom
(
train_data
);
// extract features
ExtractFeaturesFromFile
(
filename
,
parser
.
get
(),
used_data_indices
,
dataset
.
get
());
}
...
...
src/io/dense_bin.hpp
View file @
3489607f
...
...
@@ -147,6 +147,13 @@ public:
}
}
void
CopySubset
(
const
Bin
*
full_bin
,
const
data_size_t
*
used_indices
,
data_size_t
num_used_indices
)
override
{
auto
other_bin
=
reinterpret_cast
<
const
DenseBin
<
VAL_T
>*>
(
full_bin
);
for
(
int
i
=
0
;
i
<
num_used_indices
;
++
i
)
{
data_
[
i
]
=
other_bin
->
data_
[
used_indices
[
i
]];
}
}
void
SaveBinaryToFile
(
FILE
*
file
)
const
override
{
fwrite
(
data_
.
data
(),
sizeof
(
VAL_T
),
num_data_
,
file
);
}
...
...
src/io/ordered_sparse_bin.hpp
View file @
3489607f
...
...
@@ -29,17 +29,18 @@ public:
struct
SparsePair
{
data_size_t
ridx
;
// data(row) index
VAL_T
bin
;
// bin for this data
SparsePair
(
data_size_t
r
,
VAL_T
b
)
:
ridx
(
r
),
bin
(
b
)
{}
SparsePair
()
:
ridx
(
0
),
bin
(
0
)
{}
};
OrderedSparseBin
(
const
SparseBin
<
VAL_T
>*
bin_data
)
:
bin_data_
(
bin_data
)
{
data_size_t
cur_pos
=
0
;
data_size_t
i_delta
=
-
1
;
int
non_zero_cnt
=
0
;
while
(
bin_data_
->
NextNonzero
(
&
i_delta
,
&
cur_pos
))
{
ordered_pair_
.
emplace_back
(
cur_pos
,
static_cast
<
VAL_T
>
(
0
))
;
++
non_zero_cnt
;
}
ordered_pair_
.
shrink_to_fit
(
);
ordered_pair_
.
resize
(
non_zero_cnt
);
}
~
OrderedSparseBin
()
{
...
...
src/io/sparse_bin.hpp
View file @
3489607f
...
...
@@ -134,25 +134,23 @@ public:
for
(
size_t
i
=
0
;
i
<
push_buffers_
.
size
();
++
i
)
{
non_zero_size
+=
push_buffers_
[
i
].
size
();
}
std
::
vector
<
std
::
pair
<
data_size_t
,
VAL_T
>>
non_zero_pair
;
// merge
non_zero_pair
_
.
reserve
(
non_zero_size
);
non_zero_pair
.
reserve
(
non_zero_size
);
for
(
size_t
i
=
0
;
i
<
push_buffers_
.
size
();
++
i
)
{
non_zero_pair
_
.
insert
(
non_zero_pair
_
.
end
(),
push_buffers_
[
i
].
begin
(),
push_buffers_
[
i
].
end
());
non_zero_pair
.
insert
(
non_zero_pair
.
end
(),
push_buffers_
[
i
].
begin
(),
push_buffers_
[
i
].
end
());
push_buffers_
[
i
].
clear
();
push_buffers_
[
i
].
shrink_to_fit
();
}
push_buffers_
.
clear
();
push_buffers_
.
shrink_to_fit
();
// sort by data index
std
::
sort
(
non_zero_pair
_
.
begin
(),
non_zero_pair
_
.
end
(),
std
::
sort
(
non_zero_pair
.
begin
(),
non_zero_pair
.
end
(),
[](
const
std
::
pair
<
data_size_t
,
VAL_T
>&
a
,
const
std
::
pair
<
data_size_t
,
VAL_T
>&
b
)
{
return
a
.
first
<
b
.
first
;
});
// load detla array
LoadFromPair
(
non_zero_pair_
);
// free memory
non_zero_pair_
.
clear
();
non_zero_pair_
.
shrink_to_fit
();
LoadFromPair
(
non_zero_pair
);
}
void
LoadFromPair
(
const
std
::
vector
<
std
::
pair
<
data_size_t
,
VAL_T
>>&
non_zero_pair
)
{
...
...
@@ -264,12 +262,23 @@ public:
}
LoadFromPair
(
tmp_pair
);
}
}
void
CopySubset
(
const
Bin
*
full_bin
,
const
data_size_t
*
used_indices
,
data_size_t
num_used_indices
)
override
{
auto
other_bin
=
reinterpret_cast
<
const
SparseBin
<
VAL_T
>*>
(
full_bin
);
SparseBinIterator
<
VAL_T
>
iterator
(
other_bin
,
used_indices
[
0
]);
std
::
vector
<
std
::
pair
<
data_size_t
,
VAL_T
>>
tmp_pair
;
for
(
data_size_t
i
=
0
;
i
<
num_used_indices
;
++
i
)
{
VAL_T
bin
=
iterator
.
InnerGet
(
used_indices
[
i
]);
if
(
bin
>
0
)
{
tmp_pair
.
emplace_back
(
i
,
bin
);
}
}
LoadFromPair
(
tmp_pair
);
}
protected:
data_size_t
num_data_
;
std
::
vector
<
std
::
pair
<
data_size_t
,
VAL_T
>>
non_zero_pair_
;
std
::
vector
<
uint8_t
>
deltas_
;
std
::
vector
<
VAL_T
>
vals_
;
data_size_t
num_vals_
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment