Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tianlh
LightGBM-DCU
Commits
f56699eb
Commit
f56699eb
authored
Nov 19, 2016
by
Guolin Ke
Browse files
better compression algorithm for sparse bin
parent
80495ca6
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
139 additions
and
117 deletions
+139
-117
src/io/bin.cpp
src/io/bin.cpp
+1
-0
src/io/dense_bin.hpp
src/io/dense_bin.hpp
+6
-6
src/io/ordered_sparse_bin.hpp
src/io/ordered_sparse_bin.hpp
+27
-25
src/io/sparse_bin.hpp
src/io/sparse_bin.hpp
+105
-86
No files found.
src/io/bin.cpp
View file @
f56699eb
...
@@ -3,6 +3,7 @@
...
@@ -3,6 +3,7 @@
#include "dense_bin.hpp"
#include "dense_bin.hpp"
#include "sparse_bin.hpp"
#include "sparse_bin.hpp"
#include "ordered_sparse_bin.hpp"
#include <cmath>
#include <cmath>
#include <cstring>
#include <cstring>
...
...
src/io/dense_bin.hpp
View file @
f56699eb
...
@@ -16,7 +16,7 @@ namespace LightGBM {
...
@@ -16,7 +16,7 @@ namespace LightGBM {
template
<
typename
VAL_T
>
template
<
typename
VAL_T
>
class
DenseBin
:
public
Bin
{
class
DenseBin
:
public
Bin
{
public:
public:
explicit
DenseBin
(
data_size_t
num_data
,
int
default_bin
)
DenseBin
(
data_size_t
num_data
,
int
default_bin
)
:
num_data_
(
num_data
)
{
:
num_data_
(
num_data
)
{
data_
.
resize
(
num_data_
);
data_
.
resize
(
num_data_
);
VAL_T
default_bin_T
=
static_cast
<
VAL_T
>
(
default_bin
);
VAL_T
default_bin_T
=
static_cast
<
VAL_T
>
(
default_bin
);
...
@@ -37,8 +37,8 @@ public:
...
@@ -37,8 +37,8 @@ public:
BinIterator
*
GetIterator
(
data_size_t
start_idx
)
const
override
;
BinIterator
*
GetIterator
(
data_size_t
start_idx
)
const
override
;
void
ConstructHistogram
(
const
data_size_t
*
data_indices
,
data_size_t
num_data
,
void
ConstructHistogram
(
const
data_size_t
*
data_indices
,
data_size_t
num_data
,
const
score_t
*
ordered_gradients
,
const
score_t
*
ordered_hessians
,
const
score_t
*
ordered_gradients
,
const
score_t
*
ordered_hessians
,
HistogramBinEntry
*
out
)
const
override
{
HistogramBinEntry
*
out
)
const
override
{
// use 4-way unrolling, will be faster
// use 4-way unrolling, will be faster
if
(
data_indices
!=
nullptr
)
{
// if use part of data
if
(
data_indices
!=
nullptr
)
{
// if use part of data
data_size_t
rest
=
num_data
%
4
;
data_size_t
rest
=
num_data
%
4
;
...
@@ -70,8 +70,7 @@ public:
...
@@ -70,8 +70,7 @@ public:
out
[
bin
].
sum_hessians
+=
ordered_hessians
[
i
];
out
[
bin
].
sum_hessians
+=
ordered_hessians
[
i
];
++
out
[
bin
].
cnt
;
++
out
[
bin
].
cnt
;
}
}
}
}
else
{
// use full data
else
{
// use full data
data_size_t
rest
=
num_data
%
4
;
data_size_t
rest
=
num_data
%
4
;
data_size_t
i
=
0
;
data_size_t
i
=
0
;
for
(;
i
<
num_data
-
rest
;
i
+=
4
)
{
for
(;
i
<
num_data
-
rest
;
i
+=
4
)
{
...
@@ -105,7 +104,7 @@ public:
...
@@ -105,7 +104,7 @@ public:
}
}
data_size_t
Split
(
unsigned
int
threshold
,
data_size_t
*
data_indices
,
data_size_t
num_data
,
data_size_t
Split
(
unsigned
int
threshold
,
data_size_t
*
data_indices
,
data_size_t
num_data
,
data_size_t
*
lte_indices
,
data_size_t
*
gt_indices
)
const
override
{
data_size_t
*
lte_indices
,
data_size_t
*
gt_indices
)
const
override
{
data_size_t
lte_count
=
0
;
data_size_t
lte_count
=
0
;
data_size_t
gt_count
=
0
;
data_size_t
gt_count
=
0
;
for
(
data_size_t
i
=
0
;
i
<
num_data
;
++
i
)
{
for
(
data_size_t
i
=
0
;
i
<
num_data
;
++
i
)
{
...
@@ -168,5 +167,6 @@ template <typename VAL_T>
...
@@ -168,5 +167,6 @@ template <typename VAL_T>
BinIterator
*
DenseBin
<
VAL_T
>::
GetIterator
(
data_size_t
)
const
{
BinIterator
*
DenseBin
<
VAL_T
>::
GetIterator
(
data_size_t
)
const
{
return
new
DenseBinIterator
<
VAL_T
>
(
this
);
return
new
DenseBinIterator
<
VAL_T
>
(
this
);
}
}
}
// namespace LightGBM
}
// namespace LightGBM
#endif // LightGBM_IO_DENSE_BIN_HPP_
#endif // LightGBM_IO_DENSE_BIN_HPP_
src/io/ordered_sparse_bin.hpp
View file @
f56699eb
...
@@ -10,6 +10,8 @@
...
@@ -10,6 +10,8 @@
#include <mutex>
#include <mutex>
#include <algorithm>
#include <algorithm>
#include "sparse_bin.hpp"
namespace
LightGBM
{
namespace
LightGBM
{
/*!
/*!
...
@@ -21,7 +23,7 @@ namespace LightGBM {
...
@@ -21,7 +23,7 @@ namespace LightGBM {
* So we only using ordered bin for sparse situations.
* So we only using ordered bin for sparse situations.
*/
*/
template
<
typename
VAL_T
>
template
<
typename
VAL_T
>
class
OrderedSparseBin
:
public
OrderedBin
{
class
OrderedSparseBin
:
public
OrderedBin
{
public:
public:
/*! \brief Pair to store one bin entry */
/*! \brief Pair to store one bin entry */
struct
SparsePair
{
struct
SparsePair
{
...
@@ -30,14 +32,12 @@ public:
...
@@ -30,14 +32,12 @@ public:
SparsePair
(
data_size_t
r
,
VAL_T
b
)
:
ridx
(
r
),
bin
(
b
)
{}
SparsePair
(
data_size_t
r
,
VAL_T
b
)
:
ridx
(
r
),
bin
(
b
)
{}
};
};
OrderedSparseBin
(
const
std
::
vector
<
uint8_t
>&
delta
,
const
std
::
vector
<
VAL_T
>&
vals
)
OrderedSparseBin
(
const
SparseBin
<
VAL_T
>*
bin_data
)
:
delta_
(
delta
),
vals_
(
vals
)
{
:
bin_data_
(
bin_data
)
{
data_size_t
cur_pos
=
0
;
data_size_t
cur_pos
=
0
;
for
(
size_t
i
=
0
;
i
<
vals_
.
size
();
++
i
)
{
data_size_t
i_delta
=
-
1
;
cur_pos
+=
delta_
[
i
];
while
(
bin_data_
->
NextNonzero
(
&
i_delta
,
&
cur_pos
))
{
if
(
vals_
[
i
]
>
0
)
{
ordered_pair_
.
emplace_back
(
cur_pos
,
0
);
ordered_pair_
.
emplace_back
(
cur_pos
,
vals_
[
i
]);
}
}
}
ordered_pair_
.
shrink_to_fit
();
ordered_pair_
.
shrink_to_fit
();
}
}
...
@@ -51,26 +51,24 @@ public:
...
@@ -51,26 +51,24 @@ public:
leaf_cnt_
=
std
::
vector
<
data_size_t
>
(
num_leaves
,
0
);
leaf_cnt_
=
std
::
vector
<
data_size_t
>
(
num_leaves
,
0
);
if
(
used_idices
==
nullptr
)
{
if
(
used_idices
==
nullptr
)
{
// if using all data, copy all non-zero pair
// if using all data, copy all non-zero pair
data_size_t
cur_pos
=
0
;
data_size_t
j
=
0
;
data_size_t
j
=
0
;
for
(
size_t
i
=
0
;
i
<
vals_
.
size
();
++
i
)
{
data_size_t
cur_pos
=
0
;
cur_pos
+=
delta_
[
i
];
data_size_t
i_delta
=
-
1
;
if
(
vals_
[
i
]
>
0
)
{
while
(
bin_data_
->
NextNonzero
(
&
i_delta
,
&
cur_pos
))
{
ordered_pair_
[
j
].
ridx
=
cur_pos
;
ordered_pair_
[
j
].
ridx
=
cur_pos
;
ordered_pair_
[
j
].
bin
=
vals_
[
i
];
ordered_pair_
[
j
].
bin
=
bin_data_
->
vals_
[
i_delta
];
++
j
;
++
j
;
}
}
}
leaf_cnt_
[
0
]
=
static_cast
<
data_size_t
>
(
ordered_pair_
.
size
()
);
leaf_cnt_
[
0
]
=
static_cast
<
data_size_t
>
(
j
);
}
else
{
}
else
{
// if using part of data(bagging)
// if using part of data(bagging)
data_size_t
j
=
0
;
data_size_t
j
=
0
;
data_size_t
cur_pos
=
0
;
data_size_t
cur_pos
=
0
;
for
(
size_t
i
=
0
;
i
<
vals_
.
size
();
++
i
)
{
data_
size_t
i
_delta
=
-
1
;
cur_pos
+=
delta_
[
i
];
while
(
bin_data_
->
NextNonzero
(
&
i_delta
,
&
cur_pos
))
{
if
(
vals_
[
i
]
>
0
&&
used_idices
[
cur_pos
])
{
if
(
used_idices
[
cur_pos
])
{
ordered_pair_
[
j
].
ridx
=
cur_pos
;
ordered_pair_
[
j
].
ridx
=
cur_pos
;
ordered_pair_
[
j
].
bin
=
vals_
[
i
];
ordered_pair_
[
j
].
bin
=
bin_data_
->
vals_
[
i
_delta
];
++
j
;
++
j
;
}
}
}
}
...
@@ -79,7 +77,7 @@ public:
...
@@ -79,7 +77,7 @@ public:
}
}
void
ConstructHistogram
(
int
leaf
,
const
score_t
*
gradient
,
const
score_t
*
hessian
,
void
ConstructHistogram
(
int
leaf
,
const
score_t
*
gradient
,
const
score_t
*
hessian
,
HistogramBinEntry
*
out
)
const
override
{
HistogramBinEntry
*
out
)
const
override
{
// get current leaf boundary
// get current leaf boundary
const
data_size_t
start
=
leaf_start_
[
leaf
];
const
data_size_t
start
=
leaf_start_
[
leaf
];
const
data_size_t
end
=
start
+
leaf_cnt_
[
leaf
];
const
data_size_t
end
=
start
+
leaf_cnt_
[
leaf
];
...
@@ -118,9 +116,7 @@ public:
...
@@ -118,9 +116,7 @@ public:
OrderedSparseBin
<
VAL_T
>
(
const
OrderedSparseBin
<
VAL_T
>&
)
=
delete
;
OrderedSparseBin
<
VAL_T
>
(
const
OrderedSparseBin
<
VAL_T
>&
)
=
delete
;
private:
private:
const
std
::
vector
<
uint8_t
>&
delta_
;
const
SparseBin
<
VAL_T
>*
bin_data_
;
const
std
::
vector
<
VAL_T
>&
vals_
;
/*! \brief Store non-zero pair , group by leaf */
/*! \brief Store non-zero pair , group by leaf */
std
::
vector
<
SparsePair
>
ordered_pair_
;
std
::
vector
<
SparsePair
>
ordered_pair_
;
/*! \brief leaf_start_[i] means data in i-th leaf start from */
/*! \brief leaf_start_[i] means data in i-th leaf start from */
...
@@ -128,5 +124,11 @@ private:
...
@@ -128,5 +124,11 @@ private:
/*! \brief leaf_cnt_[i] means number of data in i-th leaf */
/*! \brief leaf_cnt_[i] means number of data in i-th leaf */
std
::
vector
<
data_size_t
>
leaf_cnt_
;
std
::
vector
<
data_size_t
>
leaf_cnt_
;
};
};
template
<
typename
VAL_T
>
OrderedBin
*
SparseBin
<
VAL_T
>::
CreateOrderedBin
()
const
{
return
new
OrderedSparseBin
<
VAL_T
>
(
this
);
}
}
// namespace LightGBM
}
// namespace LightGBM
#endif // LightGBM_IO_ORDERED_SPARSE_BIN_HPP_
#endif // LightGBM_IO_ORDERED_SPARSE_BIN_HPP_
src/io/sparse_bin.hpp
View file @
f56699eb
...
@@ -4,7 +4,6 @@
...
@@ -4,7 +4,6 @@
#include <LightGBM/utils/log.h>
#include <LightGBM/utils/log.h>
#include <LightGBM/bin.h>
#include <LightGBM/bin.h>
#include "ordered_sparse_bin.hpp"
#include <omp.h>
#include <omp.h>
...
@@ -15,23 +14,50 @@
...
@@ -15,23 +14,50 @@
namespace
LightGBM
{
namespace
LightGBM
{
template
<
typename
VAL_T
>
class
SparseBin
;
const
size_t
kNumFastIndex
=
64
;
const
size_t
kNumFastIndex
=
64
;
const
uint8_t
kMaxDelta
=
255
;
template
<
typename
VAL_T
>
class
SparseBinIterator
;
template
<
typename
VAL_T
>
class
SparseBinIterator
:
public
BinIterator
{
public:
SparseBinIterator
(
const
SparseBin
<
VAL_T
>*
bin_data
,
data_size_t
start_idx
)
:
bin_data_
(
bin_data
)
{
Reset
(
start_idx
);
}
inline
VAL_T
InnerGet
(
data_size_t
idx
);
inline
uint32_t
Get
(
data_size_t
idx
)
override
{
return
InnerGet
(
idx
);
}
inline
void
Reset
(
data_size_t
idx
);
private:
const
SparseBin
<
VAL_T
>*
bin_data_
;
data_size_t
cur_pos_
;
data_size_t
i_delta_
;
};
template
<
typename
VAL_T
>
class
OrderedSparseBin
;
template
<
typename
VAL_T
>
template
<
typename
VAL_T
>
class
SparseBin
:
public
Bin
{
class
SparseBin
:
public
Bin
{
public:
public:
friend
class
SparseBinIterator
<
VAL_T
>
;
friend
class
SparseBinIterator
<
VAL_T
>
;
friend
class
OrderedSparseBin
<
VAL_T
>
;
explicit
SparseBin
(
data_size_t
num_data
,
int
default_bin
)
SparseBin
(
data_size_t
num_data
,
int
default_bin
)
:
num_data_
(
num_data
)
{
:
num_data_
(
num_data
)
{
default_bin_
=
static_cast
<
VAL_T
>
(
default_bin
);
default_bin_
=
static_cast
<
VAL_T
>
(
default_bin
);
if
(
default_bin_
!=
0
)
{
if
(
default_bin_
!=
0
)
{
Log
::
Info
(
"Warning: sparse feature with negative values, treating negative values as zero"
);
Log
::
Info
(
"Warning: sparse feature with negative values, treating negative values as zero"
);
}
}
#pragma omp parallel
#pragma omp parallel
#pragma omp master
#pragma omp master
{
{
num_threads_
=
omp_get_num_threads
();
num_threads_
=
omp_get_num_threads
();
}
}
...
@@ -51,31 +77,39 @@ public:
...
@@ -51,31 +77,39 @@ public:
BinIterator
*
GetIterator
(
data_size_t
start_idx
)
const
override
;
BinIterator
*
GetIterator
(
data_size_t
start_idx
)
const
override
;
void
ConstructHistogram
(
const
data_size_t
*
,
data_size_t
,
const
score_t
*
,
void
ConstructHistogram
(
const
data_size_t
*
,
data_size_t
,
const
score_t
*
,
const
score_t
*
,
HistogramBinEntry
*
)
const
override
{
const
score_t
*
,
HistogramBinEntry
*
)
const
override
{
// Will use OrderedSparseBin->ConstructHistogram() instead
// Will use OrderedSparseBin->ConstructHistogram() instead
Log
::
Fatal
(
"Using OrderedSparseBin->ConstructHistogram() instead"
);
Log
::
Fatal
(
"Using OrderedSparseBin->ConstructHistogram() instead"
);
}
}
inline
bool
NextNonzero
(
data_size_t
*
i_delta
,
data_size_t
*
cur_pos
)
const
{
++
(
*
i_delta
);
*
cur_pos
+=
deltas_
[
*
i_delta
];
data_size_t
factor
=
1
;
while
(
*
i_delta
<
num_vals_
&&
vals_
[
*
i_delta
]
==
0
)
{
++
(
*
i_delta
);
factor
*=
kMaxDelta
;
*
cur_pos
+=
deltas_
[
*
i_delta
]
*
factor
;
}
if
(
*
i_delta
>=
0
&&
*
i_delta
<
num_vals_
)
{
return
true
;
}
else
{
return
false
;
}
}
data_size_t
Split
(
unsigned
int
threshold
,
data_size_t
*
data_indices
,
data_size_t
num_data
,
data_size_t
Split
(
unsigned
int
threshold
,
data_size_t
*
data_indices
,
data_size_t
num_data
,
data_size_t
*
lte_indices
,
data_size_t
*
gt_indices
)
const
override
{
data_size_t
*
lte_indices
,
data_size_t
*
gt_indices
)
const
override
{
// not need to split
// not need to split
if
(
num_data
<=
0
)
{
return
0
;
}
if
(
num_data
<=
0
)
{
return
0
;
}
const
auto
fast_pair
=
fast_index_
[(
data_indices
[
0
])
>>
fast_index_shift_
];
SparseBinIterator
<
VAL_T
>
iterator
(
this
,
data_indices
[
0
]);
data_size_t
j
=
fast_pair
.
first
;
data_size_t
cur_pos
=
fast_pair
.
second
;
data_size_t
lte_count
=
0
;
data_size_t
lte_count
=
0
;
data_size_t
gt_count
=
0
;
data_size_t
gt_count
=
0
;
for
(
data_size_t
i
=
0
;
i
<
num_data
;
++
i
)
{
for
(
data_size_t
i
=
0
;
i
<
num_data
;
++
i
)
{
const
data_size_t
idx
=
data_indices
[
i
];
const
data_size_t
idx
=
data_indices
[
i
];
while
(
cur_pos
<
idx
&&
j
<
num_vals_
)
{
VAL_T
bin
=
iterator
.
InnerGet
(
idx
);
++
j
;
cur_pos
+=
delta_
[
j
];
}
VAL_T
bin
=
0
;
if
(
cur_pos
==
idx
&&
j
<
num_vals_
)
{
bin
=
vals_
[
j
];
}
if
(
bin
>
threshold
)
{
if
(
bin
>
threshold
)
{
gt_indices
[
gt_count
++
]
=
idx
;
gt_indices
[
gt_count
++
]
=
idx
;
}
else
{
}
else
{
...
@@ -87,9 +121,7 @@ public:
...
@@ -87,9 +121,7 @@ public:
data_size_t
num_data
()
const
override
{
return
num_data_
;
}
data_size_t
num_data
()
const
override
{
return
num_data_
;
}
OrderedBin
*
CreateOrderedBin
()
const
override
{
OrderedBin
*
CreateOrderedBin
()
const
override
;
return
new
OrderedSparseBin
<
VAL_T
>
(
delta_
,
vals_
);
}
void
FinishLoad
()
override
{
void
FinishLoad
()
override
{
// get total non zero size
// get total non zero size
...
@@ -119,30 +151,29 @@ public:
...
@@ -119,30 +151,29 @@ public:
}
}
void
LoadFromPair
(
const
std
::
vector
<
std
::
pair
<
data_size_t
,
VAL_T
>>&
non_zero_pair
)
{
void
LoadFromPair
(
const
std
::
vector
<
std
::
pair
<
data_size_t
,
VAL_T
>>&
non_zero_pair
)
{
delta_
.
clear
();
delta
s
_
.
clear
();
vals_
.
clear
();
vals_
.
clear
();
// transform to delta array
// transform to delta array
const
uint8_t
kMaxDelta
=
255
;
data_size_t
last_idx
=
0
;
data_size_t
last_idx
=
0
;
for
(
size_t
i
=
0
;
i
<
non_zero_pair
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
non_zero_pair
.
size
();
++
i
)
{
const
data_size_t
cur_idx
=
non_zero_pair
[
i
].
first
;
const
data_size_t
cur_idx
=
non_zero_pair
[
i
].
first
;
const
VAL_T
bin
=
non_zero_pair
[
i
].
second
;
const
VAL_T
bin
=
non_zero_pair
[
i
].
second
;
data_size_t
cur_delta
=
cur_idx
-
last_idx
;
data_size_t
cur_delta
=
cur_idx
-
last_idx
;
while
(
cur_delta
>
kMaxDelta
)
{
while
(
cur_delta
>
kMaxDelta
)
{
delta_
.
push_back
(
255
);
delta
s
_
.
push_back
(
cur_delta
%
kMaxDelta
);
vals_
.
push_back
(
0
);
vals_
.
push_back
(
0
);
cur_delta
-
=
kMaxDelta
;
cur_delta
/
=
kMaxDelta
;
}
}
delta_
.
push_back
(
static_cast
<
uint8_t
>
(
cur_delta
));
delta
s
_
.
push_back
(
static_cast
<
uint8_t
>
(
cur_delta
));
vals_
.
push_back
(
bin
);
vals_
.
push_back
(
bin
);
last_idx
=
cur_idx
;
last_idx
=
cur_idx
;
}
}
// avoid out of range
// avoid out of range
delta_
.
push_back
(
0
);
delta
s
_
.
push_back
(
0
);
num_vals_
=
static_cast
<
data_size_t
>
(
vals_
.
size
());
num_vals_
=
static_cast
<
data_size_t
>
(
vals_
.
size
());
// reduce memory cost
// reduce memory cost
delta_
.
shrink_to_fit
();
delta
s
_
.
shrink_to_fit
();
vals_
.
shrink_to_fit
();
vals_
.
shrink_to_fit
();
// generate fast index
// generate fast index
...
@@ -160,26 +191,26 @@ public:
...
@@ -160,26 +191,26 @@ public:
++
fast_index_shift_
;
++
fast_index_shift_
;
}
}
// build fast index
// build fast index
data_size_t
next_i
=
0
;
data_size_t
i_delta
=
-
1
;
data_size_t
cur_pos
=
0
;
data_size_t
cur_pos
=
0
;
for
(
data_size_t
i
=
0
;
i
<
num_vals_
;
++
i
)
{
data_size_t
next_threshold
=
0
;
cur_pos
+=
delta_
[
i
];
while
(
NextNonzero
(
&
i_delta
,
&
cur_pos
))
{
while
(
next_
i
<
cur_pos
)
{
while
(
next_
threshold
<
cur_pos
)
{
fast_index_
.
emplace_back
(
i
,
cur_pos
);
fast_index_
.
emplace_back
(
i
_delta
,
cur_pos
);
next_
i
+=
pow2_mod_size
;
next_
threshold
+=
pow2_mod_size
;
}
}
}
}
// avoid out of range
// avoid out of range
while
(
next_
i
<
num_data_
)
{
while
(
next_
threshold
<
num_data_
)
{
fast_index_
.
emplace_back
(
num_vals_
-
1
,
cur_pos
);
fast_index_
.
emplace_back
(
num_vals_
-
1
,
cur_pos
);
next_
i
+=
pow2_mod_size
;
next_
threshold
+=
pow2_mod_size
;
}
}
fast_index_
.
shrink_to_fit
();
fast_index_
.
shrink_to_fit
();
}
}
void
SaveBinaryToFile
(
FILE
*
file
)
const
override
{
void
SaveBinaryToFile
(
FILE
*
file
)
const
override
{
fwrite
(
&
num_vals_
,
sizeof
(
num_vals_
),
1
,
file
);
fwrite
(
&
num_vals_
,
sizeof
(
num_vals_
),
1
,
file
);
fwrite
(
delta_
.
data
(),
sizeof
(
uint8_t
),
num_vals_
+
1
,
file
);
fwrite
(
delta
s
_
.
data
(),
sizeof
(
uint8_t
),
num_vals_
+
1
,
file
);
fwrite
(
vals_
.
data
(),
sizeof
(
VAL_T
),
num_vals_
,
file
);
fwrite
(
vals_
.
data
(),
sizeof
(
VAL_T
),
num_vals_
,
file
);
}
}
...
@@ -196,39 +227,33 @@ public:
...
@@ -196,39 +227,33 @@ public:
mem_ptr
+=
sizeof
(
uint8_t
)
*
(
tmp_num_vals
+
1
);
mem_ptr
+=
sizeof
(
uint8_t
)
*
(
tmp_num_vals
+
1
);
const
VAL_T
*
tmp_vals
=
reinterpret_cast
<
const
VAL_T
*>
(
mem_ptr
);
const
VAL_T
*
tmp_vals
=
reinterpret_cast
<
const
VAL_T
*>
(
mem_ptr
);
if
(
local_used_indices
.
size
()
<=
0
)
{
deltas_
.
clear
();
delta_
.
clear
();
vals_
.
clear
();
vals_
.
clear
();
num_vals_
=
tmp_num_vals
;
num_vals_
=
tmp_num_vals
;
for
(
data_size_t
i
=
0
;
i
<
num_vals_
;
++
i
)
{
for
(
data_size_t
i
=
0
;
i
<
num_vals_
;
++
i
)
{
deltas_
.
push_back
(
tmp_delta
[
i
]);
delta_
.
push_back
(
tmp_delta
[
i
]);
vals_
.
push_back
(
tmp_vals
[
i
]);
vals_
.
push_back
(
tmp_vals
[
i
]);
}
}
deltas_
.
push_back
(
0
);
delta_
.
push_back
(
0
);
// reduce memory cost
// reduce memory cost
deltas_
.
shrink_to_fit
();
delta_
.
shrink_to_fit
();
vals_
.
shrink_to_fit
();
vals_
.
shrink_to_fit
();
if
(
local_used_indices
.
size
()
<=
0
)
{
// generate fast index
// generate fast index
GetFastIndex
();
GetFastIndex
();
}
else
{
}
else
{
std
::
vector
<
std
::
pair
<
data_size_t
,
VAL_T
>>
tmp_pair
;
std
::
vector
<
std
::
pair
<
data_size_t
,
VAL_T
>>
tmp_pair
;
data_size_t
cur_pos
=
tmp_delta
[
0
]
;
data_size_t
cur_pos
=
0
;
data_size_t
j
=
0
;
data_size_t
j
=
-
1
;
for
(
data_size_t
i
=
0
;
i
<
static_cast
<
data_size_t
>
(
local_used_indices
.
size
());
++
i
)
{
for
(
data_size_t
i
=
0
;
i
<
static_cast
<
data_size_t
>
(
local_used_indices
.
size
());
++
i
)
{
const
data_size_t
idx
=
local_used_indices
[
i
];
const
data_size_t
idx
=
local_used_indices
[
i
];
while
(
cur_pos
<
idx
&&
j
<
tmp_num_vals
)
{
while
(
cur_pos
<
idx
&&
j
<
num_vals_
)
{
++
j
;
NextNonzero
(
&
j
,
&
cur_pos
);
cur_pos
+=
tmp_delta
[
j
];
}
}
VAL_T
bin
=
0
;
if
(
cur_pos
==
idx
&&
j
<
num_vals_
)
{
if
(
cur_pos
==
idx
&&
j
<
tmp_num_vals
)
{
bin
=
tmp_vals
[
j
];
}
if
(
bin
>
0
)
{
// new row index is i
// new row index is i
tmp_pair
.
emplace_back
(
i
,
bin
);
tmp_pair
.
emplace_back
(
i
,
vals_
[
j
]
);
}
}
}
}
LoadFromPair
(
tmp_pair
);
LoadFromPair
(
tmp_pair
);
...
@@ -239,7 +264,7 @@ public:
...
@@ -239,7 +264,7 @@ public:
private:
private:
data_size_t
num_data_
;
data_size_t
num_data_
;
std
::
vector
<
std
::
pair
<
data_size_t
,
VAL_T
>>
non_zero_pair_
;
std
::
vector
<
std
::
pair
<
data_size_t
,
VAL_T
>>
non_zero_pair_
;
std
::
vector
<
uint8_t
>
delta_
;
std
::
vector
<
uint8_t
>
delta
s
_
;
std
::
vector
<
VAL_T
>
vals_
;
std
::
vector
<
VAL_T
>
vals_
;
data_size_t
num_vals_
;
data_size_t
num_vals_
;
int
num_threads_
;
int
num_threads_
;
...
@@ -250,36 +275,30 @@ private:
...
@@ -250,36 +275,30 @@ private:
};
};
template
<
typename
VAL_T
>
template
<
typename
VAL_T
>
class
SparseBinIterator
:
public
BinIterator
{
inline
VAL_T
SparseBinIterator
<
VAL_T
>::
InnerGet
(
data_size_t
idx
)
{
public:
while
(
cur_pos_
<
idx
&&
i_delta_
<
bin_data_
->
num_vals_
)
{
SparseBinIterator
(
const
SparseBin
<
VAL_T
>*
bin_data
,
data_size_t
start_idx
)
bin_data_
->
NextNonzero
(
&
i_delta_
,
&
cur_pos_
);
:
bin_data_
(
bin_data
)
{
const
auto
fast_pair
=
bin_data
->
fast_index_
[
start_idx
>>
bin_data
->
fast_index_shift_
];
i_delta_
=
fast_pair
.
first
;
cur_pos_
=
fast_pair
.
second
;
}
}
uint32_t
Get
(
data_size_t
idx
)
override
{
if
(
cur_pos_
==
idx
&&
i_delta_
<
bin_data_
->
num_vals_
)
{
while
(
cur_pos_
<
idx
&&
i_delta_
<
bin_data_
->
num_vals_
)
{
return
bin_data_
->
vals_
[
i_delta_
];
++
i_delta_
;
}
else
{
cur_pos_
+=
bin_data_
->
delta_
[
i_delta_
];
return
0
;
}
if
(
idx
==
cur_pos_
&&
i_delta_
>=
0
&&
i_delta_
<
bin_data_
->
vals_
.
size
())
{
return
bin_data_
->
vals_
[
i_delta_
];
}
else
{
return
0
;
}
}
}
}
private:
template
<
typename
VAL_T
>
const
SparseBin
<
VAL_T
>*
bin_data_
;
inline
void
SparseBinIterator
<
VAL_T
>::
Reset
(
data_size_t
start_idx
)
{
data_size_t
cur_pos_
=
0
;
const
auto
fast_pair
=
bin_data_
->
fast_index_
[
start_idx
>>
bin_data_
->
fast_index_shift_
]
;
data_size_t
i_delta_
=
0
;
i_delta_
=
fast_pair
.
first
;
}
;
cur_pos_
=
fast_pair
.
second
;
}
template
<
typename
VAL_T
>
template
<
typename
VAL_T
>
BinIterator
*
SparseBin
<
VAL_T
>::
GetIterator
(
data_size_t
start_idx
)
const
{
BinIterator
*
SparseBin
<
VAL_T
>::
GetIterator
(
data_size_t
start_idx
)
const
{
return
new
SparseBinIterator
<
VAL_T
>
(
this
,
start_idx
);
return
new
SparseBinIterator
<
VAL_T
>
(
this
,
start_idx
);
}
}
}
// namespace LightGBM
}
// namespace LightGBM
#endif // LightGBM_IO_SPARSE_BIN_HPP_
#endif // LightGBM_IO_SPARSE_BIN_HPP_
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment