Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tianlh
LightGBM-DCU
Commits
952099d6
Commit
952099d6
authored
Nov 13, 2016
by
Guolin Ke
Browse files
fix bugs in bin finder
parent
9f04f276
Changes
9
Show whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
63 additions
and
73 deletions
+63
-73
include/LightGBM/utils/common.h
include/LightGBM/utils/common.h
+2
-14
src/boosting/dart.cpp
src/boosting/dart.cpp
+4
-4
src/boosting/gbdt.cpp
src/boosting/gbdt.cpp
+1
-1
src/io/bin.cpp
src/io/bin.cpp
+46
-45
src/io/dataset_loader.cpp
src/io/dataset_loader.cpp
+1
-1
src/network/linkers.h
src/network/linkers.h
+4
-3
src/network/network.cpp
src/network/network.cpp
+2
-2
src/network/socket_wrapper.hpp
src/network/socket_wrapper.hpp
+1
-1
src/treelearner/serial_tree_learner.cpp
src/treelearner/serial_tree_learner.cpp
+2
-2
No files found.
include/LightGBM/utils/common.h
View file @
952099d6
...
@@ -16,18 +16,6 @@ namespace LightGBM {
...
@@ -16,18 +16,6 @@ namespace LightGBM {
namespace
Common
{
namespace
Common
{
template
<
typename
T
>
inline
static
T
Max
(
const
T
&
a
,
const
T
&
b
)
{
return
a
>
b
?
a
:
b
;
}
template
<
typename
T
>
inline
static
T
Min
(
const
T
&
a
,
const
T
&
b
)
{
return
a
<
b
?
a
:
b
;
}
inline
static
std
::
string
&
Trim
(
std
::
string
&
str
)
{
inline
static
std
::
string
&
Trim
(
std
::
string
&
str
)
{
if
(
str
.
size
()
<=
0
)
{
if
(
str
.
size
()
<=
0
)
{
return
str
;
return
str
;
...
@@ -329,8 +317,8 @@ inline static std::string Join(const std::vector<T>& strs, size_t start, size_t
...
@@ -329,8 +317,8 @@ inline static std::string Join(const std::vector<T>& strs, size_t start, size_t
if
(
end
-
start
<=
0
)
{
if
(
end
-
start
<=
0
)
{
return
std
::
string
(
""
);
return
std
::
string
(
""
);
}
}
start
=
Min
<
size_t
>
(
start
,
static_cast
<
size_t
>
(
strs
.
size
())
-
1
);
start
=
std
::
min
(
start
,
static_cast
<
size_t
>
(
strs
.
size
())
-
1
);
end
=
Min
<
size_t
>
(
end
,
static_cast
<
size_t
>
(
strs
.
size
()));
end
=
std
::
min
(
end
,
static_cast
<
size_t
>
(
strs
.
size
()));
std
::
stringstream
ss
;
std
::
stringstream
ss
;
ss
<<
strs
[
start
];
ss
<<
strs
[
start
];
for
(
size_t
i
=
start
+
1
;
i
<
end
;
++
i
)
{
for
(
size_t
i
=
start
+
1
;
i
<
end
;
++
i
)
{
...
...
src/boosting/dart.cpp
View file @
952099d6
...
@@ -110,9 +110,9 @@ void DART::DroppingTrees() {
...
@@ -110,9 +110,9 @@ void DART::DroppingTrees() {
drop_index_
=
random_for_drop_
.
Sample
(
iter_
,
1
);
drop_index_
=
random_for_drop_
.
Sample
(
iter_
,
1
);
}
}
// drop trees
// drop trees
for
(
int
i
:
drop_index_
)
{
for
(
auto
i
:
drop_index_
)
{
for
(
int
curr_class
=
0
;
curr_class
<
num_class_
;
++
curr_class
)
{
for
(
int
curr_class
=
0
;
curr_class
<
num_class_
;
++
curr_class
)
{
int
curr_tree
=
i
*
num_class_
+
curr_class
;
auto
curr_tree
=
i
*
num_class_
+
curr_class
;
models_
[
curr_tree
]
->
Shrinkage
(
-
1.0
);
models_
[
curr_tree
]
->
Shrinkage
(
-
1.0
);
train_score_updater_
->
AddScore
(
models_
[
curr_tree
],
curr_class
);
train_score_updater_
->
AddScore
(
models_
[
curr_tree
],
curr_class
);
}
}
...
@@ -122,9 +122,9 @@ void DART::DroppingTrees() {
...
@@ -122,9 +122,9 @@ void DART::DroppingTrees() {
void
DART
::
Normalize
()
{
void
DART
::
Normalize
()
{
double
k
=
static_cast
<
double
>
(
drop_index_
.
size
());
double
k
=
static_cast
<
double
>
(
drop_index_
.
size
());
for
(
int
i
:
drop_index_
)
{
for
(
auto
i
:
drop_index_
)
{
for
(
int
curr_class
=
0
;
curr_class
<
num_class_
;
++
curr_class
)
{
for
(
int
curr_class
=
0
;
curr_class
<
num_class_
;
++
curr_class
)
{
int
curr_tree
=
i
*
num_class_
+
curr_class
;
auto
curr_tree
=
i
*
num_class_
+
curr_class
;
// update validation score
// update validation score
models_
[
curr_tree
]
->
Shrinkage
(
shrinkage_rate_
);
models_
[
curr_tree
]
->
Shrinkage
(
shrinkage_rate_
);
for
(
auto
&
score_updater
:
valid_score_updater_
)
{
for
(
auto
&
score_updater
:
valid_score_updater_
)
{
...
...
src/boosting/gbdt.cpp
View file @
952099d6
...
@@ -385,7 +385,7 @@ void GBDT::SaveModelToFile(int num_used_model, bool is_finish, const char* filen
...
@@ -385,7 +385,7 @@ void GBDT::SaveModelToFile(int num_used_model, bool is_finish, const char* filen
model_output_file_
<<
models_
[
i
]
->
ToString
()
<<
std
::
endl
;
model_output_file_
<<
models_
[
i
]
->
ToString
()
<<
std
::
endl
;
}
}
saved_model_size_
=
Common
::
M
ax
(
saved_model_size_
,
rest
);
saved_model_size_
=
std
::
m
ax
(
saved_model_size_
,
rest
);
model_output_file_
.
flush
();
model_output_file_
.
flush
();
// training finished, can close file
// training finished, can close file
...
...
src/io/bin.cpp
View file @
952099d6
...
@@ -42,29 +42,45 @@ BinMapper::~BinMapper() {
...
@@ -42,29 +42,45 @@ BinMapper::~BinMapper() {
void
BinMapper
::
FindBin
(
std
::
vector
<
double
>*
values
,
size_t
total_sample_cnt
,
int
max_bin
)
{
void
BinMapper
::
FindBin
(
std
::
vector
<
double
>*
values
,
size_t
total_sample_cnt
,
int
max_bin
)
{
std
::
vector
<
double
>&
ref_values
=
(
*
values
);
std
::
vector
<
double
>&
ref_values
=
(
*
values
);
size_t
sample_size
=
total_sample_cnt
;
size_t
sample_size
=
total_sample_cnt
;
size_
t
zero_cnt
=
total_sample_cnt
-
ref_values
.
size
();
in
t
zero_cnt
=
static_cast
<
int
>
(
total_sample_cnt
-
ref_values
.
size
()
)
;
// find distinct_values first
// find distinct_values first
std
::
vector
<
double
>
distinct_values
;
std
::
vector
<
double
>
distinct_values
;
std
::
vector
<
int
>
counts
;
std
::
vector
<
int
>
counts
;
std
::
sort
(
ref_values
.
begin
(),
ref_values
.
end
());
std
::
sort
(
ref_values
.
begin
(),
ref_values
.
end
());
// push 0 first
if
(
zero_cnt
>
0
)
{
// push zero in the front
distinct_values
.
push_back
(
0.0
f
);
if
(
ref_values
.
size
()
==
0
||
(
ref_values
[
0
]
>
0.0
f
&&
zero_cnt
>
0
))
{
counts
.
push_back
(
static_cast
<
int
>
(
zero_cnt
));
distinct_values
.
push_back
(
0
);
counts
.
push_back
(
zero_cnt
);
}
}
if
(
ref_values
.
size
()
>
0
)
{
if
(
ref_values
.
size
()
>
0
)
{
distinct_values
.
push_back
(
ref_values
[
0
]);
distinct_values
.
push_back
(
ref_values
[
0
]);
counts
.
push_back
(
1
);
counts
.
push_back
(
1
);
}
}
for
(
size_t
i
=
1
;
i
<
ref_values
.
size
();
++
i
)
{
for
(
size_t
i
=
1
;
i
<
ref_values
.
size
();
++
i
)
{
if
(
ref_values
[
i
]
!=
ref_values
[
i
-
1
])
{
if
(
ref_values
[
i
]
!=
ref_values
[
i
-
1
])
{
if
(
ref_values
[
i
-
1
]
==
0.0
f
)
{
counts
.
back
()
+=
zero_cnt
;
}
else
if
(
ref_values
[
i
-
1
]
<
0.0
f
&&
ref_values
[
i
]
>
0.0
f
)
{
distinct_values
.
push_back
(
0
);
counts
.
push_back
(
zero_cnt
);
}
distinct_values
.
push_back
(
ref_values
[
i
]);
distinct_values
.
push_back
(
ref_values
[
i
]);
counts
.
push_back
(
1
);
counts
.
push_back
(
1
);
}
else
{
}
else
{
++
counts
.
back
();
++
counts
.
back
();
}
}
}
}
// push zero in the back
if
(
ref_values
.
size
()
>
0
&&
ref_values
.
back
()
<
0.0
f
&&
zero_cnt
>
0
)
{
distinct_values
.
push_back
(
0
);
counts
.
push_back
(
zero_cnt
);
}
int
num_values
=
static_cast
<
int
>
(
distinct_values
.
size
());
int
num_values
=
static_cast
<
int
>
(
distinct_values
.
size
());
int
cnt_in_bin0
=
0
;
int
cnt_in_bin0
=
0
;
if
(
num_values
<=
max_bin
)
{
if
(
num_values
<=
max_bin
)
{
...
@@ -78,53 +94,38 @@ void BinMapper::FindBin(std::vector<double>* values, size_t total_sample_cnt, in
...
@@ -78,53 +94,38 @@ void BinMapper::FindBin(std::vector<double>* values, size_t total_sample_cnt, in
cnt_in_bin0
=
counts
[
0
];
cnt_in_bin0
=
counts
[
0
];
bin_upper_bound_
[
num_values
-
1
]
=
std
::
numeric_limits
<
double
>::
infinity
();
bin_upper_bound_
[
num_values
-
1
]
=
std
::
numeric_limits
<
double
>::
infinity
();
}
else
{
}
else
{
double
min_lower_bound
=
std
::
numeric_limits
<
double
>::
infinity
();
// mean size for one bin
// mean size for one bin
double
mean_bin_size
=
sample_size
/
static_cast
<
double
>
(
max_bin
);
double
mean_bin_size
=
sample_size
/
static_cast
<
double
>
(
max_bin
);
int
rest_sample_cnt
=
static_cast
<
int
>
(
sample_size
);
std
::
vector
<
bool
>
is_big_count_value
(
num_values
,
false
);
int
bin_cnt
=
0
;
for
(
int
i
=
0
;
i
<
num_values
;
++
i
)
{
if
(
counts
[
i
]
>=
mean_bin_size
)
{
is_big_count_value
[
i
]
=
true
;
}
}
std
::
vector
<
double
>
upper_bounds
(
max_bin
,
std
::
numeric_limits
<
double
>::
infinity
());
std
::
vector
<
double
>
upper_bounds
(
max_bin
,
std
::
numeric_limits
<
double
>::
infinity
());
std
::
vector
<
double
>
lower_bounds
(
max_bin
,
std
::
numeric_limits
<
double
>::
infinity
());
std
::
vector
<
double
>
lower_bounds
(
max_bin
,
std
::
numeric_limits
<
double
>::
infinity
());
// sort by count, descent
Common
::
SortForPair
(
counts
,
distinct_values
,
0
,
true
);
int
rest_sample_cnt
=
static_cast
<
int
>
(
sample_size
);
// fetch big slot as unique bin
int
bin_cnt
=
0
;
while
(
counts
[
bin_cnt
]
>
mean_bin_size
)
{
lower_bounds
[
bin_cnt
]
=
distinct_values
[
0
];
upper_bounds
[
bin_cnt
]
=
distinct_values
[
bin_cnt
];
lower_bounds
[
bin_cnt
]
=
distinct_values
[
bin_cnt
];
if
(
lower_bounds
[
bin_cnt
]
<
min_lower_bound
)
{
min_lower_bound
=
lower_bounds
[
bin_cnt
];
cnt_in_bin0
=
counts
[
bin_cnt
];
}
rest_sample_cnt
-=
counts
[
bin_cnt
];
++
bin_cnt
;
}
// process reminder bins
if
(
bin_cnt
<
max_bin
)
{
// sort rest by values
Common
::
SortForPair
<
double
,
int
>
(
distinct_values
,
counts
,
bin_cnt
,
false
);
mean_bin_size
=
rest_sample_cnt
/
static_cast
<
double
>
(
max_bin
-
bin_cnt
);
lower_bounds
[
bin_cnt
]
=
distinct_values
[
bin_cnt
];
int
cur_cnt_inbin
=
0
;
int
cur_cnt_inbin
=
0
;
for
(
int
i
=
bin_cnt
;
i
<
num_values
-
1
;
++
i
)
{
for
(
int
i
=
0
;
i
<
num_values
-
1
;
++
i
)
{
rest_sample_cnt
-=
counts
[
i
];
rest_sample_cnt
-=
counts
[
i
];
cur_cnt_inbin
+=
counts
[
i
];
cur_cnt_inbin
+=
counts
[
i
];
// need a new bin
// need a new bin
if
(
cur_cnt_inbin
>=
mean_bin_size
)
{
if
(
is_big_count_value
[
i
]
||
cur_cnt_inbin
>=
mean_bin_size
||
(
is_big_count_value
[
i
+
1
]
&&
cur_cnt_inbin
>=
std
::
max
(
1.0
,
mean_bin_size
*
0.5
f
)))
{
upper_bounds
[
bin_cnt
]
=
distinct_values
[
i
];
upper_bounds
[
bin_cnt
]
=
distinct_values
[
i
];
if
(
lower_bounds
[
bin_cnt
]
<
min_lower_bound
)
{
if
(
bin_cnt
==
0
)
{
min_lower_bound
=
lower_bounds
[
bin_cnt
];
cnt_in_bin0
=
cur_cnt_inbin
;
cnt_in_bin0
=
cur_cnt_inbin
;
}
}
++
bin_cnt
;
++
bin_cnt
;
lower_bounds
[
bin_cnt
]
=
distinct_values
[
i
+
1
];
lower_bounds
[
bin_cnt
]
=
distinct_values
[
i
+
1
];
if
(
bin_cnt
>=
max_bin
-
1
)
break
;
if
(
bin_cnt
>=
max_bin
-
1
)
{
break
;
}
cur_cnt_inbin
=
0
;
cur_cnt_inbin
=
0
;
mean_bin_size
=
rest_sample_cnt
/
static_cast
<
double
>
(
max_bin
-
bin_cnt
);
mean_bin_size
=
rest_sample_cnt
/
static_cast
<
double
>
(
max_bin
-
bin_cnt
);
}
}
}
}
cur_cnt_inbin
+=
counts
[
num_values
-
1
];
//
}
++
bin_cnt
;
Common
::
SortForPair
<
double
,
double
>
(
lower_bounds
,
upper_bounds
,
0
,
false
);
// update bin upper bound
// update bin upper bound
bin_upper_bound_
=
new
double
[
bin_cnt
];
bin_upper_bound_
=
new
double
[
bin_cnt
];
num_bin_
=
bin_cnt
;
num_bin_
=
bin_cnt
;
...
...
src/io/dataset_loader.cpp
View file @
952099d6
...
@@ -657,7 +657,7 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
...
@@ -657,7 +657,7 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
start
[
0
]
=
0
;
start
[
0
]
=
0
;
for
(
int
i
=
0
;
i
<
num_machines
-
1
;
++
i
)
{
for
(
int
i
=
0
;
i
<
num_machines
-
1
;
++
i
)
{
len
[
i
]
=
Common
::
Min
<
int
>
(
step
,
total_num_feature
-
start
[
i
]);
len
[
i
]
=
std
::
min
(
step
,
total_num_feature
-
start
[
i
]);
start
[
i
+
1
]
=
start
[
i
]
+
len
[
i
];
start
[
i
+
1
]
=
start
[
i
]
+
len
[
i
];
}
}
len
[
num_machines
-
1
]
=
total_num_feature
-
start
[
num_machines
-
1
];
len
[
num_machines
-
1
]
=
total_num_feature
-
start
[
num_machines
-
1
];
...
...
src/network/linkers.h
View file @
952099d6
#ifndef LIGHTGBM_NETWORK_LINKERS_H_
#ifndef LIGHTGBM_NETWORK_LINKERS_H_
#define LIGHTGBM_NETWORK_LINKERS_H_
#define LIGHTGBM_NETWORK_LINKERS_H_
#include <LightGBM/meta.h>
#include <LightGBM/meta.h>
#include <LightGBM/config.h>
#include <LightGBM/config.h>
#include <LightGBM/network.h>
#include <LightGBM/network.h>
#include <algorithm>
#include <chrono>
#include <chrono>
#include <ctime>
#include <ctime>
#ifdef USE_SOCKET
#ifdef USE_SOCKET
#include "socket_wrapper.hpp"
#include "socket_wrapper.hpp"
#include <LightGBM/utils/common.h>
#include <LightGBM/utils/common.h>
...
@@ -171,9 +172,9 @@ inline const RecursiveHalvingMap& Linkers::recursive_halving_map() {
...
@@ -171,9 +172,9 @@ inline const RecursiveHalvingMap& Linkers::recursive_halving_map() {
inline
void
Linkers
::
Recv
(
int
rank
,
char
*
data
,
int
len
)
const
{
inline
void
Linkers
::
Recv
(
int
rank
,
char
*
data
,
int
len
)
const
{
int
recv_cnt
=
0
;
int
recv_cnt
=
0
;
while
(
recv_cnt
<
len
)
{
while
(
recv_cnt
<
len
)
{
recv_cnt
+=
linkers_
[
rank
]
->
Recv
(
data
+
recv_cnt
,
recv_cnt
+=
linkers_
[
rank
]
->
Recv
(
data
+
recv_cnt
,
//len - recv_cnt
//len - recv_cnt
Common
::
Min
<
int
>
(
len
-
recv_cnt
,
SocketConfig
::
kMaxReceiveSize
)
std
::
min
(
len
-
recv_cnt
,
SocketConfig
::
kMaxReceiveSize
)
);
);
}
}
}
}
...
...
src/network/network.cpp
View file @
952099d6
...
@@ -54,7 +54,7 @@ void Network::Allreduce(char* input, int input_size, int type_size, char* output
...
@@ -54,7 +54,7 @@ void Network::Allreduce(char* input, int input_size, int type_size, char* output
}
}
block_start_
[
0
]
=
0
;
block_start_
[
0
]
=
0
;
for
(
int
i
=
0
;
i
<
num_machines_
-
1
;
++
i
)
{
for
(
int
i
=
0
;
i
<
num_machines_
-
1
;
++
i
)
{
block_len_
[
i
]
=
Common
::
Min
<
int
>
(
step
*
type_size
,
input_size
-
block_start_
[
i
]);
block_len_
[
i
]
=
std
::
min
(
step
*
type_size
,
input_size
-
block_start_
[
i
]);
block_start_
[
i
+
1
]
=
block_start_
[
i
]
+
block_len_
[
i
];
block_start_
[
i
+
1
]
=
block_start_
[
i
]
+
block_len_
[
i
];
}
}
block_len_
[
num_machines_
-
1
]
=
input_size
-
block_start_
[
num_machines_
-
1
];
block_len_
[
num_machines_
-
1
]
=
input_size
-
block_start_
[
num_machines_
-
1
];
...
@@ -108,7 +108,7 @@ void Network::Allgather(char* input, int all_size, int* block_start, int* block_
...
@@ -108,7 +108,7 @@ void Network::Allgather(char* input, int all_size, int* block_start, int* block_
int
accumulated_block
=
1
;
int
accumulated_block
=
1
;
for
(
int
i
=
0
;
i
<
bruck_map_
.
k
;
++
i
)
{
for
(
int
i
=
0
;
i
<
bruck_map_
.
k
;
++
i
)
{
// get current local block size
// get current local block size
int
cur_block_size
=
Common
::
Min
<
int
>
(
1
<<
i
,
num_machines_
-
accumulated_block
);
int
cur_block_size
=
std
::
min
(
1
<<
i
,
num_machines_
-
accumulated_block
);
// get out rank
// get out rank
int
out_rank
=
bruck_map_
.
out_ranks
[
i
];
int
out_rank
=
bruck_map_
.
out_ranks
[
i
];
// get send information
// get send information
...
...
src/network/socket_wrapper.hpp
View file @
952099d6
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
#ifdef USE_SOCKET
#ifdef USE_SOCKET
#if defined(_WIN32)
#if defined(_WIN32)
#define NOMINMAX
#include <winsock2.h>
#include <winsock2.h>
#include <ws2tcpip.h>
#include <ws2tcpip.h>
#include <iphlpapi.h>
#include <iphlpapi.h>
...
...
src/treelearner/serial_tree_learner.cpp
View file @
952099d6
...
@@ -62,8 +62,8 @@ void SerialTreeLearner::Init(const Dataset* train_data) {
...
@@ -62,8 +62,8 @@ void SerialTreeLearner::Init(const Dataset* train_data) {
max_cache_size
=
static_cast
<
int
>
(
histogram_pool_size_
*
1024
*
1024
/
total_histogram_size
);
max_cache_size
=
static_cast
<
int
>
(
histogram_pool_size_
*
1024
*
1024
/
total_histogram_size
);
}
}
// at least need 2 leaves
// at least need 2 leaves
max_cache_size
=
Common
::
M
ax
(
2
,
max_cache_size
);
max_cache_size
=
std
::
m
ax
(
2
,
max_cache_size
);
max_cache_size
=
Common
::
M
in
(
max_cache_size
,
num_leaves_
);
max_cache_size
=
std
::
m
in
(
max_cache_size
,
num_leaves_
);
histogram_pool_
.
ResetSize
(
max_cache_size
,
num_leaves_
);
histogram_pool_
.
ResetSize
(
max_cache_size
,
num_leaves_
);
auto
histogram_create_function
=
[
this
]()
{
auto
histogram_create_function
=
[
this
]()
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment