Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dlib
Commits
9260243c
Commit
9260243c
authored
Apr 30, 2016
by
Davis King
Browse files
Fixed bugs in multi-gpu training code.
parent
26d2d889
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
51 additions
and
57 deletions
+51
-57
dlib/dnn/trainer.h
dlib/dnn/trainer.h
+51
-57
No files found.
dlib/dnn/trainer.h
View file @
9260243c
...
...
@@ -501,28 +501,12 @@ namespace dlib
std
::
vector
<
std
::
future
<
double
>>
losses
(
devices
.
size
());
std
::
vector
<
std
::
future
<
void
>>
update_futs
(
devices
.
size
());
std
::
vector
<
tt
::
multi_device_tensor_averager
>
averagers
(
net_type
::
num_computational_layers
);
if
(
devices
.
size
()
>
1
)
{
// setup the averagers to point to the tensors in the networks.
std
::
vector
<
std
::
vector
<
tensor
*>>
all_tensors
(
devices
.
size
());
for
(
size_t
i
=
0
;
i
<
all_tensors
.
size
();
++
i
)
{
all_tensors
[
i
].
resize
(
net_type
::
num_computational_layers
);
visit_layer_parameter_gradients
(
devices
[
i
]
->
net
,
[
&
](
size_t
j
,
tensor
&
t
){
all_tensors
[
i
][
j
]
=
&
t
;
});
}
// Now set each averager to average the tensors at the same layer in each
// network.
for
(
size_t
i
=
0
;
i
<
net_type
::
num_computational_layers
;
++
i
)
{
std
::
vector
<
tensor
*>
temp
(
all_tensors
.
size
());
for
(
size_t
j
=
0
;
j
<
all_tensors
.
size
();
++
j
)
temp
[
j
]
=
all_tensors
[
j
][
i
];
averagers
[
i
].
set
(
temp
);
}
}
std
::
vector
<
tt
::
multi_device_tensor_averager
>
averagers
;
// An array of all the parameter tensors in the first network. We will
// periodically copy these tensors to all the other devices to make sure the
// different GPUs don't go out of sync.
std
::
vector
<
tensor
*>
reference_params
;
visit_layer_parameters
(
devices
[
0
]
->
net
,
[
&
](
size_t
,
tensor
&
t
)
{
reference_params
.
push_back
(
&
t
);
});
size_t
iteration
=
0
;
...
...
@@ -544,48 +528,41 @@ namespace dlib
// gradient updates between devices. So we do that now.
if
(
devices
.
size
()
>
1
)
{
for
(
auto
&&
d
:
devices
)
cuda
::
device_synchronize
(
d
->
device_id
);
for
(
auto
&&
avg
:
averagers
)
avg
.
average
();
/*
for (auto&& d : devices)
cuda::device_synchronize(d->device_id);
*/
// Evey now and then force all the parameters to be the same just to
// make sure they aren't drifting apart due to any non-deterministic
// behavior on the GPU.
/*
if (iteration%5000 == 1)
// if this is the first iteration then we need to setup the averagers.
// We can't do this outside the loop because the tensors that get
// averaged need to be allocated to their devices before we call set()
// so that the averagers can determine how best to average them.
if
(
averagers
.
size
()
==
0
)
{
for (auto&& p : param_buffer)
p = 0;
// now average all the parameters
for (size_t i = 0; i <
device
s.size(); ++i)
averagers
=
std
::
vector
<
tt
::
multi_device_tensor_averager
>
(
net_type
::
num_computational_layers
);
// setup the averagers to point to the tensors in the networks.
std
::
vector
<
std
::
vector
<
tensor
*>>
all_tensors
(
devices
.
size
());
for
(
size_t
i
=
0
;
i
<
all_tensor
s
.
size
();
++
i
)
{
visit_layer_parameters(devices[i]->net, [¶m_buffer](size_t j, tensor& t) {
if (t.size() != 0)
param_buffer
[j]
+
=
mat(t)
;
all_tensors
[
i
].
resize
(
net_type
::
num_computational_layers
);
visit_layer_parameter_gradients
(
devices
[
i
]
->
net
,
[
&
](
size_t
j
,
tensor
&
t
){
all_tensors
[
i
]
[
j
]
=
&
t
;
});
}
//
and then assign the parameters back to all the networks.
const float scale = 1.0f/devices.size();
for (size_t i = 0; i <
devices.size()
; ++i)
//
Now set each averager to average the tensors at the same layer in each
// network.
for
(
size_t
i
=
0
;
i
<
net_type
::
num_computational_layers
;
++
i
)
{
visit_layer_parameters(devices[i]->net, [scale,¶m_buffer](size_t j, tensor& t) {
if (t.size() != 0)
{
t = param_buffer[j]*scale;
t.async_copy_to_device();
}
});
std
::
vector
<
tensor
*>
temp
(
all_tensors
.
size
());
for
(
size_t
j
=
0
;
j
<
all_tensors
.
size
();
++
j
)
temp
[
j
]
=
all_tensors
[
j
][
i
];
// ignore layers that don't have parameters
if
(
temp
[
0
]
->
size
()
!=
0
)
averagers
[
i
].
set
(
temp
);
}
}
*/
for
(
auto
&&
d
:
devices
)
cuda
::
device_synchronize
(
d
->
device_id
);
for
(
auto
&&
avg
:
averagers
)
avg
.
average
();
}
...
...
@@ -597,6 +574,23 @@ namespace dlib
f
.
wait
();
// Evey now and then force all the parameters to be the same just to make
// sure they aren't drifting apart due to any non-deterministic behavior on
// the GPU. It's also important to do this on the first iteration because
// the different networks may be initialized differently when tensor data
// is first passed through them. So this code block deals with these
// issues.
if
(
devices
.
size
()
>
1
&&
iteration
%
2000
==
1
)
{
for
(
size_t
i
=
1
;
i
<
devices
.
size
();
++
i
)
{
visit_layer_parameters
(
devices
[
i
]
->
net
,
[
&
](
size_t
j
,
tensor
&
t
)
{
memcpy
(
t
,
*
reference_params
[
j
]);
});
}
}
// If we have been running for a while then check if the loss is still
// dropping. If it isn't then we will reduce the step size. Note that we
// have a "budget" that prevents us from calling
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment