Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dlib
Commits
93e786db
Commit
93e786db
authored
May 26, 2016
by
Fm
Browse files
Merge branch 'master' of
https://github.com/davisking/dlib
into dnn_group_layer
parents
59892409
91163863
Changes
23
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1293 additions
and
210 deletions
+1293
-210
dlib/algs.h
dlib/algs.h
+7
-0
dlib/dnn/core.h
dlib/dnn/core.h
+40
-8
dlib/dnn/core_abstract.h
dlib/dnn/core_abstract.h
+27
-1
dlib/dnn/cpu_dlib.cpp
dlib/dnn/cpu_dlib.cpp
+56
-13
dlib/dnn/cpu_dlib.h
dlib/dnn/cpu_dlib.h
+20
-0
dlib/dnn/cuda_dlib.cu
dlib/dnn/cuda_dlib.cu
+42
-4
dlib/dnn/cuda_dlib.h
dlib/dnn/cuda_dlib.h
+14
-0
dlib/dnn/cudnn_dlibapi.cpp
dlib/dnn/cudnn_dlibapi.cpp
+30
-14
dlib/dnn/cudnn_dlibapi.h
dlib/dnn/cudnn_dlibapi.h
+6
-0
dlib/dnn/layers.h
dlib/dnn/layers.h
+204
-41
dlib/dnn/layers_abstract.h
dlib/dnn/layers_abstract.h
+280
-4
dlib/dnn/solvers.h
dlib/dnn/solvers.h
+177
-4
dlib/dnn/solvers_abstract.h
dlib/dnn/solvers_abstract.h
+18
-0
dlib/dnn/tensor_tools.cpp
dlib/dnn/tensor_tools.cpp
+66
-22
dlib/dnn/tensor_tools.h
dlib/dnn/tensor_tools.h
+78
-17
dlib/dnn/trainer.h
dlib/dnn/trainer.h
+15
-6
dlib/optimization/optimization.h
dlib/optimization/optimization.h
+2
-2
dlib/optimization/optimization_abstract.h
dlib/optimization/optimization_abstract.h
+2
-2
dlib/test/dnn.cpp
dlib/test/dnn.cpp
+112
-24
examples/dnn_mnist_advanced_ex.cpp
examples/dnn_mnist_advanced_ex.cpp
+97
-48
No files found.
dlib/algs.h
View file @
93e786db
...
@@ -488,6 +488,13 @@ namespace dlib
...
@@ -488,6 +488,13 @@ namespace dlib
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
struct
general_
{};
struct
special_
:
general_
{};
template
<
typename
>
struct
int_
{
typedef
int
type
;
};
// ----------------------------------------------------------------------------------------
/*!A is_same_object
/*!A is_same_object
This is a templated function which checks if both of its arguments are actually
This is a templated function which checks if both of its arguments are actually
...
...
dlib/dnn/core.h
View file @
93e786db
...
@@ -24,6 +24,38 @@
...
@@ -24,6 +24,38 @@
namespace
dlib
namespace
dlib
{
{
// ----------------------------------------------------------------------------------------
namespace
impl
{
template
<
typename
T
,
typename
int_
<
decltype
(
&
T
::
get_learning_rate_multiplier
)>
::
type
=
0
>
double
get_learning_rate_multiplier
(
const
T
&
obj
,
special_
)
{
return
obj
.
get_learning_rate_multiplier
();
}
template
<
typename
T
>
double
get_learning_rate_multiplier
(
const
T
&
obj
,
general_
)
{
return
1
;
}
}
template
<
typename
T
>
double
get_learning_rate_multiplier
(
const
T
&
obj
)
{
return
impl
::
get_learning_rate_multiplier
(
obj
,
special_
());
}
// ----------------------------------------------------------------------------------------
namespace
impl
{
template
<
typename
T
,
typename
int_
<
decltype
(
&
T
::
get_weight_decay_multiplier
)>
::
type
=
0
>
double
get_weight_decay_multiplier
(
const
T
&
obj
,
special_
)
{
return
obj
.
get_weight_decay_multiplier
();
}
template
<
typename
T
>
double
get_weight_decay_multiplier
(
const
T
&
obj
,
general_
)
{
return
1
;
}
}
template
<
typename
T
>
double
get_weight_decay_multiplier
(
const
T
&
obj
)
{
return
impl
::
get_weight_decay_multiplier
(
obj
,
special_
());
}
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
namespace
impl
namespace
impl
...
@@ -458,7 +490,7 @@ namespace dlib
...
@@ -458,7 +490,7 @@ namespace dlib
sstack
pop
(
size_t
num
=
1
)
sstack
pop
(
size_t
num
=
1
)
{
{
DLIB_CASSERT
(
num
<
size
(),
"You can't pop more things from the stack than it has in it."
);
DLIB_CASSERT
(
num
<
=
size
(),
"You can't pop more things from the stack than it has in it."
);
return
sstack
(
data
+
num
,
mysize
-
num
);
return
sstack
(
data
+
num
,
mysize
-
num
);
}
}
...
@@ -849,8 +881,9 @@ namespace dlib
...
@@ -849,8 +881,9 @@ namespace dlib
void
update_parameters
(
sstack
<
solver_type
>
solvers
,
double
learning_rate
)
void
update_parameters
(
sstack
<
solver_type
>
solvers
,
double
learning_rate
)
{
{
DLIB_CASSERT
(
solvers
.
size
()
>=
num_computational_layers
,
""
);
DLIB_CASSERT
(
solvers
.
size
()
>=
num_computational_layers
,
""
);
// Don't try to adjust the parameters if this layer doesn't have any.
// Don't try to adjust the parameters if this layer doesn't have any or the
if
(
params_grad
.
size
()
!=
0
)
// learning rate is disabled for this layer.
if
(
params_grad
.
size
()
!=
0
&&
get_learning_rate_multiplier
(
details
)
!=
0
)
{
{
const
tensor
&
step
=
solvers
.
top
()(
learning_rate
,
details
,
static_cast
<
const
tensor
&>
(
params_grad
));
const
tensor
&
step
=
solvers
.
top
()(
learning_rate
,
details
,
static_cast
<
const
tensor
&>
(
params_grad
));
tt
::
add
(
details
.
get_layer_params
(),
details
.
get_layer_params
(),
step
);
tt
::
add
(
details
.
get_layer_params
(),
details
.
get_layer_params
(),
step
);
...
@@ -1200,8 +1233,9 @@ namespace dlib
...
@@ -1200,8 +1233,9 @@ namespace dlib
void
update_parameters
(
sstack
<
solver_type
>
solvers
,
double
learning_rate
)
void
update_parameters
(
sstack
<
solver_type
>
solvers
,
double
learning_rate
)
{
{
DLIB_CASSERT
(
solvers
.
size
()
>=
num_computational_layers
,
""
);
DLIB_CASSERT
(
solvers
.
size
()
>=
num_computational_layers
,
""
);
// Don't try to adjust the parameters if this layer doesn't have any.
// Don't try to adjust the parameters if this layer doesn't have any or the
if
(
params_grad
.
size
()
!=
0
)
// learning rate is disabled for this layer.
if
(
params_grad
.
size
()
!=
0
&&
get_learning_rate_multiplier
(
details
)
!=
0
)
{
{
const
tensor
&
step
=
solvers
.
top
()(
learning_rate
,
details
,
static_cast
<
const
tensor
&>
(
params_grad
));
const
tensor
&
step
=
solvers
.
top
()(
learning_rate
,
details
,
static_cast
<
const
tensor
&>
(
params_grad
));
tt
::
add
(
details
.
get_layer_params
(),
details
.
get_layer_params
(),
step
);
tt
::
add
(
details
.
get_layer_params
(),
details
.
get_layer_params
(),
step
);
...
@@ -1817,9 +1851,7 @@ namespace dlib
...
@@ -1817,9 +1851,7 @@ namespace dlib
public:
public:
typedef
INPUT_LAYER
subnet_type
;
typedef
INPUT_LAYER
subnet_type
;
typedef
typename
subnet_type
::
input_type
input_type
;
typedef
typename
subnet_type
::
input_type
input_type
;
// This layer counts as a computational layer because it copies and stores the
const
static
size_t
num_computational_layers
=
0
;
// inputs.
const
static
size_t
num_computational_layers
=
1
;
const
static
size_t
num_layers
=
2
;
const
static
size_t
num_layers
=
2
;
const
static
unsigned
int
sample_expansion_factor
=
subnet_type
::
sample_expansion_factor
;
const
static
unsigned
int
sample_expansion_factor
=
subnet_type
::
sample_expansion_factor
;
static_assert
(
sample_expansion_factor
>=
1
,
static_assert
(
sample_expansion_factor
>=
1
,
...
...
dlib/dnn/core_abstract.h
View file @
93e786db
...
@@ -67,6 +67,32 @@ namespace dlib
...
@@ -67,6 +67,32 @@ namespace dlib
(except computes it using a numerically accurate method)
(except computes it using a numerically accurate method)
!*/
!*/
// ----------------------------------------------------------------------------------------
template
<
typename
T
>
double
get_learning_rate_multiplier
(
const
T
&
obj
);
/*!
ensures
- if (obj has a get_learning_rate_multiplier() member function) then
- returns obj.get_learning_rate_multiplier()
- else
- returns 1
!*/
template
<
typename
T
>
double
get_weight_decay_multiplier
(
const
T
&
obj
);
/*!
ensures
- if (obj has a get_weight_decay_multiplier() member function) then
- returns obj.get_weight_decay_multiplier()
- else
- returns 1
!*/
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
bool
dnn_prefer_fastest_algorithms
(
bool
dnn_prefer_fastest_algorithms
(
...
@@ -152,7 +178,7 @@ namespace dlib
...
@@ -152,7 +178,7 @@ namespace dlib
);
);
/*!
/*!
requires
requires
- num < size()
- num <
=
size()
ensures
ensures
- returns a reference to the sub-stack S such that:
- returns a reference to the sub-stack S such that:
- S.size() == size()-num.
- S.size() == size()-num.
...
...
dlib/dnn/cpu_dlib.cpp
View file @
93e786db
...
@@ -385,6 +385,30 @@ namespace dlib
...
@@ -385,6 +385,30 @@ namespace dlib
d
[
i
]
=
A
*
s1
[
i
]
+
B
*
s2
[
i
]
+
C
*
s3
[
i
]
+
D
;
d
[
i
]
=
A
*
s1
[
i
]
+
B
*
s2
[
i
]
+
C
*
s3
[
i
]
+
D
;
}
}
void
affine_transform_range
(
size_t
begin
,
size_t
end
,
tensor
&
dest
,
const
tensor
&
src1
,
const
tensor
&
src2
,
const
tensor
&
src3
,
const
float
A
,
const
float
B
,
const
float
C
)
{
DLIB_CASSERT
(
dest
.
size
()
==
src1
.
size
(),
""
);
DLIB_CASSERT
(
dest
.
size
()
==
src2
.
size
(),
""
);
DLIB_CASSERT
(
dest
.
size
()
==
src3
.
size
(),
""
);
DLIB_CASSERT
(
begin
<=
end
&&
end
<=
dest
.
size
(),
""
);
const
auto
d
=
dest
.
host
();
const
auto
s1
=
src1
.
host
();
const
auto
s2
=
src2
.
host
();
const
auto
s3
=
src3
.
host
();
for
(
size_t
i
=
begin
;
i
<
end
;
++
i
)
d
[
i
]
=
A
*
s1
[
i
]
+
B
*
s2
[
i
]
+
C
*
s3
[
i
];
}
// -----------------------------------------------------------------------------------
// -----------------------------------------------------------------------------------
void
affine_transform
(
void
affine_transform
(
...
@@ -464,6 +488,8 @@ namespace dlib
...
@@ -464,6 +488,8 @@ namespace dlib
// -----------------------------------------------------------------------------------
// -----------------------------------------------------------------------------------
void
compute_adam_update
(
void
compute_adam_update
(
size_t
begin
,
size_t
end
,
tensor
&
s
,
tensor
&
s
,
tensor
&
m
,
tensor
&
m
,
tensor
&
v
,
tensor
&
v
,
...
@@ -480,6 +506,7 @@ namespace dlib
...
@@ -480,6 +506,7 @@ namespace dlib
s
.
size
()
==
v
.
size
()
&&
s
.
size
()
==
v
.
size
()
&&
s
.
size
()
==
params
.
size
()
&&
s
.
size
()
==
params
.
size
()
&&
s
.
size
()
==
params_grad
.
size
(),
""
);
s
.
size
()
==
params_grad
.
size
(),
""
);
DLIB_CASSERT
(
begin
<=
end
&&
end
<=
params
.
size
(),
""
);
const
float
eps
=
1e-8
;
const
float
eps
=
1e-8
;
const
float
alpha
=
learning_rate
*
std
::
sqrt
(
1
-
std
::
pow
(
momentum2
,
t
))
/
(
1
-
std
::
pow
(
momentum1
,
t
));
const
float
alpha
=
learning_rate
*
std
::
sqrt
(
1
-
std
::
pow
(
momentum2
,
t
))
/
(
1
-
std
::
pow
(
momentum1
,
t
));
...
@@ -492,7 +519,7 @@ namespace dlib
...
@@ -492,7 +519,7 @@ namespace dlib
auto
ps
=
s
.
host_write_only
();
auto
ps
=
s
.
host_write_only
();
auto
pparams
=
params
.
host
();
auto
pparams
=
params
.
host
();
auto
ppgrad
=
params_grad
.
host
();
auto
ppgrad
=
params_grad
.
host
();
for
(
size_t
i
=
0
;
i
<
params
.
size
()
;
++
i
)
for
(
size_t
i
=
begin
;
i
<
end
;
++
i
)
{
{
float
g
=
weight_decay
*
pparams
[
i
]
+
ppgrad
[
i
];
float
g
=
weight_decay
*
pparams
[
i
]
+
ppgrad
[
i
];
pm
[
i
]
=
momentum1
*
pm
[
i
]
+
(
1
-
momentum1
)
*
g
;
pm
[
i
]
=
momentum1
*
pm
[
i
]
+
(
1
-
momentum1
)
*
g
;
...
@@ -504,6 +531,7 @@ namespace dlib
...
@@ -504,6 +531,7 @@ namespace dlib
// -----------------------------------------------------------------------------------
// -----------------------------------------------------------------------------------
void
batch_normalize_inference
(
void
batch_normalize_inference
(
const
double
eps
,
resizable_tensor
&
dest
,
resizable_tensor
&
dest
,
const
tensor
&
src
,
const
tensor
&
src
,
const
tensor
&
gamma
,
const
tensor
&
gamma
,
...
@@ -519,7 +547,8 @@ namespace dlib
...
@@ -519,7 +547,8 @@ namespace dlib
gamma
.
k
()
==
src
.
k
()
&&
gamma
.
k
()
==
src
.
k
()
&&
have_same_dimensions
(
gamma
,
beta
)
&&
have_same_dimensions
(
gamma
,
beta
)
&&
have_same_dimensions
(
gamma
,
running_means
)
&&
have_same_dimensions
(
gamma
,
running_means
)
&&
have_same_dimensions
(
gamma
,
running_variances
),
have_same_dimensions
(
gamma
,
running_variances
)
&&
eps
>
0
,
"
\n
gamma.num_samples(): "
<<
gamma
.
num_samples
()
<<
"
\n
gamma.num_samples(): "
<<
gamma
.
num_samples
()
<<
"
\n
gamma.k(): "
<<
gamma
.
k
()
<<
"
\n
gamma.k(): "
<<
gamma
.
k
()
<<
"
\n
gamma.nr(): "
<<
gamma
.
nr
()
<<
"
\n
gamma.nr(): "
<<
gamma
.
nr
()
<<
...
@@ -538,7 +567,8 @@ namespace dlib
...
@@ -538,7 +567,8 @@ namespace dlib
"
\n
running_variances.nc(): "
<<
running_variances
.
nc
()
<<
"
\n
running_variances.nc(): "
<<
running_variances
.
nc
()
<<
"
\n
src.k(): "
<<
src
.
k
()
<<
"
\n
src.k(): "
<<
src
.
k
()
<<
"
\n
src.nr(): "
<<
src
.
nr
()
<<
"
\n
src.nr(): "
<<
src
.
nr
()
<<
"
\n
src.nc(): "
<<
src
.
nc
()
"
\n
src.nc(): "
<<
src
.
nc
()
<<
"
\n
eps: "
<<
eps
);
);
dest
.
copy_size
(
src
);
dest
.
copy_size
(
src
);
...
@@ -554,7 +584,7 @@ namespace dlib
...
@@ -554,7 +584,7 @@ namespace dlib
{
{
for
(
long
k
=
0
;
k
<
num
;
++
k
)
for
(
long
k
=
0
;
k
<
num
;
++
k
)
{
{
*
d
=
g
[
k
]
*
(
*
s
-
m
[
k
])
/
std
::
sqrt
(
v
[
k
]
+
dlib
::
tt
::
BATCH_NORM_EPS
)
+
b
[
k
];
*
d
=
g
[
k
]
*
(
*
s
-
m
[
k
])
/
std
::
sqrt
(
v
[
k
]
+
eps
)
+
b
[
k
];
++
d
;
++
d
;
++
s
;
++
s
;
}
}
...
@@ -562,6 +592,7 @@ namespace dlib
...
@@ -562,6 +592,7 @@ namespace dlib
}
}
void
batch_normalize
(
void
batch_normalize
(
const
double
eps
,
resizable_tensor
&
dest
,
resizable_tensor
&
dest
,
resizable_tensor
&
means
,
resizable_tensor
&
means
,
resizable_tensor
&
invstds
,
resizable_tensor
&
invstds
,
...
@@ -582,7 +613,8 @@ namespace dlib
...
@@ -582,7 +613,8 @@ namespace dlib
beta
.
num_samples
()
==
1
&&
beta
.
num_samples
()
==
1
&&
gamma
.
nr
()
==
beta
.
nr
()
&&
beta
.
nr
()
==
src
.
nr
()
&&
gamma
.
nr
()
==
beta
.
nr
()
&&
beta
.
nr
()
==
src
.
nr
()
&&
gamma
.
nc
()
==
beta
.
nc
()
&&
beta
.
nc
()
==
src
.
nc
()
&&
gamma
.
nc
()
==
beta
.
nc
()
&&
beta
.
nc
()
==
src
.
nc
()
&&
gamma
.
k
()
==
beta
.
k
()
&&
beta
.
k
()
==
src
.
k
(),
gamma
.
k
()
==
beta
.
k
()
&&
beta
.
k
()
==
src
.
k
()
&&
eps
>
0
,
"
\n
gamma.num_samples(): "
<<
gamma
.
num_samples
()
<<
"
\n
gamma.num_samples(): "
<<
gamma
.
num_samples
()
<<
"
\n
gamma.k(): "
<<
gamma
.
k
()
<<
"
\n
gamma.k(): "
<<
gamma
.
k
()
<<
"
\n
gamma.nr(): "
<<
gamma
.
nr
()
<<
"
\n
gamma.nr(): "
<<
gamma
.
nr
()
<<
...
@@ -593,7 +625,8 @@ namespace dlib
...
@@ -593,7 +625,8 @@ namespace dlib
"
\n
beta.nc(): "
<<
beta
.
nc
()
<<
"
\n
beta.nc(): "
<<
beta
.
nc
()
<<
"
\n
src.k(): "
<<
src
.
k
()
<<
"
\n
src.k(): "
<<
src
.
k
()
<<
"
\n
src.nr(): "
<<
src
.
nr
()
<<
"
\n
src.nr(): "
<<
src
.
nr
()
<<
"
\n
src.nc(): "
<<
src
.
nc
()
"
\n
src.nc(): "
<<
src
.
nc
()
<<
"
\n
eps: "
<<
eps
);
);
dest
.
copy_size
(
src
);
dest
.
copy_size
(
src
);
...
@@ -635,7 +668,7 @@ namespace dlib
...
@@ -635,7 +668,7 @@ namespace dlib
else
else
rvar
[
i
]
=
(
1
-
averaging_factor
)
*
rvar
[
i
]
+
scale
*
averaging_factor
*
actual_var
;
rvar
[
i
]
=
(
1
-
averaging_factor
)
*
rvar
[
i
]
+
scale
*
averaging_factor
*
actual_var
;
p_invstds
[
i
]
=
1.0
f
/
std
::
sqrt
(
actual_var
+
dlib
::
tt
::
BATCH_NORM_EPS
);
p_invstds
[
i
]
=
1.0
f
/
std
::
sqrt
(
actual_var
+
eps
);
}
}
p_src
=
src
.
host
();
p_src
=
src
.
host
();
...
@@ -662,6 +695,7 @@ namespace dlib
...
@@ -662,6 +695,7 @@ namespace dlib
}
}
void
batch_normalize_gradient
(
void
batch_normalize_gradient
(
const
double
eps
,
const
tensor
&
gradient_input
,
const
tensor
&
gradient_input
,
const
tensor
&
means
,
const
tensor
&
means
,
const
tensor
&
invstds
,
const
tensor
&
invstds
,
...
@@ -682,6 +716,7 @@ namespace dlib
...
@@ -682,6 +716,7 @@ namespace dlib
DLIB_CASSERT
(
num
==
beta_grad
.
size
(),
""
);
DLIB_CASSERT
(
num
==
beta_grad
.
size
(),
""
);
DLIB_CASSERT
(
have_same_dimensions
(
gradient_input
,
src
),
""
);
DLIB_CASSERT
(
have_same_dimensions
(
gradient_input
,
src
),
""
);
DLIB_CASSERT
(
have_same_dimensions
(
gradient_input
,
src_grad
),
""
);
DLIB_CASSERT
(
have_same_dimensions
(
gradient_input
,
src_grad
),
""
);
DLIB_CASSERT
(
eps
>
0
,
""
);
beta_grad
=
0
;
beta_grad
=
0
;
gamma_grad
=
0
;
gamma_grad
=
0
;
...
@@ -757,6 +792,7 @@ namespace dlib
...
@@ -757,6 +792,7 @@ namespace dlib
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
void
batch_normalize_conv_inference
(
void
batch_normalize_conv_inference
(
const
double
eps
,
resizable_tensor
&
dest
,
resizable_tensor
&
dest
,
const
tensor
&
src
,
const
tensor
&
src
,
const
tensor
&
gamma
,
const
tensor
&
gamma
,
...
@@ -772,7 +808,8 @@ namespace dlib
...
@@ -772,7 +808,8 @@ namespace dlib
gamma
.
k
()
==
src
.
k
()
&&
gamma
.
k
()
==
src
.
k
()
&&
have_same_dimensions
(
gamma
,
beta
)
&&
have_same_dimensions
(
gamma
,
beta
)
&&
have_same_dimensions
(
gamma
,
running_means
)
&&
have_same_dimensions
(
gamma
,
running_means
)
&&
have_same_dimensions
(
gamma
,
running_variances
),
have_same_dimensions
(
gamma
,
running_variances
)
&&
eps
>
0
,
"
\n
gamma.num_samples(): "
<<
gamma
.
num_samples
()
<<
"
\n
gamma.num_samples(): "
<<
gamma
.
num_samples
()
<<
"
\n
gamma.k(): "
<<
gamma
.
k
()
<<
"
\n
gamma.k(): "
<<
gamma
.
k
()
<<
"
\n
gamma.nr(): "
<<
gamma
.
nr
()
<<
"
\n
gamma.nr(): "
<<
gamma
.
nr
()
<<
...
@@ -791,7 +828,8 @@ namespace dlib
...
@@ -791,7 +828,8 @@ namespace dlib
"
\n
running_variances.nc(): "
<<
running_variances
.
nc
()
<<
"
\n
running_variances.nc(): "
<<
running_variances
.
nc
()
<<
"
\n
src.k(): "
<<
src
.
k
()
<<
"
\n
src.k(): "
<<
src
.
k
()
<<
"
\n
src.nr(): "
<<
src
.
nr
()
<<
"
\n
src.nr(): "
<<
src
.
nr
()
<<
"
\n
src.nc(): "
<<
src
.
nc
()
"
\n
src.nc(): "
<<
src
.
nc
()
<<
"
\n
eps: "
<<
eps
);
);
dest
.
copy_size
(
src
);
dest
.
copy_size
(
src
);
...
@@ -807,7 +845,7 @@ namespace dlib
...
@@ -807,7 +845,7 @@ namespace dlib
{
{
for
(
long
k
=
0
;
k
<
src
.
k
();
++
k
)
for
(
long
k
=
0
;
k
<
src
.
k
();
++
k
)
{
{
const
float
invstd
=
1.0
f
/
std
::
sqrt
(
v
[
k
]
+
dlib
::
tt
::
BATCH_NORM_EPS
);
const
float
invstd
=
1.0
f
/
std
::
sqrt
(
v
[
k
]
+
eps
);
for
(
long
j
=
0
;
j
<
num
;
++
j
)
for
(
long
j
=
0
;
j
<
num
;
++
j
)
{
{
*
d
=
g
[
k
]
*
(
*
s
-
m
[
k
])
*
invstd
+
b
[
k
];
*
d
=
g
[
k
]
*
(
*
s
-
m
[
k
])
*
invstd
+
b
[
k
];
...
@@ -819,6 +857,7 @@ namespace dlib
...
@@ -819,6 +857,7 @@ namespace dlib
}
}
void
batch_normalize_conv
(
void
batch_normalize_conv
(
const
double
eps
,
resizable_tensor
&
dest
,
resizable_tensor
&
dest
,
resizable_tensor
&
means
,
resizable_tensor
&
means
,
resizable_tensor
&
invstds
,
resizable_tensor
&
invstds
,
...
@@ -841,7 +880,8 @@ namespace dlib
...
@@ -841,7 +880,8 @@ namespace dlib
beta
.
nr
()
==
1
&&
beta
.
nr
()
==
1
&&
gamma
.
nc
()
==
1
&&
gamma
.
nc
()
==
1
&&
beta
.
nc
()
==
1
&&
beta
.
nc
()
==
1
&&
gamma
.
k
()
==
beta
.
k
()
&&
beta
.
k
()
==
src
.
k
(),
gamma
.
k
()
==
beta
.
k
()
&&
beta
.
k
()
==
src
.
k
()
&&
eps
>
0
,
"
\n
gamma.num_samples(): "
<<
gamma
.
num_samples
()
<<
"
\n
gamma.num_samples(): "
<<
gamma
.
num_samples
()
<<
"
\n
gamma.k(): "
<<
gamma
.
k
()
<<
"
\n
gamma.k(): "
<<
gamma
.
k
()
<<
"
\n
gamma.nr(): "
<<
gamma
.
nr
()
<<
"
\n
gamma.nr(): "
<<
gamma
.
nr
()
<<
...
@@ -852,7 +892,8 @@ namespace dlib
...
@@ -852,7 +892,8 @@ namespace dlib
"
\n
beta.nc(): "
<<
beta
.
nc
()
<<
"
\n
beta.nc(): "
<<
beta
.
nc
()
<<
"
\n
src.k(): "
<<
src
.
k
()
<<
"
\n
src.k(): "
<<
src
.
k
()
<<
"
\n
src.nr(): "
<<
src
.
nr
()
<<
"
\n
src.nr(): "
<<
src
.
nr
()
<<
"
\n
src.nc(): "
<<
src
.
nc
()
"
\n
src.nc(): "
<<
src
.
nc
()
<<
"
\n
eps: "
<<
eps
);
);
dest
.
copy_size
(
src
);
dest
.
copy_size
(
src
);
...
@@ -900,7 +941,7 @@ namespace dlib
...
@@ -900,7 +941,7 @@ namespace dlib
else
else
rvar
[
k
]
=
(
1
-
averaging_factor
)
*
rvar
[
k
]
+
scale
*
averaging_factor
*
actual_var
;
rvar
[
k
]
=
(
1
-
averaging_factor
)
*
rvar
[
k
]
+
scale
*
averaging_factor
*
actual_var
;
p_invstds
[
k
]
=
1.0
f
/
std
::
sqrt
(
actual_var
+
dlib
::
tt
::
BATCH_NORM_EPS
);
p_invstds
[
k
]
=
1.0
f
/
std
::
sqrt
(
actual_var
+
eps
);
}
}
p_src
=
src
.
host
();
p_src
=
src
.
host
();
...
@@ -928,6 +969,7 @@ namespace dlib
...
@@ -928,6 +969,7 @@ namespace dlib
}
}
void
batch_normalize_conv_gradient
(
void
batch_normalize_conv_gradient
(
const
double
eps
,
const
tensor
&
gradient_input
,
const
tensor
&
gradient_input
,
const
tensor
&
means
,
const
tensor
&
means
,
const
tensor
&
invstds
,
const
tensor
&
invstds
,
...
@@ -948,6 +990,7 @@ namespace dlib
...
@@ -948,6 +990,7 @@ namespace dlib
DLIB_CASSERT
(
src
.
k
()
==
beta_grad
.
size
(),
""
);
DLIB_CASSERT
(
src
.
k
()
==
beta_grad
.
size
(),
""
);
DLIB_CASSERT
(
have_same_dimensions
(
gradient_input
,
src
),
""
);
DLIB_CASSERT
(
have_same_dimensions
(
gradient_input
,
src
),
""
);
DLIB_CASSERT
(
have_same_dimensions
(
gradient_input
,
src_grad
),
""
);
DLIB_CASSERT
(
have_same_dimensions
(
gradient_input
,
src_grad
),
""
);
DLIB_CASSERT
(
eps
>
0
,
""
);
beta_grad
=
0
;
beta_grad
=
0
;
gamma_grad
=
0
;
gamma_grad
=
0
;
...
...
dlib/dnn/cpu_dlib.h
View file @
93e786db
...
@@ -81,6 +81,18 @@ namespace dlib
...
@@ -81,6 +81,18 @@ namespace dlib
const
float
D
const
float
D
);
);
void
affine_transform_range
(
size_t
begin
,
size_t
end
,
tensor
&
dest
,
const
tensor
&
src1
,
const
tensor
&
src2
,
const
tensor
&
src3
,
const
float
A
,
const
float
B
,
const
float
C
);
// -----------------------------------------------------------------------------------
// -----------------------------------------------------------------------------------
void
affine_transform
(
void
affine_transform
(
...
@@ -102,6 +114,8 @@ namespace dlib
...
@@ -102,6 +114,8 @@ namespace dlib
// -----------------------------------------------------------------------------------
// -----------------------------------------------------------------------------------
void
compute_adam_update
(
void
compute_adam_update
(
size_t
begin
,
size_t
end
,
tensor
&
s
,
tensor
&
s
,
tensor
&
m
,
tensor
&
m
,
tensor
&
v
,
tensor
&
v
,
...
@@ -117,6 +131,7 @@ namespace dlib
...
@@ -117,6 +131,7 @@ namespace dlib
// -----------------------------------------------------------------------------------
// -----------------------------------------------------------------------------------
void
batch_normalize_inference
(
void
batch_normalize_inference
(
const
double
eps
,
resizable_tensor
&
dest
,
resizable_tensor
&
dest
,
const
tensor
&
src
,
const
tensor
&
src
,
const
tensor
&
gamma
,
const
tensor
&
gamma
,
...
@@ -126,6 +141,7 @@ namespace dlib
...
@@ -126,6 +141,7 @@ namespace dlib
);
);
void
batch_normalize
(
void
batch_normalize
(
const
double
eps
,
resizable_tensor
&
dest
,
resizable_tensor
&
dest
,
resizable_tensor
&
means
,
resizable_tensor
&
means
,
resizable_tensor
&
invstds
,
resizable_tensor
&
invstds
,
...
@@ -138,6 +154,7 @@ namespace dlib
...
@@ -138,6 +154,7 @@ namespace dlib
);
);
void
batch_normalize_gradient
(
void
batch_normalize_gradient
(
const
double
eps
,
const
tensor
&
gradient_input
,
const
tensor
&
gradient_input
,
const
tensor
&
means
,
const
tensor
&
means
,
const
tensor
&
invstds
,
const
tensor
&
invstds
,
...
@@ -149,6 +166,7 @@ namespace dlib
...
@@ -149,6 +166,7 @@ namespace dlib
);
);
void
batch_normalize_conv_inference
(
void
batch_normalize_conv_inference
(
const
double
eps
,
resizable_tensor
&
dest
,
resizable_tensor
&
dest
,
const
tensor
&
src
,
const
tensor
&
src
,
const
tensor
&
gamma
,
const
tensor
&
gamma
,
...
@@ -158,6 +176,7 @@ namespace dlib
...
@@ -158,6 +176,7 @@ namespace dlib
);
);
void
batch_normalize_conv
(
void
batch_normalize_conv
(
const
double
eps
,
resizable_tensor
&
dest
,
resizable_tensor
&
dest
,
resizable_tensor
&
means
,
resizable_tensor
&
means
,
resizable_tensor
&
invstds
,
resizable_tensor
&
invstds
,
...
@@ -170,6 +189,7 @@ namespace dlib
...
@@ -170,6 +189,7 @@ namespace dlib
);
);
void
batch_normalize_conv_gradient
(
void
batch_normalize_conv_gradient
(
const
double
eps
,
const
tensor
&
gradient_input
,
const
tensor
&
gradient_input
,
const
tensor
&
means
,
const
tensor
&
means
,
const
tensor
&
invstds
,
const
tensor
&
invstds
,
...
...
dlib/dnn/cuda_dlib.cu
View file @
93e786db
...
@@ -504,6 +504,40 @@ namespace dlib
...
@@ -504,6 +504,40 @@ namespace dlib
src2
.
device
(),
src3
.
device
(),
dest
.
size
(),
A
,
B
,
C
,
D
);
src2
.
device
(),
src3
.
device
(),
dest
.
size
(),
A
,
B
,
C
,
D
);
}
}
// ----------------------------------------------------------------------------------------
__global__
void
_cuda_affine_transform_range
(
float
*
d
,
const
float
*
s1
,
const
float
*
s2
,
const
float
*
s3
,
size_t
begin
,
size_t
end
,
float
A
,
float
B
,
float
C
)
{
for
(
auto
i
:
grid_stride_range
(
begin
,
end
))
{
d
[
i
]
=
A
*
s1
[
i
]
+
B
*
s2
[
i
]
+
C
*
s3
[
i
];
}
}
void
affine_transform_range
(
size_t
begin
,
size_t
end
,
tensor
&
dest
,
const
tensor
&
src1
,
const
tensor
&
src2
,
const
tensor
&
src3
,
const
float
A
,
const
float
B
,
const
float
C
)
{
DLIB_CASSERT
(
dest
.
size
()
==
src1
.
size
(),
""
);
DLIB_CASSERT
(
dest
.
size
()
==
src2
.
size
(),
""
);
DLIB_CASSERT
(
dest
.
size
()
==
src3
.
size
(),
""
);
DLIB_CASSERT
(
begin
<=
end
&&
end
<=
dest
.
size
(),
""
);
launch_kernel
(
_cuda_affine_transform_range
,
max_jobs
(
end
-
begin
),
dest
.
device
(),
src1
.
device
(),
src2
.
device
(),
src3
.
device
(),
begin
,
end
,
A
,
B
,
C
);
}
// -----------------------------------------------------------------------------------
// -----------------------------------------------------------------------------------
__global__
void
_cuda_affine_transform2
(
float
*
d
,
const
float
*
s
,
size_t
n
,
const
float
*
A
,
const
float
*
B
)
__global__
void
_cuda_affine_transform2
(
float
*
d
,
const
float
*
s
,
size_t
n
,
const
float
*
A
,
const
float
*
B
)
...
@@ -549,7 +583,8 @@ namespace dlib
...
@@ -549,7 +583,8 @@ namespace dlib
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
__global__
void
_cuda_compute_adam_update
(
__global__
void
_cuda_compute_adam_update
(
size_t
n
,
size_t
begin
,
size_t
end
,
float
*
s
,
float
*
s
,
float
*
m
,
float
*
m
,
float
*
v
,
float
*
v
,
...
@@ -566,7 +601,7 @@ namespace dlib
...
@@ -566,7 +601,7 @@ namespace dlib
// m = momentum1*m + (1-momentum1) * (weight_decay*params + params_grad);
// m = momentum1*m + (1-momentum1) * (weight_decay*params + params_grad);
// v = momentum2*v + (1-momentum2)*squared(weight_decay*params + params_grad);
// v = momentum2*v + (1-momentum2)*squared(weight_decay*params + params_grad);
// s = -alpha*m/(sqrt(v) + eps);
// s = -alpha*m/(sqrt(v) + eps);
for
(
auto
i
:
grid_stride_range
(
0
,
n
))
for
(
auto
i
:
grid_stride_range
(
begin
,
end
))
{
{
float
g
=
(
weight_decay
*
params
[
i
]
+
params_grad
[
i
]);
float
g
=
(
weight_decay
*
params
[
i
]
+
params_grad
[
i
]);
m
[
i
]
=
momentum1
*
m
[
i
]
+
(
1
-
momentum1
)
*
g
;
m
[
i
]
=
momentum1
*
m
[
i
]
+
(
1
-
momentum1
)
*
g
;
...
@@ -576,6 +611,8 @@ namespace dlib
...
@@ -576,6 +611,8 @@ namespace dlib
}
}
void
compute_adam_update
(
void
compute_adam_update
(
size_t
begin
,
size_t
end
,
tensor
&
s
,
tensor
&
s
,
tensor
&
m
,
tensor
&
m
,
tensor
&
v
,
tensor
&
v
,
...
@@ -592,10 +629,11 @@ namespace dlib
...
@@ -592,10 +629,11 @@ namespace dlib
s
.
size
()
==
v
.
size
()
&&
s
.
size
()
==
v
.
size
()
&&
s
.
size
()
==
params
.
size
()
&&
s
.
size
()
==
params
.
size
()
&&
s
.
size
()
==
params_grad
.
size
(),
""
);
s
.
size
()
==
params_grad
.
size
(),
""
);
DLIB_CASSERT
(
begin
<=
end
&&
end
<=
params
.
size
(),
""
);
const
float
alpha
=
learning_rate
*
std
::
sqrt
(
1
-
std
::
pow
(
momentum2
,
t
))
/
(
1
-
std
::
pow
(
momentum1
,
t
));
const
float
alpha
=
learning_rate
*
std
::
sqrt
(
1
-
std
::
pow
(
momentum2
,
t
))
/
(
1
-
std
::
pow
(
momentum1
,
t
));
launch_kernel
(
_cuda_compute_adam_update
,
max_jobs
(
s
.
size
()
),
launch_kernel
(
_cuda_compute_adam_update
,
max_jobs
(
end
-
begin
),
s
.
size
()
,
s
.
device
(),
m
.
device
(),
v
.
device
(),
alpha
,
weight_decay
,
begin
,
end
,
s
.
device
(),
m
.
device
(),
v
.
device
(),
alpha
,
weight_decay
,
momentum1
,
momentum2
,
params
.
device
(),
params_grad
.
device
());
momentum1
,
momentum2
,
params
.
device
(),
params_grad
.
device
());
}
}
...
...
dlib/dnn/cuda_dlib.h
View file @
93e786db
...
@@ -164,6 +164,18 @@ namespace dlib
...
@@ -164,6 +164,18 @@ namespace dlib
const
float
D
const
float
D
);
);
void
affine_transform_range
(
size_t
begin
,
size_t
end
,
tensor
&
dest
,
const
tensor
&
src1
,
const
tensor
&
src2
,
const
tensor
&
src3
,
const
float
A
,
const
float
B
,
const
float
C
);
// Note that this function isn't in the tt:: namespace because add_scaled() is
// Note that this function isn't in the tt:: namespace because add_scaled() is
// called by cuda::add() so we don't need a tt:: version of add_scaled().
// called by cuda::add() so we don't need a tt:: version of add_scaled().
void
add_scaled
(
void
add_scaled
(
...
@@ -193,6 +205,8 @@ namespace dlib
...
@@ -193,6 +205,8 @@ namespace dlib
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
void
compute_adam_update
(
void
compute_adam_update
(
size_t
begin
,
size_t
end
,
tensor
&
s
,
tensor
&
s
,
tensor
&
m
,
tensor
&
m
,
tensor
&
v
,
tensor
&
v
,
...
...
dlib/dnn/cudnn_dlibapi.cpp
View file @
93e786db
...
@@ -338,6 +338,7 @@ namespace dlib
...
@@ -338,6 +338,7 @@ namespace dlib
// ------------------------------------------------------------------------------------
// ------------------------------------------------------------------------------------
void
batch_normalize_inference
(
void
batch_normalize_inference
(
const
double
eps
,
resizable_tensor
&
dest
,
resizable_tensor
&
dest
,
const
tensor
&
src
,
const
tensor
&
src
,
const
tensor
&
gamma
,
const
tensor
&
gamma
,
...
@@ -353,7 +354,8 @@ namespace dlib
...
@@ -353,7 +354,8 @@ namespace dlib
gamma
.
k
()
==
src
.
k
()
&&
gamma
.
k
()
==
src
.
k
()
&&
have_same_dimensions
(
gamma
,
beta
)
&&
have_same_dimensions
(
gamma
,
beta
)
&&
have_same_dimensions
(
gamma
,
running_means
)
&&
have_same_dimensions
(
gamma
,
running_means
)
&&
have_same_dimensions
(
gamma
,
running_variances
),
have_same_dimensions
(
gamma
,
running_variances
)
&&
eps
>
0
,
"
\n
gamma.num_samples(): "
<<
gamma
.
num_samples
()
<<
"
\n
gamma.num_samples(): "
<<
gamma
.
num_samples
()
<<
"
\n
gamma.k(): "
<<
gamma
.
k
()
<<
"
\n
gamma.k(): "
<<
gamma
.
k
()
<<
"
\n
gamma.nr(): "
<<
gamma
.
nr
()
<<
"
\n
gamma.nr(): "
<<
gamma
.
nr
()
<<
...
@@ -372,7 +374,8 @@ namespace dlib
...
@@ -372,7 +374,8 @@ namespace dlib
"
\n
running_variances.nc(): "
<<
running_variances
.
nc
()
<<
"
\n
running_variances.nc(): "
<<
running_variances
.
nc
()
<<
"
\n
src.k(): "
<<
src
.
k
()
<<
"
\n
src.k(): "
<<
src
.
k
()
<<
"
\n
src.nr(): "
<<
src
.
nr
()
<<
"
\n
src.nr(): "
<<
src
.
nr
()
<<
"
\n
src.nc(): "
<<
src
.
nc
()
"
\n
src.nc(): "
<<
src
.
nc
()
<<
"
\n
eps: "
<<
eps
);
);
const
float
in_scale
=
1
;
const
float
in_scale
=
1
;
const
float
out_scale
=
0
;
const
float
out_scale
=
0
;
...
@@ -393,10 +396,11 @@ namespace dlib
...
@@ -393,10 +396,11 @@ namespace dlib
beta
.
device
(),
beta
.
device
(),
running_means
.
device
(),
running_means
.
device
(),
running_variances
.
device
(),
running_variances
.
device
(),
dlib
::
tt
::
BATCH_NORM_EPS
));
eps
));
}
}
void
batch_normalize
(
void
batch_normalize
(
const
double
eps
,
resizable_tensor
&
dest
,
resizable_tensor
&
dest
,
resizable_tensor
&
means
,
resizable_tensor
&
means
,
resizable_tensor
&
invstds
,
resizable_tensor
&
invstds
,
...
@@ -417,7 +421,8 @@ namespace dlib
...
@@ -417,7 +421,8 @@ namespace dlib
beta
.
num_samples
()
==
1
&&
beta
.
num_samples
()
==
1
&&
gamma
.
nr
()
==
beta
.
nr
()
&&
beta
.
nr
()
==
src
.
nr
()
&&
gamma
.
nr
()
==
beta
.
nr
()
&&
beta
.
nr
()
==
src
.
nr
()
&&
gamma
.
nc
()
==
beta
.
nc
()
&&
beta
.
nc
()
==
src
.
nc
()
&&
gamma
.
nc
()
==
beta
.
nc
()
&&
beta
.
nc
()
==
src
.
nc
()
&&
gamma
.
k
()
==
beta
.
k
()
&&
beta
.
k
()
==
src
.
k
(),
gamma
.
k
()
==
beta
.
k
()
&&
beta
.
k
()
==
src
.
k
()
&&
eps
>
0
,
"
\n
gamma.num_samples(): "
<<
gamma
.
num_samples
()
<<
"
\n
gamma.num_samples(): "
<<
gamma
.
num_samples
()
<<
"
\n
gamma.k(): "
<<
gamma
.
k
()
<<
"
\n
gamma.k(): "
<<
gamma
.
k
()
<<
"
\n
gamma.nr(): "
<<
gamma
.
nr
()
<<
"
\n
gamma.nr(): "
<<
gamma
.
nr
()
<<
...
@@ -428,7 +433,8 @@ namespace dlib
...
@@ -428,7 +433,8 @@ namespace dlib
"
\n
beta.nc(): "
<<
beta
.
nc
()
<<
"
\n
beta.nc(): "
<<
beta
.
nc
()
<<
"
\n
src.k(): "
<<
src
.
k
()
<<
"
\n
src.k(): "
<<
src
.
k
()
<<
"
\n
src.nr(): "
<<
src
.
nr
()
<<
"
\n
src.nr(): "
<<
src
.
nr
()
<<
"
\n
src.nc(): "
<<
src
.
nc
()
"
\n
src.nc(): "
<<
src
.
nc
()
<<
"
\n
eps: "
<<
eps
);
);
const
float
in_scale
=
1
;
const
float
in_scale
=
1
;
...
@@ -455,12 +461,13 @@ namespace dlib
...
@@ -455,12 +461,13 @@ namespace dlib
averaging_factor
,
averaging_factor
,
running_means
.
device
(),
running_means
.
device
(),
running_variances
.
device
(),
running_variances
.
device
(),
dlib
::
tt
::
BATCH_NORM_EPS
,
eps
,
means
.
device
(),
means
.
device
(),
invstds
.
device
()));
invstds
.
device
()));
}
}
void
batch_normalize_gradient
(
void
batch_normalize_gradient
(
const
double
eps
,
const
tensor
&
gradient_input
,
const
tensor
&
gradient_input
,
const
tensor
&
means
,
const
tensor
&
means
,
const
tensor
&
invstds
,
const
tensor
&
invstds
,
...
@@ -480,6 +487,7 @@ namespace dlib
...
@@ -480,6 +487,7 @@ namespace dlib
DLIB_CASSERT
(
num
==
beta_grad
.
size
(),
""
);
DLIB_CASSERT
(
num
==
beta_grad
.
size
(),
""
);
DLIB_CASSERT
(
have_same_dimensions
(
gradient_input
,
src
),
""
);
DLIB_CASSERT
(
have_same_dimensions
(
gradient_input
,
src
),
""
);
DLIB_CASSERT
(
have_same_dimensions
(
gradient_input
,
src_grad
),
""
);
DLIB_CASSERT
(
have_same_dimensions
(
gradient_input
,
src_grad
),
""
);
DLIB_CASSERT
(
eps
>
0
,
""
);
const
float
in_scale
=
1
;
const
float
in_scale
=
1
;
const
float
out_scale
=
1
;
const
float
out_scale
=
1
;
...
@@ -503,7 +511,7 @@ namespace dlib
...
@@ -503,7 +511,7 @@ namespace dlib
gamma
.
device
(),
gamma
.
device
(),
gamma_grad
.
device
(),
gamma_grad
.
device
(),
beta_grad
.
device
(),
beta_grad
.
device
(),
dlib
::
tt
::
BATCH_NORM_EPS
,
eps
,
means
.
device
(),
means
.
device
(),
invstds
.
device
()));
invstds
.
device
()));
}
}
...
@@ -511,6 +519,7 @@ namespace dlib
...
@@ -511,6 +519,7 @@ namespace dlib
// ------------------------------------------------------------------------------------
// ------------------------------------------------------------------------------------
void
batch_normalize_conv_inference
(
void
batch_normalize_conv_inference
(
const
double
eps
,
resizable_tensor
&
dest
,
resizable_tensor
&
dest
,
const
tensor
&
src
,
const
tensor
&
src
,
const
tensor
&
gamma
,
const
tensor
&
gamma
,
...
@@ -526,7 +535,8 @@ namespace dlib
...
@@ -526,7 +535,8 @@ namespace dlib
gamma
.
k
()
==
src
.
k
()
&&
gamma
.
k
()
==
src
.
k
()
&&
have_same_dimensions
(
gamma
,
beta
)
&&
have_same_dimensions
(
gamma
,
beta
)
&&
have_same_dimensions
(
gamma
,
running_means
)
&&
have_same_dimensions
(
gamma
,
running_means
)
&&
have_same_dimensions
(
gamma
,
running_variances
),
have_same_dimensions
(
gamma
,
running_variances
)
&&
eps
>
0
,
"
\n
gamma.num_samples(): "
<<
gamma
.
num_samples
()
<<
"
\n
gamma.num_samples(): "
<<
gamma
.
num_samples
()
<<
"
\n
gamma.k(): "
<<
gamma
.
k
()
<<
"
\n
gamma.k(): "
<<
gamma
.
k
()
<<
"
\n
gamma.nr(): "
<<
gamma
.
nr
()
<<
"
\n
gamma.nr(): "
<<
gamma
.
nr
()
<<
...
@@ -545,7 +555,8 @@ namespace dlib
...
@@ -545,7 +555,8 @@ namespace dlib
"
\n
running_variances.nc(): "
<<
running_variances
.
nc
()
<<
"
\n
running_variances.nc(): "
<<
running_variances
.
nc
()
<<
"
\n
src.k(): "
<<
src
.
k
()
<<
"
\n
src.k(): "
<<
src
.
k
()
<<
"
\n
src.nr(): "
<<
src
.
nr
()
<<
"
\n
src.nr(): "
<<
src
.
nr
()
<<
"
\n
src.nc(): "
<<
src
.
nc
()
"
\n
src.nc(): "
<<
src
.
nc
()
<<
"
\n
eps: "
<<
eps
);
);
const
float
in_scale
=
1
;
const
float
in_scale
=
1
;
const
float
out_scale
=
0
;
const
float
out_scale
=
0
;
...
@@ -566,10 +577,11 @@ namespace dlib
...
@@ -566,10 +577,11 @@ namespace dlib
beta
.
device
(),
beta
.
device
(),
running_means
.
device
(),
running_means
.
device
(),
running_variances
.
device
(),
running_variances
.
device
(),
dlib
::
tt
::
BATCH_NORM_EPS
));
eps
));
}
}
void
batch_normalize_conv
(
void
batch_normalize_conv
(
const
double
eps
,
resizable_tensor
&
dest
,
resizable_tensor
&
dest
,
resizable_tensor
&
means
,
resizable_tensor
&
means
,
resizable_tensor
&
invstds
,
resizable_tensor
&
invstds
,
...
@@ -592,7 +604,8 @@ namespace dlib
...
@@ -592,7 +604,8 @@ namespace dlib
beta
.
nr
()
==
1
&&
beta
.
nr
()
==
1
&&
gamma
.
nc
()
==
1
&&
gamma
.
nc
()
==
1
&&
beta
.
nc
()
==
1
&&
beta
.
nc
()
==
1
&&
gamma
.
k
()
==
beta
.
k
()
&&
beta
.
k
()
==
src
.
k
(),
gamma
.
k
()
==
beta
.
k
()
&&
beta
.
k
()
==
src
.
k
()
&&
eps
>
0
,
"
\n
gamma.num_samples(): "
<<
gamma
.
num_samples
()
<<
"
\n
gamma.num_samples(): "
<<
gamma
.
num_samples
()
<<
"
\n
gamma.k(): "
<<
gamma
.
k
()
<<
"
\n
gamma.k(): "
<<
gamma
.
k
()
<<
"
\n
gamma.nr(): "
<<
gamma
.
nr
()
<<
"
\n
gamma.nr(): "
<<
gamma
.
nr
()
<<
...
@@ -603,7 +616,8 @@ namespace dlib
...
@@ -603,7 +616,8 @@ namespace dlib
"
\n
beta.nc(): "
<<
beta
.
nc
()
<<
"
\n
beta.nc(): "
<<
beta
.
nc
()
<<
"
\n
src.k(): "
<<
src
.
k
()
<<
"
\n
src.k(): "
<<
src
.
k
()
<<
"
\n
src.nr(): "
<<
src
.
nr
()
<<
"
\n
src.nr(): "
<<
src
.
nr
()
<<
"
\n
src.nc(): "
<<
src
.
nc
()
"
\n
src.nc(): "
<<
src
.
nc
()
<<
"
\n
eps: "
<<
eps
);
);
const
float
in_scale
=
1
;
const
float
in_scale
=
1
;
const
float
out_scale
=
0
;
const
float
out_scale
=
0
;
...
@@ -629,12 +643,13 @@ namespace dlib
...
@@ -629,12 +643,13 @@ namespace dlib
averaging_factor
,
averaging_factor
,
running_means
.
device
(),
running_means
.
device
(),
running_variances
.
device
(),
running_variances
.
device
(),
dlib
::
tt
::
BATCH_NORM_EPS
,
eps
,
means
.
device
(),
means
.
device
(),
invstds
.
device
()));
invstds
.
device
()));
}
}
void
batch_normalize_conv_gradient
(
void
batch_normalize_conv_gradient
(
const
double
eps
,
const
tensor
&
gradient_input
,
const
tensor
&
gradient_input
,
const
tensor
&
means
,
const
tensor
&
means
,
const
tensor
&
invstds
,
const
tensor
&
invstds
,
...
@@ -653,6 +668,7 @@ namespace dlib
...
@@ -653,6 +668,7 @@ namespace dlib
DLIB_CASSERT
(
src
.
k
()
==
beta_grad
.
size
(),
""
);
DLIB_CASSERT
(
src
.
k
()
==
beta_grad
.
size
(),
""
);
DLIB_CASSERT
(
have_same_dimensions
(
gradient_input
,
src
),
""
);
DLIB_CASSERT
(
have_same_dimensions
(
gradient_input
,
src
),
""
);
DLIB_CASSERT
(
have_same_dimensions
(
gradient_input
,
src_grad
),
""
);
DLIB_CASSERT
(
have_same_dimensions
(
gradient_input
,
src_grad
),
""
);
DLIB_CASSERT
(
eps
>
0
,
""
);
const
float
in_scale
=
1
;
const
float
in_scale
=
1
;
const
float
out_scale
=
1
;
const
float
out_scale
=
1
;
...
@@ -676,7 +692,7 @@ namespace dlib
...
@@ -676,7 +692,7 @@ namespace dlib
gamma
.
device
(),
gamma
.
device
(),
gamma_grad
.
device
(),
gamma_grad
.
device
(),
beta_grad
.
device
(),
beta_grad
.
device
(),
dlib
::
tt
::
BATCH_NORM_EPS
,
eps
,
means
.
device
(),
means
.
device
(),
invstds
.
device
()));
invstds
.
device
()));
}
}
...
...
dlib/dnn/cudnn_dlibapi.h
View file @
93e786db
...
@@ -135,6 +135,7 @@ namespace dlib
...
@@ -135,6 +135,7 @@ namespace dlib
// ------------------------------------------------------------------------------------
// ------------------------------------------------------------------------------------
void
batch_normalize_inference
(
void
batch_normalize_inference
(
const
double
eps
,
resizable_tensor
&
dest
,
resizable_tensor
&
dest
,
const
tensor
&
src
,
const
tensor
&
src
,
const
tensor
&
gamma
,
const
tensor
&
gamma
,
...
@@ -144,6 +145,7 @@ namespace dlib
...
@@ -144,6 +145,7 @@ namespace dlib
);
);
void
batch_normalize
(
void
batch_normalize
(
const
double
eps
,
resizable_tensor
&
dest
,
resizable_tensor
&
dest
,
resizable_tensor
&
means
,
resizable_tensor
&
means
,
resizable_tensor
&
invstds
,
resizable_tensor
&
invstds
,
...
@@ -156,6 +158,7 @@ namespace dlib
...
@@ -156,6 +158,7 @@ namespace dlib
);
);
void
batch_normalize_gradient
(
void
batch_normalize_gradient
(
const
double
eps
,
const
tensor
&
gradient_input
,
const
tensor
&
gradient_input
,
const
tensor
&
means
,
const
tensor
&
means
,
const
tensor
&
invstds
,
const
tensor
&
invstds
,
...
@@ -169,6 +172,7 @@ namespace dlib
...
@@ -169,6 +172,7 @@ namespace dlib
// ------------------------------------------------------------------------------------
// ------------------------------------------------------------------------------------
void
batch_normalize_conv_inference
(
void
batch_normalize_conv_inference
(
const
double
eps
,
resizable_tensor
&
dest
,
resizable_tensor
&
dest
,
const
tensor
&
src
,
const
tensor
&
src
,
const
tensor
&
gamma
,
const
tensor
&
gamma
,
...
@@ -178,6 +182,7 @@ namespace dlib
...
@@ -178,6 +182,7 @@ namespace dlib
);
);
void
batch_normalize_conv
(
void
batch_normalize_conv
(
const
double
eps
,
resizable_tensor
&
dest
,
resizable_tensor
&
dest
,
resizable_tensor
&
means
,
resizable_tensor
&
means
,
resizable_tensor
&
invstds
,
resizable_tensor
&
invstds
,
...
@@ -190,6 +195,7 @@ namespace dlib
...
@@ -190,6 +195,7 @@ namespace dlib
);
);
void
batch_normalize_conv_gradient
(
void
batch_normalize_conv_gradient
(
const
double
eps
,
const
tensor
&
gradient_input
,
const
tensor
&
gradient_input
,
const
tensor
&
means
,
const
tensor
&
means
,
const
tensor
&
invstds
,
const
tensor
&
invstds
,
...
...
dlib/dnn/layers.h
View file @
93e786db
...
@@ -42,6 +42,10 @@ namespace dlib
...
@@ -42,6 +42,10 @@ namespace dlib
con_
(
con_
(
)
:
)
:
learning_rate_multiplier
(
1
),
weight_decay_multiplier
(
1
),
bias_learning_rate_multiplier
(
1
),
bias_weight_decay_multiplier
(
0
),
padding_y_
(
_padding_y
),
padding_y_
(
_padding_y
),
padding_x_
(
_padding_x
)
padding_x_
(
_padding_x
)
{}
{}
...
@@ -54,12 +58,27 @@ namespace dlib
...
@@ -54,12 +58,27 @@ namespace dlib
long
padding_y
()
const
{
return
padding_y_
;
}
long
padding_y
()
const
{
return
padding_y_
;
}
long
padding_x
()
const
{
return
padding_x_
;
}
long
padding_x
()
const
{
return
padding_x_
;
}
double
get_learning_rate_multiplier
()
const
{
return
learning_rate_multiplier
;
}
double
get_weight_decay_multiplier
()
const
{
return
weight_decay_multiplier
;
}
void
set_learning_rate_multiplier
(
double
val
)
{
learning_rate_multiplier
=
val
;
}
void
set_weight_decay_multiplier
(
double
val
)
{
weight_decay_multiplier
=
val
;
}
double
get_bias_learning_rate_multiplier
()
const
{
return
bias_learning_rate_multiplier
;
}
double
get_bias_weight_decay_multiplier
()
const
{
return
bias_weight_decay_multiplier
;
}
void
set_bias_learning_rate_multiplier
(
double
val
)
{
bias_learning_rate_multiplier
=
val
;
}
void
set_bias_weight_decay_multiplier
(
double
val
)
{
bias_weight_decay_multiplier
=
val
;
}
con_
(
con_
(
const
con_
&
item
const
con_
&
item
)
:
)
:
params
(
item
.
params
),
params
(
item
.
params
),
filters
(
item
.
filters
),
filters
(
item
.
filters
),
biases
(
item
.
biases
),
biases
(
item
.
biases
),
learning_rate_multiplier
(
item
.
learning_rate_multiplier
),
weight_decay_multiplier
(
item
.
weight_decay_multiplier
),
bias_learning_rate_multiplier
(
item
.
bias_learning_rate_multiplier
),
bias_weight_decay_multiplier
(
item
.
bias_weight_decay_multiplier
),
padding_y_
(
item
.
padding_y_
),
padding_y_
(
item
.
padding_y_
),
padding_x_
(
item
.
padding_x_
)
padding_x_
(
item
.
padding_x_
)
{
{
...
@@ -81,6 +100,10 @@ namespace dlib
...
@@ -81,6 +100,10 @@ namespace dlib
biases
=
item
.
biases
;
biases
=
item
.
biases
;
padding_y_
=
item
.
padding_y_
;
padding_y_
=
item
.
padding_y_
;
padding_x_
=
item
.
padding_x_
;
padding_x_
=
item
.
padding_x_
;
learning_rate_multiplier
=
item
.
learning_rate_multiplier
;
weight_decay_multiplier
=
item
.
weight_decay_multiplier
;
bias_learning_rate_multiplier
=
item
.
bias_learning_rate_multiplier
;
bias_weight_decay_multiplier
=
item
.
bias_weight_decay_multiplier
;
return
*
this
;
return
*
this
;
}
}
...
@@ -121,18 +144,22 @@ namespace dlib
...
@@ -121,18 +144,22 @@ namespace dlib
void
backward
(
const
tensor
&
gradient_input
,
SUBNET
&
sub
,
tensor
&
params_grad
)
void
backward
(
const
tensor
&
gradient_input
,
SUBNET
&
sub
,
tensor
&
params_grad
)
{
{
conv
.
get_gradient_for_data
(
gradient_input
,
filters
(
params
,
0
),
sub
.
get_gradient_input
());
conv
.
get_gradient_for_data
(
gradient_input
,
filters
(
params
,
0
),
sub
.
get_gradient_input
());
// no point computing the parameter gradients if they won't be used.
if
(
learning_rate_multiplier
!=
0
)
{
auto
filt
=
filters
(
params_grad
,
0
);
auto
filt
=
filters
(
params_grad
,
0
);
conv
.
get_gradient_for_filters
(
gradient_input
,
sub
.
get_output
(),
filt
);
conv
.
get_gradient_for_filters
(
gradient_input
,
sub
.
get_output
(),
filt
);
auto
b
=
biases
(
params_grad
,
filters
.
size
());
auto
b
=
biases
(
params_grad
,
filters
.
size
());
tt
::
assign_conv_bias_gradient
(
b
,
gradient_input
);
tt
::
assign_conv_bias_gradient
(
b
,
gradient_input
);
}
}
}
const
tensor
&
get_layer_params
()
const
{
return
params
;
}
const
tensor
&
get_layer_params
()
const
{
return
params
;
}
tensor
&
get_layer_params
()
{
return
params
;
}
tensor
&
get_layer_params
()
{
return
params
;
}
friend
void
serialize
(
const
con_
&
item
,
std
::
ostream
&
out
)
friend
void
serialize
(
const
con_
&
item
,
std
::
ostream
&
out
)
{
{
serialize
(
"con_
2
"
,
out
);
serialize
(
"con_
3
"
,
out
);
serialize
(
item
.
params
,
out
);
serialize
(
item
.
params
,
out
);
serialize
(
_num_filters
,
out
);
serialize
(
_num_filters
,
out
);
serialize
(
_nr
,
out
);
serialize
(
_nr
,
out
);
...
@@ -143,6 +170,10 @@ namespace dlib
...
@@ -143,6 +170,10 @@ namespace dlib
serialize
(
item
.
padding_x_
,
out
);
serialize
(
item
.
padding_x_
,
out
);
serialize
(
item
.
filters
,
out
);
serialize
(
item
.
filters
,
out
);
serialize
(
item
.
biases
,
out
);
serialize
(
item
.
biases
,
out
);
serialize
(
item
.
learning_rate_multiplier
,
out
);
serialize
(
item
.
weight_decay_multiplier
,
out
);
serialize
(
item
.
bias_learning_rate_multiplier
,
out
);
serialize
(
item
.
bias_weight_decay_multiplier
,
out
);
}
}
friend
void
deserialize
(
con_
&
item
,
std
::
istream
&
in
)
friend
void
deserialize
(
con_
&
item
,
std
::
istream
&
in
)
...
@@ -167,7 +198,7 @@ namespace dlib
...
@@ -167,7 +198,7 @@ namespace dlib
item
.
padding_y_
=
nr
/
2
;
item
.
padding_y_
=
nr
/
2
;
item
.
padding_x_
=
nc
/
2
;
item
.
padding_x_
=
nc
/
2
;
}
}
else
if
(
version
==
"con_2"
)
else
if
(
version
==
"con_2"
||
version
==
"con_3"
)
{
{
deserialize
(
item
.
params
,
in
);
deserialize
(
item
.
params
,
in
);
deserialize
(
num_filters
,
in
);
deserialize
(
num_filters
,
in
);
...
@@ -180,6 +211,23 @@ namespace dlib
...
@@ -180,6 +211,23 @@ namespace dlib
deserialize
(
item
.
filters
,
in
);
deserialize
(
item
.
filters
,
in
);
deserialize
(
item
.
biases
,
in
);
deserialize
(
item
.
biases
,
in
);
if
(
version
==
"con_3"
)
{
deserialize
(
item
.
learning_rate_multiplier
,
in
);
deserialize
(
item
.
weight_decay_multiplier
,
in
);
deserialize
(
item
.
bias_learning_rate_multiplier
,
in
);
deserialize
(
item
.
bias_weight_decay_multiplier
,
in
);
}
else
{
// Previous versions didn't have these parameters, so they were
// implicitly 1.
item
.
learning_rate_multiplier
=
1
;
item
.
weight_decay_multiplier
=
1
;
item
.
bias_learning_rate_multiplier
=
1
;
item
.
bias_weight_decay_multiplier
=
1
;
}
if
(
item
.
padding_y_
!=
_padding_y
)
throw
serialization_error
(
"Wrong padding_y found while deserializing dlib::con_"
);
if
(
item
.
padding_y_
!=
_padding_y
)
throw
serialization_error
(
"Wrong padding_y found while deserializing dlib::con_"
);
if
(
item
.
padding_x_
!=
_padding_x
)
throw
serialization_error
(
"Wrong padding_x found while deserializing dlib::con_"
);
if
(
item
.
padding_x_
!=
_padding_x
)
throw
serialization_error
(
"Wrong padding_x found while deserializing dlib::con_"
);
}
}
...
@@ -207,6 +255,10 @@ namespace dlib
...
@@ -207,6 +255,10 @@ namespace dlib
<<
", padding_y="
<<
item
.
padding_y_
<<
", padding_y="
<<
item
.
padding_y_
<<
", padding_x="
<<
item
.
padding_x_
<<
", padding_x="
<<
item
.
padding_x_
<<
")"
;
<<
")"
;
out
<<
" learning_rate_mult="
<<
item
.
learning_rate_multiplier
;
out
<<
" weight_decay_mult="
<<
item
.
weight_decay_multiplier
;
out
<<
" bias_learning_rate_mult="
<<
item
.
bias_learning_rate_multiplier
;
out
<<
" bias_weight_decay_mult="
<<
item
.
bias_weight_decay_multiplier
;
return
out
;
return
out
;
}
}
...
@@ -217,6 +269,10 @@ namespace dlib
...
@@ -217,6 +269,10 @@ namespace dlib
alias_tensor
filters
,
biases
;
alias_tensor
filters
,
biases
;
tt
::
tensor_conv
conv
;
tt
::
tensor_conv
conv
;
double
learning_rate_multiplier
;
double
weight_decay_multiplier
;
double
bias_learning_rate_multiplier
;
double
bias_weight_decay_multiplier
;
// These are here only because older versions of con (which you might encounter
// These are here only because older versions of con (which you might encounter
// serialized to disk) used different padding settings.
// serialized to disk) used different padding settings.
...
@@ -594,20 +650,43 @@ namespace dlib
...
@@ -594,20 +650,43 @@ namespace dlib
FC_MODE
=
1
FC_MODE
=
1
};
};
const
double
DEFAULT_BATCH_NORM_EPS
=
0.00001
;
template
<
template
<
layer_mode
mode
layer_mode
mode
>
>
class
bn_
class
bn_
{
{
public:
public:
bn_
()
:
num_updates
(
0
),
running_stats_window_size
(
1000
)
explicit
bn_
(
unsigned
long
window_size
,
double
eps_
=
DEFAULT_BATCH_NORM_EPS
)
:
num_updates
(
0
),
running_stats_window_size
(
window_size
),
learning_rate_multiplier
(
1
),
weight_decay_multiplier
(
0
),
bias_learning_rate_multiplier
(
1
),
bias_weight_decay_multiplier
(
1
),
eps
(
eps_
)
{}
{}
explicit
bn_
(
unsigned
long
window_size
)
:
num_updates
(
0
),
running_stats_window_size
(
window_size
)
bn_
()
:
bn_
(
1000
)
{}
{}
layer_mode
get_mode
()
const
{
return
mode
;
}
layer_mode
get_mode
()
const
{
return
mode
;
}
unsigned
long
get_running_stats_window_size
()
const
{
return
running_stats_window_size
;
}
unsigned
long
get_running_stats_window_size
()
const
{
return
running_stats_window_size
;
}
double
get_eps
()
const
{
return
eps
;
}
double
get_learning_rate_multiplier
()
const
{
return
learning_rate_multiplier
;
}
double
get_weight_decay_multiplier
()
const
{
return
weight_decay_multiplier
;
}
void
set_learning_rate_multiplier
(
double
val
)
{
learning_rate_multiplier
=
val
;
}
void
set_weight_decay_multiplier
(
double
val
)
{
weight_decay_multiplier
=
val
;
}
double
get_bias_learning_rate_multiplier
()
const
{
return
bias_learning_rate_multiplier
;
}
double
get_bias_weight_decay_multiplier
()
const
{
return
bias_weight_decay_multiplier
;
}
void
set_bias_learning_rate_multiplier
(
double
val
)
{
bias_learning_rate_multiplier
=
val
;
}
void
set_bias_weight_decay_multiplier
(
double
val
)
{
bias_weight_decay_multiplier
=
val
;
}
template
<
typename
SUBNET
>
template
<
typename
SUBNET
>
void
setup
(
const
SUBNET
&
sub
)
void
setup
(
const
SUBNET
&
sub
)
...
@@ -648,16 +727,16 @@ namespace dlib
...
@@ -648,16 +727,16 @@ namespace dlib
if
(
num_updates
<
running_stats_window_size
)
if
(
num_updates
<
running_stats_window_size
)
++
num_updates
;
++
num_updates
;
if
(
mode
==
FC_MODE
)
if
(
mode
==
FC_MODE
)
tt
::
batch_normalize
(
output
,
means
,
invstds
,
decay
,
running_means
,
running_variances
,
sub
.
get_output
(),
g
,
b
);
tt
::
batch_normalize
(
eps
,
output
,
means
,
invstds
,
decay
,
running_means
,
running_variances
,
sub
.
get_output
(),
g
,
b
);
else
else
tt
::
batch_normalize_conv
(
output
,
means
,
invstds
,
decay
,
running_means
,
running_variances
,
sub
.
get_output
(),
g
,
b
);
tt
::
batch_normalize_conv
(
eps
,
output
,
means
,
invstds
,
decay
,
running_means
,
running_variances
,
sub
.
get_output
(),
g
,
b
);
}
}
else
// we are running in testing mode so we just linearly scale the input tensor.
else
// we are running in testing mode so we just linearly scale the input tensor.
{
{
if
(
mode
==
FC_MODE
)
if
(
mode
==
FC_MODE
)
tt
::
batch_normalize_inference
(
output
,
sub
.
get_output
(),
g
,
b
,
running_means
,
running_variances
);
tt
::
batch_normalize_inference
(
eps
,
output
,
sub
.
get_output
(),
g
,
b
,
running_means
,
running_variances
);
else
else
tt
::
batch_normalize_conv_inference
(
output
,
sub
.
get_output
(),
g
,
b
,
running_means
,
running_variances
);
tt
::
batch_normalize_conv_inference
(
eps
,
output
,
sub
.
get_output
(),
g
,
b
,
running_means
,
running_variances
);
}
}
}
}
...
@@ -668,9 +747,9 @@ namespace dlib
...
@@ -668,9 +747,9 @@ namespace dlib
auto
g_grad
=
gamma
(
params_grad
,
0
);
auto
g_grad
=
gamma
(
params_grad
,
0
);
auto
b_grad
=
beta
(
params_grad
,
gamma
.
size
());
auto
b_grad
=
beta
(
params_grad
,
gamma
.
size
());
if
(
mode
==
FC_MODE
)
if
(
mode
==
FC_MODE
)
tt
::
batch_normalize_gradient
(
gradient_input
,
means
,
invstds
,
sub
.
get_output
(),
g
,
sub
.
get_gradient_input
(),
g_grad
,
b_grad
);
tt
::
batch_normalize_gradient
(
eps
,
gradient_input
,
means
,
invstds
,
sub
.
get_output
(),
g
,
sub
.
get_gradient_input
(),
g_grad
,
b_grad
);
else
else
tt
::
batch_normalize_conv_gradient
(
gradient_input
,
means
,
invstds
,
sub
.
get_output
(),
g
,
sub
.
get_gradient_input
(),
g_grad
,
b_grad
);
tt
::
batch_normalize_conv_gradient
(
eps
,
gradient_input
,
means
,
invstds
,
sub
.
get_output
(),
g
,
sub
.
get_gradient_input
(),
g_grad
,
b_grad
);
}
}
const
tensor
&
get_layer_params
()
const
{
return
params
;
}
const
tensor
&
get_layer_params
()
const
{
return
params
;
}
...
@@ -679,9 +758,9 @@ namespace dlib
...
@@ -679,9 +758,9 @@ namespace dlib
friend
void
serialize
(
const
bn_
&
item
,
std
::
ostream
&
out
)
friend
void
serialize
(
const
bn_
&
item
,
std
::
ostream
&
out
)
{
{
if
(
mode
==
CONV_MODE
)
if
(
mode
==
CONV_MODE
)
serialize
(
"bn_con"
,
out
);
serialize
(
"bn_con
2
"
,
out
);
else
// if FC_MODE
else
// if FC_MODE
serialize
(
"bn_fc"
,
out
);
serialize
(
"bn_fc
2
"
,
out
);
serialize
(
item
.
params
,
out
);
serialize
(
item
.
params
,
out
);
serialize
(
item
.
gamma
,
out
);
serialize
(
item
.
gamma
,
out
);
serialize
(
item
.
beta
,
out
);
serialize
(
item
.
beta
,
out
);
...
@@ -691,6 +770,11 @@ namespace dlib
...
@@ -691,6 +770,11 @@ namespace dlib
serialize
(
item
.
running_variances
,
out
);
serialize
(
item
.
running_variances
,
out
);
serialize
(
item
.
num_updates
,
out
);
serialize
(
item
.
num_updates
,
out
);
serialize
(
item
.
running_stats_window_size
,
out
);
serialize
(
item
.
running_stats_window_size
,
out
);
serialize
(
item
.
learning_rate_multiplier
,
out
);
serialize
(
item
.
weight_decay_multiplier
,
out
);
serialize
(
item
.
bias_learning_rate_multiplier
,
out
);
serialize
(
item
.
bias_weight_decay_multiplier
,
out
);
serialize
(
item
.
eps
,
out
);
}
}
friend
void
deserialize
(
bn_
&
item
,
std
::
istream
&
in
)
friend
void
deserialize
(
bn_
&
item
,
std
::
istream
&
in
)
...
@@ -701,12 +785,12 @@ namespace dlib
...
@@ -701,12 +785,12 @@ namespace dlib
{
{
if
(
mode
==
CONV_MODE
)
if
(
mode
==
CONV_MODE
)
{
{
if
(
version
!=
"bn_con"
)
if
(
version
!=
"bn_con"
&&
version
!=
"bn_con2"
)
throw
serialization_error
(
"Unexpected version '"
+
version
+
"' found while deserializing dlib::bn_."
);
throw
serialization_error
(
"Unexpected version '"
+
version
+
"' found while deserializing dlib::bn_."
);
}
}
else
// must be in FC_MODE
else
// must be in FC_MODE
{
{
if
(
version
!=
"bn_fc"
)
if
(
version
!=
"bn_fc"
&&
version
!=
"bn_fc2"
)
throw
serialization_error
(
"Unexpected version '"
+
version
+
"' found while deserializing dlib::bn_."
);
throw
serialization_error
(
"Unexpected version '"
+
version
+
"' found while deserializing dlib::bn_."
);
}
}
}
}
...
@@ -731,16 +815,38 @@ namespace dlib
...
@@ -731,16 +815,38 @@ namespace dlib
// We also need to flip the running_variances around since the previous
// We also need to flip the running_variances around since the previous
// format saved the inverse standard deviations instead of variances.
// format saved the inverse standard deviations instead of variances.
item
.
running_variances
=
1.0
f
/
squared
(
mat
(
item
.
running_variances
))
-
tt
::
BATCH_NORM_EPS
;
item
.
running_variances
=
1.0
f
/
squared
(
mat
(
item
.
running_variances
))
-
DEFAULT_BATCH_NORM_EPS
;
}
else
if
(
version
==
"bn_con2"
||
version
==
"bn_fc2"
)
{
deserialize
(
item
.
learning_rate_multiplier
,
in
);
deserialize
(
item
.
weight_decay_multiplier
,
in
);
deserialize
(
item
.
bias_learning_rate_multiplier
,
in
);
deserialize
(
item
.
bias_weight_decay_multiplier
,
in
);
deserialize
(
item
.
eps
,
in
);
}
else
{
// Previous versions didn't have these parameters, so they were
// implicitly 1.
item
.
learning_rate_multiplier
=
1
;
item
.
weight_decay_multiplier
=
1
;
item
.
eps
=
DEFAULT_BATCH_NORM_EPS
;
}
}
}
}
friend
std
::
ostream
&
operator
<<
(
std
::
ostream
&
out
,
const
bn_
&
item
)
friend
std
::
ostream
&
operator
<<
(
std
::
ostream
&
out
,
const
bn_
&
item
)
{
{
if
(
mode
==
CONV_MODE
)
if
(
mode
==
CONV_MODE
)
out
<<
"bn_con"
;
out
<<
"bn_con
"
;
else
else
out
<<
"bn_fc"
;
out
<<
"bn_fc "
;
out
<<
" eps="
<<
item
.
eps
;
out
<<
" learning_rate_mult="
<<
item
.
learning_rate_multiplier
;
out
<<
" weight_decay_mult="
<<
item
.
weight_decay_multiplier
;
out
<<
" bias_learning_rate_mult="
<<
item
.
bias_learning_rate_multiplier
;
out
<<
" bias_weight_decay_mult="
<<
item
.
bias_weight_decay_multiplier
;
return
out
;
return
out
;
}
}
...
@@ -754,6 +860,11 @@ namespace dlib
...
@@ -754,6 +860,11 @@ namespace dlib
resizable_tensor
invstds
,
running_variances
;
resizable_tensor
invstds
,
running_variances
;
unsigned
long
num_updates
;
unsigned
long
num_updates
;
unsigned
long
running_stats_window_size
;
unsigned
long
running_stats_window_size
;
double
learning_rate_multiplier
;
double
weight_decay_multiplier
;
double
bias_learning_rate_multiplier
;
double
bias_weight_decay_multiplier
;
double
eps
;
};
};
template
<
typename
SUBNET
>
template
<
typename
SUBNET
>
...
@@ -784,11 +895,24 @@ namespace dlib
...
@@ -784,11 +895,24 @@ namespace dlib
static_assert
(
num_outputs_
>
0
,
"The number of outputs from a fc_ layer must be > 0"
);
static_assert
(
num_outputs_
>
0
,
"The number of outputs from a fc_ layer must be > 0"
);
public:
public:
fc_
()
:
num_outputs
(
num_outputs_
),
num_inputs
(
0
)
fc_
(
num_fc_outputs
o
)
:
num_outputs
(
o
.
num_outputs
),
num_inputs
(
0
),
{
learning_rate_multiplier
(
1
),
}
weight_decay_multiplier
(
1
),
bias_learning_rate_multiplier
(
1
),
bias_weight_decay_multiplier
(
0
)
{}
fc_
()
:
fc_
(
num_fc_outputs
(
num_outputs_
))
{}
fc_
(
num_fc_outputs
o
)
:
num_outputs
(
o
.
num_outputs
),
num_inputs
(
0
)
{}
double
get_learning_rate_multiplier
()
const
{
return
learning_rate_multiplier
;
}
double
get_weight_decay_multiplier
()
const
{
return
weight_decay_multiplier
;
}
void
set_learning_rate_multiplier
(
double
val
)
{
learning_rate_multiplier
=
val
;
}
void
set_weight_decay_multiplier
(
double
val
)
{
weight_decay_multiplier
=
val
;
}
double
get_bias_learning_rate_multiplier
()
const
{
return
bias_learning_rate_multiplier
;
}
double
get_bias_weight_decay_multiplier
()
const
{
return
bias_weight_decay_multiplier
;
}
void
set_bias_learning_rate_multiplier
(
double
val
)
{
bias_learning_rate_multiplier
=
val
;
}
void
set_bias_weight_decay_multiplier
(
double
val
)
{
bias_weight_decay_multiplier
=
val
;
}
unsigned
long
get_num_outputs
(
unsigned
long
get_num_outputs
(
)
const
{
return
num_outputs
;
}
)
const
{
return
num_outputs
;
}
...
@@ -834,6 +958,9 @@ namespace dlib
...
@@ -834,6 +958,9 @@ namespace dlib
template
<
typename
SUBNET
>
template
<
typename
SUBNET
>
void
backward
(
const
tensor
&
gradient_input
,
SUBNET
&
sub
,
tensor
&
params_grad
)
void
backward
(
const
tensor
&
gradient_input
,
SUBNET
&
sub
,
tensor
&
params_grad
)
{
// no point computing the parameter gradients if they won't be used.
if
(
learning_rate_multiplier
!=
0
)
{
{
// compute the gradient of the weight parameters.
// compute the gradient of the weight parameters.
auto
pw
=
weights
(
params_grad
,
0
);
auto
pw
=
weights
(
params_grad
,
0
);
...
@@ -845,6 +972,7 @@ namespace dlib
...
@@ -845,6 +972,7 @@ namespace dlib
auto
pb
=
biases
(
params_grad
,
weights
.
size
());
auto
pb
=
biases
(
params_grad
,
weights
.
size
());
tt
::
assign_bias_gradient
(
pb
,
gradient_input
);
tt
::
assign_bias_gradient
(
pb
,
gradient_input
);
}
}
}
// compute the gradient for the data
// compute the gradient for the data
auto
w
=
weights
(
params
,
0
);
auto
w
=
weights
(
params
,
0
);
...
@@ -856,20 +984,24 @@ namespace dlib
...
@@ -856,20 +984,24 @@ namespace dlib
friend
void
serialize
(
const
fc_
&
item
,
std
::
ostream
&
out
)
friend
void
serialize
(
const
fc_
&
item
,
std
::
ostream
&
out
)
{
{
serialize
(
"fc_"
,
out
);
serialize
(
"fc_
2
"
,
out
);
serialize
(
item
.
num_outputs
,
out
);
serialize
(
item
.
num_outputs
,
out
);
serialize
(
item
.
num_inputs
,
out
);
serialize
(
item
.
num_inputs
,
out
);
serialize
(
item
.
params
,
out
);
serialize
(
item
.
params
,
out
);
serialize
(
item
.
weights
,
out
);
serialize
(
item
.
weights
,
out
);
serialize
(
item
.
biases
,
out
);
serialize
(
item
.
biases
,
out
);
serialize
((
int
)
bias_mode
,
out
);
serialize
((
int
)
bias_mode
,
out
);
serialize
(
item
.
learning_rate_multiplier
,
out
);
serialize
(
item
.
weight_decay_multiplier
,
out
);
serialize
(
item
.
bias_learning_rate_multiplier
,
out
);
serialize
(
item
.
bias_weight_decay_multiplier
,
out
);
}
}
friend
void
deserialize
(
fc_
&
item
,
std
::
istream
&
in
)
friend
void
deserialize
(
fc_
&
item
,
std
::
istream
&
in
)
{
{
std
::
string
version
;
std
::
string
version
;
deserialize
(
version
,
in
);
deserialize
(
version
,
in
);
if
(
version
!=
"fc_"
)
if
(
version
!=
"fc_"
&&
version
!=
"fc_2"
)
throw
serialization_error
(
"Unexpected version '"
+
version
+
"' found while deserializing dlib::fc_."
);
throw
serialization_error
(
"Unexpected version '"
+
version
+
"' found while deserializing dlib::fc_."
);
deserialize
(
item
.
num_outputs
,
in
);
deserialize
(
item
.
num_outputs
,
in
);
...
@@ -880,6 +1012,22 @@ namespace dlib
...
@@ -880,6 +1012,22 @@ namespace dlib
int
bmode
=
0
;
int
bmode
=
0
;
deserialize
(
bmode
,
in
);
deserialize
(
bmode
,
in
);
if
(
bias_mode
!=
(
fc_bias_mode
)
bmode
)
throw
serialization_error
(
"Wrong fc_bias_mode found while deserializing dlib::fc_"
);
if
(
bias_mode
!=
(
fc_bias_mode
)
bmode
)
throw
serialization_error
(
"Wrong fc_bias_mode found while deserializing dlib::fc_"
);
if
(
version
==
"fc_2"
)
{
deserialize
(
item
.
learning_rate_multiplier
,
in
);
deserialize
(
item
.
weight_decay_multiplier
,
in
);
deserialize
(
item
.
bias_learning_rate_multiplier
,
in
);
deserialize
(
item
.
bias_weight_decay_multiplier
,
in
);
}
else
{
// Previous versions didn't have these parameters, so they were
// implicitly 1.
item
.
learning_rate_multiplier
=
1
;
item
.
weight_decay_multiplier
=
1
;
item
.
bias_learning_rate_multiplier
=
1
;
item
.
bias_weight_decay_multiplier
=
1
;
}
}
}
friend
std
::
ostream
&
operator
<<
(
std
::
ostream
&
out
,
const
fc_
&
item
)
friend
std
::
ostream
&
operator
<<
(
std
::
ostream
&
out
,
const
fc_
&
item
)
...
@@ -889,12 +1037,18 @@ namespace dlib
...
@@ -889,12 +1037,18 @@ namespace dlib
out
<<
"fc
\t
("
out
<<
"fc
\t
("
<<
"num_outputs="
<<
item
.
num_outputs
<<
"num_outputs="
<<
item
.
num_outputs
<<
")"
;
<<
")"
;
out
<<
" learning_rate_mult="
<<
item
.
learning_rate_multiplier
;
out
<<
" weight_decay_mult="
<<
item
.
weight_decay_multiplier
;
out
<<
" bias_learning_rate_mult="
<<
item
.
bias_learning_rate_multiplier
;
out
<<
" bias_weight_decay_mult="
<<
item
.
bias_weight_decay_multiplier
;
}
}
else
else
{
{
out
<<
"fc_no_bias ("
out
<<
"fc_no_bias ("
<<
"num_outputs="
<<
item
.
num_outputs
<<
"num_outputs="
<<
item
.
num_outputs
<<
")"
;
<<
")"
;
out
<<
" learning_rate_mult="
<<
item
.
learning_rate_multiplier
;
out
<<
" weight_decay_mult="
<<
item
.
weight_decay_multiplier
;
}
}
return
out
;
return
out
;
}
}
...
@@ -905,6 +1059,10 @@ namespace dlib
...
@@ -905,6 +1059,10 @@ namespace dlib
unsigned
long
num_inputs
;
unsigned
long
num_inputs
;
resizable_tensor
params
;
resizable_tensor
params
;
alias_tensor
weights
,
biases
;
alias_tensor
weights
,
biases
;
double
learning_rate_multiplier
;
double
weight_decay_multiplier
;
double
bias_learning_rate_multiplier
;
double
bias_weight_decay_multiplier
;
};
};
template
<
template
<
...
@@ -1143,7 +1301,7 @@ namespace dlib
...
@@ -1143,7 +1301,7 @@ namespace dlib
auto
sg
=
gamma
(
temp
,
0
);
auto
sg
=
gamma
(
temp
,
0
);
auto
sb
=
beta
(
temp
,
gamma
.
size
());
auto
sb
=
beta
(
temp
,
gamma
.
size
());
g
=
pointwise_multiply
(
mat
(
sg
),
1.0
f
/
sqrt
(
mat
(
item
.
running_variances
)
+
tt
::
BATCH_NORM_EPS
));
g
=
pointwise_multiply
(
mat
(
sg
),
1.0
f
/
sqrt
(
mat
(
item
.
running_variances
)
+
item
.
get_eps
()
));
b
=
mat
(
sb
)
-
pointwise_multiply
(
mat
(
g
),
mat
(
item
.
running_means
));
b
=
mat
(
sb
)
-
pointwise_multiply
(
mat
(
g
),
mat
(
item
.
running_means
));
}
}
...
@@ -1223,7 +1381,7 @@ namespace dlib
...
@@ -1223,7 +1381,7 @@ namespace dlib
{
{
std
::
string
version
;
std
::
string
version
;
deserialize
(
version
,
in
);
deserialize
(
version
,
in
);
if
(
version
==
"bn_con"
)
if
(
version
==
"bn_con"
||
version
==
"bn_con2"
)
{
{
// Since we can build an affine_ from a bn_ we check if that's what is in
// Since we can build an affine_ from a bn_ we check if that's what is in
// the stream and if so then just convert it right here.
// the stream and if so then just convert it right here.
...
@@ -1233,7 +1391,7 @@ namespace dlib
...
@@ -1233,7 +1391,7 @@ namespace dlib
item
=
temp
;
item
=
temp
;
return
;
return
;
}
}
else
if
(
version
==
"bn_fc"
)
else
if
(
version
==
"bn_fc"
||
version
==
"bn_fc2"
)
{
{
// Since we can build an affine_ from a bn_ we check if that's what is in
// Since we can build an affine_ from a bn_ we check if that's what is in
// the stream and if so then just convert it right here.
// the stream and if so then just convert it right here.
...
@@ -1289,8 +1447,13 @@ namespace dlib
...
@@ -1289,8 +1447,13 @@ namespace dlib
template
<
typename
SUBNET
>
template
<
typename
SUBNET
>
void
forward
(
const
SUBNET
&
sub
,
resizable_tensor
&
output
)
void
forward
(
const
SUBNET
&
sub
,
resizable_tensor
&
output
)
{
{
output
.
copy_size
(
sub
.
get_output
());
auto
&&
t1
=
sub
.
get_output
();
tt
::
add
(
output
,
sub
.
get_output
(),
layer
<
tag
>
(
sub
).
get_output
());
auto
&&
t2
=
layer
<
tag
>
(
sub
).
get_output
();
output
.
set_size
(
std
::
max
(
t1
.
num_samples
(),
t2
.
num_samples
()),
std
::
max
(
t1
.
k
(),
t2
.
k
()),
std
::
max
(
t1
.
nr
(),
t2
.
nr
()),
std
::
max
(
t1
.
nc
(),
t2
.
nc
()));
tt
::
add
(
output
,
t1
,
t2
);
}
}
template
<
typename
SUBNET
>
template
<
typename
SUBNET
>
...
...
dlib/dnn/layers_abstract.h
View file @
93e786db
...
@@ -123,6 +123,16 @@ namespace dlib
...
@@ -123,6 +123,16 @@ namespace dlib
allow dlib to make some layers execute in-place and therefore run a
allow dlib to make some layers execute in-place and therefore run a
little faster and use less memory. Do not implement forward() and
little faster and use less memory. Do not implement forward() and
backward().
backward().
It should also be noted that layers may define additional layer specific
fields and the solvers can use these fields as they see fit. For example,
some layers define get_learning_rate_multiplier() and
get_weight_decay_multiplier() methods. The solvers that come with dlib
look at these methods, if they exist, and adjust the learning rate or
weight decay for that layer according to the multiplier. Therefore, you
can add these methods to your layer types if you want, or even define new
fields and new solvers that use those fields in some way.
!*/
!*/
public:
public:
...
@@ -367,6 +377,10 @@ namespace dlib
...
@@ -367,6 +377,10 @@ namespace dlib
ensures
ensures
- #get_num_outputs() == num_outputs
- #get_num_outputs() == num_outputs
- #get_bias_mode() == bias_mode
- #get_bias_mode() == bias_mode
- #get_learning_rate_multiplier() == 1
- #get_weight_decay_multiplier() == 1
- #get_bias_learning_rate_multiplier() == 1
- #get_bias_weight_decay_multiplier() == 0
!*/
!*/
unsigned
long
get_num_outputs
(
unsigned
long
get_num_outputs
(
...
@@ -389,6 +403,82 @@ namespace dlib
...
@@ -389,6 +403,82 @@ namespace dlib
is added to each of the outputs of this layer.
is added to each of the outputs of this layer.
!*/
!*/
double
get_learning_rate_multiplier
(
)
const
;
/*!
ensures
- returns a multiplier number. The interpretation is that this object is
requesting that the learning rate used to optimize its parameters be
multiplied by get_learning_rate_multiplier().
!*/
double
get_weight_decay_multiplier
(
)
const
;
/*!
ensures
- returns a multiplier number. The interpretation is that this object is
requesting that the weight decay used to optimize its parameters be
multiplied by get_weight_decay_multiplier().
!*/
void
set_learning_rate_multiplier
(
double
val
);
/*!
requires
- val >= 0
ensures
- #get_learning_rate_multiplier() == val
!*/
void
set_weight_decay_multiplier
(
double
val
);
/*!
requires
- val >= 0
ensures
- #get_weight_decay_multiplier() == val
!*/
double
get_bias_learning_rate_multiplier
(
)
const
;
/*!
ensures
- returns a multiplier number. The interpretation is that this object is
requesting that the learning rate used to optimize its bias parameters be
multiplied by get_learning_rate_multiplier()*get_bias_learning_rate_multiplier().
!*/
double
get_bias_weight_decay_multiplier
(
)
const
;
/*!
ensures
- returns a multiplier number. The interpretation is that this object is
requesting that the weight decay used to optimize its bias parameters be
multiplied by get_weight_decay_multiplier()*get_bias_weight_decay_multiplier().
!*/
void
set_bias_learning_rate_multiplier
(
double
val
);
/*!
requires
- val >= 0
ensures
- #get_bias_learning_rate_multiplier() == val
!*/
void
set_bias_weight_decay_multiplier
(
double
val
);
/*!
requires
- val >= 0
ensures
- #get_bias_weight_decay_multiplier() == val
!*/
template
<
typename
SUBNET
>
void
setup
(
const
SUBNET
&
sub
);
template
<
typename
SUBNET
>
void
setup
(
const
SUBNET
&
sub
);
template
<
typename
SUBNET
>
void
forward
(
const
SUBNET
&
sub
,
resizable_tensor
&
output
);
template
<
typename
SUBNET
>
void
forward
(
const
SUBNET
&
sub
,
resizable_tensor
&
output
);
template
<
typename
SUBNET
>
void
backward
(
const
tensor
&
gradient_input
,
SUBNET
&
sub
,
tensor
&
params_grad
);
template
<
typename
SUBNET
>
void
backward
(
const
tensor
&
gradient_input
,
SUBNET
&
sub
,
tensor
&
params_grad
);
...
@@ -458,6 +548,10 @@ namespace dlib
...
@@ -458,6 +548,10 @@ namespace dlib
- #stride_x() == _stride_x
- #stride_x() == _stride_x
- #padding_y() == _padding_y
- #padding_y() == _padding_y
- #padding_x() == _padding_x
- #padding_x() == _padding_x
- #get_learning_rate_multiplier() == 1
- #get_weight_decay_multiplier() == 1
- #get_bias_learning_rate_multiplier() == 1
- #get_bias_weight_decay_multiplier() == 0
!*/
!*/
long
num_filters
(
long
num_filters
(
...
@@ -517,6 +611,82 @@ namespace dlib
...
@@ -517,6 +611,82 @@ namespace dlib
sides of the image.
sides of the image.
!*/
!*/
double
get_learning_rate_multiplier
(
)
const
;
/*!
ensures
- returns a multiplier number. The interpretation is that this object is
requesting that the learning rate used to optimize its parameters be
multiplied by get_learning_rate_multiplier().
!*/
double
get_weight_decay_multiplier
(
)
const
;
/*!
ensures
- returns a multiplier number. The interpretation is that this object is
requesting that the weight decay used to optimize its parameters be
multiplied by get_weight_decay_multiplier().
!*/
void
set_learning_rate_multiplier
(
double
val
);
/*!
requires
- val >= 0
ensures
- #get_learning_rate_multiplier() == val
!*/
void
set_weight_decay_multiplier
(
double
val
);
/*!
requires
- val >= 0
ensures
- #get_weight_decay_multiplier() == val
!*/
double
get_bias_learning_rate_multiplier
(
)
const
;
/*!
ensures
- returns a multiplier number. The interpretation is that this object is
requesting that the learning rate used to optimize its bias parameters be
multiplied by get_learning_rate_multiplier()*get_bias_learning_rate_multiplier().
!*/
double
get_bias_weight_decay_multiplier
(
)
const
;
/*!
ensures
- returns a multiplier number. The interpretation is that this object is
requesting that the weight decay used to optimize its bias parameters be
multiplied by get_weight_decay_multiplier()*get_bias_weight_decay_multiplier().
!*/
void
set_bias_learning_rate_multiplier
(
double
val
);
/*!
requires
- val >= 0
ensures
- #get_bias_learning_rate_multiplier() == val
!*/
void
set_bias_weight_decay_multiplier
(
double
val
);
/*!
requires
- val >= 0
ensures
- #get_bias_weight_decay_multiplier() == val
!*/
template
<
typename
SUBNET
>
void
setup
(
const
SUBNET
&
sub
);
template
<
typename
SUBNET
>
void
setup
(
const
SUBNET
&
sub
);
template
<
typename
SUBNET
>
void
forward
(
const
SUBNET
&
sub
,
resizable_tensor
&
output
);
template
<
typename
SUBNET
>
void
forward
(
const
SUBNET
&
sub
,
resizable_tensor
&
output
);
template
<
typename
SUBNET
>
void
backward
(
const
tensor
&
gradient_input
,
SUBNET
&
sub
,
tensor
&
params_grad
);
template
<
typename
SUBNET
>
void
backward
(
const
tensor
&
gradient_input
,
SUBNET
&
sub
,
tensor
&
params_grad
);
...
@@ -648,6 +818,8 @@ namespace dlib
...
@@ -648,6 +818,8 @@ namespace dlib
FC_MODE
=
1
// fully connected mode
FC_MODE
=
1
// fully connected mode
};
};
const
double
DEFAULT_BATCH_NORM_EPS
=
0.00001
;
template
<
template
<
layer_mode
mode
layer_mode
mode
>
>
...
@@ -684,16 +856,29 @@ namespace dlib
...
@@ -684,16 +856,29 @@ namespace dlib
/*!
/*!
ensures
ensures
- #get_mode() == mode
- #get_mode() == mode
- get_running_stats_window_size() == 1000
- #get_running_stats_window_size() == 1000
- #get_learning_rate_multiplier() == 1
- #get_weight_decay_multiplier() == 0
- #get_bias_learning_rate_multiplier() == 1
- #get_bias_weight_decay_multiplier() == 1
- #get_eps() == tt::DEFAULT_BATCH_NORM_EPS
!*/
!*/
explicit
bn_
(
explicit
bn_
(
unsigned
long
window_size
unsigned
long
window_size
,
double
eps
=
tt
::
DEFAULT_BATCH_NORM_EPS
);
);
/*!
/*!
requires
- eps > 0
ensures
ensures
- #get_mode() == mode
- #get_mode() == mode
- get_running_stats_window_size() == window_size
- #get_running_stats_window_size() == window_size
- #get_learning_rate_multiplier() == 1
- #get_weight_decay_multiplier() == 0
- #get_bias_learning_rate_multiplier() == 1
- #get_bias_weight_decay_multiplier() == 1
- #get_eps() == eps
!*/
!*/
layer_mode
get_mode
(
layer_mode
get_mode
(
...
@@ -712,6 +897,15 @@ namespace dlib
...
@@ -712,6 +897,15 @@ namespace dlib
normalization after a convolutional layer you should use CONV_MODE.
normalization after a convolutional layer you should use CONV_MODE.
!*/
!*/
double
get_eps
(
)
const
;
/*!
ensures
- When doing batch normalization, we are dividing by the standard
deviation. This epsilon value returned by this function is added to the
variance to prevent the division from dividing by zero.
!*/
unsigned
long
get_running_stats_window_size
(
unsigned
long
get_running_stats_window_size
(
)
const
;
)
const
;
/*!
/*!
...
@@ -725,6 +919,82 @@ namespace dlib
...
@@ -725,6 +919,82 @@ namespace dlib
the running average.
the running average.
!*/
!*/
double
get_learning_rate_multiplier
(
)
const
;
/*!
ensures
- returns a multiplier number. The interpretation is that this object is
requesting that the learning rate used to optimize its parameters be
multiplied by get_learning_rate_multiplier().
!*/
double
get_weight_decay_multiplier
(
)
const
;
/*!
ensures
- returns a multiplier number. The interpretation is that this object is
requesting that the weight decay used to optimize its parameters be
multiplied by get_weight_decay_multiplier().
!*/
void
set_learning_rate_multiplier
(
double
val
);
/*!
requires
- val >= 0
ensures
- #get_learning_rate_multiplier() == val
!*/
void
set_weight_decay_multiplier
(
double
val
);
/*!
requires
- val >= 0
ensures
- #get_weight_decay_multiplier() == val
!*/
double
get_bias_learning_rate_multiplier
(
)
const
;
/*!
ensures
- returns a multiplier number. The interpretation is that this object is
requesting that the learning rate used to optimize its bias parameters be
multiplied by get_learning_rate_multiplier()*get_bias_learning_rate_multiplier().
!*/
double
get_bias_weight_decay_multiplier
(
)
const
;
/*!
ensures
- returns a multiplier number. The interpretation is that this object is
requesting that the weight decay used to optimize its bias parameters be
multiplied by get_weight_decay_multiplier()*get_bias_weight_decay_multiplier().
!*/
void
set_bias_learning_rate_multiplier
(
double
val
);
/*!
requires
- val >= 0
ensures
- #get_bias_learning_rate_multiplier() == val
!*/
void
set_bias_weight_decay_multiplier
(
double
val
);
/*!
requires
- val >= 0
ensures
- #get_bias_weight_decay_multiplier() == val
!*/
template
<
typename
SUBNET
>
void
setup
(
const
SUBNET
&
sub
);
template
<
typename
SUBNET
>
void
setup
(
const
SUBNET
&
sub
);
template
<
typename
SUBNET
>
void
forward
(
const
SUBNET
&
sub
,
resizable_tensor
&
output
);
template
<
typename
SUBNET
>
void
forward
(
const
SUBNET
&
sub
,
resizable_tensor
&
output
);
template
<
typename
SUBNET
>
void
backward
(
const
tensor
&
gradient_input
,
SUBNET
&
sub
,
tensor
&
params_grad
);
template
<
typename
SUBNET
>
void
backward
(
const
tensor
&
gradient_input
,
SUBNET
&
sub
,
tensor
&
params_grad
);
...
@@ -1330,7 +1600,13 @@ namespace dlib
...
@@ -1330,7 +1600,13 @@ namespace dlib
what layer to add to the output of the previous layer. The result of this
what layer to add to the output of the previous layer. The result of this
addition is output by add_prev_. Finally, the addition happens pointwise
addition is output by add_prev_. Finally, the addition happens pointwise
according to 4D tensor arithmetic. If the dimensions don't match then
according to 4D tensor arithmetic. If the dimensions don't match then
missing elements are presumed to be equal to 0.
missing elements are presumed to be equal to 0. Moreover, each dimension
of the output tensor is equal to the maximum dimension of either of the
inputs. That is, if the tensors A and B are being added to produce C then:
- C.num_samples() == max(A.num_samples(), B.num_samples())
- C.k() == max(A.k(), B.k())
- C.nr() == max(A.nr(), B.nr())
- C.nc() == max(A.nc(), B.nc())
!*/
!*/
public:
public:
...
...
dlib/dnn/solvers.h
View file @
93e786db
...
@@ -6,6 +6,7 @@
...
@@ -6,6 +6,7 @@
#include "solvers_abstract.h"
#include "solvers_abstract.h"
#include "tensor.h"
#include "tensor.h"
#include <iostream>
#include <iostream>
#include "layers.h"
namespace
dlib
namespace
dlib
{
{
...
@@ -49,10 +50,53 @@ namespace dlib
...
@@ -49,10 +50,53 @@ namespace dlib
v
=
0
;
v
=
0
;
}
}
//perform: v = momentum*mat(v) - weight_decay*learning_rate*mat(params) - learning_rate*mat(params_grad);
const
double
lr
=
learning_rate
*
get_learning_rate_multiplier
(
l
);
tt
::
affine_transform
(
v
,
v
,
params
,
params_grad
,
const
double
wd
=
weight_decay
*
get_weight_decay_multiplier
(
l
);
momentum
,
-
weight_decay
*
learning_rate
,
-
learning_rate
,
0
);
//perform: v = momentum*mat(v) - wd*lr*mat(params) - lr*mat(params_grad);
tt
::
affine_transform
(
v
,
v
,
params
,
params_grad
,
momentum
,
-
wd
*
lr
,
-
lr
);
return
v
;
}
template
<
unsigned
long
N
>
const
tensor
&
operator
()
(
const
float
learning_rate
,
const
fc_
<
N
,
FC_HAS_BIAS
>&
l
,
const
tensor
&
params_grad
)
{
update_considering_bias
(
learning_rate
,
l
,
params_grad
,
l
.
get_num_outputs
());
return
v
;
}
template
<
long
_num_filters
,
long
_nr
,
long
_nc
,
int
_stride_y
,
int
_stride_x
,
int
_padding_y
,
int
_padding_x
>
const
tensor
&
operator
()
(
const
float
learning_rate
,
const
con_
<
_num_filters
,
_nr
,
_nc
,
_stride_y
,
_stride_x
,
_padding_y
,
_padding_x
>&
l
,
const
tensor
&
params_grad
)
{
update_considering_bias
(
learning_rate
,
l
,
params_grad
,
l
.
num_filters
());
return
v
;
}
template
<
layer_mode
mode
>
const
tensor
&
operator
()
(
const
float
learning_rate
,
const
bn_
<
mode
>&
l
,
const
tensor
&
params_grad
)
{
update_considering_bias
(
learning_rate
,
l
,
params_grad
,
params_grad
.
size
()
/
2
);
return
v
;
return
v
;
}
}
...
@@ -76,9 +120,49 @@ namespace dlib
...
@@ -76,9 +120,49 @@ namespace dlib
}
}
private:
private:
template
<
typename
layer_type
>
void
update_considering_bias
(
const
float
learning_rate
,
const
layer_type
&
l
,
const
tensor
&
params_grad
,
unsigned
long
bias_offset
)
{
const
tensor
&
params
=
l
.
get_layer_params
();
DLIB_CASSERT
(
params
.
size
()
!=
0
,
""
);
if
(
v
.
size
()
==
0
)
{
v
.
copy_size
(
params_grad
);
v
=
0
;
}
double
lr
=
learning_rate
*
get_learning_rate_multiplier
(
l
);
double
wd
=
weight_decay
*
get_weight_decay_multiplier
(
l
);
//perform: v = momentum*mat(v) - wd*lr*mat(params) - lr*mat(params_grad);
if
(
l
.
get_bias_learning_rate_multiplier
()
==
1
&&
l
.
get_bias_weight_decay_multiplier
()
==
1
)
{
tt
::
affine_transform
(
v
,
v
,
params
,
params_grad
,
momentum
,
-
wd
*
lr
,
-
lr
);
}
else
{
tt
::
affine_transform_range
(
0
,
bias_offset
,
v
,
v
,
params
,
params_grad
,
momentum
,
-
wd
*
lr
,
-
lr
);
// now update the biases but apply their multipliers
lr
*=
l
.
get_bias_learning_rate_multiplier
();
wd
*=
l
.
get_bias_weight_decay_multiplier
();
tt
::
affine_transform_range
(
bias_offset
,
v
.
size
(),
v
,
v
,
params
,
params_grad
,
momentum
,
-
wd
*
lr
,
-
lr
);
}
}
resizable_tensor
v
;
resizable_tensor
v
;
float
weight_decay
;
float
weight_decay
;
float
momentum
;
float
momentum
;
};
};
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
...
@@ -132,11 +216,57 @@ namespace dlib
...
@@ -132,11 +216,57 @@ namespace dlib
++
t
;
++
t
;
tt
::
compute_adam_update
(
s
,
m
,
v
,
t
,
learning_rate
,
weight_decay
,
momentum1
,
momentum2
,
params
,
params_grad
);
tt
::
compute_adam_update
(
0
,
params
.
size
(),
s
,
m
,
v
,
t
,
learning_rate
*
get_learning_rate_multiplier
(
l
),
weight_decay
*
get_weight_decay_multiplier
(
l
),
momentum1
,
momentum2
,
params
,
params_grad
);
return
s
;
return
s
;
}
}
template
<
unsigned
long
N
>
const
tensor
&
operator
()
(
const
float
learning_rate
,
const
fc_
<
N
,
FC_HAS_BIAS
>&
l
,
const
tensor
&
params_grad
)
{
update_considering_bias
(
learning_rate
,
l
,
params_grad
,
l
.
get_num_outputs
());
return
s
;
}
template
<
long
_num_filters
,
long
_nr
,
long
_nc
,
int
_stride_y
,
int
_stride_x
,
int
_padding_y
,
int
_padding_x
>
const
tensor
&
operator
()
(
const
float
learning_rate
,
const
con_
<
_num_filters
,
_nr
,
_nc
,
_stride_y
,
_stride_x
,
_padding_y
,
_padding_x
>&
l
,
const
tensor
&
params_grad
)
{
update_considering_bias
(
learning_rate
,
l
,
params_grad
,
l
.
num_filters
());
return
s
;
}
template
<
layer_mode
mode
>
const
tensor
&
operator
()
(
const
float
learning_rate
,
const
bn_
<
mode
>&
l
,
const
tensor
&
params_grad
)
{
update_considering_bias
(
learning_rate
,
l
,
params_grad
,
params_grad
.
size
()
/
2
);
return
s
;
}
friend
void
serialize
(
const
adam
&
item
,
std
::
ostream
&
out
)
friend
void
serialize
(
const
adam
&
item
,
std
::
ostream
&
out
)
{
{
serialize
(
"adam2"
,
out
);
serialize
(
"adam2"
,
out
);
...
@@ -165,6 +295,49 @@ namespace dlib
...
@@ -165,6 +295,49 @@ namespace dlib
}
}
private:
private:
template
<
typename
layer_type
>
void
update_considering_bias
(
const
float
learning_rate
,
const
layer_type
&
l
,
const
tensor
&
params_grad
,
unsigned
long
bias_offset
)
{
const
tensor
&
params
=
l
.
get_layer_params
();
DLIB_CASSERT
(
params
.
size
()
!=
0
,
""
);
if
(
v
.
size
()
==
0
)
{
m
.
copy_size
(
params_grad
);
m
=
0
;
v
.
copy_size
(
params_grad
);
v
=
0
;
s
.
copy_size
(
params_grad
);
}
++
t
;
if
(
l
.
get_bias_learning_rate_multiplier
()
==
1
&&
l
.
get_bias_weight_decay_multiplier
()
==
1
)
{
tt
::
compute_adam_update
(
0
,
params
.
size
(),
s
,
m
,
v
,
t
,
learning_rate
*
get_learning_rate_multiplier
(
l
),
weight_decay
*
get_weight_decay_multiplier
(
l
),
momentum1
,
momentum2
,
params
,
params_grad
);
}
else
{
tt
::
compute_adam_update
(
0
,
bias_offset
,
s
,
m
,
v
,
t
,
learning_rate
*
get_learning_rate_multiplier
(
l
),
weight_decay
*
get_weight_decay_multiplier
(
l
),
momentum1
,
momentum2
,
params
,
params_grad
);
tt
::
compute_adam_update
(
bias_offset
,
params
.
size
(),
s
,
m
,
v
,
t
,
learning_rate
*
get_learning_rate_multiplier
(
l
)
*
l
.
get_bias_learning_rate_multiplier
(),
weight_decay
*
get_weight_decay_multiplier
(
l
)
*
l
.
get_bias_weight_decay_multiplier
(),
momentum1
,
momentum2
,
params
,
params_grad
);
}
}
resizable_tensor
m
;
resizable_tensor
m
;
resizable_tensor
v
;
resizable_tensor
v
;
resizable_tensor
s
;
resizable_tensor
s
;
...
...
dlib/dnn/solvers_abstract.h
View file @
93e786db
...
@@ -78,6 +78,15 @@ namespace dlib
...
@@ -78,6 +78,15 @@ namespace dlib
V = momentum*V - weight_decay*learning_rate*l.get_layer_params() - learning_rate*params_grad;
V = momentum*V - weight_decay*learning_rate*l.get_layer_params() - learning_rate*params_grad;
Here V is a momentum term that is remembered by the solver from one
Here V is a momentum term that is remembered by the solver from one
invocation of operator() to the next.
invocation of operator() to the next.
Note that the actual learning rate and weight decay used by the solver are
multiplied by the per layer multipliers. That is, the solver will call
get_learning_rate_multiplier(l) and get_weight_decay_multiplier(l) and
multiply these values with the nominal learning rate and weight decay,
respectively, to determine the values it will use during each step. It is
also overloaded to allow additional learning rate multipliers to be applied
to fc_ and con_ bias parameters.
!*/
!*/
public:
public:
...
@@ -123,6 +132,15 @@ namespace dlib
...
@@ -123,6 +132,15 @@ namespace dlib
paper:
paper:
Kingma, Diederik P., and Jimmy Ba Adam. "A method for stochastic
Kingma, Diederik P., and Jimmy Ba Adam. "A method for stochastic
optimization." International Conference on Learning Representation. 2015.
optimization." International Conference on Learning Representation. 2015.
Note that the actual learning rate and weight decay used by the solver are
multiplied by the per layer multipliers. That is, the solver will call
get_learning_rate_multiplier(l) and get_weight_decay_multiplier(l) and
multiply these values with the nominal learning rate and weight decay,
respectively, to determine the values it will use during each step. It is
also overloaded to allow additional learning rate multipliers to be applied
to fc_ and con_ bias parameters.
!*/
!*/
public:
public:
...
...
dlib/dnn/tensor_tools.cpp
View file @
93e786db
...
@@ -240,6 +240,42 @@ namespace dlib { namespace tt
...
@@ -240,6 +240,42 @@ namespace dlib { namespace tt
#endif
#endif
}
}
void
affine_transform_range
(
size_t
begin
,
size_t
end
,
tensor
&
dest
,
const
tensor
&
src1
,
const
tensor
&
src2
,
const
tensor
&
src3
,
const
float
A
,
const
float
B
,
const
float
C
)
{
#ifdef DLIB_USE_CUDA
cuda
::
affine_transform_range
(
begin
,
end
,
dest
,
src1
,
src2
,
src3
,
A
,
B
,
C
);
#else
cpu
::
affine_transform_range
(
begin
,
end
,
dest
,
src1
,
src2
,
src3
,
A
,
B
,
C
);
#endif
}
void
affine_transform
(
tensor
&
dest
,
const
tensor
&
src1
,
const
tensor
&
src2
,
const
tensor
&
src3
,
const
float
A
,
const
float
B
,
const
float
C
)
{
#ifdef DLIB_USE_CUDA
cuda
::
affine_transform_range
(
0
,
dest
.
size
(),
dest
,
src1
,
src2
,
src3
,
A
,
B
,
C
);
#else
cpu
::
affine_transform_range
(
0
,
dest
.
size
(),
dest
,
src1
,
src2
,
src3
,
A
,
B
,
C
);
#endif
}
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
void
affine_transform
(
void
affine_transform
(
...
@@ -275,6 +311,8 @@ namespace dlib { namespace tt
...
@@ -275,6 +311,8 @@ namespace dlib { namespace tt
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
void
compute_adam_update
(
void
compute_adam_update
(
size_t
begin
,
size_t
end
,
tensor
&
s
,
tensor
&
s
,
tensor
&
m
,
tensor
&
m
,
tensor
&
v
,
tensor
&
v
,
...
@@ -288,10 +326,10 @@ namespace dlib { namespace tt
...
@@ -288,10 +326,10 @@ namespace dlib { namespace tt
)
)
{
{
#ifdef DLIB_USE_CUDA
#ifdef DLIB_USE_CUDA
cuda
::
compute_adam_update
(
s
,
m
,
v
,
t
,
learning_rate
,
weight_decay
,
momentum1
,
cuda
::
compute_adam_update
(
begin
,
end
,
s
,
m
,
v
,
t
,
learning_rate
,
weight_decay
,
momentum1
,
momentum2
,
params
,
params_grad
);
momentum2
,
params
,
params_grad
);
#else
#else
cpu
::
compute_adam_update
(
s
,
m
,
v
,
t
,
learning_rate
,
weight_decay
,
momentum1
,
cpu
::
compute_adam_update
(
begin
,
end
,
s
,
m
,
v
,
t
,
learning_rate
,
weight_decay
,
momentum1
,
momentum2
,
params
,
params_grad
);
momentum2
,
params
,
params_grad
);
#endif
#endif
}
}
...
@@ -299,6 +337,7 @@ namespace dlib { namespace tt
...
@@ -299,6 +337,7 @@ namespace dlib { namespace tt
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
void
batch_normalize_inference
(
void
batch_normalize_inference
(
const
double
eps
,
resizable_tensor
&
dest
,
resizable_tensor
&
dest
,
const
tensor
&
src
,
const
tensor
&
src
,
const
tensor
&
gamma
,
const
tensor
&
gamma
,
...
@@ -308,13 +347,14 @@ namespace dlib { namespace tt
...
@@ -308,13 +347,14 @@ namespace dlib { namespace tt
)
)
{
{
#ifdef DLIB_USE_CUDA
#ifdef DLIB_USE_CUDA
cuda
::
batch_normalize_inference
(
dest
,
src
,
gamma
,
beta
,
running_means
,
running_variances
);
cuda
::
batch_normalize_inference
(
eps
,
dest
,
src
,
gamma
,
beta
,
running_means
,
running_variances
);
#else
#else
cpu
::
batch_normalize_inference
(
dest
,
src
,
gamma
,
beta
,
running_means
,
running_variances
);
cpu
::
batch_normalize_inference
(
eps
,
dest
,
src
,
gamma
,
beta
,
running_means
,
running_variances
);
#endif
#endif
}
}
void
batch_normalize
(
void
batch_normalize
(
const
double
eps
,
resizable_tensor
&
dest
,
resizable_tensor
&
dest
,
resizable_tensor
&
means
,
resizable_tensor
&
means
,
resizable_tensor
&
vars
,
resizable_tensor
&
vars
,
...
@@ -327,13 +367,14 @@ namespace dlib { namespace tt
...
@@ -327,13 +367,14 @@ namespace dlib { namespace tt
)
)
{
{
#ifdef DLIB_USE_CUDA
#ifdef DLIB_USE_CUDA
cuda
::
batch_normalize
(
dest
,
means
,
vars
,
averaging_factor
,
running_means
,
running_variances
,
src
,
gamma
,
beta
);
cuda
::
batch_normalize
(
eps
,
dest
,
means
,
vars
,
averaging_factor
,
running_means
,
running_variances
,
src
,
gamma
,
beta
);
#else
#else
cpu
::
batch_normalize
(
dest
,
means
,
vars
,
averaging_factor
,
running_means
,
running_variances
,
src
,
gamma
,
beta
);
cpu
::
batch_normalize
(
eps
,
dest
,
means
,
vars
,
averaging_factor
,
running_means
,
running_variances
,
src
,
gamma
,
beta
);
#endif
#endif
}
}
void
batch_normalize_gradient
(
void
batch_normalize_gradient
(
const
double
eps
,
const
tensor
&
gradient_input
,
const
tensor
&
gradient_input
,
const
tensor
&
means
,
const
tensor
&
means
,
const
tensor
&
invstds
,
const
tensor
&
invstds
,
...
@@ -346,15 +387,16 @@ namespace dlib { namespace tt
...
@@ -346,15 +387,16 @@ namespace dlib { namespace tt
{
{
#ifdef DLIB_USE_CUDA
#ifdef DLIB_USE_CUDA
cuda
::
batch_normalize_gradient
(
gradient_input
,
means
,
invstds
,
src
,
gamma
,
src_grad
,
gamma_grad
,
beta_grad
);
cuda
::
batch_normalize_gradient
(
eps
,
gradient_input
,
means
,
invstds
,
src
,
gamma
,
src_grad
,
gamma_grad
,
beta_grad
);
#else
#else
cpu
::
batch_normalize_gradient
(
gradient_input
,
means
,
invstds
,
src
,
gamma
,
src_grad
,
gamma_grad
,
beta_grad
);
cpu
::
batch_normalize_gradient
(
eps
,
gradient_input
,
means
,
invstds
,
src
,
gamma
,
src_grad
,
gamma_grad
,
beta_grad
);
#endif
#endif
}
}
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
void
batch_normalize_conv_inference
(
void
batch_normalize_conv_inference
(
const
double
eps
,
resizable_tensor
&
dest
,
resizable_tensor
&
dest
,
const
tensor
&
src
,
const
tensor
&
src
,
const
tensor
&
gamma
,
const
tensor
&
gamma
,
...
@@ -364,13 +406,14 @@ namespace dlib { namespace tt
...
@@ -364,13 +406,14 @@ namespace dlib { namespace tt
)
)
{
{
#ifdef DLIB_USE_CUDA
#ifdef DLIB_USE_CUDA
cuda
::
batch_normalize_conv_inference
(
dest
,
src
,
gamma
,
beta
,
running_means
,
running_variances
);
cuda
::
batch_normalize_conv_inference
(
eps
,
dest
,
src
,
gamma
,
beta
,
running_means
,
running_variances
);
#else
#else
cpu
::
batch_normalize_conv_inference
(
dest
,
src
,
gamma
,
beta
,
running_means
,
running_variances
);
cpu
::
batch_normalize_conv_inference
(
eps
,
dest
,
src
,
gamma
,
beta
,
running_means
,
running_variances
);
#endif
#endif
}
}
void
batch_normalize_conv
(
void
batch_normalize_conv
(
const
double
eps
,
resizable_tensor
&
dest
,
resizable_tensor
&
dest
,
resizable_tensor
&
means
,
resizable_tensor
&
means
,
resizable_tensor
&
vars
,
resizable_tensor
&
vars
,
...
@@ -383,13 +426,14 @@ namespace dlib { namespace tt
...
@@ -383,13 +426,14 @@ namespace dlib { namespace tt
)
)
{
{
#ifdef DLIB_USE_CUDA
#ifdef DLIB_USE_CUDA
cuda
::
batch_normalize_conv
(
dest
,
means
,
vars
,
averaging_factor
,
running_means
,
running_variances
,
src
,
gamma
,
beta
);
cuda
::
batch_normalize_conv
(
eps
,
dest
,
means
,
vars
,
averaging_factor
,
running_means
,
running_variances
,
src
,
gamma
,
beta
);
#else
#else
cpu
::
batch_normalize_conv
(
dest
,
means
,
vars
,
averaging_factor
,
running_means
,
running_variances
,
src
,
gamma
,
beta
);
cpu
::
batch_normalize_conv
(
eps
,
dest
,
means
,
vars
,
averaging_factor
,
running_means
,
running_variances
,
src
,
gamma
,
beta
);
#endif
#endif
}
}
void
batch_normalize_conv_gradient
(
void
batch_normalize_conv_gradient
(
const
double
eps
,
const
tensor
&
gradient_input
,
const
tensor
&
gradient_input
,
const
tensor
&
means
,
const
tensor
&
means
,
const
tensor
&
invstds
,
const
tensor
&
invstds
,
...
@@ -402,9 +446,9 @@ namespace dlib { namespace tt
...
@@ -402,9 +446,9 @@ namespace dlib { namespace tt
{
{
#ifdef DLIB_USE_CUDA
#ifdef DLIB_USE_CUDA
cuda
::
batch_normalize_conv_gradient
(
gradient_input
,
means
,
invstds
,
src
,
gamma
,
src_grad
,
gamma_grad
,
beta_grad
);
cuda
::
batch_normalize_conv_gradient
(
eps
,
gradient_input
,
means
,
invstds
,
src
,
gamma
,
src_grad
,
gamma_grad
,
beta_grad
);
#else
#else
cpu
::
batch_normalize_conv_gradient
(
gradient_input
,
means
,
invstds
,
src
,
gamma
,
src_grad
,
gamma_grad
,
beta_grad
);
cpu
::
batch_normalize_conv_gradient
(
eps
,
gradient_input
,
means
,
invstds
,
src
,
gamma
,
src_grad
,
gamma_grad
,
beta_grad
);
#endif
#endif
}
}
...
...
dlib/dnn/tensor_tools.h
View file @
93e786db
...
@@ -229,13 +229,58 @@ namespace dlib { namespace tt
...
@@ -229,13 +229,58 @@ namespace dlib { namespace tt
const
float
D
const
float
D
);
);
/*!
/*!
requires - dest.size()==src1.size()
requires
- dest.size()==src1.size()
- dest.size()==src2.size()
- dest.size()==src2.size()
- dest.size()==src3.size()
- dest.size()==src3.size()
ensures
ensures
- #dest == A*src1 + B*src2 + C*src3 + D
- #dest == A*src1 + B*src2 + C*src3 + D
!*/
!*/
void
affine_transform
(
tensor
&
dest
,
const
tensor
&
src1
,
const
tensor
&
src2
,
const
tensor
&
src3
,
const
float
A
,
const
float
B
,
const
float
C
);
/*!
requires
- dest.size()==src1.size()
- dest.size()==src2.size()
- dest.size()==src3.size()
ensures
- #dest == A*src1 + B*src2 + C*src3
!*/
void
affine_transform_range
(
size_t
begin
,
size_t
end
,
tensor
&
dest
,
const
tensor
&
src1
,
const
tensor
&
src2
,
const
tensor
&
src3
,
const
float
A
,
const
float
B
,
const
float
C
);
/*!
requires
- dest.size()==src1.size()
- dest.size()==src2.size()
- dest.size()==src3.size()
- begin <= end <= dest.size()
ensures
- This function operates much like
affine_transform(dest,src1,src2,src3,A,B,C,0), except that it runs over only
the half open range [begin,end) rather than processing the entire tensor.
Specifically, it does this:
- for i in the range [begin, end):
- #dest.host()[i] == A*src1.host()[i] + B*src2.host()[i] + C*src3.host()[i]
!*/
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
void
affine_transform
(
void
affine_transform
(
...
@@ -290,6 +335,8 @@ namespace dlib { namespace tt
...
@@ -290,6 +335,8 @@ namespace dlib { namespace tt
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
void
compute_adam_update
(
void
compute_adam_update
(
size_t
begin
,
size_t
end
,
tensor
&
s
,
tensor
&
s
,
tensor
&
m
,
tensor
&
m
,
tensor
&
v
,
tensor
&
v
,
...
@@ -309,19 +356,22 @@ namespace dlib { namespace tt
...
@@ -309,19 +356,22 @@ namespace dlib { namespace tt
- weight_decay >= 0
- weight_decay >= 0
- 0 <= momentum1 < 1
- 0 <= momentum1 < 1
- 0 <= momentum2 < 1
- 0 <= momentum2 < 1
- begin <= end <= params.size()
ensures
ensures
- This function implements the ADAM parameter update method described in the paper:
- This function implements the ADAM parameter update method described in the paper:
Kingma, Diederik P., and Jimmy Ba Adam. "A method for stochastic
Kingma, Diederik P., and Jimmy Ba Adam. "A method for stochastic
optimization." International Conference on Learning Representation. 2015.
optimization." International Conference on Learning Representation. 2015.
Specifically, it implements the method shown as Algorithm 1.
Specifically, it implements the method shown as Algorithm 1.
- #s is the update vector that should be added to the parameters.
- #s is the update vector that should be added to the parameters.
- The function only operates in the half open range [begin,end) of the memory
blocks of each tensor. E.g. to make this function run on the entire tensor
set begin to 0 and end to params.size().
!*/
!*/
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
const
double
BATCH_NORM_EPS
=
0.00001
;
void
batch_normalize_inference
(
void
batch_normalize_inference
(
const
double
eps
,
resizable_tensor
&
dest
,
resizable_tensor
&
dest
,
const
tensor
&
src
,
const
tensor
&
src
,
const
tensor
&
gamma
,
const
tensor
&
gamma
,
...
@@ -331,6 +381,7 @@ namespace dlib { namespace tt
...
@@ -331,6 +381,7 @@ namespace dlib { namespace tt
);
);
/*!
/*!
requires
requires
- eps > 0
- gamma.num_samples() == 1
- gamma.num_samples() == 1
- gamma.nr() == src.nr()
- gamma.nr() == src.nr()
- gamma.nc() == src.nc()
- gamma.nc() == src.nc()
...
@@ -342,11 +393,12 @@ namespace dlib { namespace tt
...
@@ -342,11 +393,12 @@ namespace dlib { namespace tt
- Linearly transforms src as a call to batch_normalize() would if src had means
- Linearly transforms src as a call to batch_normalize() would if src had means
and variances as given by running_means and running_variances. That is, this
and variances as given by running_means and running_variances. That is, this
function performs:
function performs:
dest = gamma*(src-running_means)/sqrt(running_variances+
BATCH_NORM_EPS
) + beta
dest = gamma*(src-running_means)/sqrt(running_variances+
eps
) + beta
Note that it does it in a pointwise fashion over the samples in src.
Note that it does it in a pointwise fashion over the samples in src.
!*/
!*/
void
batch_normalize
(
void
batch_normalize
(
const
double
eps
,
resizable_tensor
&
dest
,
resizable_tensor
&
dest
,
resizable_tensor
&
means
,
resizable_tensor
&
means
,
resizable_tensor
&
invstds
,
resizable_tensor
&
invstds
,
...
@@ -359,6 +411,7 @@ namespace dlib { namespace tt
...
@@ -359,6 +411,7 @@ namespace dlib { namespace tt
);
);
/*!
/*!
requires
requires
- eps > 0
- src.num_samples() > 1
- src.num_samples() > 1
- gamma.num_samples() == 1
- gamma.num_samples() == 1
- beta.num_samples() == 1
- beta.num_samples() == 1
...
@@ -384,6 +437,7 @@ namespace dlib { namespace tt
...
@@ -384,6 +437,7 @@ namespace dlib { namespace tt
!*/
!*/
void
batch_normalize_gradient
(
void
batch_normalize_gradient
(
const
double
eps
,
const
tensor
&
gradient_input
,
const
tensor
&
gradient_input
,
const
tensor
&
means
,
const
tensor
&
means
,
const
tensor
&
invstds
,
const
tensor
&
invstds
,
...
@@ -395,8 +449,9 @@ namespace dlib { namespace tt
...
@@ -395,8 +449,9 @@ namespace dlib { namespace tt
);
);
/*!
/*!
requires
requires
- eps > 0
- invstds and means should be the output of a call to
- invstds and means should be the output of a call to
batch_normalize(dest,means,invstds,src,gamma,beta)
batch_normalize(
eps,
dest,means,invstds,src,gamma,beta)
- have_same_dimensions(gradient_input, src) == true
- have_same_dimensions(gradient_input, src) == true
- have_same_dimensions(src, src_grad) == true
- have_same_dimensions(src, src_grad) == true
- src.num_samples() > 1
- src.num_samples() > 1
...
@@ -410,7 +465,7 @@ namespace dlib { namespace tt
...
@@ -410,7 +465,7 @@ namespace dlib { namespace tt
- have_same_dimensions(invstds, gamma) == true
- have_same_dimensions(invstds, gamma) == true
ensures
ensures
- Let f(src,gamma,beta) == dot(gradient_input, dest output of
- Let f(src,gamma,beta) == dot(gradient_input, dest output of
batch_normalize(dest,means,invstds,src,gamma,beta))
batch_normalize(
eps,
dest,means,invstds,src,gamma,beta))
- Adds the gradient of f() with respect to src to #src_grad.
- Adds the gradient of f() with respect to src to #src_grad.
- Assigns the gradient of f() with respect to gamma to #gamma_grad.
- Assigns the gradient of f() with respect to gamma to #gamma_grad.
- Assigns the gradient of f() with respect to beta to #beta_grad.
- Assigns the gradient of f() with respect to beta to #beta_grad.
...
@@ -419,6 +474,7 @@ namespace dlib { namespace tt
...
@@ -419,6 +474,7 @@ namespace dlib { namespace tt
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
void
batch_normalize_conv_inference
(
void
batch_normalize_conv_inference
(
const
double
eps
,
resizable_tensor
&
dest
,
resizable_tensor
&
dest
,
const
tensor
&
src
,
const
tensor
&
src
,
const
tensor
&
gamma
,
const
tensor
&
gamma
,
...
@@ -428,6 +484,7 @@ namespace dlib { namespace tt
...
@@ -428,6 +484,7 @@ namespace dlib { namespace tt
);
);
/*!
/*!
requires
requires
- eps > 0
- gamma.num_samples() == 1
- gamma.num_samples() == 1
- gamma.nr() == 1
- gamma.nr() == 1
- gamma.nc() == 1
- gamma.nc() == 1
...
@@ -439,12 +496,13 @@ namespace dlib { namespace tt
...
@@ -439,12 +496,13 @@ namespace dlib { namespace tt
- Linearly transforms src as a call to batch_normalize_conv() would if src had
- Linearly transforms src as a call to batch_normalize_conv() would if src had
means and variances as given by running_means and running_variances. That
means and variances as given by running_means and running_variances. That
is, this function performs:
is, this function performs:
dest = gamma*(src-running_means)/sqrt(running_variances+
BATCH_NORM_EPS
) + beta
dest = gamma*(src-running_means)/sqrt(running_variances+
eps
) + beta
Note that it does this in a pointwise fashion over the samples, rows, and
Note that it does this in a pointwise fashion over the samples, rows, and
columns in src.
columns in src.
!*/
!*/
void
batch_normalize_conv
(
void
batch_normalize_conv
(
const
double
eps
,
resizable_tensor
&
dest
,
resizable_tensor
&
dest
,
resizable_tensor
&
means
,
resizable_tensor
&
means
,
resizable_tensor
&
invstds
,
resizable_tensor
&
invstds
,
...
@@ -457,6 +515,7 @@ namespace dlib { namespace tt
...
@@ -457,6 +515,7 @@ namespace dlib { namespace tt
);
);
/*!
/*!
requires
requires
- eps > 0
- src.num_samples() > 1
- src.num_samples() > 1
- gamma.num_samples()==gamma.nr()==gamma.nc() == 1
- gamma.num_samples()==gamma.nr()==gamma.nc() == 1
- beta.num_samples() ==beta.nr() ==gamma.nc() == 1
- beta.num_samples() ==beta.nr() ==gamma.nc() == 1
...
@@ -478,6 +537,7 @@ namespace dlib { namespace tt
...
@@ -478,6 +537,7 @@ namespace dlib { namespace tt
!*/
!*/
void
batch_normalize_conv_gradient
(
void
batch_normalize_conv_gradient
(
const
double
eps
,
const
tensor
&
gradient_input
,
const
tensor
&
gradient_input
,
const
tensor
&
means
,
const
tensor
&
means
,
const
tensor
&
invstds
,
const
tensor
&
invstds
,
...
@@ -489,8 +549,9 @@ namespace dlib { namespace tt
...
@@ -489,8 +549,9 @@ namespace dlib { namespace tt
);
);
/*!
/*!
requires
requires
- eps > 0
- invstds and means should be the output of a call to
- invstds and means should be the output of a call to
batch_normalize_conv(dest,means,invstds,src,gamma,beta)
batch_normalize_conv(
eps,
dest,means,invstds,src,gamma,beta)
- have_same_dimensions(gradient_input, src) == true
- have_same_dimensions(gradient_input, src) == true
- have_same_dimensions(src, src_grad) == true
- have_same_dimensions(src, src_grad) == true
- src.num_samples() > 1
- src.num_samples() > 1
...
@@ -502,7 +563,7 @@ namespace dlib { namespace tt
...
@@ -502,7 +563,7 @@ namespace dlib { namespace tt
- have_same_dimensions(invstds, gamma) == true
- have_same_dimensions(invstds, gamma) == true
ensures
ensures
- Let f(src,gamma,beta) == dot(gradient_input, dest output of
- Let f(src,gamma,beta) == dot(gradient_input, dest output of
batch_normalize_conv(dest,means,invstds,src,gamma,beta))
batch_normalize_conv(
eps,
dest,means,invstds,src,gamma,beta))
- Adds the gradient of f() with respect to src to #src_grad.
- Adds the gradient of f() with respect to src to #src_grad.
- Assigns the gradient of f() with respect to gamma to #gamma_grad.
- Assigns the gradient of f() with respect to gamma to #gamma_grad.
- Assigns the gradient of f() with respect to beta to #beta_grad.
- Assigns the gradient of f() with respect to beta to #beta_grad.
...
...
dlib/dnn/trainer.h
View file @
93e786db
...
@@ -526,8 +526,7 @@ namespace dlib
...
@@ -526,8 +526,7 @@ namespace dlib
label_type
pick_which_run_update
;
label_type
pick_which_run_update
;
job_t
next_job
;
job_t
next_job
;
std
::
vector
<
std
::
future
<
double
>>
losses
(
devices
.
size
());
std
::
vector
<
dlib
::
future
<
double
>>
losses
(
devices
.
size
());
std
::
vector
<
std
::
future
<
void
>>
update_futs
(
devices
.
size
());
std
::
vector
<
tt
::
multi_device_tensor_averager
>
averagers
;
std
::
vector
<
tt
::
multi_device_tensor_averager
>
averagers
;
// An array of all the parameter tensors in the first network. We will
// An array of all the parameter tensors in the first network. We will
...
@@ -536,6 +535,16 @@ namespace dlib
...
@@ -536,6 +535,16 @@ namespace dlib
std
::
vector
<
tensor
*>
reference_params
;
std
::
vector
<
tensor
*>
reference_params
;
visit_layer_parameters
(
devices
[
0
]
->
net
,
[
&
](
size_t
,
tensor
&
t
)
{
reference_params
.
push_back
(
&
t
);
});
visit_layer_parameters
(
devices
[
0
]
->
net
,
[
&
](
size_t
,
tensor
&
t
)
{
reference_params
.
push_back
(
&
t
);
});
// We make separate thread pools with just one thread in them because we want
// to make sure each device is always executed on the same thread. We care
// about this because there are thread_local context variables for some cuda
// components and they get regenerated when the current cuda device changes.
// Recreating them over and over is somewhat expensive so we want to avoid
// that.
std
::
vector
<
std
::
shared_ptr
<
thread_pool
>>
tp
;
for
(
size_t
i
=
0
;
i
<
devices
.
size
();
++
i
)
tp
.
push_back
(
std
::
make_shared
<
thread_pool
>
(
1
));
size_t
iteration
=
0
;
size_t
iteration
=
0
;
while
(
job_pipe
.
dequeue
(
next_job
))
while
(
job_pipe
.
dequeue
(
next_job
))
...
@@ -545,7 +554,7 @@ namespace dlib
...
@@ -545,7 +554,7 @@ namespace dlib
// right version for unsupervised or supervised training based on the type
// right version for unsupervised or supervised training based on the type
// of label_type.
// of label_type.
for
(
size_t
i
=
0
;
i
<
devices
.
size
();
++
i
)
for
(
size_t
i
=
0
;
i
<
devices
.
size
();
++
i
)
losses
[
i
]
=
std
::
async
(
std
::
launch
::
async
,[
&
,
i
](){
return
compute_parameter_gradients
(
i
,
next_job
,
pick_which_run_update
);
});
tp
[
i
]
->
add_task_by_value
([
&
,
i
](
double
&
loss
){
loss
=
compute_parameter_gradients
(
i
,
next_job
,
pick_which_run_update
);
}
,
losses
[
i
]
);
// aggregate loss values from all the network computations.
// aggregate loss values from all the network computations.
double
theloss
=
0
;
double
theloss
=
0
;
for
(
auto
&&
loss
:
losses
)
for
(
auto
&&
loss
:
losses
)
...
@@ -596,10 +605,10 @@ namespace dlib
...
@@ -596,10 +605,10 @@ namespace dlib
// Now apply all the updates to each device.
// Now apply all the updates to each device.
for
(
size_t
i
=
0
;
i
<
devices
.
size
();
++
i
)
for
(
size_t
i
=
0
;
i
<
devices
.
size
();
++
i
)
update_futs
[
i
]
=
std
::
async
(
std
::
launch
::
async
,
[
&
,
i
](){
if
(
next_job
.
have_data
[
i
])
update_parameters
(
i
);
});
tp
[
i
]
->
add_task_by_value
(
[
&
,
i
](){
if
(
next_job
.
have_data
[
i
])
update_parameters
(
i
);
});
// and wait for the updates to all happen.
// and wait for the updates to all happen.
for
(
auto
&&
f
:
update_futs
)
for
(
size_t
i
=
0
;
i
<
devices
.
size
();
++
i
)
f
.
wait
();
tp
[
i
]
->
wait_for_all_tasks
();
// Evey now and then force all the parameters to be the same just to make
// Evey now and then force all the parameters to be the same just to make
...
...
dlib/optimization/optimization.h
View file @
93e786db
...
@@ -482,7 +482,7 @@ namespace dlib
...
@@ -482,7 +482,7 @@ namespace dlib
<<
"
\n\t
x_upper.size(): "
<<
x_upper
.
size
()
<<
"
\n\t
x_upper.size(): "
<<
x_upper
.
size
()
);
);
DLIB_ASSERT
(
DLIB_ASSERT
(
min
(
x_upper
-
x_lower
)
>
0
,
min
(
x_upper
-
x_lower
)
>
=
0
,
"
\t
double find_min_box_constrained()"
"
\t
double find_min_box_constrained()"
<<
"
\n\t
You have to supply proper box constraints to this function."
<<
"
\n\t
You have to supply proper box constraints to this function."
<<
"
\n\r
min(x_upper-x_lower): "
<<
min
(
x_upper
-
x_lower
)
<<
"
\n\r
min(x_upper-x_lower): "
<<
min
(
x_upper
-
x_lower
)
...
@@ -610,7 +610,7 @@ namespace dlib
...
@@ -610,7 +610,7 @@ namespace dlib
<<
"
\n\t
x_upper.size(): "
<<
x_upper
.
size
()
<<
"
\n\t
x_upper.size(): "
<<
x_upper
.
size
()
);
);
DLIB_ASSERT
(
DLIB_ASSERT
(
min
(
x_upper
-
x_lower
)
>
0
,
min
(
x_upper
-
x_lower
)
>
=
0
,
"
\t
double find_max_box_constrained()"
"
\t
double find_max_box_constrained()"
<<
"
\n\t
You have to supply proper box constraints to this function."
<<
"
\n\t
You have to supply proper box constraints to this function."
<<
"
\n\r
min(x_upper-x_lower): "
<<
min
(
x_upper
-
x_lower
)
<<
"
\n\r
min(x_upper-x_lower): "
<<
min
(
x_upper
-
x_lower
)
...
...
dlib/optimization/optimization_abstract.h
View file @
93e786db
...
@@ -297,7 +297,7 @@ namespace dlib
...
@@ -297,7 +297,7 @@ namespace dlib
- is_col_vector(x_upper) == true
- is_col_vector(x_upper) == true
- x.size() == x_lower.size() == x_upper.size()
- x.size() == x_lower.size() == x_upper.size()
(i.e. x, x_lower, and x_upper need to all be column vectors of the same dimensionality)
(i.e. x, x_lower, and x_upper need to all be column vectors of the same dimensionality)
- min(x_upper-x_lower) > 0
- min(x_upper-x_lower) >
=
0
(i.e. x_upper must contain upper bounds relative to x_lower)
(i.e. x_upper must contain upper bounds relative to x_lower)
ensures
ensures
- Performs a box constrained minimization of the function f() using the given
- Performs a box constrained minimization of the function f() using the given
...
@@ -391,7 +391,7 @@ namespace dlib
...
@@ -391,7 +391,7 @@ namespace dlib
- is_col_vector(x_upper) == true
- is_col_vector(x_upper) == true
- x.size() == x_lower.size() == x_upper.size()
- x.size() == x_lower.size() == x_upper.size()
(i.e. x, x_lower, and x_upper need to all be column vectors of the same dimensionality)
(i.e. x, x_lower, and x_upper need to all be column vectors of the same dimensionality)
- min(x_upper-x_lower) > 0
- min(x_upper-x_lower) >
=
0
(i.e. x_upper must contain upper bounds relative to x_lower)
(i.e. x_upper must contain upper bounds relative to x_lower)
ensures
ensures
- Performs a box constrained maximization of the function f() using the given
- Performs a box constrained maximization of the function f() using the given
...
...
dlib/test/dnn.cpp
View file @
93e786db
...
@@ -165,13 +165,13 @@ namespace
...
@@ -165,13 +165,13 @@ namespace
resizable_tensor
running_means
;
resizable_tensor
running_means
;
resizable_tensor
running_variances
;
resizable_tensor
running_variances
;
batch_normalize
(
dest
,
means
,
vars
,
1
,
running_means
,
running_variances
,
src
,
gamma
,
beta
);
batch_normalize
(
DEFAULT_BATCH_NORM_EPS
,
dest
,
means
,
vars
,
1
,
running_means
,
running_variances
,
src
,
gamma
,
beta
);
const
double
scale
=
(
src
.
num_samples
())
/
(
src
.
num_samples
()
-
1.0
);
const
double
scale
=
(
src
.
num_samples
())
/
(
src
.
num_samples
()
-
1.0
);
// Turn back into biased variance estimate because that's how batch_normalize() works, so if we want to match it this is necessary.
// Turn back into biased variance estimate because that's how batch_normalize() works, so if we want to match it this is necessary.
running_variances
=
mat
(
running_variances
)
/
scale
;
running_variances
=
mat
(
running_variances
)
/
scale
;
batch_normalize_inference
(
dest2
,
src
,
gamma
,
beta
,
running_means
,
running_variances
);
batch_normalize_inference
(
DEFAULT_BATCH_NORM_EPS
,
dest2
,
src
,
gamma
,
beta
,
running_means
,
running_variances
);
DLIB_TEST_MSG
(
max
(
abs
(
mat
(
dest2
)
-
mat
(
dest
)))
<
1e-5
,
max
(
abs
(
mat
(
dest2
)
-
mat
(
dest
))));
DLIB_TEST_MSG
(
max
(
abs
(
mat
(
dest2
)
-
mat
(
dest
)))
<
1e-5
,
max
(
abs
(
mat
(
dest2
)
-
mat
(
dest
))));
cpu
::
batch_normalize_inference
(
dest3
,
src
,
gamma
,
beta
,
running_means
,
running_variances
);
cpu
::
batch_normalize_inference
(
DEFAULT_BATCH_NORM_EPS
,
dest3
,
src
,
gamma
,
beta
,
running_means
,
running_variances
);
DLIB_TEST_MSG
(
max
(
abs
(
mat
(
dest3
)
-
mat
(
dest
)))
<
1e-5
,
max
(
abs
(
mat
(
dest3
)
-
mat
(
dest
))));
DLIB_TEST_MSG
(
max
(
abs
(
mat
(
dest3
)
-
mat
(
dest
)))
<
1e-5
,
max
(
abs
(
mat
(
dest3
)
-
mat
(
dest
))));
...
@@ -179,7 +179,7 @@ namespace
...
@@ -179,7 +179,7 @@ namespace
auto
f
=
[
&
](
float
eps
)
{
auto
f
=
[
&
](
float
eps
)
{
const
float
old
=
src
.
host
()[
idx
];
const
float
old
=
src
.
host
()[
idx
];
src
.
host
()[
idx
]
+=
eps
;
src
.
host
()[
idx
]
+=
eps
;
batch_normalize
(
dest
,
means
,
vars
,
1
,
running_means
,
running_variances
,
src
,
gamma
,
beta
);
batch_normalize
(
DEFAULT_BATCH_NORM_EPS
,
dest
,
means
,
vars
,
1
,
running_means
,
running_variances
,
src
,
gamma
,
beta
);
float
result
=
dot
(
gradient_input
,
dest
);
float
result
=
dot
(
gradient_input
,
dest
);
src
.
host
()[
idx
]
=
old
;
src
.
host
()[
idx
]
=
old
;
return
result
;
return
result
;
...
@@ -191,7 +191,7 @@ namespace
...
@@ -191,7 +191,7 @@ namespace
auto
f
=
[
&
](
float
eps
)
{
auto
f
=
[
&
](
float
eps
)
{
const
float
old
=
gamma
.
host
()[
idx
];
const
float
old
=
gamma
.
host
()[
idx
];
gamma
.
host
()[
idx
]
+=
eps
;
gamma
.
host
()[
idx
]
+=
eps
;
batch_normalize
(
dest
,
means
,
vars
,
1
,
running_means
,
running_variances
,
src
,
gamma
,
beta
);
batch_normalize
(
DEFAULT_BATCH_NORM_EPS
,
dest
,
means
,
vars
,
1
,
running_means
,
running_variances
,
src
,
gamma
,
beta
);
float
result
=
dot
(
gradient_input
,
dest
);
float
result
=
dot
(
gradient_input
,
dest
);
gamma
.
host
()[
idx
]
=
old
;
gamma
.
host
()[
idx
]
=
old
;
return
result
;
return
result
;
...
@@ -203,7 +203,7 @@ namespace
...
@@ -203,7 +203,7 @@ namespace
auto
f
=
[
&
](
float
eps
)
{
auto
f
=
[
&
](
float
eps
)
{
const
float
old
=
beta
.
host
()[
idx
];
const
float
old
=
beta
.
host
()[
idx
];
beta
.
host
()[
idx
]
+=
eps
;
beta
.
host
()[
idx
]
+=
eps
;
batch_normalize
(
dest
,
means
,
vars
,
1
,
running_means
,
running_variances
,
src
,
gamma
,
beta
);
batch_normalize
(
DEFAULT_BATCH_NORM_EPS
,
dest
,
means
,
vars
,
1
,
running_means
,
running_variances
,
src
,
gamma
,
beta
);
float
result
=
dot
(
gradient_input
,
dest
);
float
result
=
dot
(
gradient_input
,
dest
);
beta
.
host
()[
idx
]
=
old
;
beta
.
host
()[
idx
]
=
old
;
return
result
;
return
result
;
...
@@ -220,7 +220,7 @@ namespace
...
@@ -220,7 +220,7 @@ namespace
gamma_grad
=
8
;
gamma_grad
=
8
;
beta_grad
=
8
;
beta_grad
=
8
;
batch_normalize_gradient
(
gradient_input
,
means
,
vars
,
src
,
gamma
,
src_grad
,
gamma_grad
,
beta_grad
);
batch_normalize_gradient
(
DEFAULT_BATCH_NORM_EPS
,
gradient_input
,
means
,
vars
,
src
,
gamma
,
src_grad
,
gamma_grad
,
beta_grad
);
auto
grad_error
=
compare_gradients
(
src_grad
,
grad_src
);
auto
grad_error
=
compare_gradients
(
src_grad
,
grad_src
);
dlog
<<
LINFO
<<
"src error: "
<<
grad_error
;
dlog
<<
LINFO
<<
"src error: "
<<
grad_error
;
...
@@ -250,14 +250,14 @@ namespace
...
@@ -250,14 +250,14 @@ namespace
resizable_tensor
running_means
;
resizable_tensor
running_means
;
resizable_tensor
running_variances
;
resizable_tensor
running_variances
;
batch_normalize_conv
(
dest
,
means
,
vars
,
1
,
running_means
,
running_variances
,
src
,
gamma
,
beta
);
batch_normalize_conv
(
DEFAULT_BATCH_NORM_EPS
,
dest
,
means
,
vars
,
1
,
running_means
,
running_variances
,
src
,
gamma
,
beta
);
const
double
scale
=
(
src
.
num_samples
()
*
src
.
nr
()
*
src
.
nc
())
/
(
src
.
num_samples
()
*
src
.
nr
()
*
src
.
nc
()
-
1.0
);
const
double
scale
=
(
src
.
num_samples
()
*
src
.
nr
()
*
src
.
nc
())
/
(
src
.
num_samples
()
*
src
.
nr
()
*
src
.
nc
()
-
1.0
);
// Turn back into biased variance estimate because that's how
// Turn back into biased variance estimate because that's how
// batch_normalize_conv() works, so if we want to match it this is necessary.
// batch_normalize_conv() works, so if we want to match it this is necessary.
running_variances
=
mat
(
running_variances
)
/
scale
;
running_variances
=
mat
(
running_variances
)
/
scale
;
batch_normalize_conv_inference
(
dest2
,
src
,
gamma
,
beta
,
running_means
,
running_variances
);
batch_normalize_conv_inference
(
DEFAULT_BATCH_NORM_EPS
,
dest2
,
src
,
gamma
,
beta
,
running_means
,
running_variances
);
DLIB_TEST
(
max
(
abs
(
mat
(
dest2
)
-
mat
(
dest
)))
<
1e-5
);
DLIB_TEST
(
max
(
abs
(
mat
(
dest2
)
-
mat
(
dest
)))
<
1e-5
);
cpu
::
batch_normalize_conv_inference
(
dest3
,
src
,
gamma
,
beta
,
running_means
,
running_variances
);
cpu
::
batch_normalize_conv_inference
(
DEFAULT_BATCH_NORM_EPS
,
dest3
,
src
,
gamma
,
beta
,
running_means
,
running_variances
);
DLIB_TEST
(
max
(
abs
(
mat
(
dest3
)
-
mat
(
dest
)))
<
1e-5
);
DLIB_TEST
(
max
(
abs
(
mat
(
dest3
)
-
mat
(
dest
)))
<
1e-5
);
...
@@ -265,7 +265,7 @@ namespace
...
@@ -265,7 +265,7 @@ namespace
auto
f
=
[
&
](
float
eps
)
{
auto
f
=
[
&
](
float
eps
)
{
const
float
old
=
src
.
host
()[
idx
];
const
float
old
=
src
.
host
()[
idx
];
src
.
host
()[
idx
]
+=
eps
;
src
.
host
()[
idx
]
+=
eps
;
batch_normalize_conv
(
dest
,
means
,
vars
,
1
,
running_means
,
running_variances
,
src
,
gamma
,
beta
);
batch_normalize_conv
(
DEFAULT_BATCH_NORM_EPS
,
dest
,
means
,
vars
,
1
,
running_means
,
running_variances
,
src
,
gamma
,
beta
);
float
result
=
dot
(
gradient_input
,
dest
);
float
result
=
dot
(
gradient_input
,
dest
);
src
.
host
()[
idx
]
=
old
;
src
.
host
()[
idx
]
=
old
;
return
result
;
return
result
;
...
@@ -277,7 +277,7 @@ namespace
...
@@ -277,7 +277,7 @@ namespace
auto
f
=
[
&
](
float
eps
)
{
auto
f
=
[
&
](
float
eps
)
{
const
float
old
=
gamma
.
host
()[
idx
];
const
float
old
=
gamma
.
host
()[
idx
];
gamma
.
host
()[
idx
]
+=
eps
;
gamma
.
host
()[
idx
]
+=
eps
;
batch_normalize_conv
(
dest
,
means
,
vars
,
1
,
running_means
,
running_variances
,
src
,
gamma
,
beta
);
batch_normalize_conv
(
DEFAULT_BATCH_NORM_EPS
,
dest
,
means
,
vars
,
1
,
running_means
,
running_variances
,
src
,
gamma
,
beta
);
float
result
=
dot
(
gradient_input
,
dest
);
float
result
=
dot
(
gradient_input
,
dest
);
gamma
.
host
()[
idx
]
=
old
;
gamma
.
host
()[
idx
]
=
old
;
return
result
;
return
result
;
...
@@ -289,7 +289,7 @@ namespace
...
@@ -289,7 +289,7 @@ namespace
auto
f
=
[
&
](
float
eps
)
{
auto
f
=
[
&
](
float
eps
)
{
const
float
old
=
beta
.
host
()[
idx
];
const
float
old
=
beta
.
host
()[
idx
];
beta
.
host
()[
idx
]
+=
eps
;
beta
.
host
()[
idx
]
+=
eps
;
batch_normalize_conv
(
dest
,
means
,
vars
,
1
,
running_means
,
running_variances
,
src
,
gamma
,
beta
);
batch_normalize_conv
(
DEFAULT_BATCH_NORM_EPS
,
dest
,
means
,
vars
,
1
,
running_means
,
running_variances
,
src
,
gamma
,
beta
);
float
result
=
dot
(
gradient_input
,
dest
);
float
result
=
dot
(
gradient_input
,
dest
);
beta
.
host
()[
idx
]
=
old
;
beta
.
host
()[
idx
]
=
old
;
return
result
;
return
result
;
...
@@ -307,7 +307,7 @@ namespace
...
@@ -307,7 +307,7 @@ namespace
gamma_grad
=
9
;
gamma_grad
=
9
;
beta_grad
=
9
;
beta_grad
=
9
;
batch_normalize_conv_gradient
(
gradient_input
,
means
,
vars
,
src
,
gamma
,
src_grad
,
gamma_grad
,
beta_grad
);
batch_normalize_conv_gradient
(
DEFAULT_BATCH_NORM_EPS
,
gradient_input
,
means
,
vars
,
src
,
gamma
,
src_grad
,
gamma_grad
,
beta_grad
);
auto
grad_error
=
compare_gradients
(
src_grad
,
grad_src
);
auto
grad_error
=
compare_gradients
(
src_grad
,
grad_src
);
...
@@ -662,11 +662,11 @@ namespace
...
@@ -662,11 +662,11 @@ namespace
rnd
.
fill_uniform
(
params_grad
);
rnd
.
fill_uniform
(
params_grad
);
resizable_tensor
mm
(
m
),
vv
(
v
);
resizable_tensor
mm
(
m
),
vv
(
v
);
cpu
::
compute_adam_update
(
s
,
mm
,
vv
,
t
,
0.01
,
0.001
,
0.9
,
0.99
,
params
,
params_grad
);
cpu
::
compute_adam_update
(
0
,
params
.
size
(),
s
,
mm
,
vv
,
t
,
0.01
,
0.001
,
0.9
,
0.99
,
params
,
params_grad
);
matrix
<
float
>
s1
=
mat
(
s
);
matrix
<
float
>
s1
=
mat
(
s
);
rnd
.
fill_uniform
(
s
);
rnd
.
fill_uniform
(
s
);
cuda
::
compute_adam_update
(
s
,
m
,
v
,
t
,
0.01
,
0.001
,
0.9
,
0.99
,
params
,
params_grad
);
cuda
::
compute_adam_update
(
0
,
params
.
size
(),
s
,
m
,
v
,
t
,
0.01
,
0.001
,
0.9
,
0.99
,
params
,
params_grad
);
matrix
<
float
>
s2
=
mat
(
s
);
matrix
<
float
>
s2
=
mat
(
s
);
DLIB_TEST_MSG
(
max
(
abs
(
s1
-
s2
))
<
1e-6
,
max
(
abs
(
s1
-
s2
)));
DLIB_TEST_MSG
(
max
(
abs
(
s1
-
s2
))
<
1e-6
,
max
(
abs
(
s1
-
s2
)));
...
@@ -775,6 +775,27 @@ namespace
...
@@ -775,6 +775,27 @@ namespace
cpu
::
affine_transform
(
dest2
,
src2
,
srcb2
,
srcc2
,
2
,
3
,
4
,
5
);
cpu
::
affine_transform
(
dest2
,
src2
,
srcb2
,
srcc2
,
2
,
3
,
4
,
5
);
DLIB_TEST
(
equal
(
mat
(
dest
),
mat
(
dest2
)));
DLIB_TEST
(
equal
(
mat
(
dest
),
mat
(
dest2
)));
cuda
::
affine_transform
(
dest
,
src
,
srcb
,
srcc
,
2
,
3
,
4
,
0
);
cpu
::
affine_transform
(
dest2
,
src2
,
srcb2
,
srcc2
,
2
,
3
,
4
,
0
);
DLIB_TEST
(
equal
(
mat
(
dest
),
mat
(
dest2
)));
cuda
::
affine_transform_range
(
0
,
dest
.
size
(),
dest
,
src
,
srcb
,
srcc
,
2
,
3
,
4
);
cpu
::
affine_transform_range
(
0
,
dest2
.
size
(),
dest2
,
src2
,
srcb2
,
srcc2
,
2
,
3
,
4
);
DLIB_TEST
(
equal
(
mat
(
dest
),
mat
(
dest2
)));
if
(
3
<
dest
.
size
())
{
dest
=
999
;
dest2
=
999
;
cuda
::
affine_transform_range
(
3
,
dest
.
size
()
-
1
,
dest
,
src
,
srcb
,
srcc
,
2
,
3
,
4
);
cpu
::
affine_transform_range
(
3
,
dest2
.
size
()
-
1
,
dest2
,
src2
,
srcb2
,
srcc2
,
2
,
3
,
4
);
DLIB_TEST
(
equal
(
mat
(
dest
),
mat
(
dest2
)));
cuda
::
affine_transform_range
(
dest
.
size
(),
dest
.
size
(),
dest
,
src
,
srcb
,
srcc
,
2
,
3
,
4
);
cpu
::
affine_transform_range
(
dest2
.
size
(),
dest2
.
size
(),
dest2
,
src2
,
srcb2
,
srcc2
,
2
,
3
,
4
);
DLIB_TEST
(
equal
(
mat
(
dest
),
mat
(
dest2
)));
}
rnd
.
fill_uniform
(
dest
);
rnd
.
fill_uniform
(
dest
);
rnd
.
fill_uniform
(
src
);
rnd
.
fill_uniform
(
src
);
...
@@ -863,8 +884,8 @@ namespace
...
@@ -863,8 +884,8 @@ namespace
rnd
.
fill_uniform
(
src
);
rnd
.
fill_uniform
(
src
);
cpu
::
batch_normalize
(
dest
,
means
,
invstds
,
1
,
running_means
,
running_variances
,
src
,
gamma
,
beta
);
cpu
::
batch_normalize
(
DEFAULT_BATCH_NORM_EPS
,
dest
,
means
,
invstds
,
1
,
running_means
,
running_variances
,
src
,
gamma
,
beta
);
cuda
::
batch_normalize
(
dest2
,
means2
,
invstds2
,
1
,
running_means2
,
running_variances2
,
src
,
gamma
,
beta
);
cuda
::
batch_normalize
(
DEFAULT_BATCH_NORM_EPS
,
dest2
,
means2
,
invstds2
,
1
,
running_means2
,
running_variances2
,
src
,
gamma
,
beta
);
dlog
<<
LINFO
<<
"dest error: "
<<
max
(
abs
(
mat
(
dest
)
-
mat
(
dest2
)));
dlog
<<
LINFO
<<
"dest error: "
<<
max
(
abs
(
mat
(
dest
)
-
mat
(
dest2
)));
dlog
<<
LINFO
<<
"means error: "
<<
max
(
abs
(
mat
(
means
)
-
mat
(
means2
)));
dlog
<<
LINFO
<<
"means error: "
<<
max
(
abs
(
mat
(
means
)
-
mat
(
means2
)));
...
@@ -890,8 +911,8 @@ namespace
...
@@ -890,8 +911,8 @@ namespace
rnd
.
fill_uniform
(
gradient_input
);
rnd
.
fill_uniform
(
gradient_input
);
cpu
::
batch_normalize_gradient
(
gradient_input
,
means
,
invstds
,
src
,
gamma
,
src_grad
,
gamma_grad
,
beta_grad
);
cpu
::
batch_normalize_gradient
(
DEFAULT_BATCH_NORM_EPS
,
gradient_input
,
means
,
invstds
,
src
,
gamma
,
src_grad
,
gamma_grad
,
beta_grad
);
cuda
::
batch_normalize_gradient
(
gradient_input
,
means
,
invstds
,
src
,
gamma
,
src_grad2
,
gamma_grad2
,
beta_grad2
);
cuda
::
batch_normalize_gradient
(
DEFAULT_BATCH_NORM_EPS
,
gradient_input
,
means
,
invstds
,
src
,
gamma
,
src_grad2
,
gamma_grad2
,
beta_grad2
);
dlog
<<
LINFO
<<
"src_grad error: "
<<
max
(
abs
(
mat
(
src_grad
)
-
mat
(
src_grad2
)));
dlog
<<
LINFO
<<
"src_grad error: "
<<
max
(
abs
(
mat
(
src_grad
)
-
mat
(
src_grad2
)));
dlog
<<
LINFO
<<
"gamma_grad error: "
<<
max
(
abs
(
mat
(
gamma_grad
)
-
mat
(
gamma_grad2
)));
dlog
<<
LINFO
<<
"gamma_grad error: "
<<
max
(
abs
(
mat
(
gamma_grad
)
-
mat
(
gamma_grad2
)));
...
@@ -917,8 +938,8 @@ namespace
...
@@ -917,8 +938,8 @@ namespace
tt
::
tensor_rand
rnd
;
tt
::
tensor_rand
rnd
;
rnd
.
fill_uniform
(
src
);
rnd
.
fill_uniform
(
src
);
cpu
::
batch_normalize_conv
(
dest
,
means
,
invstds
,
1
,
running_means
,
running_variances
,
src
,
gamma
,
beta
);
cpu
::
batch_normalize_conv
(
DEFAULT_BATCH_NORM_EPS
,
dest
,
means
,
invstds
,
1
,
running_means
,
running_variances
,
src
,
gamma
,
beta
);
cuda
::
batch_normalize_conv
(
dest2
,
means2
,
invstds2
,
1
,
running_means2
,
running_variances2
,
src
,
gamma
,
beta
);
cuda
::
batch_normalize_conv
(
DEFAULT_BATCH_NORM_EPS
,
dest2
,
means2
,
invstds2
,
1
,
running_means2
,
running_variances2
,
src
,
gamma
,
beta
);
dlog
<<
LINFO
<<
"dest error: "
<<
max
(
abs
(
mat
(
dest
)
-
mat
(
dest2
)));
dlog
<<
LINFO
<<
"dest error: "
<<
max
(
abs
(
mat
(
dest
)
-
mat
(
dest2
)));
dlog
<<
LINFO
<<
"means error: "
<<
max
(
abs
(
mat
(
means
)
-
mat
(
means2
)));
dlog
<<
LINFO
<<
"means error: "
<<
max
(
abs
(
mat
(
means
)
-
mat
(
means2
)));
...
@@ -942,8 +963,8 @@ namespace
...
@@ -942,8 +963,8 @@ namespace
rnd
.
fill_uniform
(
gradient_input
);
rnd
.
fill_uniform
(
gradient_input
);
cpu
::
batch_normalize_conv_gradient
(
gradient_input
,
means
,
invstds
,
src
,
gamma
,
src_grad
,
gamma_grad
,
beta_grad
);
cpu
::
batch_normalize_conv_gradient
(
DEFAULT_BATCH_NORM_EPS
,
gradient_input
,
means
,
invstds
,
src
,
gamma
,
src_grad
,
gamma_grad
,
beta_grad
);
cuda
::
batch_normalize_conv_gradient
(
gradient_input
,
means
,
invstds
,
src
,
gamma
,
src_grad2
,
gamma_grad2
,
beta_grad2
);
cuda
::
batch_normalize_conv_gradient
(
DEFAULT_BATCH_NORM_EPS
,
gradient_input
,
means
,
invstds
,
src
,
gamma
,
src_grad2
,
gamma_grad2
,
beta_grad2
);
dlog
<<
LINFO
<<
"src_grad error: "
<<
max
(
abs
(
mat
(
src_grad
)
-
mat
(
src_grad2
)));
dlog
<<
LINFO
<<
"src_grad error: "
<<
max
(
abs
(
mat
(
src_grad
)
-
mat
(
src_grad2
)));
dlog
<<
LINFO
<<
"gamma_grad error: "
<<
max
(
abs
(
mat
(
gamma_grad
)
-
mat
(
gamma_grad2
)));
dlog
<<
LINFO
<<
"gamma_grad error: "
<<
max
(
abs
(
mat
(
gamma_grad
)
-
mat
(
gamma_grad2
)));
...
@@ -1318,6 +1339,72 @@ namespace
...
@@ -1318,6 +1339,72 @@ namespace
DLIB_TEST
(
net2
.
subnet
().
subnet
().
subnet
().
layer_details
().
get_num_outputs
()
==
4
);
DLIB_TEST
(
net2
.
subnet
().
subnet
().
subnet
().
layer_details
().
get_num_outputs
()
==
4
);
}
}
// ----------------------------------------------------------------------------------------
template
<
int
N
,
template
<
typename
>
class
BN
,
int
stride
,
typename
SUBNET
>
using
block
=
BN
<
con
<
N
,
3
,
3
,
1
,
1
,
relu
<
BN
<
con
<
N
,
3
,
3
,
stride
,
stride
,
SUBNET
>>>>>
;
template
<
template
<
int
,
template
<
typename
>
class
,
int
,
typename
>
class
block
,
int
N
,
template
<
typename
>
class
BN
,
typename
SUBNET
>
using
residual
=
add_prev1
<
block
<
N
,
BN
,
1
,
tag1
<
SUBNET
>>>
;
template
<
template
<
int
,
template
<
typename
>
class
,
int
,
typename
>
class
block
,
int
N
,
template
<
typename
>
class
BN
,
typename
SUBNET
>
using
residual_down
=
add_prev2
<
avg_pool
<
2
,
2
,
2
,
2
,
skip1
<
tag2
<
block
<
N
,
BN
,
2
,
tag1
<
SUBNET
>>>>>>
;
template
<
typename
SUBNET
>
using
res
=
relu
<
residual
<
block
,
8
,
bn_con
,
SUBNET
>>
;
template
<
typename
SUBNET
>
using
ares
=
relu
<
residual
<
block
,
8
,
affine
,
SUBNET
>>
;
template
<
typename
SUBNET
>
using
res_down
=
relu
<
residual_down
<
block
,
8
,
bn_con
,
SUBNET
>>
;
template
<
typename
SUBNET
>
using
ares_down
=
relu
<
residual_down
<
block
,
8
,
affine
,
SUBNET
>>
;
template
<
typename
SUBNET
>
using
pres
=
prelu
<
add_prev1
<
bn_con
<
con
<
8
,
3
,
3
,
1
,
1
,
prelu
<
bn_con
<
con
<
8
,
3
,
3
,
1
,
1
,
tag1
<
SUBNET
>>>>>>>>
;
void
test_visit_funcions
()
{
using
net_type2
=
loss_multiclass_log
<
fc
<
10
,
avg_pool_everything
<
pres
<
res
<
res
<
res_down
<
// 2 prelu layers here
tag4
<
repeat
<
9
,
pres
,
// 9 groups, each containing 2 prelu layers
res_down
<
res
<
input
<
matrix
<
unsigned
char
>>
>>>>>>>>>>>
;
net_type2
pnet
;
DLIB_CASSERT
(
pnet
.
num_layers
==
131
,
pnet
.
num_layers
);
DLIB_CASSERT
(
pnet
.
num_computational_layers
==
109
,
pnet
.
num_computational_layers
);
std
::
vector
<
bool
>
hit
(
pnet
.
num_computational_layers
,
false
);
size_t
count
=
0
;
visit_layer_parameter_gradients
(
pnet
,
[
&
](
size_t
i
,
tensor
&
){
hit
[
i
]
=
true
;
++
count
;
});
for
(
auto
x
:
hit
)
DLIB_TEST
(
x
);
DLIB_TEST
(
count
==
pnet
.
num_computational_layers
);
count
=
0
;
std
::
vector
<
bool
>
hit2
(
pnet
.
num_computational_layers
,
false
);
visit_layer_parameters
(
pnet
,
[
&
](
size_t
i
,
tensor
&
){
hit2
[
i
]
=
true
;
++
count
;
});
for
(
auto
x
:
hit2
)
DLIB_TEST
(
x
);
DLIB_TEST
(
count
==
pnet
.
num_computational_layers
);
}
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
class
dnn_tester
:
public
tester
class
dnn_tester
:
public
tester
...
@@ -1378,6 +1465,7 @@ namespace
...
@@ -1378,6 +1465,7 @@ namespace
test_batch_normalize_conv
();
test_batch_normalize_conv
();
test_basic_tensor_ops
();
test_basic_tensor_ops
();
test_layers
();
test_layers
();
test_visit_funcions
();
}
}
}
a
;
}
a
;
...
...
examples/dnn_mnist_advanced_ex.cpp
View file @
93e786db
...
@@ -20,29 +20,76 @@ using namespace dlib;
...
@@ -20,29 +20,76 @@ using namespace dlib;
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
// Let's start by showing how you can conveniently define large
networks. The
// Let's start by showing how you can conveniently define large
and complex
// most important tool for doing this are C++'s alias templates.
These let us
//
networks. The
most important tool for doing this are C++'s alias templates.
// define new layer types that are combinations of a bunch of other
layers.
//
These let us
define new layer types that are combinations of a bunch of other
// These will form the building blocks for more complex networks.
//
layers.
These will form the building blocks for more complex networks.
// So let's begin by defining the building block of a residual network (see
// So let's begin by defining the building block of a residual network (see
// Figure 2 in Deep Residual Learning for Image Recognition by He, Zhang, Ren,
// Figure 2 in Deep Residual Learning for Image Recognition by He, Zhang, Ren,
// and Sun). You can see a few things in this statement. The most obvious is
// and Sun). We are going to decompose the residual block into a few alias
// that we have combined a bunch of layers into the name "base_res". You can
// statements. First, we define the core block.
// also see the use of the tag1 layer. This layer doesn't do any computation.
// It exists solely so other layers can refer to it. In this case, the
// Here we have parameterized the "block" layer on a BN layer (nominally some
// add_prev1 layer looks for the tag1 layer and will take the tag1 output and
// kind of batch normalization), the number of filter outputs N, and the stride
// add it to the input of the add_prev1 layer. This combination allows us to
// the block operates at.
// implement skip and residual style networks. We have also made base_res
template
<
// parameterized by BN, which will let us insert different batch normalization
int
N
,
// layers.
template
<
typename
>
class
BN
,
template
<
template
<
typename
>
class
BN
,
typename
SUBNET
>
int
stride
,
using
base_res
=
relu
<
add_prev1
<
BN
<
con
<
8
,
3
,
3
,
1
,
1
,
relu
<
BN
<
con
<
8
,
3
,
3
,
1
,
1
,
tag1
<
SUBNET
>>>>>>>>
;
typename
SUBNET
>
// We also want a residual block that begins by doing downsampling. We can
using
block
=
BN
<
con
<
N
,
3
,
3
,
1
,
1
,
relu
<
BN
<
con
<
N
,
3
,
3
,
stride
,
stride
,
SUBNET
>>>>>
;
// reuse base_res to define it like this:
template
<
template
<
typename
>
class
BN
,
typename
SUBNET
>
// Next, we need to define the skip layer mechanism used in the residual network
using
base_res_down
=
base_res
<
BN
,
avg_pool
<
1
,
1
,
2
,
2
,
SUBNET
>>
;
// paper. They create their blocks by adding the input tensor to the output of
// each block. So we define an alias statement that takes a block and wraps it
// with this skip/add structure.
// Note the tag layer. This layer doesn't do any computation. It exists solely
// so other layers can refer to it. In this case, the add_prev1 layer looks for
// the tag1 layer and will take the tag1 output and add it to the input of the
// add_prev1 layer. This combination allows us to implement skip and residual
// style networks. We have also set the block stride to 1 in this statement.
// The significance of that is explained next.
template
<
template
<
int
,
template
<
typename
>
class
,
int
,
typename
>
class
block
,
int
N
,
template
<
typename
>
class
BN
,
typename
SUBNET
>
using
residual
=
add_prev1
<
block
<
N
,
BN
,
1
,
tag1
<
SUBNET
>>>
;
// Some residual blocks do downsampling. They do this by using a stride of 2
// instead of 1. However, when downsampling we need to also take care to
// downsample the part of the network that adds the original input to the output
// or the sizes won't make sense (the network will still run, but the results
// aren't as good). So here we define a downsampling version of residual. In
// it, we make use of the skip1 layer. This layer simply outputs whatever is
// output by the tag1 layer. Therefore, the skip1 layer (there are also skip2,
// skip3, etc. in dlib) allows you to create branching network structures.
// residual_down creates a network structure like this:
/*
input from SUBNET
/ \
/ \
block downsample(using avg_pool)
\ /
\ /
add tensors (using add_prev2 which adds the output of tag2 with avg_pool's output)
|
output
*/
template
<
template
<
int
,
template
<
typename
>
class
,
int
,
typename
>
class
block
,
int
N
,
template
<
typename
>
class
BN
,
typename
SUBNET
>
using
residual_down
=
add_prev2
<
avg_pool
<
2
,
2
,
2
,
2
,
skip1
<
tag2
<
block
<
N
,
BN
,
2
,
tag1
<
SUBNET
>>>>>>
;
// Now we can define 4 different residual blocks we will use in this example.
// Now we can define 4 different residual blocks we will use in this example.
// The first two are non-downsampling residual blocks while the last two
// The first two are non-downsampling residual blocks while the last two
...
@@ -50,10 +97,10 @@ using base_res_down = base_res<BN,avg_pool<1,1,2,2,SUBNET>>;
...
@@ -50,10 +97,10 @@ using base_res_down = base_res<BN,avg_pool<1,1,2,2,SUBNET>>;
// ares_down have had the batch normalization replaced with simple affine
// ares_down have had the batch normalization replaced with simple affine
// layers. We will use the affine version of the layers when testing our
// layers. We will use the affine version of the layers when testing our
// networks.
// networks.
template
<
typename
SUBNET
>
using
res
=
base_res
<
bn_con
,
SUBNET
>
;
template
<
typename
SUBNET
>
using
res
=
relu
<
residual
<
block
,
8
,
bn_con
,
SUBNET
>
>
;
template
<
typename
SUBNET
>
using
ares
=
base_res
<
affine
,
SUBNET
>
;
template
<
typename
SUBNET
>
using
ares
=
relu
<
residual
<
block
,
8
,
affine
,
SUBNET
>
>
;
template
<
typename
SUBNET
>
using
res_down
=
base_res_down
<
bn_con
,
SUBNET
>
;
template
<
typename
SUBNET
>
using
res_down
=
relu
<
residual_down
<
block
,
8
,
bn_con
,
SUBNET
>
>
;
template
<
typename
SUBNET
>
using
ares_down
=
base_res_down
<
affine
,
SUBNET
>
;
template
<
typename
SUBNET
>
using
ares_down
=
relu
<
residual_down
<
block
,
8
,
affine
,
SUBNET
>
>
;
...
@@ -145,39 +192,41 @@ int main(int argc, char** argv) try
...
@@ -145,39 +192,41 @@ int main(int argc, char** argv) try
// These print statements will output this (I've truncated it since it's
// These print statements will output this (I've truncated it since it's
// long, but you get the idea):
// long, but you get the idea):
/*
/*
The pnet has 1
27
layers in it.
The pnet has 1
31
layers in it.
layer<0> loss_multiclass_log
layer<0> loss_multiclass_log
layer<1> fc (num_outputs=10)
layer<1> fc (num_outputs=10)
learning_rate_mult=1 weight_decay_mult=1 bias_learning_rate_mult=1 bias_weight_decay_mult=0
layer<2> avg_pool (nr=0, nc=0, stride_y=1, stride_x=1, padding_y=0, padding_x=0)
layer<2> avg_pool (nr=0, nc=0, stride_y=1, stride_x=1, padding_y=0, padding_x=0)
layer<3> prelu (initial_param_value=0.2)
layer<3> prelu (initial_param_value=0.2)
layer<4> add_prev
layer<4> add_prev
layer<5> bn_con
layer<5> bn_con
eps=1e-05 learning_rate_mult=1 weight_decay_mult=0 bias_learning_rate_mult=1 bias_weight_decay_mult=1
layer<6> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1)
layer<6> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1)
learning_rate_mult=1 weight_decay_mult=1 bias_learning_rate_mult=1 bias_weight_decay_mult=0
layer<7> prelu (initial_param_value=0.25)
layer<7> prelu (initial_param_value=0.25)
layer<8> bn_con
layer<8> bn_con
eps=1e-05 learning_rate_mult=1 weight_decay_mult=0 bias_learning_rate_mult=1 bias_weight_decay_mult=1
layer<9> con
(num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1)
layer<9> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1)
learning_rate_mult=1 weight_decay_mult=1 bias_learning_rate_mult=1 bias_weight_decay_mult=0
layer<10> tag1
layer<10> tag1
...
...
layer<33> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1)
layer<34> relu
layer<34> tag1
layer<35> bn_con eps=1e-05 learning_rate_mult=1 weight_decay_mult=0 bias_learning_rate_mult=1 bias_weight_decay_mult=1
layer<35> avg_pool (nr=1, nc=1, stride_y=2, stride_x=2, padding_y=0, padding_x=0)
layer<36> con (num_filters=8, nr=3, nc=3, stride_y=2, stride_x=2, padding_y=0, padding_x=0) learning_rate_mult=1 weight_decay_mult=1 bias_learning_rate_mult=1 bias_weight_decay_mult=0
layer<36> tag4
layer<37> tag1
layer<37> prelu (initial_param_value=0.3)
layer<38> tag4
layer<38> add_prev
layer<39> prelu (initial_param_value=0.3)
layer<39> bn_con
layer<40> add_prev
layer<41> bn_con eps=1e-05 learning_rate_mult=1 weight_decay_mult=0 bias_learning_rate_mult=1 bias_weight_decay_mult=1
...
...
layer<115> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1)
layer<116> tag1
layer<117> avg_pool (nr=1, nc=1, stride_y=2, stride_x=2, padding_y=0, padding_x=0)
layer<118> relu
layer<118> relu
layer<119>
add_prev
layer<119>
bn_con eps=1e-05 learning_rate_mult=1 weight_decay_mult=0 bias_learning_rate_mult=1 bias_weight_decay_mult=1
layer<120>
bn_
con
layer<120> con
(num_filters=8, nr=3, nc=3, stride_y=2, stride_x=2, padding_y=0, padding_x=0) learning_rate_mult=1 weight_decay_mult=1 bias_learning_rate_mult=1 bias_weight_decay_mult=0
layer<121>
con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1)
layer<121>
tag1
layer<122> relu
layer<122> relu
layer<123> bn_con
layer<123> add_prev
layer<124> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1)
layer<124> bn_con eps=1e-05 learning_rate_mult=1 weight_decay_mult=0 bias_learning_rate_mult=1 bias_weight_decay_mult=1
layer<125> tag1
layer<125> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1) learning_rate_mult=1 weight_decay_mult=1 bias_learning_rate_mult=1 bias_weight_decay_mult=0
layer<126> input<matrix>
layer<126> relu
layer<127> bn_con eps=1e-05 learning_rate_mult=1 weight_decay_mult=0 bias_learning_rate_mult=1 bias_weight_decay_mult=1
layer<128> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1) learning_rate_mult=1 weight_decay_mult=1 bias_learning_rate_mult=1 bias_weight_decay_mult=0
layer<129> tag1
layer<130> input<matrix>
*/
*/
// Now that we know the index numbers for each layer, we can access them
// Now that we know the index numbers for each layer, we can access them
...
@@ -195,7 +244,7 @@ int main(int argc, char** argv) try
...
@@ -195,7 +244,7 @@ int main(int argc, char** argv) try
// parts of your network and access them by layer<tag>(). You can also
// parts of your network and access them by layer<tag>(). You can also
// index relative to a tag. So for example, to access the layer immediately
// index relative to a tag. So for example, to access the layer immediately
// after tag4 you can say:
// after tag4 you can say:
layer
<
tag4
,
1
>
(
pnet
);
// Equivalent to layer<3
6
+1>(pnet).
layer
<
tag4
,
1
>
(
pnet
);
// Equivalent to layer<3
8
+1>(pnet).
// Or to access the layer 2 layers after tag4:
// Or to access the layer 2 layers after tag4:
layer
<
tag4
,
2
>
(
pnet
);
layer
<
tag4
,
2
>
(
pnet
);
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment