Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dlib
Commits
cbce85ec
"git@developer.sourcefind.cn:OpenDAS/dlib.git" did not exist on "6f0feb2d5e4b15c496dfaac52f412323eeca4214"
Commit
cbce85ec
authored
Dec 05, 2015
by
Davis King
Browse files
Added GPU versions of the batch normalization functions.
parent
06534305
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
586 additions
and
12 deletions
+586
-12
dlib/dnn/cpu_dlib.cpp
dlib/dnn/cpu_dlib.cpp
+4
-4
dlib/dnn/cuda_dlib.cu
dlib/dnn/cuda_dlib.cu
+479
-8
dlib/test/dnn.cpp
dlib/test/dnn.cpp
+103
-0
No files found.
dlib/dnn/cpu_dlib.cpp
View file @
cbce85ec
...
@@ -185,7 +185,7 @@ namespace dlib
...
@@ -185,7 +185,7 @@ namespace dlib
for
(
long
i
=
0
;
i
<
num
;
++
i
)
for
(
long
i
=
0
;
i
<
num
;
++
i
)
{
{
auto
actual_var
=
p_invstds
[
i
]
-
p_means
[
i
]
*
p_means
[
i
];
auto
actual_var
=
p_invstds
[
i
]
-
p_means
[
i
]
*
p_means
[
i
];
p_invstds
[
i
]
=
1.0
/
std
::
sqrt
(
actual_var
+
eps
);
p_invstds
[
i
]
=
1.0
f
/
std
::
sqrt
(
actual_var
+
eps
);
}
}
p_src
=
src
.
host
();
p_src
=
src
.
host
();
...
@@ -361,8 +361,8 @@ namespace dlib
...
@@ -361,8 +361,8 @@ namespace dlib
// compute variances
// compute variances
for
(
long
k
=
0
;
k
<
src
.
k
();
++
k
)
for
(
long
k
=
0
;
k
<
src
.
k
();
++
k
)
{
{
auto
actual_var
=
p_invstds
[
k
]
-
p_means
[
k
]
*
p_means
[
k
];
float
actual_var
=
p_invstds
[
k
]
-
p_means
[
k
]
*
p_means
[
k
];
p_invstds
[
k
]
=
1.0
/
std
::
sqrt
(
actual_var
+
eps
);
p_invstds
[
k
]
=
1.0
f
/
std
::
sqrt
(
actual_var
+
eps
);
}
}
p_src
=
src
.
host
();
p_src
=
src
.
host
();
...
@@ -421,7 +421,7 @@ namespace dlib
...
@@ -421,7 +421,7 @@ namespace dlib
{
{
for
(
long
k
=
0
;
k
<
src
.
k
();
++
k
)
for
(
long
k
=
0
;
k
<
src
.
k
();
++
k
)
{
{
const
auto
invstd_pow
=
-
0.5
*
std
::
pow
(
p_invstds
[
k
],
3.0
f
);
const
float
invstd_pow
=
-
0.5
*
std
::
pow
(
p_invstds
[
k
],
3.0
f
);
for
(
long
i
=
0
;
i
<
num
;
++
i
)
for
(
long
i
=
0
;
i
<
num
;
++
i
)
{
{
const
float
x_hat
=
(
*
p_src
-
p_means
[
k
])
*
p_invstds
[
k
];
const
float
x_hat
=
(
*
p_src
-
p_means
[
k
])
*
p_invstds
[
k
];
...
...
dlib/dnn/cuda_dlib.cu
View file @
cbce85ec
...
@@ -163,8 +163,48 @@ namespace dlib
...
@@ -163,8 +163,48 @@ namespace dlib
}
}
}
}
// -----------------------------------------------------------------------------------
// -----------------------------------------------------------------------------------
// -----------------------------------------------------------------------------------
__global__
void
_cuda_batch_normalize
(
float
*
dest
,
float
*
means
,
float
*
invstds
,
const
float
*
src
,
const
float
*
gamma
,
const
float
*
beta
,
long
num
,
long
num_samples
)
{
const
float
eps
=
0.00001
;
const
float
invnum
=
1.0
f
/
num_samples
;
for
(
auto
i
:
grid_stride_range
(
0
,
num
))
{
means
[
i
]
=
0
;
invstds
[
i
]
=
0
;
for
(
long
n
=
0
;
n
<
num_samples
;
++
n
)
{
float
val
=
src
[
n
*
num
+
i
];
means
[
i
]
+=
val
;
invstds
[
i
]
+=
val
*
val
;
}
means
[
i
]
*=
invnum
;
invstds
[
i
]
*=
invnum
;
float
actual_var
=
invstds
[
i
]
-
means
[
i
]
*
means
[
i
];
invstds
[
i
]
=
1.0
f
/::
sqrt
(
actual_var
+
eps
);
for
(
long
n
=
0
;
n
<
num_samples
;
++
n
)
{
long
idx
=
n
*
num
+
i
;
float
temp
=
(
src
[
idx
]
-
means
[
i
])
*
invstds
[
i
];
dest
[
idx
]
=
temp
*
gamma
[
i
]
+
beta
[
i
];
}
}
}
void
batch_normalize
(
void
batch_normalize
(
resizable_tensor
&
dest
,
resizable_tensor
&
dest
,
resizable_tensor
&
means
,
resizable_tensor
&
means
,
...
@@ -174,8 +214,90 @@ namespace dlib
...
@@ -174,8 +214,90 @@ namespace dlib
const
tensor
&
beta
const
tensor
&
beta
)
)
{
{
// TODO
DLIB_CASSERT
(
DLIB_CASSERT
(
false
,
""
);
src
.
num_samples
()
>
1
&&
gamma
.
num_samples
()
==
1
&&
beta
.
num_samples
()
==
1
&&
gamma
.
nr
()
==
beta
.
nr
()
&&
beta
.
nr
()
==
src
.
nr
()
&&
gamma
.
nc
()
==
beta
.
nc
()
&&
beta
.
nc
()
==
src
.
nc
()
&&
gamma
.
k
()
==
beta
.
k
()
&&
beta
.
k
()
==
src
.
k
(),
"
\n
gamma.num_samples(): "
<<
gamma
.
num_samples
()
<<
"
\n
gamma.k(): "
<<
gamma
.
k
()
<<
"
\n
gamma.nr(): "
<<
gamma
.
nr
()
<<
"
\n
gamma.nc(): "
<<
gamma
.
nc
()
<<
"
\n
beta.num_samples(): "
<<
beta
.
num_samples
()
<<
"
\n
beta.k(): "
<<
beta
.
k
()
<<
"
\n
beta.nr(): "
<<
beta
.
nr
()
<<
"
\n
beta.nc(): "
<<
beta
.
nc
()
<<
"
\n
src.k(): "
<<
src
.
k
()
<<
"
\n
src.nr(): "
<<
src
.
nr
()
<<
"
\n
src.nc(): "
<<
src
.
nc
()
);
dest
.
copy_size
(
src
);
means
.
set_size
(
1
,
src
.
k
(),
src
.
nr
(),
src
.
nc
());
invstds
.
set_size
(
1
,
src
.
k
(),
src
.
nr
(),
src
.
nc
());
_cuda_batch_normalize
<<<
512
,
512
>>>
(
dest
.
device
(),
means
.
device
(),
invstds
.
device
(),
src
.
device
(),
gamma
.
device
(),
beta
.
device
(),
means
.
size
(),
src
.
num_samples
());
}
__global__
void
_cuda_batch_normalize_gradient
(
const
float
*
grad
,
const
float
*
means
,
const
float
*
invstds
,
const
float
*
src
,
const
float
*
gamma
,
float
*
src_grad
,
float
*
gamma_grad
,
float
*
beta_grad
,
float
*
dmeans
,
float
*
dvars
,
long
num
,
long
num_samples
)
{
const
float
invnum
=
1.0
f
/
num_samples
;
for
(
auto
i
:
grid_stride_range
(
0
,
num
))
{
dvars
[
i
]
=
0
;
dmeans
[
i
]
=
0
;
for
(
long
n
=
0
;
n
<
num_samples
;
++
n
)
{
const
long
idx
=
n
*
num
+
i
;
const
float
x_hat
=
(
src
[
idx
]
-
means
[
i
])
*
invstds
[
i
];
beta_grad
[
i
]
+=
grad
[
idx
];
gamma_grad
[
i
]
+=
grad
[
idx
]
*
x_hat
;
const
float
dx
=
grad
[
idx
]
*
gamma
[
i
];
dvars
[
i
]
+=
dx
*
(
src
[
idx
]
-
means
[
i
])
*-
0.5
*::
pow
(
invstds
[
i
],
3.0
f
);
}
for
(
long
n
=
0
;
n
<
num_samples
;
++
n
)
{
const
long
idx
=
n
*
num
+
i
;
const
float
dx
=
grad
[
idx
]
*
gamma
[
i
];
dmeans
[
i
]
+=
dx
*-
invstds
[
i
]
+
dvars
[
i
]
*
-
2
*
(
src
[
idx
]
-
means
[
i
])
*
invnum
;
}
for
(
long
n
=
0
;
n
<
num_samples
;
++
n
)
{
const
long
idx
=
n
*
num
+
i
;
const
float
dx
=
grad
[
idx
]
*
gamma
[
i
];
src_grad
[
idx
]
+=
dx
*
invstds
[
i
]
+
dvars
[
i
]
*
2
*
(
src
[
idx
]
-
means
[
i
])
*
invnum
+
dmeans
[
i
]
*
invnum
;
}
}
}
}
void
batch_normalize_gradient
::
operator
()
(
void
batch_normalize_gradient
::
operator
()
(
...
@@ -189,12 +311,141 @@ namespace dlib
...
@@ -189,12 +311,141 @@ namespace dlib
tensor
&
beta_grad
tensor
&
beta_grad
)
)
{
{
// TODO
const
long
num
=
src
.
k
()
*
src
.
nr
()
*
src
.
nc
();
DLIB_CASSERT
(
false
,
""
);
DLIB_CASSERT
(
num
==
means
.
size
(),
""
);
DLIB_CASSERT
(
num
==
invstds
.
size
(),
""
);
DLIB_CASSERT
(
num
==
gamma
.
size
(),
""
);
DLIB_CASSERT
(
num
==
gamma_grad
.
size
(),
""
);
DLIB_CASSERT
(
num
==
beta_grad
.
size
(),
""
);
DLIB_CASSERT
(
have_same_dimensions
(
gradient_input
,
src
),
""
);
DLIB_CASSERT
(
have_same_dimensions
(
gradient_input
,
src_grad
),
""
);
dvars
.
copy_size
(
invstds
);
dmeans
.
copy_size
(
means
);
_cuda_batch_normalize_gradient
<<<
512
,
512
>>>
(
gradient_input
.
device
(),
means
.
device
(),
invstds
.
device
(),
src
.
device
(),
gamma
.
device
(),
src_grad
.
device
(),
gamma_grad
.
device
(),
beta_grad
.
device
(),
dmeans
.
device
(),
dvars
.
device
(),
num
,
src
.
num_samples
());
}
}
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
// This function is from the article:
// http://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/
__inline__
__device__
float
warp_reduce_sum
(
float
val
)
{
for
(
int
offset
=
warpSize
/
2
;
offset
>
0
;
offset
/=
2
)
val
+=
__shfl_down
(
val
,
offset
);
return
val
;
}
__inline__
__device__
bool
is_first_thread_in_warp
()
{
return
(
threadIdx
.
x
&
(
warpSize
-
1
))
==
0
;
}
__inline__
__device__
void
warp_reduce_atomic_add
(
float
&
out
,
float
val
)
/*!
ensures
- Atomically adds all the val variables in the current warp to out.
See this page for an extended discussion:
http://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/
!*/
{
val
=
warp_reduce_sum
(
val
);
if
(
is_first_thread_in_warp
())
atomicAdd
(
&
out
,
val
);
}
__global__
void
_cuda_batch_normalize_conv1
(
float
*
dest
,
float
*
means
,
float
*
invstds
,
const
float
*
src
,
const
float
*
gamma
,
const
float
*
beta
,
long
num_k
,
long
num_samples
,
long
num_pixels
)
{
for
(
long
k
=
0
;
k
<
num_k
;
++
k
)
{
float
mval
=
0
;
float
ival
=
0
;
// Now do two parallel reductions to compute the first two moments of the
// data.
for
(
auto
j
:
grid_stride_range
(
0
,
num_samples
*
num_pixels
))
{
long
i
=
j
%
num_pixels
;
long
n
=
j
/
num_pixels
;
float
val
=
src
[
n
*
num_k
*
num_pixels
+
k
*
num_pixels
+
i
];
mval
+=
val
;
ival
+=
val
*
val
;
}
warp_reduce_atomic_add
(
means
[
k
],
mval
);
warp_reduce_atomic_add
(
invstds
[
k
],
ival
);
}
}
__global__
void
_cuda_batch_normalize_conv2
(
float
*
means
,
float
*
invstds
,
long
num_k
,
long
num_samples
,
long
num_pixels
)
{
const
float
scale
=
1.0
f
/
(
num_samples
*
num_pixels
);
const
float
eps
=
0.00001
;
for
(
auto
k
:
grid_stride_range
(
0
,
num_k
))
{
means
[
k
]
*=
scale
;
auto
actual_var
=
scale
*
invstds
[
k
]
-
means
[
k
]
*
means
[
k
];
invstds
[
k
]
=
1.0
f
/::
sqrt
(
actual_var
+
eps
);
}
}
__global__
void
_cuda_batch_normalize_conv3
(
float
*
dest
,
float
*
means
,
float
*
invstds
,
const
float
*
src
,
const
float
*
gamma
,
const
float
*
beta
,
long
num_k
,
long
num_samples
,
long
num_pixels
)
{
for
(
long
k
=
0
;
k
<
num_k
;
++
k
)
{
for
(
auto
j
:
grid_stride_range
(
0
,
num_samples
*
num_pixels
))
{
long
i
=
j
%
num_pixels
;
long
n
=
j
/
num_pixels
;
i
=
n
*
num_k
*
num_pixels
+
k
*
num_pixels
+
i
;
dest
[
i
]
=
(
src
[
i
]
-
means
[
k
])
*
invstds
[
k
];
dest
[
i
]
=
dest
[
i
]
*
gamma
[
k
]
+
beta
[
k
];
}
}
}
void
batch_normalize_conv
(
void
batch_normalize_conv
(
resizable_tensor
&
dest
,
resizable_tensor
&
dest
,
resizable_tensor
&
means
,
resizable_tensor
&
means
,
...
@@ -204,8 +455,172 @@ namespace dlib
...
@@ -204,8 +455,172 @@ namespace dlib
const
tensor
&
beta
const
tensor
&
beta
)
)
{
{
// TODO
DLIB_CASSERT
(
DLIB_CASSERT
(
false
,
""
);
src
.
num_samples
()
>
1
&&
gamma
.
num_samples
()
==
1
&&
beta
.
num_samples
()
==
1
&&
gamma
.
nr
()
==
1
&&
beta
.
nr
()
==
1
&&
gamma
.
nc
()
==
1
&&
beta
.
nc
()
==
1
&&
gamma
.
k
()
==
beta
.
k
()
&&
beta
.
k
()
==
src
.
k
(),
"
\n
gamma.num_samples(): "
<<
gamma
.
num_samples
()
<<
"
\n
gamma.k(): "
<<
gamma
.
k
()
<<
"
\n
gamma.nr(): "
<<
gamma
.
nr
()
<<
"
\n
gamma.nc(): "
<<
gamma
.
nc
()
<<
"
\n
beta.num_samples(): "
<<
beta
.
num_samples
()
<<
"
\n
beta.k(): "
<<
beta
.
k
()
<<
"
\n
beta.nr(): "
<<
beta
.
nr
()
<<
"
\n
beta.nc(): "
<<
beta
.
nc
()
<<
"
\n
src.k(): "
<<
src
.
k
()
<<
"
\n
src.nr(): "
<<
src
.
nr
()
<<
"
\n
src.nc(): "
<<
src
.
nc
()
);
dest
.
copy_size
(
src
);
means
.
set_size
(
1
,
src
.
k
());
invstds
.
set_size
(
1
,
src
.
k
());
means
=
0
;
invstds
=
0
;
_cuda_batch_normalize_conv1
<<<
512
,
512
>>>
(
dest
.
device
(),
means
.
device
(),
invstds
.
device
(),
src
.
device
(),
gamma
.
device
(),
beta
.
device
(),
src
.
k
(),
src
.
num_samples
(),
src
.
nr
()
*
src
.
nc
());
_cuda_batch_normalize_conv2
<<<
512
,
512
>>>
(
means
.
device
(),
invstds
.
device
(),
src
.
k
(),
src
.
num_samples
(),
src
.
nr
()
*
src
.
nc
());
_cuda_batch_normalize_conv3
<<<
512
,
512
>>>
(
dest
.
device
(),
means
.
device
(),
invstds
.
device
(),
src
.
device
(),
gamma
.
device
(),
beta
.
device
(),
src
.
k
(),
src
.
num_samples
(),
src
.
nr
()
*
src
.
nc
());
}
__global__
void
_cuda_batch_normalize_conv_gradient1
(
const
float
*
grad
,
const
float
*
means
,
const
float
*
invstds
,
const
float
*
src
,
const
float
*
gamma
,
float
*
src_grad
,
float
*
gamma_grad
,
float
*
beta_grad
,
float
*
dmeans
,
float
*
dvars
,
long
num_k
,
long
num_samples
,
long
num_pixels
)
{
for
(
long
k
=
0
;
k
<
num_k
;
++
k
)
{
float
bval
=
0
;
float
gval
=
0
;
float
dval
=
0
;
const
float
invstd_pow
=
-
0.5
f
*::
pow
(
invstds
[
k
],
3.0
f
);
// Now do three parallel reductions
for
(
auto
j
:
grid_stride_range
(
0
,
num_samples
*
num_pixels
))
{
long
i
=
j
%
num_pixels
;
long
n
=
j
/
num_pixels
;
long
idx
=
n
*
num_k
*
num_pixels
+
k
*
num_pixels
+
i
;
const
float
x_hat
=
(
src
[
idx
]
-
means
[
k
])
*
invstds
[
k
];
bval
+=
grad
[
idx
];
gval
+=
grad
[
idx
]
*
x_hat
;
const
float
dx
=
grad
[
idx
]
*
gamma
[
k
];
dval
+=
dx
*
(
src
[
idx
]
-
means
[
k
])
*
invstd_pow
;
}
warp_reduce_atomic_add
(
beta_grad
[
k
],
bval
);
warp_reduce_atomic_add
(
gamma_grad
[
k
],
gval
);
warp_reduce_atomic_add
(
dvars
[
k
],
dval
);
}
}
__global__
void
_cuda_batch_normalize_conv_gradient2
(
const
float
*
grad
,
const
float
*
means
,
const
float
*
invstds
,
const
float
*
src
,
const
float
*
gamma
,
float
*
src_grad
,
float
*
gamma_grad
,
float
*
beta_grad
,
float
*
dmeans
,
float
*
dvars
,
long
num_k
,
long
num_samples
,
long
num_pixels
)
{
const
float
invnum
=
1.0
f
/
(
num_samples
*
num_pixels
);
for
(
long
k
=
0
;
k
<
num_k
;
++
k
)
{
float
mval
=
0
;
// Now do a parallel reduction
for
(
auto
j
:
grid_stride_range
(
0
,
num_samples
*
num_pixels
))
{
long
i
=
j
%
num_pixels
;
long
n
=
j
/
num_pixels
;
long
idx
=
n
*
num_k
*
num_pixels
+
k
*
num_pixels
+
i
;
const
float
dx
=
grad
[
idx
]
*
gamma
[
k
];
mval
+=
-
dx
*
invstds
[
k
]
+
dvars
[
k
]
*
-
2
*
(
src
[
idx
]
-
means
[
k
])
*
invnum
;
}
warp_reduce_atomic_add
(
dmeans
[
k
],
mval
);
}
}
__global__
void
_cuda_batch_normalize_conv_gradient3
(
const
float
*
grad
,
const
float
*
means
,
const
float
*
invstds
,
const
float
*
src
,
const
float
*
gamma
,
float
*
src_grad
,
float
*
gamma_grad
,
float
*
beta_grad
,
float
*
dmeans
,
float
*
dvars
,
long
num_k
,
long
num_samples
,
long
num_pixels
)
{
const
float
invnum
=
1.0
f
/
(
num_samples
*
num_pixels
);
for
(
long
k
=
0
;
k
<
num_k
;
++
k
)
{
for
(
auto
j
:
grid_stride_range
(
0
,
num_samples
*
num_pixels
))
{
long
i
=
j
%
num_pixels
;
long
n
=
j
/
num_pixels
;
long
idx
=
n
*
num_k
*
num_pixels
+
k
*
num_pixels
+
i
;
const
float
dx
=
grad
[
idx
]
*
gamma
[
k
];
src_grad
[
idx
]
+=
dx
*
invstds
[
k
]
+
dvars
[
k
]
*
2
*
(
src
[
idx
]
-
means
[
k
])
*
invnum
+
dmeans
[
k
]
*
invnum
;
}
}
}
}
void
batch_normalize_conv_gradient
::
operator
()
(
void
batch_normalize_conv_gradient
::
operator
()
(
...
@@ -219,8 +634,64 @@ namespace dlib
...
@@ -219,8 +634,64 @@ namespace dlib
tensor
&
beta_grad
tensor
&
beta_grad
)
)
{
{
// TODO
DLIB_CASSERT
(
src
.
k
()
==
means
.
size
(),
""
);
DLIB_CASSERT
(
false
,
""
);
DLIB_CASSERT
(
src
.
k
()
==
invstds
.
size
(),
""
);
DLIB_CASSERT
(
src
.
k
()
==
gamma
.
size
(),
""
);
DLIB_CASSERT
(
src
.
k
()
==
gamma_grad
.
size
(),
""
);
DLIB_CASSERT
(
src
.
k
()
==
beta_grad
.
size
(),
""
);
DLIB_CASSERT
(
have_same_dimensions
(
gradient_input
,
src
),
""
);
DLIB_CASSERT
(
have_same_dimensions
(
gradient_input
,
src_grad
),
""
);
dvars
.
copy_size
(
invstds
);
dmeans
.
copy_size
(
means
);
dvars
=
0
;
dmeans
=
0
;
_cuda_batch_normalize_conv_gradient1
<<<
512
,
512
>>>
(
gradient_input
.
device
(),
means
.
device
(),
invstds
.
device
(),
src
.
device
(),
gamma
.
device
(),
src_grad
.
device
(),
gamma_grad
.
device
(),
beta_grad
.
device
(),
dmeans
.
device
(),
dvars
.
device
(),
src
.
k
(),
src
.
num_samples
(),
src
.
nr
()
*
src
.
nc
());
_cuda_batch_normalize_conv_gradient2
<<<
512
,
512
>>>
(
gradient_input
.
device
(),
means
.
device
(),
invstds
.
device
(),
src
.
device
(),
gamma
.
device
(),
src_grad
.
device
(),
gamma_grad
.
device
(),
beta_grad
.
device
(),
dmeans
.
device
(),
dvars
.
device
(),
src
.
k
(),
src
.
num_samples
(),
src
.
nr
()
*
src
.
nc
());
_cuda_batch_normalize_conv_gradient3
<<<
512
,
512
>>>
(
gradient_input
.
device
(),
means
.
device
(),
invstds
.
device
(),
src
.
device
(),
gamma
.
device
(),
src_grad
.
device
(),
gamma_grad
.
device
(),
beta_grad
.
device
(),
dmeans
.
device
(),
dvars
.
device
(),
src
.
k
(),
src
.
num_samples
(),
src
.
nr
()
*
src
.
nc
());
}
}
// -----------------------------------------------------------------------------------
// -----------------------------------------------------------------------------------
...
...
dlib/test/dnn.cpp
View file @
cbce85ec
...
@@ -460,6 +460,107 @@ namespace
...
@@ -460,6 +460,107 @@ namespace
}
}
#endif
#endif
// ----------------------------------------------------------------------------------------
void
compare_bn_gpu_and_cpu
()
{
print_spinner
();
resizable_tensor
dest
,
dest2
;
resizable_tensor
means
,
means2
;
resizable_tensor
invstds
,
invstds2
;
resizable_tensor
src
(
64
,
20
,
100
,
100
);
resizable_tensor
gamma
(
1
,
20
,
100
,
100
);
resizable_tensor
beta
(
1
,
20
,
100
,
100
);
gamma
=
2
;
beta
=
3
;
tt
::
tensor_rand
rnd
;
rnd
.
fill_uniform
(
src
);
cpu
::
batch_normalize
(
dest
,
means
,
invstds
,
src
,
gamma
,
beta
);
cuda
::
batch_normalize
(
dest2
,
means2
,
invstds2
,
src
,
gamma
,
beta
);
dlog
<<
LINFO
<<
"dest error: "
<<
max
(
abs
(
mat
(
dest
)
-
mat
(
dest2
)));
dlog
<<
LINFO
<<
"means error: "
<<
max
(
abs
(
mat
(
means
)
-
mat
(
means2
)));
dlog
<<
LINFO
<<
"invstds error: "
<<
max
(
abs
(
mat
(
invstds
)
-
mat
(
invstds2
)));
DLIB_TEST
(
max
(
abs
(
mat
(
dest
)
-
mat
(
dest2
)))
<
1e-5
);
DLIB_TEST
(
max
(
abs
(
mat
(
means
)
-
mat
(
means2
)))
<
1e-5
);
DLIB_TEST
(
max
(
abs
(
mat
(
invstds
)
-
mat
(
invstds2
)))
<
1e-5
);
// now check that the gradients match as well
resizable_tensor
gradient_input
;
resizable_tensor
src_grad
,
gamma_grad
,
beta_grad
;
resizable_tensor
src_grad2
,
gamma_grad2
,
beta_grad2
;
gradient_input
.
copy_size
(
dest
);
src_grad
.
copy_size
(
src
);
src_grad
=
0
;
src_grad2
=
src_grad
;
gamma_grad
.
copy_size
(
gamma
);
gamma_grad
=
0
;
gamma_grad2
=
gamma_grad
;
beta_grad
.
copy_size
(
beta
);
beta_grad
=
0
;
beta_grad2
=
beta_grad
;
rnd
.
fill_uniform
(
gradient_input
);
cpu
::
batch_normalize_gradient
cpu_bng
;
cpu_bng
(
gradient_input
,
means
,
invstds
,
src
,
gamma
,
src_grad
,
gamma_grad
,
beta_grad
);
cuda
::
batch_normalize_gradient
cuda_bng
;
cuda_bng
(
gradient_input
,
means
,
invstds
,
src
,
gamma
,
src_grad2
,
gamma_grad2
,
beta_grad2
);
dlog
<<
LINFO
<<
"src_grad error: "
<<
max
(
abs
(
mat
(
src_grad
)
-
mat
(
src_grad2
)));
dlog
<<
LINFO
<<
"gamma_grad error: "
<<
max
(
abs
(
mat
(
gamma_grad
)
-
mat
(
gamma_grad2
)));
dlog
<<
LINFO
<<
"beta_grad error: "
<<
max
(
abs
(
mat
(
beta_grad
)
-
mat
(
beta_grad2
)));
DLIB_TEST
(
max
(
abs
(
mat
(
src_grad
)
-
mat
(
src_grad2
)))
<
1e-5
);
DLIB_TEST
(
max
(
abs
(
mat
(
gamma_grad
)
-
mat
(
gamma_grad2
)))
<
1e-5
);
DLIB_TEST
(
max
(
abs
(
mat
(
beta_grad
)
-
mat
(
beta_grad2
)))
<
1e-5
);
}
void
compare_bn_conv_gpu_and_cpu
()
{
print_spinner
();
resizable_tensor
dest
,
dest2
;
resizable_tensor
means
,
means2
;
resizable_tensor
invstds
,
invstds2
;
resizable_tensor
src
(
2
,
8
,
10
,
9
);
resizable_tensor
gamma
(
1
,
8
);
resizable_tensor
beta
(
1
,
8
);
gamma
=
2
;
beta
=
3
;
tt
::
tensor_rand
rnd
;
rnd
.
fill_uniform
(
src
);
cpu
::
batch_normalize_conv
(
dest
,
means
,
invstds
,
src
,
gamma
,
beta
);
cuda
::
batch_normalize_conv
(
dest2
,
means2
,
invstds2
,
src
,
gamma
,
beta
);
dlog
<<
LINFO
<<
"dest error: "
<<
max
(
abs
(
mat
(
dest
)
-
mat
(
dest2
)));
dlog
<<
LINFO
<<
"means error: "
<<
max
(
abs
(
mat
(
means
)
-
mat
(
means2
)));
dlog
<<
LINFO
<<
"invstds error: "
<<
max
(
abs
(
mat
(
invstds
)
-
mat
(
invstds2
)));
DLIB_TEST
(
max
(
abs
(
mat
(
dest
)
-
mat
(
dest2
)))
<
1e-4
);
DLIB_TEST
(
max
(
abs
(
mat
(
means
)
-
mat
(
means2
)))
<
1e-4
);
DLIB_TEST
(
max
(
abs
(
mat
(
invstds
)
-
mat
(
invstds2
)))
<
1e-4
);
resizable_tensor
gradient_input
;
resizable_tensor
src_grad
,
gamma_grad
,
beta_grad
;
resizable_tensor
src_grad2
,
gamma_grad2
,
beta_grad2
;
gradient_input
.
copy_size
(
dest
);
src_grad
.
copy_size
(
src
);
src_grad
=
0
;
src_grad2
=
src_grad
;
gamma_grad
.
copy_size
(
gamma
);
gamma_grad
=
0
;
gamma_grad2
=
gamma_grad
;
beta_grad
.
copy_size
(
beta
);
beta_grad
=
0
;
beta_grad2
=
beta_grad
;
rnd
.
fill_uniform
(
gradient_input
);
cpu
::
batch_normalize_conv_gradient
cpu_bng
;
cpu_bng
(
gradient_input
,
means
,
invstds
,
src
,
gamma
,
src_grad
,
gamma_grad
,
beta_grad
);
cuda
::
batch_normalize_conv_gradient
cuda_bng
;
cuda_bng
(
gradient_input
,
means
,
invstds
,
src
,
gamma
,
src_grad2
,
gamma_grad2
,
beta_grad2
);
dlog
<<
LINFO
<<
"src_grad error: "
<<
max
(
abs
(
mat
(
src_grad
)
-
mat
(
src_grad2
)));
dlog
<<
LINFO
<<
"gamma_grad error: "
<<
max
(
abs
(
mat
(
gamma_grad
)
-
mat
(
gamma_grad2
)));
dlog
<<
LINFO
<<
"beta_grad error: "
<<
max
(
abs
(
mat
(
beta_grad
)
-
mat
(
beta_grad2
)));
DLIB_TEST
(
max
(
abs
(
mat
(
src_grad
)
-
mat
(
src_grad2
)))
<
1e-4
);
DLIB_TEST
(
max
(
abs
(
mat
(
gamma_grad
)
-
mat
(
gamma_grad2
)))
<
1e-4
);
DLIB_TEST
(
max
(
abs
(
mat
(
beta_grad
)
-
mat
(
beta_grad2
)))
<
1e-4
);
}
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
class
dnn_tester
:
public
tester
class
dnn_tester
:
public
tester
...
@@ -488,6 +589,8 @@ namespace
...
@@ -488,6 +589,8 @@ namespace
test_batch_normalize
();
test_batch_normalize
();
test_batch_normalize_conv
();
test_batch_normalize_conv
();
test_basic_tensor_ops
();
test_basic_tensor_ops
();
compare_bn_gpu_and_cpu
();
compare_bn_conv_gpu_and_cpu
();
}
}
}
a
;
}
a
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment