Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dlib
Commits
f7310f4b
"git@developer.sourcefind.cn:OpenDAS/dlib.git" did not exist on "49532acf87315aa8514617fb9a40f2ecbd87c9d4"
Commit
f7310f4b
authored
Aug 11, 2017
by
Davis King
Browse files
Added multiply_zero_padded()
parent
46a02d94
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
305 additions
and
0 deletions
+305
-0
dlib/dnn/cpu_dlib.cpp
dlib/dnn/cpu_dlib.cpp
+73
-0
dlib/dnn/cpu_dlib.h
dlib/dnn/cpu_dlib.h
+7
-0
dlib/dnn/cuda_dlib.cu
dlib/dnn/cuda_dlib.cu
+128
-0
dlib/dnn/cuda_dlib.h
dlib/dnn/cuda_dlib.h
+7
-0
dlib/dnn/tensor_tools.cpp
dlib/dnn/tensor_tools.cpp
+14
-0
dlib/dnn/tensor_tools.h
dlib/dnn/tensor_tools.h
+17
-0
dlib/test/dnn.cpp
dlib/test/dnn.cpp
+59
-0
No files found.
dlib/dnn/cpu_dlib.cpp
View file @
f7310f4b
...
@@ -265,6 +265,79 @@ namespace dlib
...
@@ -265,6 +265,79 @@ namespace dlib
}
}
}
}
// ----------------------------------------------------------------------------------------
void
multiply_zero_padded
(
bool
add_to
,
tensor
&
dest
,
const
tensor
&
src1
,
const
tensor
&
src2
)
{
auto
d
=
dest
.
host
();
auto
s1
=
src1
.
host
();
auto
s2
=
src2
.
host
();
// Do the simple and fast version if everything has the same dimensions
if
(
have_same_dimensions
(
dest
,
src1
)
&&
have_same_dimensions
(
dest
,
src2
))
{
if
(
add_to
)
{
for
(
size_t
i
=
0
;
i
<
dest
.
size
();
++
i
)
d
[
i
]
+=
s1
[
i
]
*
s2
[
i
];
}
else
{
for
(
size_t
i
=
0
;
i
<
dest
.
size
();
++
i
)
d
[
i
]
=
s1
[
i
]
*
s2
[
i
];
}
return
;
}
// Otherwise, do the more complex version with bounds checking.
for
(
long
n
=
0
;
n
<
dest
.
num_samples
();
++
n
)
{
for
(
long
k
=
0
;
k
<
dest
.
k
();
++
k
)
{
for
(
long
r
=
0
;
r
<
dest
.
nr
();
++
r
)
{
for
(
long
c
=
0
;
c
<
dest
.
nc
();
++
c
)
{
float
v1
=
0
;
float
v2
=
0
;
// if this index is inside src1
if
(
n
<
src1
.
num_samples
()
&&
k
<
src1
.
k
()
&&
r
<
src1
.
nr
()
&&
c
<
src1
.
nc
()
)
{
const
auto
s_idx
=
((
n
*
src1
.
k
()
+
k
)
*
src1
.
nr
()
+
r
)
*
src1
.
nc
()
+
c
;
v1
=
s1
[
s_idx
];
}
// if this index is inside src2
if
(
n
<
src2
.
num_samples
()
&&
k
<
src2
.
k
()
&&
r
<
src2
.
nr
()
&&
c
<
src2
.
nc
()
)
{
const
auto
s_idx
=
((
n
*
src2
.
k
()
+
k
)
*
src2
.
nr
()
+
r
)
*
src2
.
nc
()
+
c
;
v2
=
s2
[
s_idx
];
}
if
(
add_to
)
*
d
+=
v1
*
v2
;
else
*
d
=
v1
*
v2
;
++
d
;
}
}
}
}
}
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
void
assign_bias_gradient
(
void
assign_bias_gradient
(
...
...
dlib/dnn/cpu_dlib.h
View file @
f7310f4b
...
@@ -30,6 +30,13 @@ namespace dlib
...
@@ -30,6 +30,13 @@ namespace dlib
const
tensor
&
src2
const
tensor
&
src2
);
);
void
multiply_zero_padded
(
bool
add_to
,
tensor
&
dest
,
const
tensor
&
src1
,
const
tensor
&
src2
);
void
add
(
void
add
(
float
beta
,
float
beta
,
tensor
&
dest
,
tensor
&
dest
,
...
...
dlib/dnn/cuda_dlib.cu
View file @
f7310f4b
...
@@ -513,6 +513,134 @@ namespace dlib
...
@@ -513,6 +513,134 @@ namespace dlib
}
}
// ------------------------------------------------------------------------------------
__global__
void
_cuda_mult1
(
float
*
d
,
const
float
*
s1
,
const
float
*
s2
,
size_t
n
)
{
for
(
auto
i
:
grid_stride_range
(
0
,
n
))
{
d
[
i
]
=
s1
[
i
]
*
s2
[
i
];
}
}
__global__
void
_cuda_mult1_add_to
(
float
*
d
,
const
float
*
s1
,
const
float
*
s2
,
size_t
n
)
{
for
(
auto
i
:
grid_stride_range
(
0
,
n
))
{
d
[
i
]
+=
s1
[
i
]
*
s2
[
i
];
}
}
__global__
void
_cuda_mult2
(
float
*
d
,
const
float
*
s1
,
const
float
*
s2
,
size_t
dn
,
size_t
dk
,
size_t
dr
,
size_t
dc
,
size_t
s1n
,
size_t
s1k
,
size_t
s1r
,
size_t
s1c
,
size_t
s2n
,
size_t
s2k
,
size_t
s2r
,
size_t
s2c
)
{
for
(
auto
i
:
grid_stride_range
(
0
,
dn
*
dk
*
dr
*
dc
))
{
size_t
n
,
k
,
r
,
c
;
unpack_idx
(
i
,
dk
,
dr
,
dc
,
n
,
k
,
r
,
c
);
float
v1
=
0
;
float
v2
=
0
;
if
(
n
<
s1n
&&
k
<
s1k
&&
r
<
s1r
&&
c
<
s1c
)
{
v1
=
s1
[
pack_idx
(
s1k
,
s1r
,
s1c
,
n
,
k
,
r
,
c
)];
}
if
(
n
<
s2n
&&
k
<
s2k
&&
r
<
s2r
&&
c
<
s2c
)
{
v2
=
s2
[
pack_idx
(
s2k
,
s2r
,
s2c
,
n
,
k
,
r
,
c
)];
}
d
[
i
]
=
v1
*
v2
;
}
}
__global__
void
_cuda_mult2_add_to
(
float
*
d
,
const
float
*
s1
,
const
float
*
s2
,
size_t
dn
,
size_t
dk
,
size_t
dr
,
size_t
dc
,
size_t
s1n
,
size_t
s1k
,
size_t
s1r
,
size_t
s1c
,
size_t
s2n
,
size_t
s2k
,
size_t
s2r
,
size_t
s2c
)
{
for
(
auto
i
:
grid_stride_range
(
0
,
dn
*
dk
*
dr
*
dc
))
{
size_t
n
,
k
,
r
,
c
;
unpack_idx
(
i
,
dk
,
dr
,
dc
,
n
,
k
,
r
,
c
);
float
v1
=
0
;
float
v2
=
0
;
if
(
n
<
s1n
&&
k
<
s1k
&&
r
<
s1r
&&
c
<
s1c
)
{
v1
=
s1
[
pack_idx
(
s1k
,
s1r
,
s1c
,
n
,
k
,
r
,
c
)];
}
if
(
n
<
s2n
&&
k
<
s2k
&&
r
<
s2r
&&
c
<
s2c
)
{
v2
=
s2
[
pack_idx
(
s2k
,
s2r
,
s2c
,
n
,
k
,
r
,
c
)];
}
d
[
i
]
+=
v1
*
v2
;
}
}
void
multiply_zero_padded
(
bool
add_to
,
tensor
&
dest
,
const
tensor
&
src1
,
const
tensor
&
src2
)
{
if
(
dest
.
size
()
==
0
)
return
;
// Do the simple and fast version if everything has the same dimensions
if
(
have_same_dimensions
(
dest
,
src1
)
&&
have_same_dimensions
(
dest
,
src2
))
{
if
(
add_to
)
launch_kernel
(
_cuda_mult1_add_to
,
max_jobs
(
dest
.
size
()),
dest
.
device
(),
src1
.
device
(),
src2
.
device
(),
dest
.
size
());
else
launch_kernel
(
_cuda_mult1
,
max_jobs
(
dest
.
size
()),
dest
.
device
(),
src1
.
device
(),
src2
.
device
(),
dest
.
size
());
}
else
{
if
(
add_to
)
{
// Otherwise, do the more complex version with bounds checking.
launch_kernel
(
_cuda_mult2_add_to
,
max_jobs
(
dest
.
size
()),
dest
.
device
(),
src1
.
device
(),
src2
.
device
(),
dest
.
num_samples
(),
dest
.
k
(),
dest
.
nr
(),
dest
.
nc
(),
src1
.
num_samples
(),
src1
.
k
(),
src1
.
nr
(),
src1
.
nc
(),
src2
.
num_samples
(),
src2
.
k
(),
src2
.
nr
(),
src2
.
nc
()
);
}
else
{
// Otherwise, do the more complex version with bounds checking.
launch_kernel
(
_cuda_mult2
,
max_jobs
(
dest
.
size
()),
dest
.
device
(),
src1
.
device
(),
src2
.
device
(),
dest
.
num_samples
(),
dest
.
k
(),
dest
.
nr
(),
dest
.
nc
(),
src1
.
num_samples
(),
src1
.
k
(),
src1
.
nr
(),
src1
.
nc
(),
src2
.
num_samples
(),
src2
.
k
(),
src2
.
nr
(),
src2
.
nc
()
);
}
}
}
// ------------------------------------------------------------------------------------
// ------------------------------------------------------------------------------------
__global__
void
_cuda_add1
(
float
*
d
,
const
float
*
s1
,
const
float
*
s2
,
size_t
n
)
__global__
void
_cuda_add1
(
float
*
d
,
const
float
*
s1
,
const
float
*
s2
,
size_t
n
)
...
...
dlib/dnn/cuda_dlib.h
View file @
f7310f4b
...
@@ -185,6 +185,13 @@ namespace dlib
...
@@ -185,6 +185,13 @@ namespace dlib
const
tensor
&
src2
const
tensor
&
src2
);
);
void
multiply_zero_padded
(
bool
add_to
,
tensor
&
dest
,
const
tensor
&
src1
,
const
tensor
&
src2
);
void
add
(
void
add
(
tensor
&
dest
,
tensor
&
dest
,
const
tensor
&
src1
,
const
tensor
&
src1
,
...
...
dlib/dnn/tensor_tools.cpp
View file @
f7310f4b
...
@@ -304,6 +304,20 @@ namespace dlib { namespace tt
...
@@ -304,6 +304,20 @@ namespace dlib { namespace tt
#endif
#endif
}
}
void
multiply_zero_padded
(
bool
add_to
,
tensor
&
dest
,
const
tensor
&
src1
,
const
tensor
&
src2
)
{
#ifdef DLIB_USE_CUDA
cuda
::
multiply_zero_padded
(
add_to
,
dest
,
src1
,
src2
);
#else
cpu
::
multiply_zero_padded
(
add_to
,
dest
,
src1
,
src2
);
#endif
}
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
void
affine_transform
(
void
affine_transform
(
...
...
dlib/dnn/tensor_tools.h
View file @
f7310f4b
...
@@ -306,6 +306,23 @@ namespace dlib { namespace tt
...
@@ -306,6 +306,23 @@ namespace dlib { namespace tt
- Instead of assigning the result to dest, this function adds the result to dest.
- Instead of assigning the result to dest, this function adds the result to dest.
!*/
!*/
void
multiply_zero_padded
(
bool
add_to
,
tensor
&
dest
,
const
tensor
&
src1
,
const
tensor
&
src2
);
/*!
ensures
- if (add_to) then
- performs: dest += src1 * src2
- else
- performs: dest = src1 * src2
- In either case, the multiplication happens pointwise according to 4D tensor
arithmetic. If the dimensions don't match then missing elements are presumed
to be equal to 0.
!*/
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
void
affine_transform
(
void
affine_transform
(
...
...
dlib/test/dnn.cpp
View file @
f7310f4b
...
@@ -904,6 +904,64 @@ namespace
...
@@ -904,6 +904,64 @@ namespace
DLIB_TEST_MSG
(
max
(
abs
(
mat
(
v
)
-
mat
(
vv
)))
<
1e-6
,
max
(
abs
(
mat
(
v
)
-
mat
(
vv
))));
DLIB_TEST_MSG
(
max
(
abs
(
mat
(
v
)
-
mat
(
vv
)))
<
1e-6
,
max
(
abs
(
mat
(
v
)
-
mat
(
vv
))));
}
}
void
test_multiply_zero_padded
()
{
print_spinner
();
dlib
::
rand
rnd
;
tt
::
tensor_rand
trnd
;
for
(
int
iter
=
0
;
iter
<
300
;
++
iter
)
{
resizable_tensor
dest1
(
rnd
.
get_random_32bit_number
()
%
4
+
1
,
rnd
.
get_random_32bit_number
()
%
4
+
1
,
rnd
.
get_random_32bit_number
()
%
4
+
1
,
rnd
.
get_random_32bit_number
()
%
4
+
1
);
resizable_tensor
dest2
;
dest2
.
copy_size
(
dest1
);
resizable_tensor
src1
(
rnd
.
get_random_32bit_number
()
%
4
+
1
,
rnd
.
get_random_32bit_number
()
%
4
+
1
,
rnd
.
get_random_32bit_number
()
%
4
+
1
,
rnd
.
get_random_32bit_number
()
%
4
+
1
);
resizable_tensor
src2
(
rnd
.
get_random_32bit_number
()
%
4
+
1
,
rnd
.
get_random_32bit_number
()
%
4
+
1
,
rnd
.
get_random_32bit_number
()
%
4
+
1
,
rnd
.
get_random_32bit_number
()
%
4
+
1
);
trnd
.
fill_uniform
(
dest1
);
trnd
.
fill_uniform
(
dest2
);
trnd
.
fill_uniform
(
src1
);
trnd
.
fill_uniform
(
src2
);
cpu
::
multiply_zero_padded
(
false
,
dest1
,
src1
,
src2
);
cuda
::
multiply_zero_padded
(
false
,
dest2
,
src1
,
src2
);
DLIB_TEST
(
max
(
abs
(
mat
(
dest1
)
-
mat
(
dest2
)))
<
1e-5
);
cpu
::
multiply_zero_padded
(
true
,
dest1
,
src1
,
src2
);
cuda
::
multiply_zero_padded
(
true
,
dest2
,
src1
,
src2
);
DLIB_TEST
(
max
(
abs
(
mat
(
dest1
)
-
mat
(
dest2
)))
<
1e-5
);
}
// make sure we have a test for the case where all tensors have the same
// dimensions.
resizable_tensor
dest1
(
3
,
4
,
5
,
6
);
resizable_tensor
dest2
;
resizable_tensor
src1
;
resizable_tensor
src2
;
dest2
.
copy_size
(
dest1
);
src1
.
copy_size
(
dest1
);
src2
.
copy_size
(
dest1
);
trnd
.
fill_uniform
(
dest1
);
trnd
.
fill_uniform
(
dest2
);
trnd
.
fill_uniform
(
src1
);
trnd
.
fill_uniform
(
src2
);
cpu
::
multiply_zero_padded
(
false
,
dest1
,
src1
,
src2
);
cuda
::
multiply_zero_padded
(
false
,
dest2
,
src1
,
src2
);
DLIB_TEST
(
max
(
abs
(
mat
(
dest1
)
-
mat
(
dest2
)))
<
1e-5
);
cpu
::
multiply_zero_padded
(
true
,
dest1
,
src1
,
src2
);
cuda
::
multiply_zero_padded
(
true
,
dest2
,
src1
,
src2
);
DLIB_TEST
(
max
(
abs
(
mat
(
dest1
)
-
mat
(
dest2
)))
<
1e-5
);
}
void
test_add
()
void
test_add
()
{
{
print_spinner
();
print_spinner
();
...
@@ -2606,6 +2664,7 @@ namespace
...
@@ -2606,6 +2664,7 @@ namespace
compare_bn_gpu_and_cpu
();
compare_bn_gpu_and_cpu
();
compare_bn_conv_gpu_and_cpu
();
compare_bn_conv_gpu_and_cpu
();
test_add
();
test_add
();
test_multiply_zero_padded
();
compare_adam
();
compare_adam
();
test_copy_tensor_gpu
();
test_copy_tensor_gpu
();
#endif
#endif
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment