Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dlib
Commits
2ba29f65
Commit
2ba29f65
authored
Dec 08, 2015
by
Davis King
Browse files
Updated multiply()'s CUDA implementation to reflect it's new features. Also added
CUDA version of add_bias_gradient().
parent
2f34414e
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
130 additions
and
39 deletions
+130
-39
dlib/dnn/cuda_dlib.cu
dlib/dnn/cuda_dlib.cu
+77
-35
dlib/dnn/cuda_dlib.h
dlib/dnn/cuda_dlib.h
+7
-0
dlib/dnn/cuda_utils.h
dlib/dnn/cuda_utils.h
+35
-0
dlib/dnn/layers_abstract.h
dlib/dnn/layers_abstract.h
+4
-0
dlib/dnn/tensor_tools.cpp
dlib/dnn/tensor_tools.cpp
+1
-3
dlib/test/dnn.cpp
dlib/test/dnn.cpp
+6
-1
No files found.
dlib/dnn/cuda_dlib.cu
View file @
2ba29f65
...
@@ -48,13 +48,32 @@ namespace dlib
...
@@ -48,13 +48,32 @@ namespace dlib
// -----------------------------------------------------------------------------------
// -----------------------------------------------------------------------------------
__global__
void
_cuda_multiply
(
float
*
d
,
const
float
*
s1
,
const
float
*
s2
,
size_t
n
)
__global__
void
_cuda_multiply
1
(
float
*
d
,
const
float
*
s1
,
const
float
*
s2
,
size_t
n
)
{
{
for
(
auto
i
:
grid_stride_range
(
0
,
n
))
for
(
auto
i
:
grid_stride_range
(
0
,
n
))
{
{
d
[
i
]
=
s1
[
i
]
*
s2
[
i
];
d
[
i
]
=
s1
[
i
]
*
s2
[
i
];
}
}
}
}
__global__
void
_cuda_multiply2
(
float
*
d
,
const
float
*
s1
,
const
float
*
s2
,
size_t
n
,
size_t
s1_n
,
size_t
s2_n
,
size_t
max_size
)
{
for
(
auto
i
:
grid_stride_range
(
0
,
n
))
{
d
[
i
]
=
0
;
for
(
size_t
j
=
i
;
j
<
max_size
;
j
+=
n
)
d
[
i
]
+=
s1
[
j
%
s1_n
]
*
s2
[
j
%
s2_n
];
}
}
__global__
void
_cuda_multiply3
(
float
*
d
,
const
float
*
s1
,
const
float
*
s2
,
size_t
n
,
size_t
s1_n
,
size_t
s2_n
)
{
for
(
auto
i
:
grid_stride_range
(
0
,
n
))
{
d
[
i
]
=
s1
[
i
%
s1_n
]
*
s2
[
i
%
s2_n
];
}
}
void
multiply
(
void
multiply
(
tensor
&
dest
,
tensor
&
dest
,
...
@@ -62,9 +81,36 @@ namespace dlib
...
@@ -62,9 +81,36 @@ namespace dlib
const
tensor
&
src2
const
tensor
&
src2
)
)
{
{
DLIB_CASSERT
(
dest
.
size
()
==
src1
.
size
(),
""
);
DLIB_CASSERT
(
dest
.
size
()
==
src2
.
size
(),
""
);
DLIB_CASSERT
(
dest
.
k
()
==
src1
.
k
()
&&
src1
.
k
()
==
src2
.
k
()
&&
_cuda_multiply
<<<
512
,
512
>>>
(
dest
.
device
(),
src1
.
device
(),
src2
.
device
(),
src1
.
size
());
dest
.
nr
()
==
src1
.
nr
()
&&
src1
.
nr
()
==
src2
.
nr
()
&&
dest
.
nc
()
==
src1
.
nc
()
&&
src1
.
nc
()
==
src2
.
nc
()
,
""
);
const
long
MD
=
std
::
max
(
std
::
max
(
dest
.
num_samples
(),
src1
.
num_samples
()),
src2
.
num_samples
());
DLIB_CASSERT
((
dest
.
num_samples
()
==
1
||
dest
.
num_samples
()
==
MD
)
&&
(
src1
.
num_samples
()
==
1
||
src1
.
num_samples
()
==
MD
)
&&
(
src2
.
num_samples
()
==
1
||
src2
.
num_samples
()
==
MD
)
,
""
);
if
(
dest
.
size
()
==
0
)
return
;
const
size_t
max_size
=
std
::
max
(
std
::
max
(
dest
.
size
(),
src1
.
size
()),
src2
.
size
());
const
auto
d
=
dest
.
host
();
const
auto
s1
=
src1
.
host
();
const
auto
s2
=
src2
.
host
();
if
(
dest
.
size
()
==
src1
.
size
()
&&
src1
.
size
()
==
src2
.
size
())
{
_cuda_multiply1
<<<
512
,
512
>>>
(
dest
.
device
(),
src1
.
device
(),
src2
.
device
(),
src1
.
size
());
}
else
if
(
dest
.
num_samples
()
==
1
)
{
_cuda_multiply2
<<<
512
,
512
>>>
(
dest
.
device
(),
src1
.
device
(),
src2
.
device
(),
dest
.
size
(),
src1
.
size
(),
src2
.
size
(),
max_size
);
}
else
{
_cuda_multiply3
<<<
512
,
512
>>>
(
dest
.
device
(),
src1
.
device
(),
src2
.
device
(),
dest
.
size
(),
src1
.
size
(),
src2
.
size
());
}
}
}
// -----------------------------------------------------------------------------------
// -----------------------------------------------------------------------------------
...
@@ -184,6 +230,33 @@ namespace dlib
...
@@ -184,6 +230,33 @@ namespace dlib
}
}
}
}
// -----------------------------------------------------------------------------------
__global__
void
_add_bias_gradient
(
float
*
out
,
const
float
*
in
,
size_t
n
,
size_t
total_n
)
{
for
(
auto
i
:
grid_stride_range
(
0
,
n
))
{
out
[
i
]
=
in
[
i
];
for
(
size_t
j
=
i
+
n
;
j
<
total_n
;
j
+=
n
)
out
[
i
]
+=
in
[
j
];
}
}
void
add_bias_gradient
(
tensor
&
grad
,
const
tensor
&
gradient_input
)
{
DLIB_CASSERT
(
grad
.
num_samples
()
==
1
&&
gradient_input
.
k
()
==
grad
.
k
()
&&
gradient_input
.
nr
()
==
grad
.
nr
()
&&
gradient_input
.
nc
()
==
grad
.
nc
()
&&
gradient_input
.
size
()
>
0
,
""
);
_add_bias_gradient
<<<
512
,
512
>>>
(
grad
.
device
(),
gradient_input
.
device
(),
grad
.
size
(),
gradient_input
.
size
());
}
// -----------------------------------------------------------------------------------
// -----------------------------------------------------------------------------------
// -----------------------------------------------------------------------------------
// -----------------------------------------------------------------------------------
...
@@ -363,37 +436,6 @@ namespace dlib
...
@@ -363,37 +436,6 @@ namespace dlib
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
// This function is from the article:
// http://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/
__inline__
__device__
float
warp_reduce_sum
(
float
val
)
{
for
(
int
offset
=
warpSize
/
2
;
offset
>
0
;
offset
/=
2
)
val
+=
__shfl_down
(
val
,
offset
);
return
val
;
}
__inline__
__device__
bool
is_first_thread_in_warp
()
{
return
(
threadIdx
.
x
&
(
warpSize
-
1
))
==
0
;
}
__inline__
__device__
void
warp_reduce_atomic_add
(
float
&
out
,
float
val
)
/*!
ensures
- Atomically adds all the val variables in the current warp to out.
See this page for an extended discussion:
http://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/
!*/
{
val
=
warp_reduce_sum
(
val
);
if
(
is_first_thread_in_warp
())
atomicAdd
(
&
out
,
val
);
}
__global__
void
_cuda_batch_normalize_conv1
(
__global__
void
_cuda_batch_normalize_conv1
(
float
*
dest
,
float
*
dest
,
float
*
means
,
float
*
means
,
...
...
dlib/dnn/cuda_dlib.h
View file @
2ba29f65
...
@@ -74,6 +74,13 @@ namespace dlib
...
@@ -74,6 +74,13 @@ namespace dlib
const
tensor
&
B
const
tensor
&
B
);
);
// -----------------------------------------------------------------------------------
void
add_bias_gradient
(
tensor
&
grad
,
const
tensor
&
gradient_input
);
// -----------------------------------------------------------------------------------
// -----------------------------------------------------------------------------------
void
batch_normalize
(
void
batch_normalize
(
...
...
dlib/dnn/cuda_utils.h
View file @
2ba29f65
...
@@ -34,6 +34,41 @@ namespace dlib
...
@@ -34,6 +34,41 @@ namespace dlib
{
{
namespace
cuda
namespace
cuda
{
{
// ------------------------------------------------------------------------------------
// This function is from the article:
// http://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/
__inline__
__device__
float
warp_reduce_sum
(
float
val
)
{
for
(
int
offset
=
warpSize
/
2
;
offset
>
0
;
offset
/=
2
)
val
+=
__shfl_down
(
val
,
offset
);
return
val
;
}
__inline__
__device__
bool
is_first_thread_in_warp
()
{
return
(
threadIdx
.
x
&
(
warpSize
-
1
))
==
0
;
}
__inline__
__device__
void
warp_reduce_atomic_add
(
float
&
out
,
float
val
)
/*!
ensures
- Atomically adds all the val variables in the current warp to out.
See this page for an extended discussion:
http://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/
!*/
{
val
=
warp_reduce_sum
(
val
);
if
(
is_first_thread_in_warp
())
atomicAdd
(
&
out
,
val
);
}
// ------------------------------------------------------------------------------------
class
grid_stride_range
class
grid_stride_range
{
{
/*!
/*!
...
...
dlib/dnn/layers_abstract.h
View file @
2ba29f65
...
@@ -520,6 +520,10 @@ namespace dlib
...
@@ -520,6 +520,10 @@ namespace dlib
template
<
typename
SUBNET
>
template
<
typename
SUBNET
>
using
dropout
=
add_layer
<
dropout_
,
SUBNET
>
;
using
dropout
=
add_layer
<
dropout_
,
SUBNET
>
;
// ----------------------------------------------------------------------------------------
// TODO, add spec for bn_ and affine_ layers.
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
class
relu_
class
relu_
...
...
dlib/dnn/tensor_tools.cpp
View file @
2ba29f65
...
@@ -286,9 +286,7 @@ namespace dlib { namespace tt
...
@@ -286,9 +286,7 @@ namespace dlib { namespace tt
)
)
{
{
#ifdef DLIB_USE_CUDA
#ifdef DLIB_USE_CUDA
// TODO
cuda
::
add_bias_gradient
(
grad
,
gradient_input
);
DLIB_CASSERT
(
false
,
""
);
//cuda::add_bias_gradient(grad,gradient_input);
#else
#else
cpu
::
add_bias_gradient
(
grad
,
gradient_input
);
cpu
::
add_bias_gradient
(
grad
,
gradient_input
);
#endif
#endif
...
...
dlib/test/dnn.cpp
View file @
2ba29f65
...
@@ -505,7 +505,7 @@ namespace
...
@@ -505,7 +505,7 @@ namespace
dest
.
set_size
(
1
,
4
);
dest
.
set_size
(
1
,
4
);
cuda
::
multiply
(
dest
,
A
,
B
);
cuda
::
multiply
(
dest
,
A
,
B
);
DLIB_TEST
(
max
(
abs
(
mat
(
dest
)
-
sum_rows
(
pointwise_multiply
(
mat
(
A
),
mat
(
B
)))))
<
1e-6
);
DLIB_TEST
_MSG
(
max
(
abs
(
mat
(
dest
)
-
sum_rows
(
pointwise_multiply
(
mat
(
A
),
mat
(
B
)))))
<
1e-6
,
max
(
abs
(
mat
(
dest
)
-
sum_rows
(
pointwise_multiply
(
mat
(
A
),
mat
(
B
)))))
);
A
.
set_size
(
1
,
4
);
A
.
set_size
(
1
,
4
);
rnd
.
fill_uniform
(
A
);
rnd
.
fill_uniform
(
A
);
...
@@ -633,6 +633,11 @@ namespace
...
@@ -633,6 +633,11 @@ namespace
void
test_layers
()
void
test_layers
()
{
{
{
print_spinner
();
affine_
l
;
DLIB_TEST_MSG
(
test_layer
(
l
),
test_layer
(
l
));
}
{
{
print_spinner
();
print_spinner
();
bn_
l
;
bn_
l
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment