Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dlib
Commits
546bdf51
Commit
546bdf51
authored
Jan 26, 2018
by
Davis King
Browse files
Switched 2D kernels to use the new 2D launch_kernel(). Also added overload of
dot_prods() that can accumulate in addition to assign.
parent
19b16e1a
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
113 additions
and
12 deletions
+113
-12
dlib/dnn/cuda_dlib.cu
dlib/dnn/cuda_dlib.cu
+53
-12
dlib/dnn/cuda_dlib.h
dlib/dnn/cuda_dlib.h
+7
-0
dlib/dnn/tensor_tools.cpp
dlib/dnn/tensor_tools.cpp
+17
-0
dlib/dnn/tensor_tools.h
dlib/dnn/tensor_tools.h
+18
-0
dlib/test/dnn.cpp
dlib/test/dnn.cpp
+18
-0
No files found.
dlib/dnn/cuda_dlib.cu
View file @
546bdf51
...
@@ -142,9 +142,8 @@ namespace dlib
...
@@ -142,9 +142,8 @@ namespace dlib
)
)
{
{
invnorms
.
set_size
(
data
.
num_samples
());
invnorms
.
set_size
(
data
.
num_samples
());
dim3
blocks
(
1
,
10
);
// x size 1 so we don't need to worry about inter-block synchronization (since only y spans blocks)
launch_kernel
(
_cuda_inverse_norms
,
max_jobs
(
data
.
size
()
/
data
.
num_samples
(),
data
.
num_samples
()),
dim3
threads
(
32
,
32
);
// x size must be 32 because we are using warp_reduce_atomic_add() in the kernel.
invnorms
.
device
(),
data
.
device
(),
data
.
num_samples
(),
data
.
size
()
/
data
.
num_samples
(),
eps
);
_cuda_inverse_norms
<<<
blocks
,
threads
>>>
(
invnorms
.
device
(),
data
.
device
(),
data
.
num_samples
(),
data
.
size
()
/
data
.
num_samples
(),
eps
);
}
}
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
...
@@ -170,16 +169,57 @@ namespace dlib
...
@@ -170,16 +169,57 @@ namespace dlib
}
}
}
}
__global__
void
_cuda_dot_prods_add_to
(
float
*
out
,
const
float
*
lhs
,
const
float
*
rhs
,
size_t
nr
,
size_t
nc
)
{
for
(
auto
i
:
grid_stride_range_y
(
0
,
nr
))
{
auto
l
=
lhs
+
i
*
nc
;
auto
r
=
rhs
+
i
*
nc
;
float
temp
=
0
;
for
(
auto
j
:
grid_stride_range
(
0
,
nc
))
temp
+=
l
[
j
]
*
r
[
j
];
// and store the sum into out[i]
warp_reduce_atomic_add
(
out
[
i
],
temp
);
}
}
void
dot_prods
(
void
dot_prods
(
resizable_tensor
&
out
,
resizable_tensor
&
out
,
const
tensor
&
lhs
,
const
tensor
&
lhs
,
const
tensor
&
rhs
const
tensor
&
rhs
)
)
{
{
DLIB_CASSERT
(
have_same_dimensions
(
lhs
,
rhs
));
out
.
set_size
(
lhs
.
num_samples
());
out
.
set_size
(
lhs
.
num_samples
());
dim3
blocks
(
1
,
10
);
// x size 1 so we don't need to worry about inter-block synchronization (since only y spans blocks)
if
(
out
.
size
()
==
0
)
dim3
threads
(
32
,
32
);
// x size must be 32 because we are using warp_reduce_atomic_add() in the kernel.
return
;
_cuda_dot_prods
<<<
blocks
,
threads
>>>
(
out
.
device
(),
lhs
.
device
(),
rhs
.
device
(),
lhs
.
num_samples
(),
lhs
.
size
()
/
lhs
.
num_samples
());
const
auto
nr
=
lhs
.
num_samples
();
const
auto
nc
=
lhs
.
size
()
/
lhs
.
num_samples
();
launch_kernel
(
_cuda_dot_prods
,
max_jobs
(
nc
,
nr
),
out
.
device_write_only
(),
lhs
.
device
(),
rhs
.
device
(),
nr
,
nc
);
}
void
dot_prods
(
bool
add_to
,
tensor
&
out
,
const
tensor
&
lhs
,
const
tensor
&
rhs
)
{
DLIB_CASSERT
(
have_same_dimensions
(
lhs
,
rhs
));
DLIB_CASSERT
(
out
.
k
()
==
1
&&
out
.
nr
()
==
1
&&
out
.
nc
()
==
1
);
DLIB_CASSERT
(
out
.
size
()
==
lhs
.
num_samples
());
const
auto
nr
=
lhs
.
num_samples
();
const
auto
nc
=
lhs
.
size
()
/
lhs
.
num_samples
();
if
(
add_to
)
launch_kernel
(
_cuda_dot_prods_add_to
,
max_jobs
(
nc
,
nr
),
out
.
device
(),
lhs
.
device
(),
rhs
.
device
(),
nr
,
nc
);
else
launch_kernel
(
_cuda_dot_prods
,
max_jobs
(
nc
,
nr
),
out
.
device_write_only
(),
lhs
.
device
(),
rhs
.
device
(),
nr
,
nc
);
}
}
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
...
@@ -501,14 +541,15 @@ namespace dlib
...
@@ -501,14 +541,15 @@ namespace dlib
if
(
dest
.
size
()
==
0
)
if
(
dest
.
size
()
==
0
)
return
;
return
;
dim3
blocks
(
1
,
10
);
// x size 1 so we don't need to worry about inter-block synchronization (since only y spans blocks)
dim3
threads
(
32
,
32
);
// x size must be 32 because we are using warp_reduce_atomic_add() in the kernel.
const
auto
bs
=
src1
.
nr
()
*
src1
.
nc
();
const
auto
n
=
src1
.
num_samples
()
*
src1
.
k
();
if
(
add_to
)
if
(
add_to
)
_cuda_multiply_conv2_add_to
<<<
blocks
,
threads
>>>
(
launch_kernel
(
_cuda_multiply_conv2_add_to
,
max_jobs
(
bs
,
n
),
dest
.
device
(),
src1
.
device
(),
src1
.
num_samples
()
*
src1
.
k
(),
src2
.
device
(),
src1
.
nr
()
*
src1
.
nc
()
,
src1
.
k
());
dest
.
device
(),
src1
.
device
(),
n
,
src2
.
device
(),
bs
,
src1
.
k
());
else
else
_cuda_multiply_conv2
<<<
blocks
,
threads
>>>
(
launch_kernel
(
_cuda_multiply_conv2
,
max_jobs
(
bs
,
n
),
dest
.
device
(),
src1
.
device
(),
src1
.
num_samples
()
*
src1
.
k
(),
src2
.
device
(),
src1
.
nr
()
*
src1
.
nc
()
,
src1
.
k
());
dest
.
device
(),
src1
.
device
(),
n
,
src2
.
device
(),
bs
,
src1
.
k
());
}
}
}
}
...
...
dlib/dnn/cuda_dlib.h
View file @
546bdf51
...
@@ -121,6 +121,13 @@ namespace dlib
...
@@ -121,6 +121,13 @@ namespace dlib
const
tensor
&
rhs
const
tensor
&
rhs
);
);
void
dot_prods
(
bool
add_to
,
tensor
&
out
,
const
tensor
&
lhs
,
const
tensor
&
rhs
);
void
scale_columns
(
void
scale_columns
(
tensor
&
out
,
tensor
&
out
,
const
tensor
&
m
,
const
tensor
&
m
,
...
...
dlib/dnn/tensor_tools.cpp
View file @
546bdf51
...
@@ -69,6 +69,23 @@ namespace dlib { namespace tt
...
@@ -69,6 +69,23 @@ namespace dlib { namespace tt
#endif
#endif
}
}
void
dot_prods
(
bool
add_to
,
tensor
&
out
,
const
tensor
&
lhs
,
const
tensor
&
rhs
)
{
#ifdef DLIB_USE_CUDA
cuda
::
dot_prods
(
add_to
,
out
,
lhs
,
rhs
);
#else
if
(
add_to
)
out
+=
sum_cols
(
pointwise_multiply
(
mat
(
lhs
),
mat
(
rhs
)));
else
out
=
sum_cols
(
pointwise_multiply
(
mat
(
lhs
),
mat
(
rhs
)));
#endif
}
void
scale_columns
(
void
scale_columns
(
tensor
&
out
,
tensor
&
out
,
const
tensor
&
m
,
const
tensor
&
m
,
...
...
dlib/dnn/tensor_tools.h
View file @
546bdf51
...
@@ -50,6 +50,24 @@ namespace dlib { namespace tt
...
@@ -50,6 +50,24 @@ namespace dlib { namespace tt
- #out == sum_cols(pointwise_multiply(mat(lhs), mat(rhs)));
- #out == sum_cols(pointwise_multiply(mat(lhs), mat(rhs)));
!*/
!*/
void
dot_prods
(
bool
add_to
,
tensor
&
out
,
const
tensor
&
lhs
,
const
tensor
&
rhs
);
/*!
requires
- have_same_dimensions(lhs,rhs) == true
- out.size() == lhs.num_samples()
- out.k() == out.nr() == out.nc() == 1
ensures
- if (add_to) then
- #out == mat(out) + sum_cols(pointwise_multiply(mat(lhs), mat(rhs)));
- else
- #out == sum_cols(pointwise_multiply(mat(lhs), mat(rhs)));
!*/
void
scale_columns
(
void
scale_columns
(
tensor
&
out
,
tensor
&
out
,
const
tensor
&
m
,
const
tensor
&
m
,
...
...
dlib/test/dnn.cpp
View file @
546bdf51
...
@@ -1256,6 +1256,24 @@ namespace
...
@@ -1256,6 +1256,24 @@ namespace
out2
=
scale_rows
(
mat
(
data
),
mat
(
invnorms1
));
out2
=
scale_rows
(
mat
(
data
),
mat
(
invnorms1
));
DLIB_TEST
(
max
(
abs
(
mat
(
out1
)
-
mat
(
out2
)))
<
1e-6
);
DLIB_TEST
(
max
(
abs
(
mat
(
out1
)
-
mat
(
out2
)))
<
1e-6
);
}
}
{
resizable_tensor
a
(
123
,
432
),
b
(
123
,
432
);
rnd
.
fill_gaussian
(
a
);
rnd
.
fill_gaussian
(
b
);
resizable_tensor
out
;
dot_prods
(
out
,
a
,
b
);
const
matrix
<
float
>
truth
=
sum_cols
(
pointwise_multiply
(
mat
(
a
),
mat
(
b
)));
DLIB_TEST
(
max
(
abs
(
mat
(
out
)
-
truth
))
<
1e-4
);
out
=
0
;
DLIB_TEST
(
max
(
abs
(
mat
(
out
)
-
truth
))
>
1e-2
);
dot_prods
(
false
,
out
,
a
,
b
);
DLIB_TEST
(
max
(
abs
(
mat
(
out
)
-
truth
))
<
1e-4
);
dot_prods
(
true
,
out
,
a
,
b
);
DLIB_TEST
(
max
(
abs
(
mat
(
out
)
/
2
-
truth
))
<
1e-4
);
DLIB_TEST
(
max
(
abs
(
mat
(
out
)
-
truth
))
>
1e-2
);
}
}
}
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment