"...git@developer.sourcefind.cn:OpenDAS/megatron-lm.git" did not exist on "22c0e300670672e4e0a8604bd6ab89bc28c970a6"
Commit 2ba29f65 authored by Davis King's avatar Davis King
Browse files

Updated multiply()'s CUDA implementation to reflect it's new features. Also added

CUDA version of add_bias_gradient().
parent 2f34414e
...@@ -48,13 +48,32 @@ namespace dlib ...@@ -48,13 +48,32 @@ namespace dlib
// ----------------------------------------------------------------------------------- // -----------------------------------------------------------------------------------
__global__ void _cuda_multiply(float* d, const float* s1, const float* s2, size_t n) __global__ void _cuda_multiply1(float* d, const float* s1, const float* s2, size_t n)
{ {
for (auto i : grid_stride_range(0, n)) for (auto i : grid_stride_range(0, n))
{ {
d[i] = s1[i]*s2[i]; d[i] = s1[i]*s2[i];
} }
} }
__global__ void _cuda_multiply2(float* d, const float* s1, const float* s2,
size_t n, size_t s1_n, size_t s2_n, size_t max_size)
{
for (auto i : grid_stride_range(0, n))
{
d[i] = 0;
for (size_t j = i; j < max_size; j += n)
d[i] += s1[j%s1_n]*s2[j%s2_n];
}
}
__global__ void _cuda_multiply3(float* d, const float* s1, const float* s2,
size_t n, size_t s1_n, size_t s2_n)
{
for (auto i : grid_stride_range(0, n))
{
d[i] = s1[i%s1_n]*s2[i%s2_n];
}
}
void multiply ( void multiply (
tensor& dest, tensor& dest,
...@@ -62,9 +81,36 @@ namespace dlib ...@@ -62,9 +81,36 @@ namespace dlib
const tensor& src2 const tensor& src2
) )
{ {
DLIB_CASSERT(dest.size()==src1.size(),"");
DLIB_CASSERT(dest.size()==src2.size(),""); DLIB_CASSERT(dest.k() == src1.k() && src1.k() == src2.k() &&
_cuda_multiply<<<512,512>>>(dest.device(), src1.device(), src2.device(), src1.size()); dest.nr() == src1.nr() && src1.nr() == src2.nr() &&
dest.nc() == src1.nc() && src1.nc() == src2.nc() ,"");
const long MD = std::max(std::max(dest.num_samples(),src1.num_samples()),src2.num_samples());
DLIB_CASSERT((dest.num_samples()==1 || dest.num_samples()==MD) &&
(src1.num_samples()==1 || src1.num_samples()==MD) &&
(src2.num_samples()==1 || src2.num_samples()==MD) ,"");
if (dest.size() == 0)
return;
const size_t max_size = std::max(std::max(dest.size(),src1.size()),src2.size());
const auto d = dest.host();
const auto s1 = src1.host();
const auto s2 = src2.host();
if (dest.size() == src1.size() && src1.size() == src2.size())
{
_cuda_multiply1<<<512,512>>>(dest.device(), src1.device(), src2.device(), src1.size());
}
else if (dest.num_samples() == 1)
{
_cuda_multiply2<<<512,512>>>(dest.device(), src1.device(), src2.device(),
dest.size(), src1.size(), src2.size(), max_size);
}
else
{
_cuda_multiply3<<<512,512>>>(dest.device(), src1.device(), src2.device(),
dest.size(), src1.size(), src2.size());
}
} }
// ----------------------------------------------------------------------------------- // -----------------------------------------------------------------------------------
...@@ -184,6 +230,33 @@ namespace dlib ...@@ -184,6 +230,33 @@ namespace dlib
} }
} }
// -----------------------------------------------------------------------------------
__global__ void _add_bias_gradient(float* out, const float* in, size_t n, size_t total_n)
{
for (auto i : grid_stride_range(0, n))
{
out[i] = in[i];
for (size_t j = i+n; j < total_n; j+=n)
out[i] += in[j];
}
}
void add_bias_gradient (
tensor& grad,
const tensor& gradient_input
)
{
DLIB_CASSERT(
grad.num_samples() == 1 &&
gradient_input.k() == grad.k() &&
gradient_input.nr() == grad.nr() &&
gradient_input.nc() == grad.nc() &&
gradient_input.size() > 0,"");
_add_bias_gradient<<<512,512>>>(grad.device(), gradient_input.device(), grad.size(), gradient_input.size());
}
// ----------------------------------------------------------------------------------- // -----------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------- // -----------------------------------------------------------------------------------
...@@ -363,37 +436,6 @@ namespace dlib ...@@ -363,37 +436,6 @@ namespace dlib
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
// This function is from the article:
// http://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/
__inline__ __device__ float warp_reduce_sum(float val)
{
for (int offset = warpSize/2; offset > 0; offset /= 2)
val += __shfl_down(val, offset);
return val;
}
__inline__ __device__ bool is_first_thread_in_warp()
{
return (threadIdx.x & (warpSize - 1)) == 0;
}
__inline__ __device__ void warp_reduce_atomic_add(
float& out,
float val
)
/*!
ensures
- Atomically adds all the val variables in the current warp to out.
See this page for an extended discussion:
http://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/
!*/
{
val = warp_reduce_sum(val);
if (is_first_thread_in_warp())
atomicAdd(&out, val);
}
__global__ void _cuda_batch_normalize_conv1( __global__ void _cuda_batch_normalize_conv1(
float* dest, float* dest,
float* means, float* means,
......
...@@ -74,6 +74,13 @@ namespace dlib ...@@ -74,6 +74,13 @@ namespace dlib
const tensor& B const tensor& B
); );
// -----------------------------------------------------------------------------------
void add_bias_gradient (
tensor& grad,
const tensor& gradient_input
);
// ----------------------------------------------------------------------------------- // -----------------------------------------------------------------------------------
void batch_normalize ( void batch_normalize (
......
...@@ -34,6 +34,41 @@ namespace dlib ...@@ -34,6 +34,41 @@ namespace dlib
{ {
namespace cuda namespace cuda
{ {
// ------------------------------------------------------------------------------------
// This function is from the article:
// http://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/
__inline__ __device__ float warp_reduce_sum(float val)
{
for (int offset = warpSize/2; offset > 0; offset /= 2)
val += __shfl_down(val, offset);
return val;
}
__inline__ __device__ bool is_first_thread_in_warp()
{
return (threadIdx.x & (warpSize - 1)) == 0;
}
__inline__ __device__ void warp_reduce_atomic_add(
float& out,
float val
)
/*!
ensures
- Atomically adds all the val variables in the current warp to out.
See this page for an extended discussion:
http://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/
!*/
{
val = warp_reduce_sum(val);
if (is_first_thread_in_warp())
atomicAdd(&out, val);
}
// ------------------------------------------------------------------------------------
class grid_stride_range class grid_stride_range
{ {
/*! /*!
......
...@@ -520,6 +520,10 @@ namespace dlib ...@@ -520,6 +520,10 @@ namespace dlib
template <typename SUBNET> template <typename SUBNET>
using dropout = add_layer<dropout_, SUBNET>; using dropout = add_layer<dropout_, SUBNET>;
// ----------------------------------------------------------------------------------------
// TODO, add spec for bn_ and affine_ layers.
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
class relu_ class relu_
......
...@@ -286,9 +286,7 @@ namespace dlib { namespace tt ...@@ -286,9 +286,7 @@ namespace dlib { namespace tt
) )
{ {
#ifdef DLIB_USE_CUDA #ifdef DLIB_USE_CUDA
// TODO cuda::add_bias_gradient(grad,gradient_input);
DLIB_CASSERT(false,"");
//cuda::add_bias_gradient(grad,gradient_input);
#else #else
cpu::add_bias_gradient(grad,gradient_input); cpu::add_bias_gradient(grad,gradient_input);
#endif #endif
......
...@@ -505,7 +505,7 @@ namespace ...@@ -505,7 +505,7 @@ namespace
dest.set_size(1,4); dest.set_size(1,4);
cuda::multiply(dest, A, B); cuda::multiply(dest, A, B);
DLIB_TEST(max(abs(mat(dest)-sum_rows(pointwise_multiply(mat(A),mat(B))))) < 1e-6); DLIB_TEST_MSG(max(abs(mat(dest)-sum_rows(pointwise_multiply(mat(A),mat(B))))) < 1e-6, max(abs(mat(dest)-sum_rows(pointwise_multiply(mat(A),mat(B))))));
A.set_size(1,4); A.set_size(1,4);
rnd.fill_uniform(A); rnd.fill_uniform(A);
...@@ -633,6 +633,11 @@ namespace ...@@ -633,6 +633,11 @@ namespace
void test_layers() void test_layers()
{ {
{
print_spinner();
affine_ l;
DLIB_TEST_MSG(test_layer(l), test_layer(l));
}
{ {
print_spinner(); print_spinner();
bn_ l; bn_ l;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment