Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dlib
Commits
24687a9e
"src/array/vscode:/vscode.git/clone" did not exist on "cb0e1103ed1e9adc5d09ca0fd6399b203d0aeb0c"
Commit
24687a9e
authored
Jan 22, 2016
by
Davis King
Browse files
Added grid_stride_range_y cuda tool
parent
32dd3f2f
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
85 additions
and
0 deletions
+85
-0
dlib/dnn/cuda_utils.h
dlib/dnn/cuda_utils.h
+85
-0
No files found.
dlib/dnn/cuda_utils.h
View file @
24687a9e
...
@@ -249,6 +249,91 @@ namespace dlib
...
@@ -249,6 +249,91 @@ namespace dlib
size_t
iend
;
size_t
iend
;
};
};
// ------------------------------------------------------------------------------------
class
grid_stride_range_y
{
/*!
WHAT THIS OBJECT REPRESENTS
This object is just like grid_stride_range except that it looks at
CUDA's y thread index (e.g. threadIdx.y) instead of the x index.
Therefore, if you launch a cuda kernel with a statement like:
dim3 blocks(10,1);
dim3 threads(32,32); // You need to have x any not equal to 1 to get parallelism over both loops.
add_arrays<<<blocks,threads>>>(a,b,out,nr,nc);
You can perform a nested 2D parallel for loop rather than doing just a
1D for loop.
So the code in the kernel would look like this if you wanted to add two
2D matrices:
__global__ void add_arrays(
const float* a,
const float* b,
float* out,
size_t nr,
size_t nc
)
{
for (auto r : grid_stride_range_y(0, nr))
{
for (auto c : grid_stride_range(0, nc))
{
auto i = r*nc+c;
out[i] = a[i]+b[i];
}
}
}
!*/
public:
__device__
grid_stride_range_y
(
size_t
ibegin_
,
size_t
iend_
)
:
ibegin
(
ibegin_
),
iend
(
iend_
)
{}
class
iterator
{
public:
__device__
iterator
()
{}
__device__
iterator
(
size_t
pos_
)
:
pos
(
pos_
)
{}
__device__
size_t
operator
*
()
const
{
return
pos
;
}
__device__
iterator
&
operator
++
()
{
pos
+=
gridDim
.
y
*
blockDim
.
y
;
return
*
this
;
}
__device__
bool
operator
!=
(
const
iterator
&
item
)
const
{
return
pos
<
item
.
pos
;
}
private:
size_t
pos
;
};
__device__
iterator
begin
()
const
{
return
iterator
(
ibegin
+
blockDim
.
y
*
blockIdx
.
y
+
threadIdx
.
y
);
}
__device__
iterator
end
()
const
{
return
iterator
(
iend
);
}
private:
size_t
ibegin
;
size_t
iend
;
};
// ------------------------------------------------------------------------------------
}
}
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment