Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
dfa02139
Commit
dfa02139
authored
Oct 30, 2018
by
Chao Liu
Browse files
convolution: init cuda run
parent
49ceb0fe
Changes
3
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
523 additions
and
51 deletions
+523
-51
driver/conv.cu
driver/conv.cu
+55
-23
src/include/device_tensor.cuh
src/include/device_tensor.cuh
+15
-21
src/include/direct_convolution.cuh
src/include/direct_convolution.cuh
+453
-7
No files found.
driver/conv.cu
View file @
dfa02139
#include <iostream>
#include <iostream>
#include <numeric>
#include <initializer_list>
#include "nvToolsExt.h"
#include "nvToolsExt.h"
#include "tensor.hpp"
#include "tensor.hpp"
#include "device_tensor.cuh"
#include "device_tensor.cuh"
#include "direct_convolution.cuh"
#include "direct_convolution.cuh"
template
<
class
T
>
struct
Generator
{
T
value
=
0
;
template
<
class
...
Is
>
T
operator
()(
Is
...
is
)
{
#if 0
return value;
#else
std
::
initializer_list
<
std
::
size_t
>
ls
=
{
static_cast
<
std
::
size_t
>
(
is
)...};
return
std
::
accumulate
(
ls
.
begin
(),
ls
.
end
(),
std
::
size_t
(
0
));
#endif
}
};
template
<
typename
T
>
template
<
typename
T
>
void
host_convolution
(
const
Tensor
<
T
>&
in
,
void
host_convolution
(
const
Tensor
<
T
>&
in
,
const
Tensor
<
T
>&
wei
,
const
Tensor
<
T
>&
wei
,
...
@@ -37,24 +57,39 @@ void host_convolution(const Tensor<T>& in,
...
@@ -37,24 +57,39 @@ void host_convolution(const Tensor<T>& in,
}
}
template
<
class
T
>
template
<
class
T
>
void
device_convolution
(
const
Tensor
<
T
>&
in
,
const
Tensor
<
T
>&
wei
,
Tensor
<
T
>&
out
)
void
device_convolution
(
Tensor
<
T
>&
in
,
Tensor
<
T
>&
wei
,
Tensor
<
T
>&
out
)
{
{
DeviceTensorDescriptor
in_desc_device
(
in
.
mDesc
);
DeviceTensorDescriptor
<
4
>
in_desc_device
(
in
.
mDesc
);
DeviceTensorDescriptor
wei_desc_device
(
wei
.
mDesc
);
DeviceTensorDescriptor
<
4
>
wei_desc_device
(
wei
.
mDesc
);
DeviceTensorDescriptor
out_desc_device
(
out
.
mDesc
);
DeviceTensorDescriptor
<
4
>
out_desc_device
(
out
.
mDesc
);
printf
(
"__func__: in_desc_device: %u %u %u %u
\n
"
,
in_desc_device
.
GetLength
(
0
),
in_desc_device
.
GetLength
(
1
),
in_desc_device
.
GetLength
(
2
),
in_desc_device
.
GetLength
(
3
));
std
::
size_t
data_sz
=
sizeof
(
T
);
std
::
size_t
data_sz
=
sizeof
(
T
);
DeviceMem
in_device_buf
(
data_sz
*
in
.
mDesc
.
GetElementSpace
());
DeviceMem
in_device_buf
(
data_sz
*
in
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_device_buf
(
data_sz
*
wei
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_device_buf
(
data_sz
*
wei
.
mDesc
.
GetElementSpace
());
DeviceMem
out_device_buf
(
data_sz
*
out
.
mDesc
.
GetElementSpace
());
DeviceMem
out_device_buf
(
data_sz
*
out
.
mDesc
.
GetElementSpace
());
int
num_thread
=
std
::
thread
::
hardware_concurrency
();
#if 1
in
.
GenerateTensorValue
(
Generator
<
float
>
{
1
},
num_thread
);
wei
.
GenerateTensorValue
(
Generator
<
float
>
{
1
},
num_thread
);
#endif
out
.
GenerateTensorValue
(
Generator
<
float
>
{
0
},
num_thread
);
in_device_buf
.
ToDevice
(
in
.
mData
.
data
());
in_device_buf
.
ToDevice
(
in
.
mData
.
data
());
wei_device_buf
.
ToDevice
(
wei
.
mData
.
data
());
wei_device_buf
.
ToDevice
(
wei
.
mData
.
data
());
out_device_buf
.
ToDevice
(
out
.
mData
.
data
());
dim3
block_dim
(
25
6
,
1
,
1
);
dim3
block_dim
(
6
4
,
1
,
1
);
dim3
grid_dim
(
1
,
1
,
1
);
dim3
grid_dim
(
1
,
1
,
1
);
direct
_convolution
<
T
,
256
>
gridwise
_convolution
<
T
,
3
,
3
,
4
,
4
,
2
,
2
,
1
,
1
,
32
,
32
,
1
>
<<<
grid_dim
,
block_dim
>>>
(
in_desc_device
,
<<<
grid_dim
,
block_dim
>>>
(
in_desc_device
,
static_cast
<
T
*>
(
in_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
in_device_buf
.
GetDeviceBuffer
()),
wei_desc_device
,
wei_desc_device
,
...
@@ -65,23 +100,20 @@ void device_convolution(const Tensor<T>& in, const Tensor<T>& wei, Tensor<T>& ou
...
@@ -65,23 +100,20 @@ void device_convolution(const Tensor<T>& in, const Tensor<T>& wei, Tensor<T>& ou
out_device_buf
.
FromDevice
(
out
.
mData
.
data
());
out_device_buf
.
FromDevice
(
out
.
mData
.
data
());
}
}
template
<
class
T
>
struct
Generator
{
template
<
class
...
Is
>
T
operator
()(
Is
...
is
)
{
return
1
;
}
};
int
main
()
int
main
()
{
{
#if 0
#if 0
Tensor<float> in({3, 16, 1
28
, 1
28
});
Tensor<float> in({3, 16, 1
30
, 1
30
});
Tensor<float> wei({4, 16, 3, 3});
Tensor<float> wei({4, 16, 3, 3});
Tensor<float> out_host({3, 4, 126, 126});
Tensor<float> out_host({3, 4, 128, 128});
#elif
0
Tensor
<
float
>
in
({
1
,
1
,
130
,
130
});
Tensor
<
float
>
wei
({
1
,
1
,
3
,
3
});
Tensor
<
float
>
out_host
({
1
,
1
,
128
,
128
});
#elif 1
Tensor
<
float
>
in
({
1
,
1
,
18
,
18
});
Tensor
<
float
>
wei
({
1
,
1
,
3
,
3
});
Tensor
<
float
>
out_host
({
1
,
1
,
16
,
16
});
#else
#else
Tensor
<
float
>
in
({
1
,
1
,
4
,
4
});
Tensor
<
float
>
in
({
1
,
1
,
4
,
4
});
Tensor
<
float
>
wei
({
1
,
1
,
3
,
3
});
Tensor
<
float
>
wei
({
1
,
1
,
3
,
3
});
...
@@ -93,16 +125,16 @@ int main()
...
@@ -93,16 +125,16 @@ int main()
std
::
cout
<<
__func__
<<
": num_thread "
<<
num_thread
<<
std
::
endl
;
std
::
cout
<<
__func__
<<
": num_thread "
<<
num_thread
<<
std
::
endl
;
in
.
GenerateTensorValue
(
Generator
<
float
>
{},
num_thread
);
in
.
GenerateTensorValue
(
Generator
<
float
>
{
1
},
num_thread
);
wei
.
GenerateTensorValue
(
Generator
<
float
>
{},
num_thread
);
wei
.
GenerateTensorValue
(
Generator
<
float
>
{
1
},
num_thread
);
host_convolution
(
in
,
wei
,
out_host
,
num_thread
);
//
host_convolution(in, wei, out_host, num_thread);
device_convolution
(
in
,
wei
,
out_device
);
device_convolution
(
in
,
wei
,
out_device
);
std
::
cout
<<
__func__
<<
": done"
<<
std
::
endl
;
std
::
cout
<<
__func__
<<
": done"
<<
std
::
endl
;
LogRange
(
std
::
cout
,
in
.
mData
,
","
)
<<
std
::
endl
;
LogRange
(
std
::
cout
,
in
.
mData
,
","
)
<<
std
::
endl
;
LogRange
(
std
::
cout
,
wei
.
mData
,
","
)
<<
std
::
endl
;
LogRange
(
std
::
cout
,
wei
.
mData
,
","
)
<<
std
::
endl
;
LogRange
(
std
::
cout
,
out_host
.
mData
,
","
)
<<
std
::
endl
;
//
LogRange(std::cout, out_host.mData, ",") << std::endl;
LogRange
(
std
::
cout
,
out_device
.
mData
,
","
)
<<
std
::
endl
;
LogRange
(
std
::
cout
,
out_device
.
mData
,
","
)
<<
std
::
endl
;
}
}
src/include/device_tensor.cuh
View file @
dfa02139
#pragma once
#pragma once
#include <algorithm>
#include "helper_cuda.h"
#include "helper_cuda.h"
#include "tensor.hpp"
#include "tensor.hpp"
template
<
unsigned
NDim
>
struct
DeviceTensorDescriptor
struct
DeviceTensorDescriptor
{
{
DeviceTensorDescriptor
()
=
de
lete
;
__host__
__device__
DeviceTensorDescriptor
()
=
de
fault
;
__host__
DeviceTensorDescriptor
(
const
TensorDescriptor
&
host_desc
)
__host__
DeviceTensorDescriptor
(
const
TensorDescriptor
&
host_desc
)
:
mDataType
(
host_desc
.
GetDataType
()),
mDim
(
host_desc
.
GetDimension
())
{
{
std
::
size_t
data_sz
=
host_desc
.
GetDataType
()
==
DataType_t
::
Float
?
4
:
2
;
assert
(
NDim
==
host_desc
.
GetDimension
());
std
::
copy
(
host_desc
.
GetLengths
().
begin
(),
host_desc
.
GetLengths
().
end
(),
mpLengths
);
std
::
copy
(
host_desc
.
GetStrides
().
begin
(),
host_desc
.
GetStrides
().
end
(),
mpStrides
);
}
checkCudaErrors
(
cudaMalloc
(
&
mpLengths
,
data_sz
*
mDim
));
__host__
__device__
unsigned
GetLength
(
unsigned
i
)
const
{
return
mpLengths
[
i
];
}
checkCudaErrors
(
cudaMalloc
(
&
mpStrides
,
data_sz
*
mDim
));
checkCudaErrors
(
cudaMemcpy
(
__host__
__device__
unsigned
long
GetStride
(
unsigned
i
)
const
{
return
mpStrides
[
i
];
}
mpLengths
,
host_desc
.
GetLengths
().
data
(),
data_sz
*
mDim
,
cudaMemcpyHostToDevice
));
checkCudaErrors
(
cudaMemcpy
(
mpStrides
,
host_desc
.
GetStrides
().
data
(),
data_sz
*
mDim
,
cudaMemcpyHostToDevice
));
}
__host__
~
DeviceTensorDescriptor
()
// this is ugly
__host__
__device__
unsigned
long
Get1dIndex
(
unsigned
n
,
unsigned
c
,
unsigned
h
,
unsigned
w
)
const
{
{
#if 0
return
n
*
mpStrides
[
0
]
+
c
*
mpStrides
[
1
]
+
h
*
mpStrides
[
2
]
+
w
*
mpStrides
[
3
];
if(mpLengths != nullptr)
checkCudaErrors(cudaFree(mpLengths));
if(mpStrides != nullptr)
checkCudaErrors(cudaFree(mpStrides));
#endif
}
}
DataType_t
mDataType
;
unsigned
mpLengths
[
NDim
];
unsigned
long
mDim
;
unsigned
long
mpStrides
[
NDim
];
unsigned
long
*
mpLengths
=
nullptr
;
unsigned
long
*
mpStrides
=
nullptr
;
};
};
src/include/direct_convolution.cuh
View file @
dfa02139
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment