Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
2f2cf35b
Commit
2f2cf35b
authored
Oct 22, 2018
by
Chao Liu
Browse files
initial cuda build
parent
d51b8158
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
179 additions
and
15 deletions
+179
-15
driver/CMakeLists.txt
driver/CMakeLists.txt
+1
-1
driver/conv.cu
driver/conv.cu
+108
-0
src/include/device_tensor.cuh
src/include/device_tensor.cuh
+39
-0
src/include/direct_convolution.cuh
src/include/direct_convolution.cuh
+12
-0
src/include/tensor.hpp
src/include/tensor.hpp
+17
-14
src/tensor.cpp
src/tensor.cpp
+2
-0
No files found.
driver/CMakeLists.txt
View file @
2f2cf35b
add_executable
(
conv EXCLUDE_FROM_ALL conv.c
pp
)
add_executable
(
conv EXCLUDE_FROM_ALL conv.c
u
)
target_link_libraries
(
conv convolution
)
target_link_libraries
(
conv convolution
)
driver/conv.c
pp
→
driver/conv.c
u
View file @
2f2cf35b
#include <iostream>
#include <iostream>
#include "nvToolsExt.h"
#include "tensor.hpp"
#include "tensor.hpp"
#include "device_tensor.cuh"
#include "direct_convolution.cuh"
template
<
typename
T
>
template
<
typename
T
>
void
direc
t_convolution
(
const
Tensor
<
T
>&
in
,
void
hos
t_convolution
(
const
Tensor
<
T
>&
in
,
const
Tensor
<
T
>&
wei
,
const
Tensor
<
T
>&
wei
,
Tensor
<
T
>&
out
,
Tensor
<
T
>&
out
,
std
::
size_t
num_thread
)
std
::
size_t
num_thread
)
...
@@ -33,6 +36,35 @@ void direct_convolution(const Tensor<T>& in,
...
@@ -33,6 +36,35 @@ void direct_convolution(const Tensor<T>& in,
f_par
(
num_thread
);
f_par
(
num_thread
);
}
}
template
<
class
T
>
void
device_convolution
(
const
Tensor
<
T
>&
in
,
const
Tensor
<
T
>&
wei
,
Tensor
<
T
>&
out
)
{
DeviceTensorDescriptor
in_desc_device
(
in
.
mDesc
);
DeviceTensorDescriptor
wei_desc_device
(
wei
.
mDesc
);
DeviceTensorDescriptor
out_desc_device
(
out
.
mDesc
);
std
::
size_t
data_sz
=
sizeof
(
T
);
DeviceMem
in_device_buf
(
data_sz
*
in
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_device_buf
(
data_sz
*
wei
.
mDesc
.
GetElementSpace
());
DeviceMem
out_device_buf
(
data_sz
*
out
.
mDesc
.
GetElementSpace
());
in_device_buf
.
ToDevice
(
in
.
mData
.
data
());
wei_device_buf
.
ToDevice
(
wei
.
mData
.
data
());
dim3
block_dim
(
256
,
1
,
1
);
dim3
grid_dim
(
1
,
1
,
1
);
direct_convolution
<
T
,
256
>
<<<
grid_dim
,
block_dim
>>>
(
in_desc_device
,
static_cast
<
T
*>
(
in_device_buf
.
GetDeviceBuffer
()),
wei_desc_device
,
static_cast
<
T
*>
(
wei_device_buf
.
GetDeviceBuffer
()),
out_desc_device
,
static_cast
<
T
*>
(
out_device_buf
.
GetDeviceBuffer
()));
out_device_buf
.
FromDevice
(
out
.
mData
.
data
());
}
template
<
class
T
>
template
<
class
T
>
struct
Generator
struct
Generator
{
{
...
@@ -46,9 +78,16 @@ struct Generator
...
@@ -46,9 +78,16 @@ struct Generator
int
main
()
int
main
()
{
{
#if 0
Tensor<float> in({3, 16, 128, 128});
Tensor<float> in({3, 16, 128, 128});
Tensor<float> wei({4, 16, 3, 3});
Tensor<float> wei({4, 16, 3, 3});
Tensor
<
float
>
out
({
3
,
4
,
126
,
126
});
Tensor<float> out_host({3, 4, 126, 126});
#else
Tensor
<
float
>
in
({
1
,
1
,
4
,
4
});
Tensor
<
float
>
wei
({
1
,
1
,
3
,
3
});
Tensor
<
float
>
out_host
({
1
,
1
,
2
,
2
});
#endif
Tensor
<
float
>
out_device
=
out_host
;
int
num_thread
=
std
::
thread
::
hardware_concurrency
();
int
num_thread
=
std
::
thread
::
hardware_concurrency
();
...
@@ -57,11 +96,13 @@ int main()
...
@@ -57,11 +96,13 @@ int main()
in
.
GenerateTensorValue
(
Generator
<
float
>
{},
num_thread
);
in
.
GenerateTensorValue
(
Generator
<
float
>
{},
num_thread
);
wei
.
GenerateTensorValue
(
Generator
<
float
>
{},
num_thread
);
wei
.
GenerateTensorValue
(
Generator
<
float
>
{},
num_thread
);
direct_convolution
(
in
,
wei
,
out
,
num_thread
);
host_convolution
(
in
,
wei
,
out_host
,
num_thread
);
device_convolution
(
in
,
wei
,
out_device
);
std
::
cout
<<
__func__
<<
": done"
<<
std
::
endl
;
std
::
cout
<<
__func__
<<
": done"
<<
std
::
endl
;
LogRange
(
std
::
cout
,
in
.
mData
,
","
)
<<
std
::
endl
;
LogRange
(
std
::
cout
,
in
.
mData
,
","
)
<<
std
::
endl
;
LogRange
(
std
::
cout
,
wei
.
mData
,
","
)
<<
std
::
endl
;
LogRange
(
std
::
cout
,
wei
.
mData
,
","
)
<<
std
::
endl
;
LogRange
(
std
::
cout
,
out
.
mData
,
","
)
<<
std
::
endl
;
LogRange
(
std
::
cout
,
out_host
.
mData
,
","
)
<<
std
::
endl
;
LogRange
(
std
::
cout
,
out_device
.
mData
,
","
)
<<
std
::
endl
;
}
}
src/include/device_tensor.cuh
0 → 100644
View file @
2f2cf35b
#pragma once
#include "helper_cuda.h"
#include "tensor.hpp"
struct
DeviceTensorDescriptor
{
DeviceTensorDescriptor
()
=
delete
;
__host__
DeviceTensorDescriptor
(
const
TensorDescriptor
&
host_desc
)
:
mDataType
(
host_desc
.
GetDataType
()),
mDim
(
host_desc
.
GetDimension
())
{
std
::
size_t
data_sz
=
host_desc
.
GetDataType
()
==
DataType_t
::
Float
?
4
:
2
;
checkCudaErrors
(
cudaMalloc
(
&
mpLengths
,
data_sz
*
mDim
));
checkCudaErrors
(
cudaMalloc
(
&
mpStrides
,
data_sz
*
mDim
));
checkCudaErrors
(
cudaMemcpy
(
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
host_desc
.
GetLengths
().
data
())),
mpLengths
,
data_sz
*
mDim
,
cudaMemcpyHostToDevice
));
checkCudaErrors
(
cudaMemcpy
(
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
host_desc
.
GetStrides
().
data
())),
mpStrides
,
data_sz
*
mDim
,
cudaMemcpyHostToDevice
));
}
__host__
~
DeviceTensorDescriptor
()
{
checkCudaErrors
(
cudaFree
(
mpLengths
));
checkCudaErrors
(
cudaFree
(
mpStrides
));
}
DataType_t
mDataType
;
unsigned
long
mDim
;
unsigned
long
*
mpLengths
;
unsigned
long
*
mpStrides
;
};
src/include/direct_convolution.cuh
0 → 100644
View file @
2f2cf35b
#pragma once
#include "device_tensor.cuh"
template
<
class
TFloat
,
int
NBlockDim
>
__global__
void
direct_convolution
(
DeviceTensorDescriptor
in_desc
,
TFloat
*
const
in
,
DeviceTensorDescriptor
wei_desc
,
TFloat
*
const
wei
,
DeviceTensorDescriptor
out_desc
,
TFloat
*
out
)
{
}
src/include/tensor.hpp
View file @
2f2cf35b
#pragma once
#include <thread>
#include <thread>
#include <vector>
#include <vector>
#include <numeric>
#include <numeric>
...
@@ -89,6 +90,7 @@ struct TensorDescriptor
...
@@ -89,6 +90,7 @@ struct TensorDescriptor
{
{
}
}
DataType_t
GetDataType
()
const
;
std
::
size_t
GetDimension
()
const
;
std
::
size_t
GetDimension
()
const
;
std
::
size_t
GetElementSize
()
const
;
std
::
size_t
GetElementSize
()
const
;
std
::
size_t
GetElementSpace
()
const
;
std
::
size_t
GetElementSpace
()
const
;
...
@@ -105,35 +107,36 @@ struct TensorDescriptor
...
@@ -105,35 +107,36 @@ struct TensorDescriptor
}
}
private:
private:
DataType_t
mDataType
;
std
::
vector
<
std
::
size_t
>
mLens
;
std
::
vector
<
std
::
size_t
>
mLens
;
std
::
vector
<
std
::
size_t
>
mStrides
;
std
::
vector
<
std
::
size_t
>
mStrides
;
DataType_t
mDataType
;
};
};
struct
Gpu
Mem
struct
Device
Mem
{
{
Gpu
Mem
()
=
delete
;
Device
Mem
()
=
delete
;
Gpu
Mem
(
std
::
size_t
size
,
std
::
size_t
data
_size
)
:
mSize
(
size
),
mDataSize
(
data
_size
)
Device
Mem
(
std
::
size_t
mem
_size
)
:
mMe
mSize
(
mem
_size
)
{
{
cudaMalloc
(
static_cast
<
void
**>
(
&
m
GpuBuf
),
mDataSize
*
mSize
);
cudaMalloc
(
static_cast
<
void
**>
(
&
m
pDeviceBuf
),
mMe
mSize
);
}
}
int
ToGpu
(
void
*
p
)
void
*
GetDeviceBuffer
()
{
return
mpDeviceBuf
;
}
int
ToDevice
(
const
void
*
p
)
{
{
return
static_cast
<
int
>
(
cudaMemcpy
(
mGpuBuf
,
p
,
mDataSize
*
mSize
,
cudaMemcpyHostToDevice
));
return
static_cast
<
int
>
(
cudaMemcpy
(
mpDeviceBuf
,
const_cast
<
void
*>
(
p
),
mMemSize
,
cudaMemcpyHostToDevice
));
}
}
int
From
Gpu
(
void
*
p
)
int
From
Device
(
void
*
p
)
{
{
return
static_cast
<
int
>
(
cudaMemcpy
(
p
,
m
GpuBuf
,
mDataSize
*
mSize
,
cudaMemcpyDeviceToHost
));
return
static_cast
<
int
>
(
cudaMemcpy
(
p
,
m
pDeviceBuf
,
mMe
mSize
,
cudaMemcpyDeviceToHost
));
}
}
~
Gpu
Mem
()
{
cudaFree
(
m
Gpu
Buf
);
}
~
Device
Mem
()
{
cudaFree
(
m
pDevice
Buf
);
}
void
*
mGpuBuf
;
void
*
mpDeviceBuf
;
std
::
size_t
mSize
;
std
::
size_t
mMemSize
;
std
::
size_t
mDataSize
;
};
};
struct
joinable_thread
:
std
::
thread
struct
joinable_thread
:
std
::
thread
...
...
src/tensor.cpp
View file @
2f2cf35b
...
@@ -28,6 +28,8 @@ void TensorDescriptor::CalculateStrides()
...
@@ -28,6 +28,8 @@ void TensorDescriptor::CalculateStrides()
mLens
.
rbegin
(),
mLens
.
rend
()
-
1
,
mStrides
.
rbegin
()
+
1
,
std
::
multiplies
<
std
::
size_t
>
());
mLens
.
rbegin
(),
mLens
.
rend
()
-
1
,
mStrides
.
rbegin
()
+
1
,
std
::
multiplies
<
std
::
size_t
>
());
}
}
DataType_t
TensorDescriptor
::
GetDataType
()
const
{
return
mDataType
;
}
std
::
size_t
TensorDescriptor
::
GetDimension
()
const
{
return
mLens
.
size
();
}
std
::
size_t
TensorDescriptor
::
GetDimension
()
const
{
return
mLens
.
size
();
}
std
::
size_t
TensorDescriptor
::
GetElementSize
()
const
std
::
size_t
TensorDescriptor
::
GetElementSize
()
const
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment