Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
df228b3c
"...composable_kernel-1.git" did not exist on "15c89e81f0587e8b46caa6062040c469a97ebc09"
Commit
df228b3c
authored
Jan 08, 2019
by
Chao Liu
Browse files
refactor
parent
0b8e67ef
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
57 additions
and
62 deletions
+57
-62
driver/conv.cu
driver/conv.cu
+2
-1
src/include/blockwise_direct_convolution.cuh
src/include/blockwise_direct_convolution.cuh
+1
-1
src/include/constant_tensor_descriptor.cuh
src/include/constant_tensor_descriptor.cuh
+18
-58
src/include/conv_common.cuh
src/include/conv_common.cuh
+34
-0
src/include/gridwise_direct_convolution_2.cuh
src/include/gridwise_direct_convolution_2.cuh
+2
-2
No files found.
driver/conv.cu
View file @
df228b3c
...
@@ -5,6 +5,7 @@
...
@@ -5,6 +5,7 @@
#include "nvToolsExt.h"
#include "nvToolsExt.h"
#include "tensor.hpp"
#include "tensor.hpp"
#include "constant_tensor_descriptor.cuh"
#include "constant_tensor_descriptor.cuh"
#include "conv_common.cuh"
#include "device_direct_convolution_1.cuh"
#include "device_direct_convolution_1.cuh"
#include "device_direct_convolution_2.cuh"
#include "device_direct_convolution_2.cuh"
//#include "device_implicit_gemm_convolution.cuh"
//#include "device_implicit_gemm_convolution.cuh"
...
@@ -367,7 +368,7 @@ int main()
...
@@ -367,7 +368,7 @@ int main()
auto
in_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
N
,
C
,
HI
,
WI
>
{});
auto
in_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
N
,
C
,
HI
,
WI
>
{});
auto
wei_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
K
,
C
,
S
,
R
>
{});
auto
wei_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
K
,
C
,
S
,
R
>
{});
auto
out_desc
=
get_convolution_output_4d_tensor_descriptor
(
in_desc
,
wei_desc
);
auto
out_desc
=
get_convolution_output_
default_
4d_tensor_descriptor
(
in_desc
,
wei_desc
);
ostream_ConstantTensorDescriptor
(
in_desc
,
std
::
cout
<<
"in_desc: "
);
ostream_ConstantTensorDescriptor
(
in_desc
,
std
::
cout
<<
"in_desc: "
);
ostream_ConstantTensorDescriptor
(
wei_desc
,
std
::
cout
<<
"wei_desc: "
);
ostream_ConstantTensorDescriptor
(
wei_desc
,
std
::
cout
<<
"wei_desc: "
);
...
...
src/include/blockwise_direct_convolution.cuh
View file @
df228b3c
...
@@ -59,7 +59,7 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
...
@@ -59,7 +59,7 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
make_ConstantTensorDescriptor
(
Sequence
<
KPerThread
,
CPerThread
,
S
,
R
>
{});
make_ConstantTensorDescriptor
(
Sequence
<
KPerThread
,
CPerThread
,
S
,
R
>
{});
constexpr
auto
out_thread_desc
=
constexpr
auto
out_thread_desc
=
get_convolution_output_4d_tensor_descriptor
(
in_thread_desc
,
wei_thread_desc
);
get_convolution_output_
default_
4d_tensor_descriptor
(
in_thread_desc
,
wei_thread_desc
);
constexpr
auto
in_thread_block_desc
=
constexpr
auto
in_thread_block_desc
=
make_ConstantTensorDescriptor
(
in_thread_desc
.
GetLengths
(),
in_block_desc
.
GetStrides
());
make_ConstantTensorDescriptor
(
in_thread_desc
.
GetLengths
(),
in_block_desc
.
GetStrides
());
...
...
src/include/constant_tensor_descriptor.cuh
View file @
df228b3c
...
@@ -23,14 +23,6 @@ struct Sequence
...
@@ -23,14 +23,6 @@ struct Sequence
return
mData
[
I
];
return
mData
[
I
];
}
}
template
<
unsigned
I
>
__host__
__device__
constexpr
auto
GetConstant
(
Number
<
I
>
)
const
{
constexpr
unsigned
N
=
Get
(
I
);
return
Number
<
N
>
{};
}
template
<
unsigned
I0
,
unsigned
I1
>
template
<
unsigned
I0
,
unsigned
I1
>
__host__
__device__
constexpr
auto
Reorder
(
Number
<
I0
>
,
Number
<
I1
>
)
const
__host__
__device__
constexpr
auto
Reorder
(
Number
<
I0
>
,
Number
<
I1
>
)
const
{
{
...
@@ -61,17 +53,15 @@ struct Sequence
...
@@ -61,17 +53,15 @@ struct Sequence
return
Sequence
<
IR0
,
IR1
,
IR2
,
IR3
>
{};
return
Sequence
<
IR0
,
IR1
,
IR2
,
IR3
>
{};
}
}
template
<
unsigned
I0
,
unsigned
I1
,
unsigned
I2
,
unsigned
I3
,
unsigned
I4
>
template
<
unsigned
I0
,
unsigned
I1
,
unsigned
I2
,
unsigned
I3
>
__host__
__device__
constexpr
auto
__host__
__device__
constexpr
auto
Reorder
(
Sequence
<
I0
,
I1
,
I2
,
I3
>
)
const
Reorder
(
Number
<
I0
>
,
Number
<
I1
>
,
Number
<
I2
>
,
Number
<
I3
>
,
Number
<
I4
>
)
const
{
{
constexpr
unsigned
IR0
=
Get
(
Number
<
I0
>
{});
constexpr
unsigned
IR0
=
Get
(
Number
<
I0
>
{});
constexpr
unsigned
IR1
=
Get
(
Number
<
I1
>
{});
constexpr
unsigned
IR1
=
Get
(
Number
<
I1
>
{});
constexpr
unsigned
IR2
=
Get
(
Number
<
I2
>
{});
constexpr
unsigned
IR2
=
Get
(
Number
<
I2
>
{});
constexpr
unsigned
IR3
=
Get
(
Number
<
I3
>
{});
constexpr
unsigned
IR3
=
Get
(
Number
<
I3
>
{});
constexpr
unsigned
IR4
=
Get
(
Number
<
I4
>
{});
return
Sequence
<
IR0
,
IR1
,
IR2
,
IR3
,
IR4
>
{};
return
Sequence
<
IR0
,
IR1
,
IR2
,
IR3
>
{};
}
}
};
};
...
@@ -132,7 +122,8 @@ struct ConstantTensorDescriptor
...
@@ -132,7 +122,8 @@ struct ConstantTensorDescriptor
}
}
// this is ugly, only for 4d
// this is ugly, only for 4d
__host__
__device__
unsigned
Get1dIndex
(
unsigned
n
,
unsigned
c
,
unsigned
h
,
unsigned
w
)
const
__host__
__device__
unsigned
Get1dIndex
(
unsigned
i0
,
unsigned
i1
,
unsigned
i2
,
unsigned
i3
)
const
{
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
...
@@ -140,24 +131,24 @@ struct ConstantTensorDescriptor
...
@@ -140,24 +131,24 @@ struct ConstantTensorDescriptor
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
static_assert
(
nDim
==
4
,
"nDim is not 4"
);
static_assert
(
nDim
==
4
,
"nDim is not 4"
);
return
n
*
GetStride
(
I0
)
+
c
*
GetStride
(
I1
)
+
h
*
GetStride
(
I2
)
+
w
*
GetStride
(
I3
);
return
i0
*
GetStride
(
I0
)
+
i1
*
GetStride
(
I1
)
+
i2
*
GetStride
(
I2
)
+
i3
*
GetStride
(
I3
);
}
template
<
class
...
Is
>
__host__
__device__
constexpr
auto
Reorder
(
Is
...
is
)
const
{
constexpr
auto
lengths
=
Lengths
{}.
Reorder
(
is
...);
constexpr
auto
strides
=
Strides
{}.
Reorder
(
is
...);
return
ConstantTensorDescriptor
<
decltype
(
lengths
),
decltype
(
strides
)
>
{};
}
}
};
};
// this is ugly, only for 4d
// this is ugly, only for 4d
template
<
unsigned
N
,
unsigned
C
,
unsigned
H
,
unsigned
W
>
template
<
unsigned
L0
,
unsigned
L1
,
unsigned
L2
,
unsigned
L3
>
__host__
__device__
constexpr
auto
calculate_default_strides
(
Sequence
<
N
,
C
,
H
,
W
>
)
__host__
__device__
constexpr
auto
calculate_default_strides
(
Sequence
<
L0
,
L1
,
L2
,
L3
>
)
{
return
Sequence
<
L1
*
L2
*
L3
,
L2
*
L3
,
L3
,
1
>
{};
}
// this is ugly, only for 4d
template
<
unsigned
S0
,
unsigned
S1
,
unsigned
S2
,
unsigned
S3
>
__host__
__device__
constexpr
auto
calculate_full_lengths
(
Sequence
<
S0
,
S1
,
S2
,
S3
>
)
{
{
return
Sequence
<
C
*
H
*
W
,
H
*
W
,
W
,
1
>
{};
static_assert
((
S0
%
S1
==
0
)
&&
(
S1
%
S2
==
0
)
&&
(
S2
%
S3
==
0
),
"cannot be evenly divided!"
);
return
Sequence
<
1
,
S0
/
S1
,
S1
/
S2
,
S2
/
S3
>
{};
}
}
template
<
class
Lengths
>
template
<
class
Lengths
>
...
@@ -173,37 +164,6 @@ __host__ __device__ constexpr auto make_ConstantTensorDescriptor(Lengths, Stride
...
@@ -173,37 +164,6 @@ __host__ __device__ constexpr auto make_ConstantTensorDescriptor(Lengths, Stride
return
ConstantTensorDescriptor
<
Lengths
,
Strides
>
{};
return
ConstantTensorDescriptor
<
Lengths
,
Strides
>
{};
}
}
// this is ugly, only for 4d
template
<
class
InDesc
,
class
WeiDesc
>
__host__
__device__
constexpr
auto
get_convolution_output_4d_tensor_descriptor
(
InDesc
,
WeiDesc
)
{
constexpr
auto
in_desc
=
InDesc
{};
constexpr
auto
wei_desc
=
WeiDesc
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
static_assert
(
in_desc
.
GetDimension
()
==
4
,
"input nDim is not 4"
);
static_assert
(
wei_desc
.
GetDimension
()
==
4
,
"weight nDim is not 4"
);
static_assert
(
in_desc
.
GetLength
(
I1
)
==
wei_desc
.
GetLength
(
I1
),
"input & weight dimension not consistent"
);
constexpr
auto
N
=
in_desc
.
GetLength
(
I0
);
constexpr
auto
HI
=
in_desc
.
GetLength
(
I2
);
constexpr
auto
WI
=
in_desc
.
GetLength
(
I3
);
constexpr
auto
K
=
wei_desc
.
GetLength
(
I0
);
constexpr
auto
S
=
wei_desc
.
GetLength
(
I2
);
constexpr
auto
R
=
wei_desc
.
GetLength
(
I3
);
constexpr
auto
HO
=
HI
-
S
+
1
;
constexpr
auto
WO
=
WI
-
R
+
1
;
return
make_ConstantTensorDescriptor
(
Sequence
<
N
,
K
,
HO
,
WO
>
{});
}
// this is ugly, only for 4d
// this is ugly, only for 4d
template
<
class
TDesc
>
template
<
class
TDesc
>
__host__
__device__
void
print_ConstantTensorDescriptor
(
TDesc
,
const
char
*
s
)
__host__
__device__
void
print_ConstantTensorDescriptor
(
TDesc
,
const
char
*
s
)
...
...
src/include/conv_common.cuh
0 → 100644
View file @
df228b3c
#pragma once
#include "constant_tensor_descriptor.cuh"
// this is ugly, only for 4d
template
<
class
InDesc
,
class
WeiDesc
>
__host__
__device__
constexpr
auto
get_convolution_output_default_4d_tensor_descriptor
(
InDesc
,
WeiDesc
)
{
constexpr
auto
in_desc
=
InDesc
{};
constexpr
auto
wei_desc
=
WeiDesc
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
static_assert
(
in_desc
.
GetDimension
()
==
4
,
"input nDim is not 4"
);
static_assert
(
wei_desc
.
GetDimension
()
==
4
,
"weight nDim is not 4"
);
static_assert
(
in_desc
.
GetLength
(
I1
)
==
wei_desc
.
GetLength
(
I1
),
"input & weight dimension not consistent"
);
constexpr
auto
N
=
in_desc
.
GetLength
(
I0
);
constexpr
auto
HI
=
in_desc
.
GetLength
(
I2
);
constexpr
auto
WI
=
in_desc
.
GetLength
(
I3
);
constexpr
auto
K
=
wei_desc
.
GetLength
(
I0
);
constexpr
auto
S
=
wei_desc
.
GetLength
(
I2
);
constexpr
auto
R
=
wei_desc
.
GetLength
(
I3
);
constexpr
auto
HO
=
HI
-
S
+
1
;
constexpr
auto
WO
=
WI
-
R
+
1
;
return
make_ConstantTensorDescriptor
(
Sequence
<
N
,
K
,
HO
,
WO
>
{});
}
src/include/gridwise_direct_convolution_2.cuh
View file @
df228b3c
...
@@ -69,8 +69,8 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
...
@@ -69,8 +69,8 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
constexpr
auto
wei_thread_block_desc
=
make_ConstantTensorDescriptor
(
constexpr
auto
wei_thread_block_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
KPerThread
,
CPerThread
,
S
,
R
>
{},
wei_block_desc
.
GetStrides
());
Sequence
<
KPerThread
,
CPerThread
,
S
,
R
>
{},
wei_block_desc
.
GetStrides
());
constexpr
auto
out_thread_desc
=
constexpr
auto
out_thread_desc
=
get_convolution_output_default_4d_tensor_descriptor
(
get_convolution_output_4d_tensor_descriptor
(
in_thread_block_desc
,
wei_thread_block_desc
);
in_thread_block_desc
,
wei_thread_block_desc
);
// register
// register
Float
p_out_thread
[
out_thread_desc
.
GetElementSpace
()];
Float
p_out_thread
[
out_thread_desc
.
GetElementSpace
()];
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment