Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
6e59255a
Commit
6e59255a
authored
Mar 13, 2021
by
root
Browse files
debug
parent
7d0a5412
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
18 additions
and
64 deletions
+18
-64
composable_kernel/include/tensor_operation/blockwise_gemm_v3.hpp
...ble_kernel/include/tensor_operation/blockwise_gemm_v3.hpp
+3
-37
composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_v2.hpp
...nel/include/tensor_operation/gridwise_dynamic_gemm_v2.hpp
+11
-23
driver/include/device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
...convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
+3
-3
driver/src/conv_driver.cpp
driver/src/conv_driver.cpp
+1
-1
No files found.
composable_kernel/include/tensor_operation/blockwise_gemm_v3.hpp
View file @
6e59255a
...
@@ -33,7 +33,6 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
...
@@ -33,7 +33,6 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
};
};
index_t
mMyThreadOffsetA
;
index_t
mMyThreadOffsetA
;
index_t
mMyThreadOffsetB
;
__device__
BlockwiseGemm_km_kn_m0m1n0n1_v3
()
__device__
BlockwiseGemm_km_kn_m0m1n0n1_v3
()
{
{
...
@@ -68,8 +67,6 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
...
@@ -68,8 +67,6 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
auto
c_thread_mtx_index
=
GetBeginOfThreadMatrixC
(
get_thread_local_1d_id
());
auto
c_thread_mtx_index
=
GetBeginOfThreadMatrixC
(
get_thread_local_1d_id
());
mMyThreadOffsetA
=
BlockMatrixA
{}.
CalculateOffset
(
make_tuple
(
0
,
c_thread_mtx_index
.
k
));
mMyThreadOffsetA
=
BlockMatrixA
{}.
CalculateOffset
(
make_tuple
(
0
,
c_thread_mtx_index
.
k
));
mMyThreadOffsetB
=
BlockMatrixB
{}.
CalculateOffset
(
make_tuple
(
0
,
0
,
c_thread_mtx_index
.
h
,
c_thread_mtx_index
.
w
));
}
}
__device__
static
constexpr
auto
GetThreadMatrixCLengths
()
__device__
static
constexpr
auto
GetThreadMatrixCLengths
()
...
@@ -109,38 +106,6 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
...
@@ -109,38 +106,6 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
}
}
};
};
template
<
typename
SrcDesc
,
typename
DstDesc
,
index_t
NSliceCYX
,
index_t
NSliceH
,
index_t
NSliceW
,
index_t
DataPerAccess
>
struct
ThreadwiseSliceCopy_b
{
template
<
typename
Data
>
__device__
static
void
Run
(
const
Data
*
p_src
,
Data
*
p_dst
)
{
static_assert
(
SrcDesc
::
IsKnownAtCompileTime
()
&&
DstDesc
::
IsKnownAtCompileTime
(),
"wrong! Desc should be known at compile-time"
);
using
vector_t
=
typename
vector_type
<
Data
,
DataPerAccess
>::
type
;
static_for
<
0
,
NSliceCYX
,
1
>
{}([
&
](
auto
i
)
{
static_for
<
0
,
NSliceH
,
1
>
{}([
&
](
auto
j
)
{
static_for
<
0
,
NSliceW
,
1
>
{}([
&
](
auto
k
)
{
constexpr
auto
src_offset
=
SrcDesc
{}.
CalculateOffset
(
make_tuple
(
i
,
0
,
j
,
k
));
constexpr
auto
dst_offset
=
DstDesc
{}.
CalculateOffset
(
make_tuple
(
i
,
0
,
j
,
k
));
*
reinterpret_cast
<
vector_t
*>
(
&
p_dst
[
dst_offset
])
=
*
reinterpret_cast
<
const
vector_t
*>
(
&
p_src
[
src_offset
]);
});
});
});
}
};
template
<
typename
FloatA
,
typename
FloatB
,
typename
FloatC
>
template
<
typename
FloatA
,
typename
FloatB
,
typename
FloatC
>
__device__
void
__device__
void
Run_naive
(
const
FloatA
*
p_a_block
,
const
FloatB
*
p_b_thread
,
FloatC
*
p_c_thread
)
const
Run_naive
(
const
FloatA
*
p_a_block
,
const
FloatB
*
p_b_thread
,
FloatC
*
p_c_thread
)
const
...
@@ -160,7 +125,8 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
...
@@ -160,7 +125,8 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
make_tuple
(
Number
<
CYXPerThreadLoop
>
{},
Number
<
KPerThread
>
{}));
make_tuple
(
Number
<
CYXPerThreadLoop
>
{},
Number
<
KPerThread
>
{}));
constexpr
auto
b_thread_mtx
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
constexpr
auto
b_thread_mtx
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
Number
<
CYXPerThreadLoop
>
{},
Number
<
1
>
{},
Number
<
1
>
{},
Number
<
1
>
{}));
// make_tuple(Number<CYXPerThreadLoop>{}, Number<1>{}, Number<1>{}, Number<1>{}));
make_tuple
(
Number
<
CYXPerThreadLoop
>
{},
Number
<
1
>
{}));
constexpr
auto
c_thread_mtx
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
constexpr
auto
c_thread_mtx
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
Number
<
KPerThread
>
{},
Number
<
1
>
{}));
make_tuple
(
Number
<
KPerThread
>
{},
Number
<
1
>
{}));
...
@@ -183,7 +149,7 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
...
@@ -183,7 +149,7 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
mMyThreadOffsetA
,
mMyThreadOffsetA
,
p_a_thread
+
a_thread_mtx
.
CalculateOffset
(
make_tuple
(
0
,
0
)));
p_a_thread
+
a_thread_mtx
.
CalculateOffset
(
make_tuple
(
0
,
0
)));
//
threadwise_gemm.Run(p_a_thread, p_b_thread, p_c_thread);
threadwise_gemm
.
Run
(
p_a_thread
,
p_b_thread
+
CYXPerThreadLoop
*
cyx_begin
,
p_c_thread
);
}
}
}
}
...
...
composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_v2.hpp
View file @
6e59255a
...
@@ -56,32 +56,19 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
...
@@ -56,32 +56,19 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
{
{
__host__
__device__
static
constexpr
index_t
GetSharedMemoryNumberOfByte
()
__host__
__device__
static
constexpr
index_t
GetSharedMemoryNumberOfByte
()
{
{
constexpr
auto
max_lds_align
=
math
::
lcm
(
Number
<
ABlockTransferDstScalarPerVector_M
>
{},
constexpr
auto
max_lds_align
=
Number
<
BBlockTransferDstScalarPerVector_N
>
{},
math
::
lcm
(
Number
<
ABlockTransferDstScalarPerVector_M
>
{},
Number
<
KPerThread
>
{});
Number
<
KPerThread
>
{},
Number
<
HWPerThread
>
{});
static_assert
(
CYXPerBlock
==
4
&&
HWPerBlock
==
64
&&
KPerBlock
==
16
,
""
);
// A matrix in LDS memory, dst of blockwise copy
// A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
// be careful of LDS alignment
constexpr
auto
a_cyx_k_block_desc
=
make_dynamic_naive_tensor_descriptor_aligned_v2
(
constexpr
auto
a_cyx_k_block_desc
=
make_dynamic_naive_tensor_descriptor_aligned_v2
(
make_tuple
(
Number
<
CYXPerBlock
>
{},
Number
<
KPerBlock
>
{}),
max_lds_align
);
make_tuple
(
Number
<
CYXPerBlock
>
{},
Number
<
KPerBlock
>
{}),
max_lds_align
);
// B matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
constexpr
auto
b_cyx_n_h_w_block_desc
=
make_dynamic_naive_tensor_descriptor_aligned_v2
(
make_tuple
(
Number
<
CYXPerBlock
>
{},
Number
<
1
>
{},
Number
<
8
>
{},
Number
<
8
>
{}),
max_lds_align
);
// LDS allocation for A and B: be careful of alignment
// LDS allocation for A and B: be careful of alignment
constexpr
auto
a_block_space_size
=
constexpr
auto
a_block_space_size
=
math
::
integer_least_multiple
(
a_cyx_k_block_desc
.
GetElementSpaceSize
(),
max_lds_align
);
math
::
integer_least_multiple
(
a_cyx_k_block_desc
.
GetElementSpaceSize
(),
max_lds_align
);
constexpr
auto
b_block_space_size
=
math
::
integer_least_multiple
(
return
2
*
(
a_block_space_size
)
*
sizeof
(
Float
);
b_cyx_n_h_w_block_desc
.
GetElementSpaceSize
(),
max_lds_align
);
return
2
*
(
a_block_space_size
+
b_block_space_size
)
*
sizeof
(
Float
);
}
}
template
<
bool
HasMainKBlockLoop
,
bool
HasDoubleTailKBlockLoop
>
template
<
bool
HasMainKBlockLoop
,
bool
HasDoubleTailKBlockLoop
>
...
@@ -180,17 +167,17 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
...
@@ -180,17 +167,17 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
#if 1
#if 1
constexpr
auto
b_cyx_n_h_w_thread_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
constexpr
auto
b_cyx_n_h_w_thread_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
Number
<
CYXPer
Thread
>
{},
Number
<
1
>
{},
Number
<
1
>
{},
Number
<
1
>
{}));
make_tuple
(
Number
<
CYXPer
Block
>
{},
Number
<
1
>
{},
Number
<
1
>
{},
Number
<
1
>
{}));
const
index_t
h_thread_id
=
get_thread_local_1d_id
()
/
8
;
const
index_t
h_thread_id
=
get_thread_local_1d_id
()
/
8
;
const
index_t
w_thread_id
=
get_thread_local_1d_id
()
%
8
;
const
index_t
w_thread_id
=
get_thread_local_1d_id
()
%
8
;
auto
b_t
hreadwise
_t
ransfer
=
ThreadwiseDynamicTensorSliceTransfer_v2
<
using
T
hreadwise
TensorSliceT
ransfer
B
=
ThreadwiseDynamicTensorSliceTransfer_v2
<
Float
,
Float
,
Float
,
Float
,
decltype
(
b_cyx_n_h_w_global_desc
),
decltype
(
b_cyx_n_h_w_global_desc
),
decltype
(
b_cyx_n_h_w_thread_desc
),
decltype
(
b_cyx_n_h_w_thread_desc
),
Sequence
<
CYXPer
Thread
,
1
,
1
,
1
>
,
Sequence
<
CYXPer
Block
,
1
,
1
,
1
>
,
Sequence
<
3
,
2
,
0
,
1
>
,
// BBlockTransferSrcAccessOrder,
Sequence
<
3
,
2
,
0
,
1
>
,
// BBlockTransferSrcAccessOrder,
3
,
// BBlockTransferSrcVectorDim,
3
,
// BBlockTransferSrcVectorDim,
1
,
// BBlockTransferSrcScalarPerVector,
1
,
// BBlockTransferSrcScalarPerVector,
...
@@ -198,7 +185,9 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
...
@@ -198,7 +185,9 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
AddressSpace
::
Vgpr
,
AddressSpace
::
Vgpr
,
InMemoryDataOperation
::
Set
,
InMemoryDataOperation
::
Set
,
1
,
1
,
true
>
(
true
>
;
ThreadwiseTensorSliceTransferB
b_threadwise_transfer
(
b_cyx_n_h_w_global_desc
,
b_cyx_n_h_w_global_desc
,
make_multi_index
(
make_multi_index
(
0
,
0
,
h_block_data_on_global
+
h_thread_id
,
w_block_data_on_global
+
w_thread_id
));
0
,
0
,
h_block_data_on_global
+
h_thread_id
,
w_block_data_on_global
+
w_thread_id
));
...
@@ -234,7 +223,6 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
...
@@ -234,7 +223,6 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
b_cyx_n_h_w_block_desc
.
GetElementSpaceSize
(),
max_lds_align
);
b_cyx_n_h_w_block_desc
.
GetElementSpaceSize
(),
max_lds_align
);
Float
*
p_a_block_double
=
p_shared_block
;
Float
*
p_a_block_double
=
p_shared_block
;
Float
*
p_b_block_double
=
p_shared_block
+
2
*
a_block_space_size
;
// register allocation for output
// register allocation for output
AccFloat
p_c_thread
[
c_k_n_h_w_thread_desc
.
GetElementSpaceSize
()];
AccFloat
p_c_thread
[
c_k_n_h_w_thread_desc
.
GetElementSpaceSize
()];
...
@@ -279,16 +267,16 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
...
@@ -279,16 +267,16 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
a_blockwise_copy
.
RunWrite
(
a_cyx_k_block_desc
,
p_a_block_double
);
a_blockwise_copy
.
RunWrite
(
a_cyx_k_block_desc
,
p_a_block_double
);
__syncthreads
();
__syncthreads
();
blockwise_gemm
.
Run
(
p_a_block_double
,
p_b_thread
,
p_c_thread
);
}
}
#if 0
#if 0
if constexpr(HasMainKBlockLoop)
if constexpr(HasMainKBlockLoop)
{
{
Float* p_a_block_even = p_a_block_double;
Float* p_a_block_even = p_a_block_double;
Float* p_b_block_even = p_b_block_double;
Float* p_a_block_odd = p_a_block_double + a_block_space_size;
Float* p_a_block_odd = p_a_block_double + a_block_space_size;
Float* p_b_block_odd = p_b_block_double + b_block_space_size;
index_t b_block_data_begin = 0;
index_t b_block_data_begin = 0;
...
...
driver/include/device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
View file @
6e59255a
...
@@ -72,7 +72,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(InDesc
...
@@ -72,7 +72,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(InDesc
constexpr
index_t
GemmMPerBlock
=
16
;
constexpr
index_t
GemmMPerBlock
=
16
;
constexpr
index_t
GemmNPerBlock
=
64
;
constexpr
index_t
GemmNPerBlock
=
64
;
constexpr
index_t
GemmKPerBlock
=
4
;
constexpr
index_t
GemmKPerBlock
=
4
*
3
*
3
;
constexpr
index_t
GemmMPerThread
=
16
;
constexpr
index_t
GemmMPerThread
=
16
;
constexpr
index_t
GemmNPerThread
=
1
;
constexpr
index_t
GemmNPerThread
=
1
;
...
@@ -83,14 +83,14 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(InDesc
...
@@ -83,14 +83,14 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(InDesc
constexpr
index_t
GemmMLevel1Cluster
=
1
;
constexpr
index_t
GemmMLevel1Cluster
=
1
;
constexpr
index_t
GemmNLevel1Cluster
=
64
;
constexpr
index_t
GemmNLevel1Cluster
=
64
;
using
GemmABlockTransferThreadSliceLengths_GemmK_GemmM
=
Sequence
<
1
,
1
>
;
using
GemmABlockTransferThreadSliceLengths_GemmK_GemmM
=
Sequence
<
9
,
1
>
;
using
GemmABlockTransferThreadClusterLengths_GemmK_GemmM
=
Sequence
<
4
,
16
>
;
using
GemmABlockTransferThreadClusterLengths_GemmK_GemmM
=
Sequence
<
4
,
16
>
;
constexpr
index_t
GemmABlockTransferSrcScalarPerVector_GemmK
=
1
;
constexpr
index_t
GemmABlockTransferSrcScalarPerVector_GemmK
=
1
;
constexpr
index_t
GemmABlockTransferDstScalarPerVector_GemmM
=
1
;
constexpr
index_t
GemmABlockTransferDstScalarPerVector_GemmM
=
1
;
using
GemmBBlockTransferThreadSliceLengths_GemmK_GemmN
=
Sequence
<
4
,
1
>
;
using
GemmBBlockTransferThreadSliceLengths_GemmK_GemmN
=
Sequence
<
4
,
1
>
;
using
GemmBBlockTransferThreadClusterLengths_GemmK_GemmN
=
Sequence
<
1
,
64
>
;
using
GemmBBlockTransferThreadClusterLengths_GemmK_GemmN
=
Sequence
<
9
,
64
>
;
constexpr
index_t
GemmBBlockTransferSrcScalarPerVector_GemmN
=
1
;
constexpr
index_t
GemmBBlockTransferSrcScalarPerVector_GemmN
=
1
;
constexpr
index_t
GemmBBlockTransferDstScalarPerVector_GemmN
=
1
;
constexpr
index_t
GemmBBlockTransferDstScalarPerVector_GemmN
=
1
;
...
...
driver/src/conv_driver.cpp
View file @
6e59255a
...
@@ -779,7 +779,7 @@ int main(int argc, char* argv[])
...
@@ -779,7 +779,7 @@ int main(int argc, char* argv[])
#if 1
#if 1
// LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl;
// LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl;
// LogRange(std::cout << "wei_kcyx: ", wei_kcyx.mData, ",") << std::endl;
// LogRange(std::cout << "wei_kcyx: ", wei_kcyx.mData, ",") << std::endl;
//
LogRange(std::cout << "out_nkhw_host : ", out_nkhw_host.mData, ",") << std::endl;
LogRange
(
std
::
cout
<<
"out_nkhw_host : "
,
out_nkhw_host
.
mData
,
","
)
<<
std
::
endl
;
LogRange
(
std
::
cout
<<
"out_nkhw_device: "
,
out_nkhw_device
.
mData
,
","
)
<<
std
::
endl
;
LogRange
(
std
::
cout
<<
"out_nkhw_device: "
,
out_nkhw_device
.
mData
,
","
)
<<
std
::
endl
;
#endif
#endif
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment