Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
736a37ba
Commit
736a37ba
authored
Jan 20, 2021
by
Jing Zhang
Browse files
debug
parent
15232a0d
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
44 additions
and
31 deletions
+44
-31
composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp
...kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp
+3
-2
composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_fp16_bfp16.hpp
...lude/tensor_operation/gridwise_gemm_xdlops_fp16_bfp16.hpp
+14
-1
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_v2.hpp
...sor_operation/threadwise_generic_tensor_slice_copy_v2.hpp
+7
-11
composable_kernel/include/utility/amd_xdlops.hpp
composable_kernel/include/utility/amd_xdlops.hpp
+5
-3
composable_kernel/include/utility/float_type.amd.hpp.in
composable_kernel/include/utility/float_type.amd.hpp.in
+1
-0
driver/include/device_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
...tion_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
+9
-9
driver/src/conv_driver.cpp
driver/src/conv_driver.cpp
+5
-5
No files found.
composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp
View file @
736a37ba
...
@@ -129,6 +129,7 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_xdlops
...
@@ -129,6 +129,7 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_xdlops
return p_c_thread;
return p_c_thread;
}
}
};
};
#endif
template
<
>
template
<
>
struct
WithMNRepeats
<
1
,
1
>
struct
WithMNRepeats
<
1
,
1
>
...
@@ -138,10 +139,10 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_xdlops
...
@@ -138,10 +139,10 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_xdlops
const
FloatB
*
__restrict__
p_b_block
,
const
FloatB
*
__restrict__
p_b_block
,
FloatC
p_c_thread
)
FloatC
p_c_thread
)
{
{
return XdlopsGemm.template Run<M, N, K>(p_a_block, p_b_block, p_c_thread);
p_c_thread
=
XdlopsGemm
.
template
Run
<
M
,
N
,
K
>(
p_a_block
,
p_b_block
,
p_c_thread
);
return
p_c_thread
;
}
}
};
};
#endif
#endif
#endif
template
<
class
FloatA
,
class
FloatB
,
class
FloatC
>
template
<
class
FloatA
,
class
FloatB
,
class
FloatC
>
...
...
composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_fp16_bfp16.hpp
View file @
736a37ba
...
@@ -51,6 +51,8 @@ struct make_block_work_sequence<MBlockWork, NBlockWork, NBlock1MBlock0>
...
@@ -51,6 +51,8 @@ struct make_block_work_sequence<MBlockWork, NBlockWork, NBlock1MBlock0>
__device__
constexpr
auto
get
()
{
return
Sequence
<
NBlockWork
,
MBlockWork
>
{};
}
__device__
constexpr
auto
get
()
{
return
Sequence
<
NBlockWork
,
MBlockWork
>
{};
}
};
};
#define ACCVGPR_ZERO(acc_reg_id) asm volatile("v_accvgpr_write_b32 a[" #acc_reg_id "], 0" : :);
template
<
index_t
GridSize
,
template
<
index_t
GridSize
,
index_t
BlockSize
,
index_t
BlockSize
,
class
ABFloat
,
class
ABFloat
,
...
@@ -212,6 +214,11 @@ struct GridwiseBatchGemmXdlops_gkmkpack_gknkpack_gmn_v2_org
...
@@ -212,6 +214,11 @@ struct GridwiseBatchGemmXdlops_gkmkpack_gknkpack_gmn_v2_org
constexpr
index_t
c_thread_size
=
MPerBlock
*
NPerBlock
/
BlockSize
;
constexpr
index_t
c_thread_size
=
MPerBlock
*
NPerBlock
/
BlockSize
;
auto
c_thread_vec
=
GetRegBuffer
<
AccFloat
,
c_thread_size
>
();
auto
c_thread_vec
=
GetRegBuffer
<
AccFloat
,
c_thread_size
>
();
ACCVGPR_ZERO
(
0
)
ACCVGPR_ZERO
(
1
)
ACCVGPR_ZERO
(
2
)
ACCVGPR_ZERO
(
3
)
// preload data into LDS
// preload data into LDS
{
{
a_blockwise_copy
.
Run
(
p_a_global
,
p_a_block
);
a_blockwise_copy
.
Run
(
p_a_global
,
p_a_block
);
...
@@ -496,6 +503,11 @@ struct GridwiseBatchGemmXdlops_gkmkpack_gknkpack_gmn_v2
...
@@ -496,6 +503,11 @@ struct GridwiseBatchGemmXdlops_gkmkpack_gknkpack_gmn_v2
constexpr
index_t
c_thread_size
=
MPerBlock
*
NPerBlock
/
BlockSize
;
constexpr
index_t
c_thread_size
=
MPerBlock
*
NPerBlock
/
BlockSize
;
auto
c_thread_vec
=
GetRegBuffer
<
AccFloat
,
c_thread_size
>
();
auto
c_thread_vec
=
GetRegBuffer
<
AccFloat
,
c_thread_size
>
();
ACCVGPR_ZERO
(
0
)
ACCVGPR_ZERO
(
1
)
ACCVGPR_ZERO
(
2
)
ACCVGPR_ZERO
(
3
)
// preload data into LDS
// preload data into LDS
{
{
a_blockwise_copy
.
Run
(
p_a_global
,
p_a_block
);
a_blockwise_copy
.
Run
(
p_a_global
,
p_a_block
);
...
@@ -615,7 +627,8 @@ struct GridwiseBatchGemmXdlops_gkmkpack_gknkpack_gmn_v2
...
@@ -615,7 +627,8 @@ struct GridwiseBatchGemmXdlops_gkmkpack_gknkpack_gmn_v2
m_thread_data_on_global
%
(
M2
*
M1
)
/
M2
,
m_thread_data_on_global
%
(
M2
*
M1
)
/
M2
,
m_thread_data_on_global
%
M2
,
m_thread_data_on_global
%
M2
,
n_thread_data_on_global
))
n_thread_data_on_global
))
.
Store
(
c_thread_vec
.
GetVector
(
Number
<
M0
*
M2
>
{})[
Number
<
blk_id
>
{}],
p_c_global
);
.
Store
(
c_thread_vec
,
p_c_global
);
//.Store(c_thread_vec.GetVector(Number<M0 * M2>{})[Number<blk_id>{}], p_c_global);
});
});
}
}
}
}
...
...
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_v2.hpp
View file @
736a37ba
...
@@ -56,15 +56,11 @@ struct ThreadwiseGenericTensorSliceCopy_v5
...
@@ -56,15 +56,11 @@ struct ThreadwiseGenericTensorSliceCopy_v5
static_assert
(
is_valid_sequence_map
<
SrcDimAccessOrder
>
{},
"wrong! map is not valid"
);
static_assert
(
is_valid_sequence_map
<
SrcDimAccessOrder
>
{},
"wrong! map is not valid"
);
static_assert
(
is_valid_sequence_map
<
DstDimAccessOrder
>
{},
"wrong! map is not valid"
);
static_assert
(
is_valid_sequence_map
<
DstDimAccessOrder
>
{},
"wrong! map is not valid"
);
static_assert
(
static_assert
(
SliceLengths
{}[
SrcVectorReadDim
]
%
SrcDataPerRead
==
0
,
SliceLengths
{}[
SrcVectorReadDim
]
%
math
::
lcm
(
SrcDataPerRead
,
DstDataPerWrite
)
==
0
,
"wrong! cannot evenly divide"
);
"wrong! cannot evenly divide"
);
static_assert
(
static_assert
(
SliceLengths
{}[
DstVectorWriteDim
]
%
DstDataPerWrite
==
0
,
SliceLengths
{}[
DstVectorWriteDim
]
%
math
::
lcm
(
SrcDataPerRead
,
DstDataPerWrite
)
==
0
,
"wrong! cannot evenly divide"
);
"wrong! cannot evenly divide"
);
static_assert
(
ThreadBufferSize
==
8
||
ThreadBufferSize
==
16
,
""
);
}
}
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v5
()
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v5
()
...
@@ -194,9 +190,9 @@ struct ThreadwiseGenericTensorSliceCopy_v5
...
@@ -194,9 +190,9 @@ struct ThreadwiseGenericTensorSliceCopy_v5
// load data from src to the long-vector buffer
// load data from src to the long-vector buffer
const
auto
src_coord
=
mSrcSliceOrigin
+
to_multi_index
(
long_vector_data_begin_id
);
const
auto
src_coord
=
mSrcSliceOrigin
+
to_multi_index
(
long_vector_data_begin_id
);
auto
src_buff
=
auto
src_buff
=
buffer_vector_load
<
SrcDataPerRead
,
SrcDesc
::
GetElementSpace
()
>
(
vector_data_load
<
SrcData
,
src_data_per_access
>::
run
(
p_src
,
src_coord
);
p_src
,
src_coord
);
//
buffer_
vector_load<SrcData
PerRead, SrcDesc::GetElementSpace()>
(p_src, src_coord);
// vector_
data_
load<SrcData
, src_data_per_access>::run
(p_src, src_coord);
// store data from the long-vector buffer to dst
// store data from the long-vector buffer to dst
constexpr
auto
buff_off
=
constexpr
auto
buff_off
=
...
...
composable_kernel/include/utility/amd_xdlops.hpp
View file @
736a37ba
...
@@ -132,10 +132,12 @@ intrin_mfma_f32_32x32x2f32(const float* reg_a, const float* reg_b, c_vec16_1_t::
...
@@ -132,10 +132,12 @@ intrin_mfma_f32_32x32x2f32(const float* reg_a, const float* reg_b, c_vec16_1_t::
return
reg_c
;
return
reg_c
;
}
}
__device__
c_vec4_1_t
::
VecType
__device__
float_vec4_t
intrin_mfma_f32_16x16x4f32
(
const
float
*
reg_a
,
intrin_mfma_f32_16x16x4f32
(
const
float
*
reg_a
,
const
float
*
reg_b
,
c_vec4_1_t
::
VecType
reg_c
)
const
float
*
reg_b
,
float_vec4_t
reg_c
)
{
{
reg_c
.
s
.
x
=
llvm_intrin_amdgcn_mfma_f32_16x16x4f32
(
reg_a
[
0
],
reg_b
[
0
],
reg_c
.
s
.
x
,
0
,
0
,
0
);
reg_c
.
s4
(
Number
<
0
>
{})
=
llvm_intrin_amdgcn_mfma_f32_16x16x4f32
(
reg_a
[
0
],
reg_b
[
0
],
reg_c
.
s4
[
Number
<
0
>
{}],
0
,
0
,
0
);
return
reg_c
;
return
reg_c
;
}
}
...
...
composable_kernel/include/utility/float_type.amd.hpp.in
View file @
736a37ba
...
@@ -36,6 +36,7 @@ union float_vec4_t
...
@@ -36,6 +36,7 @@ union float_vec4_t
StaticallyIndexedArray<float, 4> s1;
StaticallyIndexedArray<float, 4> s1;
StaticallyIndexedArray<float2_t, 2> s2;
StaticallyIndexedArray<float2_t, 2> s2;
StaticallyIndexedArray<float4_t, 1> s4;
StaticallyIndexedArray<float4_t, 1> s4;
float n[4];
__host__ __device__ constexpr float_vec4_t() {}
__host__ __device__ constexpr float_vec4_t() {}
template <index_t vs>
template <index_t vs>
...
...
driver/include/device_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
View file @
736a37ba
...
@@ -64,15 +64,15 @@ void gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw(
...
@@ -64,15 +64,15 @@ void gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw(
make_native_tensor_descriptor_packed
(
Sequence
<
N
,
K
,
Ho
,
Wo
>
{});
make_native_tensor_descriptor_packed
(
Sequence
<
N
,
K
,
Ho
,
Wo
>
{});
// read params: tunning parameters
// read params: tunning parameters
constexpr
index_t
GemmMPerBlock
=
1
28
;
constexpr
index_t
GemmMPerBlock
=
1
6
;
constexpr
index_t
GemmNPerBlock
=
25
6
;
constexpr
index_t
GemmNPerBlock
=
1
6
;
constexpr
index_t
GemmKPerBlock
=
4
;
constexpr
index_t
GemmKPerBlock
=
4
;
constexpr
index_t
GemmMPerWave
=
1
28
;
constexpr
index_t
GemmMPerWave
=
1
6
;
constexpr
index_t
GemmNPerWave
=
6
4
;
constexpr
index_t
GemmNPerWave
=
1
6
;
constexpr
index_t
GemmKPack
=
4
;
constexpr
index_t
GemmKPack
=
4
;
// read params: dependent parameters
// read params: dependent parameters
constexpr
index_t
BlockSize
=
25
6
;
constexpr
index_t
BlockSize
=
6
4
;
constexpr
index_t
GemmM
=
K
;
constexpr
index_t
GemmM
=
K
;
constexpr
index_t
GemmN
=
N
*
Ho
*
Wo
;
constexpr
index_t
GemmN
=
N
*
Ho
*
Wo
;
...
@@ -83,7 +83,7 @@ void gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw(
...
@@ -83,7 +83,7 @@ void gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw(
// A matrix copy
// A matrix copy
constexpr
index_t
GemmABlockCopyClusterLengths_GemmK
=
4
;
constexpr
index_t
GemmABlockCopyClusterLengths_GemmK
=
4
;
constexpr
index_t
GemmABlockCopyClusterLengths_GemmM
=
6
4
;
constexpr
index_t
GemmABlockCopyClusterLengths_GemmM
=
1
6
;
constexpr
index_t
GemmABlockCopyClusterLengths_GemmKPack
=
1
;
constexpr
index_t
GemmABlockCopyClusterLengths_GemmKPack
=
1
;
constexpr
index_t
GemmABlockCopyThreadSliceLengths_GemmK
=
constexpr
index_t
GemmABlockCopyThreadSliceLengths_GemmK
=
...
@@ -114,8 +114,8 @@ void gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw(
...
@@ -114,8 +114,8 @@ void gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw(
// B matrix Copy
// B matrix Copy
constexpr
index_t
GemmBBlockCopyClusterLengths_GemmK
=
4
;
constexpr
index_t
GemmBBlockCopyClusterLengths_GemmK
=
4
;
constexpr
index_t
GemmBBlockCopyClusterLengths_GemmN
=
6
4
;
constexpr
index_t
GemmBBlockCopyClusterLengths_GemmN
=
4
;
constexpr
index_t
GemmBBlockCopyClusterLengths_GemmKPack
=
1
;
constexpr
index_t
GemmBBlockCopyClusterLengths_GemmKPack
=
4
;
constexpr
index_t
GemmBBlockCopyThreadSliceLengths_GemmK
=
constexpr
index_t
GemmBBlockCopyThreadSliceLengths_GemmK
=
GemmKPerBlock
/
GemmBBlockCopyClusterLengths_GemmK
;
GemmKPerBlock
/
GemmBBlockCopyClusterLengths_GemmK
;
...
@@ -140,7 +140,7 @@ void gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw(
...
@@ -140,7 +140,7 @@ void gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw(
using
GemmBBlockCopySrcAccessOrder
=
Sequence
<
0
,
1
,
3
,
2
>
;
// [GemmG, GemmK, GemmKPack, GemmN]
using
GemmBBlockCopySrcAccessOrder
=
Sequence
<
0
,
1
,
3
,
2
>
;
// [GemmG, GemmK, GemmKPack, GemmN]
using
GemmBBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
,
2
,
3
>
;
// [GemmG, GemmK, GemmN, GemmKPack]
using
GemmBBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
,
2
,
3
>
;
// [GemmG, GemmK, GemmN, GemmKPack]
constexpr
index_t
GemmBBlockCopySrcDataPerRead_GemmN
=
1
;
constexpr
index_t
GemmBBlockCopySrcDataPerRead_GemmN
=
4
;
constexpr
index_t
GemmBBlockCopyDstDataPerWrite_GemmKPack
=
1
;
constexpr
index_t
GemmBBlockCopyDstDataPerWrite_GemmKPack
=
1
;
// gridwise GEMM
// gridwise GEMM
...
...
driver/src/conv_driver.cpp
View file @
736a37ba
...
@@ -24,11 +24,11 @@ int main(int argc, char* argv[])
...
@@ -24,11 +24,11 @@ int main(int argc, char* argv[])
using
namespace
ck
;
using
namespace
ck
;
// 1x1, 56x56
// 1x1, 56x56
constexpr
index_t
N
=
128
;
constexpr
index_t
N
=
4
;
constexpr
index_t
C
=
128
;
constexpr
index_t
C
=
32
;
constexpr
index_t
HI
=
56
;
constexpr
index_t
HI
=
2
;
constexpr
index_t
WI
=
56
;
constexpr
index_t
WI
=
2
;
constexpr
index_t
K
=
128
;
constexpr
index_t
K
=
32
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
constexpr
index_t
X
=
1
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment