Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
43a20997
"docker/vscode:/vscode.git/clone" did not exist on "48d0123f0f4415b1bb78f5a538df8b0b9975c6d4"
Commit
43a20997
authored
Dec 02, 2022
by
aska-0096
Browse files
debugging
parent
0cd587d9
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
13 additions
and
8 deletions
+13
-8
example/01_gemm/run_gemm_example.inc
example/01_gemm/run_gemm_example.inc
+2
-5
include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp
...ude/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp
+1
-0
include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp
.../ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp
+0
-2
include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp
include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp
+1
-1
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
...tion/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
+1
-0
include/ck/utility/common_header.hpp
include/ck/utility/common_header.hpp
+8
-0
No files found.
example/01_gemm/run_gemm_example.inc
View file @
43a20997
...
@@ -32,10 +32,8 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
...
@@ -32,10 +32,8 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
{
{
case
0
:
break
;
case
0
:
break
;
case
1
:
case
1
:
ck
::
utils
::
FillUniformDistributionIntegerValue
<
ADataType
>
{
-
5.
f
,
5.
f
}(
a_m_k
.
begin
(),
ck
::
utils
::
FillUniformDistributionIntegerValue
<
ADataType
>
{
1.
f
,
1.
f
}(
a_m_k
.
begin
(),
a_m_k
.
end
());
a_m_k
.
end
());
ck
::
utils
::
FillUniformDistributionIntegerValue
<
BDataType
>
{
1.
f
,
1.
f
}(
b_k_n
.
begin
(),
b_k_n
.
end
());
ck
::
utils
::
FillUniformDistributionIntegerValue
<
BDataType
>
{
-
5.
f
,
5.
f
}(
b_k_n
.
begin
(),
b_k_n
.
end
());
break
;
break
;
default
:
default
:
ck
::
utils
::
FillUniformDistribution
<
ADataType
>
{
-
1.
f
,
1.
f
}(
a_m_k
.
begin
(),
a_m_k
.
end
());
ck
::
utils
::
FillUniformDistribution
<
ADataType
>
{
-
1.
f
,
1.
f
}(
a_m_k
.
begin
(),
a_m_k
.
end
());
...
@@ -102,7 +100,6 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
...
@@ -102,7 +100,6 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
return
true
;
return
true
;
}
}
float
ave_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
config
.
time_kernel
});
float
ave_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
config
.
time_kernel
});
std
::
size_t
flop
=
2_
uz
*
M
*
N
*
K
;
std
::
size_t
flop
=
2_
uz
*
M
*
N
*
K
;
...
...
include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp
View file @
43a20997
...
@@ -226,6 +226,7 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3
...
@@ -226,6 +226,7 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3
constexpr
index_t
c_offset
=
constexpr
index_t
c_offset
=
c_thread_desc_
.
CalculateOffset
(
make_tuple
(
iCut
,
iN
,
0
));
c_thread_desc_
.
CalculateOffset
(
make_tuple
(
iCut
,
iN
,
0
));
// debug_hexprinter(0x3c003c00, a_thread_vec.template AsType<FloatAB>()(Number<0>{}));
wmma_gemm
.
template
Run
(
wmma_gemm
.
template
Run
(
a_thread_vec
.
template
AsType
<
wmma_input_type
>()(
Number
<
0
>{}),
a_thread_vec
.
template
AsType
<
wmma_input_type
>()(
Number
<
0
>{}),
b_thread_vec
.
template
AsType
<
wmma_input_type
>()(
Number
<
0
>
{}),
b_thread_vec
.
template
AsType
<
wmma_input_type
>()(
Number
<
0
>
{}),
...
...
include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp
View file @
43a20997
...
@@ -359,8 +359,6 @@ struct DeviceGemmWmma : public DeviceGemm<ALayout,
...
@@ -359,8 +359,6 @@ struct DeviceGemmWmma : public DeviceGemm<ALayout,
remove_reference_t
<
typename
GridwiseGemm
::
DefaultBlock2CTileMap
>
,
remove_reference_t
<
typename
GridwiseGemm
::
DefaultBlock2CTileMap
>
,
true
>
;
// Last Option is W/O
true
>
;
// Last Option is W/O
std
::
cout
<<
"Host kernel type is "
<<
type_name
<
decltype
(
kernel
)
>
()
<<
std
::
endl
;
printf
(
"---------------------Crush before kernel launch-------------------
\n
"
);
ave_time
=
launch_and_time_kernel
(
stream_config
,
ave_time
=
launch_and_time_kernel
(
stream_config
,
kernel
,
kernel
,
dim3
(
grid_size
),
dim3
(
grid_size
),
...
...
include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp
View file @
43a20997
...
@@ -356,6 +356,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma
...
@@ -356,6 +356,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma
/*******************************************************************************/
/*******************************************************************************/
// BlockLevel, A/B Matrix ThreadMapping in LDS, As Destinaion of BlockWise_Copy
// BlockLevel, A/B Matrix ThreadMapping in LDS, As Destinaion of BlockWise_Copy
const
auto
K0
=
a_grid_desc_k0_m_k1
.
GetLength
(
I0
);
const
auto
K0
=
a_grid_desc_k0_m_k1
.
GetLength
(
I0
);
printf
(
"A_GRID_DESC: %s
\n
"
,
std
::
string
(
type_name
<
decltype
(
a_grid_desc_k0_m_k1
)
>
()).
c_str
());
constexpr
auto
max_lds_align
=
K1
;
constexpr
auto
max_lds_align
=
K1
;
constexpr
auto
a_block_desc_k0perblock_mperblock_k1
=
GetABlockDescriptor_K0PerBlock_MPerBlock_K1
();
constexpr
auto
a_block_desc_k0perblock_mperblock_k1
=
GetABlockDescriptor_K0PerBlock_MPerBlock_K1
();
constexpr
auto
b_block_desc_k0perblock_nperblock_k1
=
GetBBlockDescriptor_K0PerBlock_NPerBlock_K1
();
constexpr
auto
b_block_desc_k0perblock_nperblock_k1
=
GetBBlockDescriptor_K0PerBlock_NPerBlock_K1
();
...
@@ -457,7 +458,6 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma
...
@@ -457,7 +458,6 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma
// gridwise GEMM pipeline
// gridwise GEMM pipeline
const
index_t
K0BlockMainLoop
=
__builtin_amdgcn_readfirstlane
(
K0
/
K0PerBlock
);
const
index_t
K0BlockMainLoop
=
__builtin_amdgcn_readfirstlane
(
K0
/
K0PerBlock
);
GridwiseGemmPipe
::
template
Run
<
HasMainKBlockLoop
>(
a_grid_desc_k0_m_k1
,
GridwiseGemmPipe
::
template
Run
<
HasMainKBlockLoop
>(
a_grid_desc_k0_m_k1
,
a_block_desc_k0perblock_mperblock_k1
,
a_block_desc_k0perblock_mperblock_k1
,
a_blockwise_copy
,
a_blockwise_copy
,
...
...
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
View file @
43a20997
...
@@ -208,6 +208,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
...
@@ -208,6 +208,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
src_buf
.
template
Get
<
src_vector_t
>(
src_coord_
.
GetOffset
(),
is_src_valid
)};
src_buf
.
template
Get
<
src_vector_t
>(
src_coord_
.
GetOffset
(),
is_src_valid
)};
// apply SrcElementwiseOperation on src_vector_container
// apply SrcElementwiseOperation on src_vector_container
debug_hexprinter
(
0xffffffff
,
src_coord_
.
GetOffset
());
static_for
<
0
,
SrcScalarPerVector
,
1
>
{}([
&
](
auto
i
)
{
static_for
<
0
,
SrcScalarPerVector
,
1
>
{}([
&
](
auto
i
)
{
SrcData
src_v
;
SrcData
src_v
;
...
...
include/ck/utility/common_header.hpp
View file @
43a20997
...
@@ -72,3 +72,11 @@ constexpr auto type_name() {
...
@@ -72,3 +72,11 @@ constexpr auto type_name() {
name
.
remove_suffix
(
suffix
.
size
());
name
.
remove_suffix
(
suffix
.
size
());
return
name
;
return
name
;
}
}
template
<
typename
T
>
__device__
void
debug_hexprinter
(
const
uint32_t
v_target
,
T
v_val
){
const
uint32_t
v_dbg
=
*
(
reinterpret_cast
<
uint32_t
*>
(
&
v_val
));
if
(
v_dbg
!=
v_target
)
printf
(
"@Thread: %d, Val: %08x != Target: %08x
\n
"
,
ck
::
get_thread_local_1d_id
(),
v_dbg
,
v_target
);
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment