Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
58d75b7a
Unverified
Commit
58d75b7a
authored
Dec 17, 2024
by
M.Emin Ozturk
Committed by
GitHub
Dec 17, 2024
Browse files
Merge branch 'develop' into gemm_bf16_sk_muozturk
parents
7ed95722
627a27bd
Changes
64
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
3604 additions
and
1962 deletions
+3604
-1962
include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp
...nsor_operation/gpu/device/device_batched_gemm_multi_d.hpp
+2
-1
include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp
...e/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp
+29
-16
include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
...operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
+1
-0
include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp
...device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp
+48
-138
include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
...vice_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
+131
-163
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp
...ration/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp
+8
-8
include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp
...operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp
+749
-526
include/ck/utility/math_v2.hpp
include/ck/utility/math_v2.hpp
+1
-1
include/ck_tile/README.md
include/ck_tile/README.md
+3
-0
include/ck_tile/core.hpp
include/ck_tile/core.hpp
+1
-0
include/ck_tile/ops/flatmm.hpp
include/ck_tile/ops/flatmm.hpp
+1
-0
include/ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32_itl.hpp
.../flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32_itl.hpp
+510
-0
include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16.inc
.../block/uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16.inc
+794
-589
include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16_itl.inc
...ck/uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16_itl.inc
+769
-0
include/ck_tile/ops/flatmm/block/uk/flatmm_uk_gfx9_32x512x128_1x1x1_16x16x16.inc
...tmm/block/uk/flatmm_uk_gfx9_32x512x128_1x1x1_16x16x16.inc
+504
-503
include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
+8
-7
include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_policy.hpp
...sed_moe/pipeline/fused_moegemm_pipeline_flatmm_policy.hpp
+27
-2
include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_traits.hpp
...e/ck_tile/ops/fused_moe/pipeline/fused_moegemm_traits.hpp
+3
-1
include/ck_tile/ops/gemm.hpp
include/ck_tile/ops/gemm.hpp
+1
-1
include/ck_tile/ops/smoothquant/kernel/smoothquant_kernel.hpp
...ude/ck_tile/ops/smoothquant/kernel/smoothquant_kernel.hpp
+14
-6
No files found.
include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp
View file @
58d75b7a
...
@@ -89,7 +89,8 @@ struct DeviceBatchedGemmV2MultiD : public BaseOperator
...
@@ -89,7 +89,8 @@ struct DeviceBatchedGemmV2MultiD : public BaseOperator
index_t
BatchStrideE
,
index_t
BatchStrideE
,
AElementwiseOperation
a_element_op
,
AElementwiseOperation
a_element_op
,
BElementwiseOperation
b_element_op
,
BElementwiseOperation
b_element_op
,
CDEElementwiseOperation
cde_element_op
)
=
0
;
CDEElementwiseOperation
cde_element_op
,
index_t
KBatch
)
=
0
;
virtual
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
=
0
;
virtual
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
=
0
;
};
};
...
...
include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp
View file @
58d75b7a
...
@@ -41,12 +41,15 @@ __global__ void
...
@@ -41,12 +41,15 @@ __global__ void
__shared__
char
p_shared
[
GridwiseGemm
::
GetSharedMemoryNumberOfByte
()];
__shared__
char
p_shared
[
GridwiseGemm
::
GetSharedMemoryNumberOfByte
()];
const
index_t
g_idx
=
blockIdx
.
z
%
karg
.
Batch
;
const
index_t
g_idx
=
blockIdx
.
z
%
karg
.
Batch
;
const
index_t
k_idx
=
blockIdx
.
z
/
karg
.
Batch
;
const
auto
a_batch_offset
=
karg
.
compute_ptr_offset_of_batch
.
GetAPtrOffset
(
g_idx
);
const
auto
a_batch_offset
=
karg
.
compute_ptr_offset_of_batch
.
GetAPtrOffset
(
g_idx
);
const
auto
b_batch_offset
=
karg
.
compute_ptr_offset_of_batch
.
GetBPtrOffset
(
g_idx
);
const
auto
b_batch_offset
=
karg
.
compute_ptr_offset_of_batch
.
GetBPtrOffset
(
g_idx
);
const
auto
ds_batch_offset
=
karg
.
compute_ptr_offset_of_batch
.
GetDsPtrOffset
(
g_idx
);
const
auto
ds_batch_offset
=
karg
.
compute_ptr_offset_of_batch
.
GetDsPtrOffset
(
g_idx
);
const
auto
c_batch_offset
=
karg
.
compute_ptr_offset_of_batch
.
GetCPtrOffset
(
g_idx
);
const
auto
c_batch_offset
=
karg
.
compute_ptr_offset_of_batch
.
GetCPtrOffset
(
g_idx
);
auto
splitk_batch_offset
=
typename
GridwiseGemm
::
SplitKBatchOffset
(
karg
,
k_idx
);
// populate pointer, desc for Ds
// populate pointer, desc for Ds
static_for
<
0
,
GridwiseGemm
::
NumDTensor
,
1
>
{}([
&
](
auto
i
)
{
static_for
<
0
,
GridwiseGemm
::
NumDTensor
,
1
>
{}([
&
](
auto
i
)
{
// D pointer
// D pointer
...
@@ -54,8 +57,8 @@ __global__ void
...
@@ -54,8 +57,8 @@ __global__ void
});
});
GridwiseGemm
::
template
Run
<
HasMainKBlockLoop
,
CGlobalMemoryDataOperation
,
TailNum
>(
GridwiseGemm
::
template
Run
<
HasMainKBlockLoop
,
CGlobalMemoryDataOperation
,
TailNum
>(
karg
.
p_a_grid
+
a_batch_offset
,
karg
.
p_a_grid
+
a_batch_offset
+
splitk_batch_offset
.
a_k_split_offset
,
karg
.
p_b_grid
+
b_batch_offset
,
karg
.
p_b_grid
+
b_batch_offset
+
splitk_batch_offset
.
b_k_split_offset
,
karg
.
p_ds_grid
,
karg
.
p_ds_grid
,
karg
.
p_c_grid
+
c_batch_offset
,
karg
.
p_c_grid
+
c_batch_offset
,
p_shared
,
p_shared
,
...
@@ -87,12 +90,15 @@ __global__ void
...
@@ -87,12 +90,15 @@ __global__ void
__shared__
char
p_shared_1
[
GridwiseGemm
::
GetSharedMemoryNumberOfByte
()];
__shared__
char
p_shared_1
[
GridwiseGemm
::
GetSharedMemoryNumberOfByte
()];
const
index_t
g_idx
=
blockIdx
.
z
%
karg
.
Batch
;
const
index_t
g_idx
=
blockIdx
.
z
%
karg
.
Batch
;
const
index_t
k_idx
=
blockIdx
.
z
/
karg
.
Batch
;
const
auto
a_batch_offset
=
karg
.
compute_ptr_offset_of_batch
.
GetAPtrOffset
(
g_idx
);
const
auto
a_batch_offset
=
karg
.
compute_ptr_offset_of_batch
.
GetAPtrOffset
(
g_idx
);
const
auto
b_batch_offset
=
karg
.
compute_ptr_offset_of_batch
.
GetBPtrOffset
(
g_idx
);
const
auto
b_batch_offset
=
karg
.
compute_ptr_offset_of_batch
.
GetBPtrOffset
(
g_idx
);
const
auto
ds_batch_offset
=
karg
.
compute_ptr_offset_of_batch
.
GetDsPtrOffset
(
g_idx
);
const
auto
ds_batch_offset
=
karg
.
compute_ptr_offset_of_batch
.
GetDsPtrOffset
(
g_idx
);
const
auto
c_batch_offset
=
karg
.
compute_ptr_offset_of_batch
.
GetCPtrOffset
(
g_idx
);
const
auto
c_batch_offset
=
karg
.
compute_ptr_offset_of_batch
.
GetCPtrOffset
(
g_idx
);
auto
splitk_batch_offset
=
typename
GridwiseGemm
::
SplitKBatchOffset
(
karg
,
k_idx
);
// populate pointer, desc for Ds
// populate pointer, desc for Ds
static_for
<
0
,
GridwiseGemm
::
NumDTensor
,
1
>
{}([
&
](
auto
i
)
{
static_for
<
0
,
GridwiseGemm
::
NumDTensor
,
1
>
{}([
&
](
auto
i
)
{
// D pointer
// D pointer
...
@@ -100,8 +106,8 @@ __global__ void
...
@@ -100,8 +106,8 @@ __global__ void
});
});
GridwiseGemm
::
template
Run_2Lds
<
HasMainKBlockLoop
,
CGlobalMemoryDataOperation
,
TailNum
>(
GridwiseGemm
::
template
Run_2Lds
<
HasMainKBlockLoop
,
CGlobalMemoryDataOperation
,
TailNum
>(
karg
.
p_a_grid
+
a_batch_offset
,
karg
.
p_a_grid
+
a_batch_offset
+
splitk_batch_offset
.
a_k_split_offset
,
karg
.
p_b_grid
+
b_batch_offset
,
karg
.
p_b_grid
+
b_batch_offset
+
splitk_batch_offset
.
b_k_split_offset
,
karg
.
p_ds_grid
,
karg
.
p_ds_grid
,
karg
.
p_c_grid
+
c_batch_offset
,
karg
.
p_c_grid
+
c_batch_offset
,
p_shared_0
,
p_shared_0
,
...
@@ -303,7 +309,8 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
...
@@ -303,7 +309,8 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
index_t
Batch_
,
index_t
Batch_
,
AElementwiseOperation
a_element_op_
,
AElementwiseOperation
a_element_op_
,
BElementwiseOperation
b_element_op_
,
BElementwiseOperation
b_element_op_
,
CElementwiseOperation
c_element_op_
)
CElementwiseOperation
c_element_op_
,
index_t
KBatch_
)
:
GridwiseGemm
::
Argument
{
p_a_grid_
,
:
GridwiseGemm
::
Argument
{
p_a_grid_
,
p_b_grid_
,
p_b_grid_
,
p_ds_grid_
,
p_ds_grid_
,
...
@@ -315,7 +322,7 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
...
@@ -315,7 +322,7 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
StrideB_
,
StrideB_
,
StrideDs_
,
StrideDs_
,
StrideE_
,
StrideE_
,
1
,
KBatch_
,
a_element_op_
,
a_element_op_
,
b_element_op_
,
b_element_op_
,
c_element_op_
},
c_element_op_
},
...
@@ -336,13 +343,14 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
...
@@ -336,13 +343,14 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
arg
.
Print
();
arg
.
Print
();
}
}
if
(
!
GridwiseGemm
::
CheckValidity
(
arg
)
||
arg
.
KBatch
>
1
)
if
(
!
GridwiseGemm
::
CheckValidity
(
arg
))
{
{
throw
std
::
runtime_error
(
"wrong! GridwiseGemm has invalid setting"
);
throw
std
::
runtime_error
(
"wrong! GridwiseGemm has invalid setting"
);
}
}
index_t
gdx
,
gdy
,
gdz
;
index_t
gdx
,
gdy
,
gdz
;
std
::
tie
(
gdx
,
gdy
,
gdz
)
=
GridwiseGemm
::
CalculateGridSize
(
arg
.
M
,
arg
.
N
,
arg
.
Batch
);
std
::
tie
(
gdx
,
gdy
,
gdz
)
=
GridwiseGemm
::
CalculateGridSize
(
arg
.
M
,
arg
.
N
,
arg
.
Batch
*
arg
.
KBatch
);
float
ave_time
=
0
;
float
ave_time
=
0
;
...
@@ -387,10 +395,11 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
...
@@ -387,10 +395,11 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
rotating_mem
.
Next
();
rotating_mem
.
Next
();
// clear c mem
// clear c mem
if
(
arg_
.
KBatch
>
1
)
if
(
arg_
.
KBatch
>
1
)
hipGetErrorString
(
hipMemsetAsync
(
arg_
.
p_c_grid
,
hipGetErrorString
(
0
,
hipMemsetAsync
(
arg_
.
p_c_grid
,
arg_
.
M
*
arg_
.
N
*
sizeof
(
CDataType
),
0
,
stream_config
.
stream_id_
));
arg
.
Batch
*
arg_
.
M
*
arg_
.
N
*
sizeof
(
CDataType
),
stream_config
.
stream_id_
));
};
};
ave_time
=
ck
::
utility
::
launch_and_time_kernel_with_preprocess
<
false
>
(
ave_time
=
ck
::
utility
::
launch_and_time_kernel_with_preprocess
<
false
>
(
...
@@ -889,7 +898,8 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
...
@@ -889,7 +898,8 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
index_t
BatchStrideE
,
index_t
BatchStrideE
,
AElementwiseOperation
a_element_op
,
AElementwiseOperation
a_element_op
,
BElementwiseOperation
b_element_op
,
BElementwiseOperation
b_element_op
,
CElementwiseOperation
c_element_op
)
CElementwiseOperation
c_element_op
,
index_t
KBatch
=
1
)
{
{
return
Argument
{
static_cast
<
const
ADataType
*>
(
p_a
),
return
Argument
{
static_cast
<
const
ADataType
*>
(
p_a
),
static_cast
<
const
BDataType
*>
(
p_b
),
static_cast
<
const
BDataType
*>
(
p_b
),
...
@@ -909,7 +919,8 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
...
@@ -909,7 +919,8 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
Batch
,
Batch
,
a_element_op
,
a_element_op
,
b_element_op
,
b_element_op
,
c_element_op
};
c_element_op
,
KBatch
};
}
}
static
auto
MakeInvoker
()
{
return
Invoker
{};
}
static
auto
MakeInvoker
()
{
return
Invoker
{};
}
...
@@ -934,7 +945,8 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
...
@@ -934,7 +945,8 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
index_t
BatchStrideE
,
index_t
BatchStrideE
,
AElementwiseOperation
a_element_op
,
AElementwiseOperation
a_element_op
,
BElementwiseOperation
b_element_op
,
BElementwiseOperation
b_element_op
,
CElementwiseOperation
c_element_op
)
override
CElementwiseOperation
c_element_op
,
index_t
KBatch
=
1
)
override
{
{
return
std
::
make_unique
<
Argument
>
(
static_cast
<
const
ADataType
*>
(
p_a
),
return
std
::
make_unique
<
Argument
>
(
static_cast
<
const
ADataType
*>
(
p_a
),
static_cast
<
const
BDataType
*>
(
p_b
),
static_cast
<
const
BDataType
*>
(
p_b
),
...
@@ -954,7 +966,8 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
...
@@ -954,7 +966,8 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
Batch
,
Batch
,
a_element_op
,
a_element_op
,
b_element_op
,
b_element_op
,
c_element_op
);
c_element_op
,
KBatch
);
}
}
// polymorphic
// polymorphic
...
...
include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
View file @
58d75b7a
...
@@ -729,6 +729,7 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2<ALayout,
...
@@ -729,6 +729,7 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2<ALayout,
return
str
.
str
();
return
str
.
str
();
}
}
REGISTER_EXTRA_PRINTING_METHODS
};
};
}
// namespace device
}
// namespace device
...
...
include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp
View file @
58d75b7a
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2023
-2024
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
...
@@ -106,89 +106,35 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
...
@@ -106,89 +106,35 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
static
constexpr
auto
I3
=
Number
<
3
>
{};
static
constexpr
auto
I3
=
Number
<
3
>
{};
static
constexpr
index_t
KPerBlock
=
K0PerBlock
*
K1
;
static
constexpr
index_t
KPerBlock
=
K0PerBlock
*
K1
;
static
constexpr
auto
transform_conv_to_gemm
=
using
ConvToGemmBwdDataTransform
=
TransformConvBwdDataToGemm_v1
<
NDimSpatial
,
TransformConvBwdDataToGemm_v1
<
NDimSpatial
,
ConvBackwardDataSpecialization
,
ConvBackwardDataSpecialization
,
K1
,
K1
,
K1
,
K1
,
MPerBlock
,
MPerBlock
,
NPerBlock
,
NPerBlock
,
KPerBlock
,
KPerBlock
,
true
/* DoPadGemmM */
,
true
/* DoPadGemmM */
,
true
/* DoPadGemmN */
,
true
/* DoPadGemmN */
>
{};
ALayout
,
BLayout
,
static
auto
GetDummyABDsEGridDescriptor
()
ELayout
>
;
{
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>
dummy_tensor_lengths
=
{
1
};
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>
dummy_tensor_strides
=
{
1
};
const
std
::
array
<
index_t
,
NDimSpatial
>
dummy_spatial_lengths
=
{
1
};
const
auto
a_grid_desc_ak0_m_ak1
=
transform_conv_to_gemm
.
template
MakeADescriptor_AK0_M_AK1
<
ALayout
>(
dummy_tensor_lengths
,
dummy_tensor_strides
,
dummy_tensor_lengths
,
dummy_tensor_strides
,
dummy_tensor_lengths
,
dummy_tensor_strides
,
dummy_spatial_lengths
,
dummy_spatial_lengths
,
dummy_spatial_lengths
,
dummy_spatial_lengths
,
dummy_spatial_lengths
);
const
auto
b_grid_desc_bk0_n_bk1
=
transform_conv_to_gemm
.
template
MakeBDescriptor_BK0_N_BK1
<
BLayout
>(
dummy_tensor_lengths
,
dummy_tensor_strides
,
dummy_tensor_lengths
,
dummy_tensor_strides
,
dummy_tensor_lengths
,
dummy_tensor_strides
,
dummy_spatial_lengths
,
dummy_spatial_lengths
,
dummy_spatial_lengths
,
dummy_spatial_lengths
,
dummy_spatial_lengths
);
const
auto
ds_grid_desc_m_n
=
generate_tuple
(
[
&
](
auto
i
)
{
using
DLayout
=
remove_cvref_t
<
tuple_element_t
<
i
.
value
,
DsLayout
>>
;
return
transform_conv_to_gemm
.
template
MakeCDescriptor_M_N
<
DLayout
>(
dummy_tensor_lengths
,
dummy_tensor_strides
,
dummy_tensor_lengths
,
dummy_tensor_strides
,
dummy_tensor_lengths
,
dummy_tensor_strides
,
dummy_spatial_lengths
,
dummy_spatial_lengths
,
dummy_spatial_lengths
,
dummy_spatial_lengths
,
dummy_spatial_lengths
);
},
Number
<
NumDTensor
>
{});
const
auto
e_grid_desc_m_n
=
transform_conv_to_gemm
.
template
MakeCDescriptor_M_N
<
ELayout
>(
dummy_tensor_lengths
,
dummy_tensor_strides
,
dummy_tensor_lengths
,
dummy_tensor_strides
,
dummy_tensor_lengths
,
dummy_tensor_strides
,
dummy_spatial_lengths
,
dummy_spatial_lengths
,
dummy_spatial_lengths
,
dummy_spatial_lengths
,
dummy_spatial_lengths
);
static
auto
GetDummyABDsEGridDescriptor
(
const
ConvToGemmBwdDataTransform
&
conv_to_gemm_transform
)
{
const
auto
a_grid_desc_ak0_m_ak1
=
conv_to_gemm_transform
.
MakeADescriptor_AK0_M_AK1
();
const
auto
b_grid_desc_bk0_n_bk1
=
conv_to_gemm_transform
.
MakeBDescriptor_BK0_N_BK1
();
const
auto
ds_grid_desc_m_n
=
generate_tuple
([
&
](
auto
)
{
return
conv_to_gemm_transform
.
MakeCDescriptor_M_N
();
},
Number
<
NumDTensor
>
{});
const
auto
e_grid_desc_m_n
=
conv_to_gemm_transform
.
MakeCDescriptor_M_N
();
return
make_tuple
(
return
make_tuple
(
a_grid_desc_ak0_m_ak1
,
b_grid_desc_bk0_n_bk1
,
ds_grid_desc_m_n
,
e_grid_desc_m_n
);
a_grid_desc_ak0_m_ak1
,
b_grid_desc_bk0_n_bk1
,
ds_grid_desc_m_n
,
e_grid_desc_m_n
);
}
}
// desc
// desc
using
ABDsEGridDesc
=
decltype
(
GetDummyABDsEGridDescriptor
());
constexpr
static
ConvToGemmBwdDataTransform
dummy_conv_to_gemm_transform
;
using
ABDsEGridDesc
=
decltype
(
GetDummyABDsEGridDescriptor
(
dummy_conv_to_gemm_transform
));
using
AGridDesc_AK0_M_AK1
=
remove_cvref_t
<
tuple_element_t
<
0
,
ABDsEGridDesc
>>
;
using
AGridDesc_AK0_M_AK1
=
remove_cvref_t
<
tuple_element_t
<
0
,
ABDsEGridDesc
>>
;
using
BGridDesc_BK0_N_BK1
=
remove_cvref_t
<
tuple_element_t
<
1
,
ABDsEGridDesc
>>
;
using
BGridDesc_BK0_N_BK1
=
remove_cvref_t
<
tuple_element_t
<
1
,
ABDsEGridDesc
>>
;
...
@@ -270,7 +216,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
...
@@ -270,7 +216,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
b_g_k_c_xs_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
b_g_k_c_xs_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
b_g_k_c_xs_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
b_g_k_c_xs_strides
,
const
std
::
array
<
std
::
array
<
index_t
,
NDimSpatial
+
3
>
,
NumDTensor
>&
const
std
::
array
<
std
::
array
<
index_t
,
NDimSpatial
+
3
>
,
NumDTensor
>&
ds_g_n_c_wis_lengths
,
/*
ds_g_n_c_wis_lengths
*/
,
const
std
::
array
<
std
::
array
<
index_t
,
NDimSpatial
+
3
>
,
NumDTensor
>&
const
std
::
array
<
std
::
array
<
index_t
,
NDimSpatial
+
3
>
,
NumDTensor
>&
ds_g_n_c_wis_strides
,
ds_g_n_c_wis_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
e_g_n_c_wis_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
e_g_n_c_wis_lengths
,
...
@@ -291,15 +237,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
...
@@ -291,15 +237,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
b_element_op_
{
b_element_op
},
b_element_op_
{
b_element_op
},
cde_element_op_
{
cde_element_op
},
cde_element_op_
{
cde_element_op
},
a_g_n_k_wos_lengths_
{
a_g_n_k_wos_lengths
},
a_g_n_k_wos_lengths_
{
a_g_n_k_wos_lengths
},
a_g_n_k_wos_strides_
{
a_g_n_k_wos_strides
},
b_g_k_c_xs_lengths_
{
b_g_k_c_xs_lengths
},
b_g_k_c_xs_lengths_
{
b_g_k_c_xs_lengths
},
b_g_k_c_xs_strides_
{
b_g_k_c_xs_strides
},
ds_g_n_c_wis_lengths_
{
ds_g_n_c_wis_lengths
},
ds_g_n_c_wis_strides_
{
ds_g_n_c_wis_strides
},
e_g_n_c_wis_lengths_
{
e_g_n_c_wis_lengths
},
e_g_n_c_wis_strides_
{
e_g_n_c_wis_strides
},
conv_filter_strides_
{
conv_filter_strides
},
conv_filter_strides_
{
conv_filter_strides
},
conv_filter_dilations_
{
conv_filter_dilations
},
input_left_pads_
{
input_left_pads
},
input_left_pads_
{
input_left_pads
},
input_right_pads_
{
input_right_pads
}
input_right_pads_
{
input_right_pads
}
{
{
...
@@ -382,68 +321,47 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
...
@@ -382,68 +321,47 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
tildes
=
{
i_ztilde
,
i_ytilde
,
i_xtilde
};
tildes
=
{
i_ztilde
,
i_ytilde
,
i_xtilde
};
}
}
ConvToGemmBwdDataTransform
conv_to_gemm_transform_
{
a_g_n_k_wos_lengths
,
a_g_n_k_wos_strides
,
b_g_k_c_xs_lengths
,
b_g_k_c_xs_strides
,
e_g_n_c_wis_lengths
,
e_g_n_c_wis_strides
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
,
tildes
};
const
auto
a_grid_desc_ak0_m_ak1
=
const
auto
a_grid_desc_ak0_m_ak1
=
transform_conv_to_gemm
.
template
MakeADescriptor_AK0_M_AK1
<
ALayout
>(
conv_to_gemm_transform_
.
MakeADescriptor_AK0_M_AK1
();
a_g_n_k_wos_lengths
,
a_g_n_k_wos_strides
,
b_g_k_c_xs_lengths
,
b_g_k_c_xs_strides
,
e_g_n_c_wis_lengths
,
e_g_n_c_wis_strides
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
,
tildes
);
const
auto
b_grid_desc_bk0_n_bk1
=
const
auto
b_grid_desc_bk0_n_bk1
=
transform_conv_to_gemm
.
template
MakeBDescriptor_BK0_N_BK1
<
BLayout
>(
conv_to_gemm_transform_
.
MakeBDescriptor_BK0_N_BK1
();
a_g_n_k_wos_lengths
,
a_g_n_k_wos_strides
,
b_g_k_c_xs_lengths
,
b_g_k_c_xs_strides
,
e_g_n_c_wis_lengths
,
e_g_n_c_wis_strides
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
,
tildes
);
DsGridDesc_M_N
ds_grid_desc_m_n
;
DsGridDesc_M_N
ds_grid_desc_m_n
;
// populate Ds desc
// populate Ds desc
static_for
<
0
,
NumDTensor
,
1
>
{}([
&
](
auto
i
)
{
static_for
<
0
,
NumDTensor
,
1
>
{}([
&
](
auto
i
)
{
using
DLayout
=
remove_cvref_t
<
tuple_element_t
<
i
.
value
,
DsLayout
>>
;
using
DLayout
=
remove_cvref_t
<
tuple_element_t
<
i
.
value
,
DsLayout
>>
;
static_assert
(
is_same_v
<
DLayout
,
ELayout
>
);
ds_grid_desc_m_n
(
i
)
=
ConvToGemmBwdDataTransform
conv_to_gemm_transform_d
{
transform_conv_to_gemm
.
template
MakeCDescriptor_M_N
<
DLayout
>(
a_g_n_k_wos_lengths
,
a_g_n_k_wos_strides
,
b_g_k_c_xs_lengths
,
b_g_k_c_xs_strides
,
ds_g_n_c_wis_lengths
[
i
],
ds_g_n_c_wis_strides
[
i
],
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
,
tildes
);
});
const
auto
e_grid_desc_m_n
=
transform_conv_to_gemm
.
template
MakeCDescriptor_M_N
<
ELayout
>(
a_g_n_k_wos_lengths
,
a_g_n_k_wos_lengths
,
a_g_n_k_wos_strides
,
a_g_n_k_wos_strides
,
b_g_k_c_xs_lengths
,
b_g_k_c_xs_lengths
,
b_g_k_c_xs_strides
,
b_g_k_c_xs_strides
,
e_g_n_c_wis_lengths
,
e_g_n_c_wis_lengths
,
e
_g_n_c_wis_strides
,
ds
_g_n_c_wis_strides
[
i
]
,
conv_filter_strides
,
conv_filter_strides
,
conv_filter_dilations
,
conv_filter_dilations
,
input_left_pads
,
input_left_pads
,
input_right_pads
,
input_right_pads
,
tildes
);
tildes
};
ds_grid_desc_m_n
(
i
)
=
conv_to_gemm_transform_d
.
MakeCDescriptor_M_N
();
});
const
auto
e_grid_desc_m_n
=
conv_to_gemm_transform_
.
MakeCDescriptor_M_N
();
// for check validity
// for check validity
ds_grid_desc_m_n_container_
.
push_back
(
ds_grid_desc_m_n
);
ds_grid_desc_m_n_container_
.
push_back
(
ds_grid_desc_m_n
);
...
@@ -522,17 +440,9 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
...
@@ -522,17 +440,9 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
BElementwiseOp
b_element_op_
;
BElementwiseOp
b_element_op_
;
CDEElementwiseOp
cde_element_op_
;
CDEElementwiseOp
cde_element_op_
;
// for checking IsSupportedArgument()
std
::
array
<
index_t
,
NDimSpatial
+
3
>
a_g_n_k_wos_lengths_
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
a_g_n_k_wos_lengths_
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
a_g_n_k_wos_strides_
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
b_g_k_c_xs_lengths_
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
b_g_k_c_xs_lengths_
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
b_g_k_c_xs_strides_
;
std
::
array
<
std
::
array
<
index_t
,
NDimSpatial
+
3
>
,
NumDTensor
>
ds_g_n_c_wis_lengths_
;
std
::
array
<
std
::
array
<
index_t
,
NDimSpatial
+
3
>
,
NumDTensor
>
ds_g_n_c_wis_strides_
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
e_g_n_c_wis_lengths_
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
e_g_n_c_wis_strides_
;
std
::
array
<
index_t
,
NDimSpatial
>
conv_filter_strides_
;
std
::
array
<
index_t
,
NDimSpatial
>
conv_filter_strides_
;
std
::
array
<
index_t
,
NDimSpatial
>
conv_filter_dilations_
;
std
::
array
<
index_t
,
NDimSpatial
>
input_left_pads_
;
std
::
array
<
index_t
,
NDimSpatial
>
input_left_pads_
;
std
::
array
<
index_t
,
NDimSpatial
>
input_right_pads_
;
std
::
array
<
index_t
,
NDimSpatial
>
input_right_pads_
;
};
};
...
...
include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
View file @
58d75b7a
...
@@ -54,15 +54,16 @@ template <typename GridwiseGemm,
...
@@ -54,15 +54,16 @@ template <typename GridwiseGemm,
typename
ABDataType
,
typename
ABDataType
,
typename
DsPointer
,
typename
DsPointer
,
typename
EDataType
,
typename
EDataType
,
typename
AElementwiseOp
eration
,
typename
AElementwiseOp
,
typename
BElementwiseOp
eration
,
typename
BElementwiseOp
,
typename
CDEElementwiseOp
eration
,
typename
CDEElementwiseOp
,
typename
AGridDesc_AK0_M_AK1
,
typename
AGridDesc_AK0_M_AK1
,
typename
BGridDesc_BK0_N_BK1
,
typename
BGridDesc_BK0_N_BK1
,
typename
DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
,
typename
DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
,
typename
EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
,
typename
EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
,
typename
Block2ETileMap
,
typename
Block2ETileMap
,
typename
ComputePtrOffsetOfBatch
,
typename
ComputePtrOffsetOfBatch
,
typename
ComputePtrOffsetOfN
,
bool
HasMainKBlockLoop
>
bool
HasMainKBlockLoop
>
__global__
void
__global__
void
#if CK_USE_LAUNCH_BOUNDS
#if CK_USE_LAUNCH_BOUNDS
...
@@ -73,10 +74,9 @@ __global__ void
...
@@ -73,10 +74,9 @@ __global__ void
const
ABDataType
*
__restrict__
p_b_grid
,
const
ABDataType
*
__restrict__
p_b_grid
,
DsPointer
p_ds_grid
,
DsPointer
p_ds_grid
,
EDataType
*
__restrict__
p_e_grid
,
EDataType
*
__restrict__
p_e_grid
,
const
AElementwiseOperation
a_element_op
,
const
AElementwiseOp
a_element_op
,
const
BElementwiseOperation
b_element_op
,
const
BElementwiseOp
b_element_op
,
const
CDEElementwiseOperation
cde_element_op
,
const
CDEElementwiseOp
cde_element_op
,
const
index_t
batch_count
,
const
AGridDesc_AK0_M_AK1
a_grid_desc_ak0_m_ak1
,
const
AGridDesc_AK0_M_AK1
a_grid_desc_ak0_m_ak1
,
const
BGridDesc_BK0_N_BK1
b_grid_desc_bk0_n_bk1
,
const
BGridDesc_BK0_N_BK1
b_grid_desc_bk0_n_bk1
,
const
DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
const
DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
...
@@ -84,24 +84,29 @@ __global__ void
...
@@ -84,24 +84,29 @@ __global__ void
const
EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
const
EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
e_grid_desc_mblock_mperblock_nblock_nperblock_
,
e_grid_desc_mblock_mperblock_nblock_nperblock_
,
const
Block2ETileMap
block_2_ctile_map
,
const
Block2ETileMap
block_2_ctile_map
,
const
ComputePtrOffsetOfBatch
compute_ptr_offset_of_batch
)
const
ComputePtrOffsetOfBatch
compute_ptr_offset_of_batch
,
const
ComputePtrOffsetOfN
compute_ptr_offset_of_n
)
{
{
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
defined(__gfx94__))
defined(__gfx94__))
// offset base pointer for each work-group
// offset base pointer for each work-group
const
index_t
num_blocks_per_batch
=
const
index_t
n_idx
=
__builtin_amdgcn_readfirstlane
(
blockIdx
.
z
);
__builtin_amdgcn_readfirstlane
(
get_grid_size
()
/
batch_count
);
const
index_t
g_idx
=
__builtin_amdgcn_readfirstlane
(
blockIdx
.
y
);
const
index_t
g_idx
=
__builtin_amdgcn_readfirstlane
(
get_block_1d_id
()
/
num_blocks_per_batch
);
const
long_index_t
a_batch_offset
=
amd_wave_read_first_lane
(
const
long_index_t
a_batch_offset
=
static_cast
<
long_index_t
>
(
compute_ptr_offset_of_batch
.
GetAPtrOffset
(
g_idx
))
)
;
amd_wave_read_first_lane
(
compute_ptr_offset_of_batch
.
GetAPtrOffset
(
g_idx
));
const
long_index_t
b_batch_offset
=
amd_wave_read_first_lane
(
const
long_index_t
b_batch_offset
=
static_cast
<
long_index_t
>
(
compute_ptr_offset_of_batch
.
GetBPtrOffset
(
g_idx
))
)
;
amd_wave_read_first_lane
(
compute_ptr_offset_of_batch
.
GetBPtrOffset
(
g_idx
));
const
long_index_t
e_batch_offset
=
amd_wave_read_first_lane
(
const
long_index_t
e_batch_offset
=
static_cast
<
long_index_t
>
(
compute_ptr_offset_of_batch
.
GetEPtrOffset
(
g_idx
))
)
;
amd_wave_read_first_lane
(
compute_ptr_offset_of_batch
.
GetEPtrOffset
(
g_idx
));
const
auto
ds_batch_offset
=
compute_ptr_offset_of_batch
.
GetDsPtrOffset
(
g_idx
);
const
auto
ds_batch_offset
=
compute_ptr_offset_of_batch
.
GetDsPtrOffset
(
g_idx
);
const
long_index_t
a_n_offset
=
amd_wave_read_first_lane
(
compute_ptr_offset_of_n
.
GetAPtrOffset
(
n_idx
));
const
long_index_t
e_n_offset
=
amd_wave_read_first_lane
(
compute_ptr_offset_of_n
.
GetEPtrOffset
(
n_idx
));
__shared__
char
p_shared
[
GridwiseGemm
::
GetSharedMemoryNumberOfByte
()];
__shared__
char
p_shared
[
GridwiseGemm
::
GetSharedMemoryNumberOfByte
()];
DsPointer
p_ds_grid_grp
;
DsPointer
p_ds_grid_grp
;
...
@@ -112,10 +117,10 @@ __global__ void
...
@@ -112,10 +117,10 @@ __global__ void
static_for
<
0
,
NumDTensor
,
1
>
{}(
static_for
<
0
,
NumDTensor
,
1
>
{}(
[
&
](
auto
i
)
{
p_ds_grid_grp
(
i
)
=
p_ds_grid
[
i
]
+
ds_batch_offset
[
i
];
});
[
&
](
auto
i
)
{
p_ds_grid_grp
(
i
)
=
p_ds_grid
[
i
]
+
ds_batch_offset
[
i
];
});
GridwiseGemm
::
template
Run
<
HasMainKBlockLoop
>(
p_a_grid
+
a_batch_offset
,
GridwiseGemm
::
template
Run
<
HasMainKBlockLoop
>(
p_a_grid
+
a_batch_offset
+
a_n_offset
,
p_b_grid
+
b_batch_offset
,
p_b_grid
+
b_batch_offset
,
p_ds_grid_grp
,
p_ds_grid_grp
,
p_e_grid
+
e_batch_offset
,
p_e_grid
+
e_batch_offset
+
e_n_offset
,
p_shared
,
p_shared
,
a_element_op
,
a_element_op
,
b_element_op
,
b_element_op
,
...
@@ -130,7 +135,6 @@ __global__ void
...
@@ -130,7 +135,6 @@ __global__ void
ignore
=
p_b_grid
;
ignore
=
p_b_grid
;
ignore
=
p_ds_grid
;
ignore
=
p_ds_grid
;
ignore
=
p_e_grid
;
ignore
=
p_e_grid
;
ignore
=
batch_count
;
ignore
=
a_grid_desc_ak0_m_ak1
;
ignore
=
a_grid_desc_ak0_m_ak1
;
ignore
=
b_grid_desc_bk0_n_bk1
;
ignore
=
b_grid_desc_bk0_n_bk1
;
ignore
=
ds_grid_desc_mblock_mperblock_nblock_nperblock
;
ignore
=
ds_grid_desc_mblock_mperblock_nblock_nperblock
;
...
@@ -139,6 +143,7 @@ __global__ void
...
@@ -139,6 +143,7 @@ __global__ void
ignore
=
b_element_op
;
ignore
=
b_element_op
;
ignore
=
cde_element_op
;
ignore
=
cde_element_op
;
ignore
=
compute_ptr_offset_of_batch
;
ignore
=
compute_ptr_offset_of_batch
;
ignore
=
compute_ptr_offset_of_n
;
ignore
=
block_2_ctile_map
;
ignore
=
block_2_ctile_map
;
#endif
#endif
}
}
...
@@ -233,82 +238,54 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
...
@@ -233,82 +238,54 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
static
constexpr
auto
I2
=
Number
<
2
>
{};
static
constexpr
auto
I2
=
Number
<
2
>
{};
static
constexpr
auto
I3
=
Number
<
3
>
{};
static
constexpr
auto
I3
=
Number
<
3
>
{};
static
constexpr
auto
transform_conv_to_gemm
=
using
ConvToGemmBwdDataTransform
=
TransformConvBwdDataToGemm_v1
<
NDimSpatial
,
TransformConvBwdDataToGemm_v1
<
NDimSpatial
,
ConvBackwardDataSpecialization
,
ConvBackwardDataSpecialization
,
AK1
,
AK1
,
BK1
,
BK1
,
MPerBlock
,
MPerBlock
,
NPerBlock
,
NPerBlock
,
KPerBlock
,
KPerBlock
,
DoPadGemmM
,
DoPadGemmM
,
DoPadGemmN
,
DoPadGemmN
>
{};
ALayout
,
BLayout
,
static
auto
GetDummyABDsEGridDescriptor
()
ELayout
,
true
,
/*SplitConvN*/
ABDataType
,
EDataType
>
;
static
auto
GetDummyABDsEGridDescriptor
(
const
ConvToGemmBwdDataTransform
&
conv_to_gemm_transform
)
{
{
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>
dummy_tensor_lengths
=
{
1
};
const
auto
a_grid_desc_ak0_m_ak1
=
conv_to_gemm_transform
.
MakeADescriptor_AK0_M_AK1
();
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>
dummy_tensor_strides
=
{
1
};
const
std
::
array
<
index_t
,
NDimSpatial
>
dummy_spatial_lengths
=
{
1
};
const
auto
b_grid_desc_bk0_n_bk1
=
conv_to_gemm_transform
.
MakeBDescriptor_BK0_N_BK1
();
const
auto
a_grid_desc_ak0_m_ak1
=
transform_conv_to_gemm
.
template
MakeADescriptor_AK0_M_AK1
<
ALayout
>(
dummy_tensor_lengths
,
dummy_tensor_strides
,
dummy_tensor_lengths
,
dummy_tensor_strides
,
dummy_tensor_lengths
,
dummy_tensor_strides
,
dummy_spatial_lengths
,
dummy_spatial_lengths
,
dummy_spatial_lengths
,
dummy_spatial_lengths
,
dummy_spatial_lengths
);
const
auto
b_grid_desc_bk0_n_bk1
=
transform_conv_to_gemm
.
template
MakeBDescriptor_BK0_N_BK1
<
BLayout
>(
dummy_tensor_lengths
,
dummy_tensor_strides
,
dummy_tensor_lengths
,
dummy_tensor_strides
,
dummy_tensor_lengths
,
dummy_tensor_strides
,
dummy_spatial_lengths
,
dummy_spatial_lengths
,
dummy_spatial_lengths
,
dummy_spatial_lengths
,
dummy_spatial_lengths
);
const
auto
ds_grid_desc_m_n
=
generate_tuple
(
const
auto
ds_grid_desc_m_n
=
generate_tuple
(
[
&
](
auto
i
)
{
[
&
](
auto
i
)
{
using
DLayout
=
remove_cvref_t
<
tuple_element_t
<
i
.
value
,
DsLayout
>>
;
using
DLayout
=
remove_cvref_t
<
tuple_element_t
<
i
.
value
,
DsLayout
>>
;
using
DDataType
=
remove_cvref_t
<
tuple_element_t
<
i
.
value
,
DsDataType
>>
;
return
transform_conv_to_gemm
.
template
MakeCDescriptor_M_N
<
DLayout
>(
using
ConvToGemmBwdDataTransformD
=
dummy_tensor_lengths
,
TransformConvBwdDataToGemm_v1
<
NDimSpatial
,
dummy_tensor_strides
,
ConvBackwardDataSpecialization
,
dummy_tensor_lengths
,
AK1
,
dummy_tensor_strides
,
BK1
,
dummy_tensor_lengths
,
MPerBlock
,
dummy_tensor_strides
,
NPerBlock
,
dummy_spatial_lengths
,
KPerBlock
,
dummy_spatial_lengths
,
DoPadGemmM
,
dummy_spatial_lengths
,
DoPadGemmN
,
dummy_spatial_lengths
,
ALayout
,
dummy_spatial_lengths
);
BLayout
,
DLayout
,
true
,
/*SplitConvN*/
ABDataType
,
DDataType
>
;
return
ConvToGemmBwdDataTransformD
{}.
MakeCDescriptor_M_N
();
},
},
Number
<
NumDTensor
>
{});
Number
<
NumDTensor
>
{});
const
auto
e_grid_desc_m_n
=
const
auto
e_grid_desc_m_n
=
conv_to_gemm_transform
.
MakeCDescriptor_M_N
();
transform_conv_to_gemm
.
template
MakeCDescriptor_M_N
<
ELayout
>(
dummy_tensor_lengths
,
dummy_tensor_strides
,
dummy_tensor_lengths
,
dummy_tensor_strides
,
dummy_tensor_lengths
,
dummy_tensor_strides
,
dummy_spatial_lengths
,
dummy_spatial_lengths
,
dummy_spatial_lengths
,
dummy_spatial_lengths
,
dummy_spatial_lengths
);
return
make_tuple
(
return
make_tuple
(
a_grid_desc_ak0_m_ak1
,
b_grid_desc_bk0_n_bk1
,
ds_grid_desc_m_n
,
e_grid_desc_m_n
);
a_grid_desc_ak0_m_ak1
,
b_grid_desc_bk0_n_bk1
,
ds_grid_desc_m_n
,
e_grid_desc_m_n
);
...
@@ -377,7 +354,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
...
@@ -377,7 +354,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
}
}
// desc
// desc
using
ABDsEGridDesc
=
decltype
(
GetDummyABDsEGridDescriptor
());
constexpr
static
ConvToGemmBwdDataTransform
dummy_conv_to_gemm_transform
;
using
ABDsEGridDesc
=
decltype
(
GetDummyABDsEGridDescriptor
(
dummy_conv_to_gemm_transform
));
using
AGridDesc_AK0_M_AK1
=
remove_cvref_t
<
tuple_element_t
<
0
,
ABDsEGridDesc
>>
;
using
AGridDesc_AK0_M_AK1
=
remove_cvref_t
<
tuple_element_t
<
0
,
ABDsEGridDesc
>>
;
using
BGridDesc_BK0_N_BK1
=
remove_cvref_t
<
tuple_element_t
<
1
,
ABDsEGridDesc
>>
;
using
BGridDesc_BK0_N_BK1
=
remove_cvref_t
<
tuple_element_t
<
1
,
ABDsEGridDesc
>>
;
...
@@ -431,15 +409,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
...
@@ -431,15 +409,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
b_element_op_
{
b_element_op
},
b_element_op_
{
b_element_op
},
cde_element_op_
{
cde_element_op
},
cde_element_op_
{
cde_element_op
},
a_g_n_k_wos_lengths_
{
a_g_n_k_wos_lengths
},
a_g_n_k_wos_lengths_
{
a_g_n_k_wos_lengths
},
a_g_n_k_wos_strides_
{
a_g_n_k_wos_strides
},
b_g_k_c_xs_lengths_
{
b_g_k_c_xs_lengths
},
b_g_k_c_xs_lengths_
{
b_g_k_c_xs_lengths
},
b_g_k_c_xs_strides_
{
b_g_k_c_xs_strides
},
ds_g_n_c_wis_lengths_
{
ds_g_n_c_wis_lengths
},
ds_g_n_c_wis_strides_
{
ds_g_n_c_wis_strides
},
e_g_n_c_wis_lengths_
{
e_g_n_c_wis_lengths
},
e_g_n_c_wis_strides_
{
e_g_n_c_wis_strides
},
conv_filter_strides_
{
conv_filter_strides
},
conv_filter_strides_
{
conv_filter_strides
},
conv_filter_dilations_
{
conv_filter_dilations
},
input_left_pads_
{
input_left_pads
},
input_left_pads_
{
input_left_pads
},
input_right_pads_
{
input_right_pads
}
input_right_pads_
{
input_right_pads
}
{
{
...
@@ -450,11 +421,6 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
...
@@ -450,11 +421,6 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
p_ds_grid_
(
i
)
=
static_cast
<
const
DDataType
*>
(
p_ds
[
i
]);
p_ds_grid_
(
i
)
=
static_cast
<
const
DDataType
*>
(
p_ds
[
i
]);
});
});
// A/B/Ds/E Batch Stride
compute_ptr_offset_of_batch_
.
BatchStrideA_
=
a_g_n_k_wos_strides
[
0
];
compute_ptr_offset_of_batch_
.
BatchStrideB_
=
b_g_k_c_xs_strides
[
0
];
compute_ptr_offset_of_batch_
.
BatchStrideE_
=
e_g_n_c_wis_strides
[
0
];
static_for
<
0
,
NumDTensor
,
1
>
{}([
&
](
auto
i
)
{
static_for
<
0
,
NumDTensor
,
1
>
{}([
&
](
auto
i
)
{
compute_ptr_offset_of_batch_
.
BatchStrideDs_
(
i
)
=
ds_g_n_c_wis_strides
[
i
][
0
];
compute_ptr_offset_of_batch_
.
BatchStrideDs_
(
i
)
=
ds_g_n_c_wis_strides
[
i
][
0
];
});
});
...
@@ -526,68 +492,65 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
...
@@ -526,68 +492,65 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
throw
std
::
runtime_error
(
"wrong! only implemented for 2D and 3D now"
);
throw
std
::
runtime_error
(
"wrong! only implemented for 2D and 3D now"
);
}
}
ConvToGemmBwdDataTransform
conv_to_gemm_transform_
{
a_g_n_k_wos_lengths
,
a_g_n_k_wos_strides
,
b_g_k_c_xs_lengths
,
b_g_k_c_xs_strides
,
e_g_n_c_wis_lengths
,
e_g_n_c_wis_strides
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
,
tildes
};
conv_N_per_block_
=
conv_to_gemm_transform_
.
N_
;
const
auto
a_grid_desc_ak0_m_ak1
=
const
auto
a_grid_desc_ak0_m_ak1
=
transform_conv_to_gemm
.
template
MakeADescriptor_AK0_M_AK1
<
ALayout
>(
conv_to_gemm_transform_
.
MakeADescriptor_AK0_M_AK1
();
a_g_n_k_wos_lengths
,
a_g_n_k_wos_strides
,
b_g_k_c_xs_lengths
,
b_g_k_c_xs_strides
,
e_g_n_c_wis_lengths
,
e_g_n_c_wis_strides
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
,
tildes
);
const
auto
b_grid_desc_bk0_n_bk1
=
const
auto
b_grid_desc_bk0_n_bk1
=
transform_conv_to_gemm
.
template
MakeBDescriptor_BK0_N_BK1
<
BLayout
>(
conv_to_gemm_transform_
.
MakeBDescriptor_BK0_N_BK1
();
a_g_n_k_wos_lengths
,
a_g_n_k_wos_strides
,
b_g_k_c_xs_lengths
,
b_g_k_c_xs_strides
,
e_g_n_c_wis_lengths
,
e_g_n_c_wis_strides
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
,
tildes
);
DsGridDesc_M_N
ds_grid_desc_m_n
;
DsGridDesc_M_N
ds_grid_desc_m_n
;
// populate Ds desc
// populate Ds desc
static_for
<
0
,
NumDTensor
,
1
>
{}([
&
](
auto
i
)
{
static_for
<
0
,
NumDTensor
,
1
>
{}([
&
](
auto
i
)
{
using
DLayout
=
remove_cvref_t
<
tuple_element_t
<
i
.
value
,
DsLayout
>>
;
using
DLayout
=
remove_cvref_t
<
tuple_element_t
<
i
.
value
,
DsLayout
>>
;
using
DDataType
=
remove_cvref_t
<
tuple_element_t
<
i
.
value
,
DsDataType
>>
;
ds_grid_desc_m_n
(
i
)
=
using
ConvToGemmBwdDataTransformD
=
t
ransform
_c
onv
_to_gemm
.
template
MakeCDescriptor_M_N
<
DLayout
>(
T
ransform
C
onv
BwdDataToGemm_v1
<
NDimSpatial
,
a_g_n_k_wos_lengths
,
ConvBackwardDataSpecialization
,
a_g_n_k_wos_strides
,
AK1
,
b_g_k_c_xs_lengths
,
BK1
,
b_g_k_c_xs_strides
,
MPerBlock
,
ds_g_n_c_wis_lengths
[
i
]
,
NPerBlock
,
ds_g_n_c_wis_strides
[
i
]
,
KPerBlock
,
conv_filter_strides
,
DoPadGemmM
,
conv_filter_dilations
,
DoPadGemmN
,
input_left_pads
,
ALayout
,
input_right_pads
,
BLayout
,
tildes
);
DLayout
,
});
true
,
/*SplitConvN*/
ABDataType
,
const
auto
e_grid_desc_m_n
=
DDataType
>
;
t
ransform
_
conv_to_gemm
.
template
MakeCDescriptor_M_N
<
ELayout
>(
ConvToGemmBwdDataT
ransform
D
conv_to_gemm
_transform_d
{
a_g_n_k_wos_lengths
,
a_g_n_k_wos_lengths
,
a_g_n_k_wos_strides
,
a_g_n_k_wos_strides
,
b_g_k_c_xs_lengths
,
b_g_k_c_xs_lengths
,
b_g_k_c_xs_strides
,
b_g_k_c_xs_strides
,
e
_g_n_c_wis_lengths
,
ds
_g_n_c_wis_lengths
[
i
]
,
e
_g_n_c_wis_strides
,
ds
_g_n_c_wis_strides
[
i
]
,
conv_filter_strides
,
conv_filter_strides
,
conv_filter_dilations
,
conv_filter_dilations
,
input_left_pads
,
input_left_pads
,
input_right_pads
,
input_right_pads
,
tildes
);
tildes
};
ds_grid_desc_m_n
(
i
)
=
conv_to_gemm_transform_d
.
MakeCDescriptor_M_N
();
});
const
auto
e_grid_desc_m_n
=
conv_to_gemm_transform_
.
MakeCDescriptor_M_N
();
// desc for problem definition
// desc for problem definition
const
auto
a_grid_desc_m_k
=
const
auto
a_grid_desc_m_k
=
...
@@ -628,6 +591,13 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
...
@@ -628,6 +591,13 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
}
}
}
}
}
}
// A/B/Ds/E Batch Stride
compute_ptr_offset_of_batch_
.
BatchStrideA_
=
a_g_n_k_wos_strides
[
0
];
compute_ptr_offset_of_batch_
.
BatchStrideB_
=
b_g_k_c_xs_strides
[
0
];
compute_ptr_offset_of_batch_
.
BatchStrideE_
=
e_g_n_c_wis_strides
[
0
];
compute_ptr_offset_of_n_
.
BatchStrideA_
=
a_g_n_k_wos_strides
[
1
]
*
conv_N_per_block_
;
compute_ptr_offset_of_n_
.
BatchStrideE_
=
e_g_n_c_wis_strides
[
1
]
*
conv_N_per_block_
;
}
}
void
Print
()
const
void
Print
()
const
...
@@ -660,6 +630,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
...
@@ -660,6 +630,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
// tensor descriptor for problem definition
// tensor descriptor for problem definition
index_t
num_group_
;
index_t
num_group_
;
index_t
conv_N_per_block_
;
std
::
vector
<
AGridDesc_M_K
>
a_grid_desc_m_k_container_
;
std
::
vector
<
AGridDesc_M_K
>
a_grid_desc_m_k_container_
;
std
::
vector
<
BGridDesc_N_K
>
b_grid_desc_n_k_container_
;
std
::
vector
<
BGridDesc_N_K
>
b_grid_desc_n_k_container_
;
std
::
vector
<
DsGridDesc_M_N
>
ds_grid_desc_m_n_container_
;
std
::
vector
<
DsGridDesc_M_N
>
ds_grid_desc_m_n_container_
;
...
@@ -678,23 +649,16 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
...
@@ -678,23 +649,16 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
// for computing batch offset
// for computing batch offset
ComputePtrOffsetOfStridedBatch
<
I1
,
I1
,
NumDTensor
>
compute_ptr_offset_of_batch_
;
ComputePtrOffsetOfStridedBatch
<
I1
,
I1
,
NumDTensor
>
compute_ptr_offset_of_batch_
;
ComputePtrOffsetOfStridedBatch
<
I1
,
I1
,
I0
>
compute_ptr_offset_of_n_
;
// element-wise op
// element-wise op
AElementwiseOp
a_element_op_
;
AElementwiseOp
a_element_op_
;
BElementwiseOp
b_element_op_
;
BElementwiseOp
b_element_op_
;
CDEElementwiseOp
cde_element_op_
;
CDEElementwiseOp
cde_element_op_
;
// for checking IsSupportedArgument()
std
::
array
<
index_t
,
NDimSpatial
+
3
>
a_g_n_k_wos_lengths_
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
a_g_n_k_wos_lengths_
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
a_g_n_k_wos_strides_
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
b_g_k_c_xs_lengths_
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
b_g_k_c_xs_lengths_
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
b_g_k_c_xs_strides_
;
std
::
array
<
std
::
array
<
index_t
,
NDimSpatial
+
3
>
,
NumDTensor
>
ds_g_n_c_wis_lengths_
;
std
::
array
<
std
::
array
<
index_t
,
NDimSpatial
+
3
>
,
NumDTensor
>
ds_g_n_c_wis_strides_
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
e_g_n_c_wis_lengths_
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
e_g_n_c_wis_strides_
;
std
::
array
<
index_t
,
NDimSpatial
>
conv_filter_strides_
;
std
::
array
<
index_t
,
NDimSpatial
>
conv_filter_strides_
;
std
::
array
<
index_t
,
NDimSpatial
>
conv_filter_dilations_
;
std
::
array
<
index_t
,
NDimSpatial
>
input_left_pads_
;
std
::
array
<
index_t
,
NDimSpatial
>
input_left_pads_
;
std
::
array
<
index_t
,
NDimSpatial
>
input_right_pads_
;
std
::
array
<
index_t
,
NDimSpatial
>
input_right_pads_
;
};
};
...
@@ -711,8 +675,12 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
...
@@ -711,8 +675,12 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
arg
.
Print
();
arg
.
Print
();
}
}
float
ave_time
=
0
;
const
index_t
gdy
=
arg
.
num_group_
;
const
index_t
num_workgroups_per_Conv_N
=
arg
.
a_g_n_k_wos_lengths_
[
I1
]
/
arg
.
conv_N_per_block_
;
const
index_t
gdz
=
num_workgroups_per_Conv_N
;
float
ave_time
=
0
;
for
(
std
::
size_t
i
=
0
;
i
<
arg
.
a_grid_desc_ak0_m_ak1_container_
.
size
();
i
++
)
for
(
std
::
size_t
i
=
0
;
i
<
arg
.
a_grid_desc_ak0_m_ak1_container_
.
size
();
i
++
)
{
{
if
(
!
GridwiseGemm
::
CheckValidity
(
arg
.
a_grid_desc_m_k_container_
[
i
],
if
(
!
GridwiseGemm
::
CheckValidity
(
arg
.
a_grid_desc_m_k_container_
[
i
],
...
@@ -724,9 +692,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
...
@@ -724,9 +692,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
throw
std
::
runtime_error
(
"wrong! device_op has invalid setting"
);
throw
std
::
runtime_error
(
"wrong! device_op has invalid setting"
);
}
}
const
index_t
grid_size
=
arg
.
block_2_etile_map_container_
[
i
].
CalculateGridSize
(
const
index_t
gdx
=
arg
.
block_2_etile_map_container_
[
i
].
CalculateGridSize
(
arg
.
e_grid_desc_m_n_container_
[
i
])
*
arg
.
e_grid_desc_m_n_container_
[
i
]);
arg
.
num_group_
;
const
auto
GemmK
=
arg
.
a_grid_desc_m_k_container_
[
i
].
GetLength
(
I1
);
const
auto
GemmK
=
arg
.
a_grid_desc_m_k_container_
[
i
].
GetLength
(
I1
);
...
@@ -747,12 +714,13 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
...
@@ -747,12 +714,13 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
DeviceOp
::
EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
,
DeviceOp
::
EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
,
Block2ETileMap
,
Block2ETileMap
,
ComputePtrOffsetOfStridedBatch
<
I1
,
I1
,
NumDTensor
>
,
ComputePtrOffsetOfStridedBatch
<
I1
,
I1
,
NumDTensor
>
,
ComputePtrOffsetOfStridedBatch
<
I1
,
I1
,
I0
>
,
has_main_loop
>
;
has_main_loop
>
;
return
launch_and_time_kernel
(
return
launch_and_time_kernel
(
stream_config
,
stream_config
,
kernel
,
kernel
,
dim3
(
g
rid_size
),
dim3
(
g
dx
,
gdy
,
gdz
),
dim3
(
BlockSize
),
dim3
(
BlockSize
),
0
,
0
,
arg
.
p_a_grid_
,
arg
.
p_a_grid_
,
...
@@ -762,13 +730,13 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
...
@@ -762,13 +730,13 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
arg
.
a_element_op_
,
arg
.
a_element_op_
,
arg
.
b_element_op_
,
arg
.
b_element_op_
,
arg
.
cde_element_op_
,
arg
.
cde_element_op_
,
arg
.
a_g_n_k_wos_lengths_
[
0
],
// Group count
arg
.
a_grid_desc_ak0_m_ak1_container_
[
i
],
arg
.
a_grid_desc_ak0_m_ak1_container_
[
i
],
arg
.
b_grid_desc_bk0_n_bk1_container_
[
i
],
arg
.
b_grid_desc_bk0_n_bk1_container_
[
i
],
arg
.
ds_grid_desc_mblock_mperblock_nblock_nperblock_container_
[
i
],
arg
.
ds_grid_desc_mblock_mperblock_nblock_nperblock_container_
[
i
],
arg
.
e_grid_desc_mblock_mperblock_nblock_nperblock_container_
[
i
],
arg
.
e_grid_desc_mblock_mperblock_nblock_nperblock_container_
[
i
],
arg
.
block_2_etile_map_container_
[
i
],
arg
.
block_2_etile_map_container_
[
i
],
arg
.
compute_ptr_offset_of_batch_
);
arg
.
compute_ptr_offset_of_batch_
,
arg
.
compute_ptr_offset_of_n_
);
};
};
if
(
GridwiseGemm
::
CalculateHasMainKBlockLoop
(
GemmK
))
if
(
GridwiseGemm
::
CalculateHasMainKBlockLoop
(
GemmK
))
...
...
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp
View file @
58d75b7a
...
@@ -41,7 +41,7 @@ __global__ void
...
@@ -41,7 +41,7 @@ __global__ void
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
__shared__
char
p_shared
[
GridwiseGemm
::
GetSharedMemoryNumberOfByte
()];
__shared__
char
p_shared
[
GridwiseGemm
::
GetSharedMemoryNumberOfByte
()];
auto
splitk_batch_offset
=
typename
GridwiseGemm
::
SplitKBatchOffset
(
karg
);
auto
splitk_batch_offset
=
typename
GridwiseGemm
::
SplitKBatchOffset
(
karg
,
blockIdx
.
z
);
GridwiseGemm
::
template
Run
<
HasMainKBlockLoop
,
CGlobalMemoryDataOperation
,
TailNum
>(
GridwiseGemm
::
template
Run
<
HasMainKBlockLoop
,
CGlobalMemoryDataOperation
,
TailNum
>(
karg
.
p_a_grid
+
splitk_batch_offset
.
a_k_split_offset
,
karg
.
p_a_grid
+
splitk_batch_offset
.
a_k_split_offset
,
...
@@ -76,7 +76,7 @@ __global__ void
...
@@ -76,7 +76,7 @@ __global__ void
__shared__
char
p_shared_0
[
GridwiseGemm
::
GetSharedMemoryNumberOfByte
()];
__shared__
char
p_shared_0
[
GridwiseGemm
::
GetSharedMemoryNumberOfByte
()];
__shared__
char
p_shared_1
[
GridwiseGemm
::
GetSharedMemoryNumberOfByte
()];
__shared__
char
p_shared_1
[
GridwiseGemm
::
GetSharedMemoryNumberOfByte
()];
auto
splitk_batch_offset
=
typename
GridwiseGemm
::
SplitKBatchOffset
(
karg
);
auto
splitk_batch_offset
=
typename
GridwiseGemm
::
SplitKBatchOffset
(
karg
,
blockIdx
.
z
);
GridwiseGemm
::
template
Run_2Lds
<
HasMainKBlockLoop
,
CGlobalMemoryDataOperation
,
TailNum
>(
GridwiseGemm
::
template
Run_2Lds
<
HasMainKBlockLoop
,
CGlobalMemoryDataOperation
,
TailNum
>(
karg
.
p_a_grid
+
splitk_batch_offset
.
a_k_split_offset
,
karg
.
p_a_grid
+
splitk_batch_offset
.
a_k_split_offset
,
...
@@ -639,27 +639,27 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
...
@@ -639,27 +639,27 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
struct
SplitKBatchOffset
struct
SplitKBatchOffset
{
{
__device__
SplitKBatchOffset
(
Argument
&
karg
)
__device__
SplitKBatchOffset
(
Argument
&
karg
,
index_t
k_id
)
{
{
if
constexpr
(
is_same_v
<
tensor_layout
::
gemm
::
RowMajor
,
ALayout
>
)
if
constexpr
(
is_same_v
<
tensor_layout
::
gemm
::
RowMajor
,
ALayout
>
)
{
{
a_k_split_offset
=
blockIdx
.
z
*
karg
.
KRead
;
a_k_split_offset
=
k_id
*
karg
.
KRead
;
}
}
else
if
constexpr
(
is_same_v
<
tensor_layout
::
gemm
::
ColumnMajor
,
ALayout
>
)
else
if
constexpr
(
is_same_v
<
tensor_layout
::
gemm
::
ColumnMajor
,
ALayout
>
)
{
{
a_k_split_offset
=
blockIdx
.
z
*
karg
.
KRead
*
karg
.
StrideA
;
a_k_split_offset
=
k_id
*
karg
.
KRead
*
karg
.
StrideA
;
}
}
if
constexpr
(
is_same_v
<
tensor_layout
::
gemm
::
RowMajor
,
BLayout
>
)
if
constexpr
(
is_same_v
<
tensor_layout
::
gemm
::
RowMajor
,
BLayout
>
)
{
{
b_k_split_offset
=
blockIdx
.
z
*
karg
.
KRead
*
karg
.
StrideB
;
b_k_split_offset
=
k_id
*
karg
.
KRead
*
karg
.
StrideB
;
}
}
else
if
constexpr
(
is_same_v
<
tensor_layout
::
gemm
::
ColumnMajor
,
BLayout
>
)
else
if
constexpr
(
is_same_v
<
tensor_layout
::
gemm
::
ColumnMajor
,
BLayout
>
)
{
{
b_k_split_offset
=
blockIdx
.
z
*
karg
.
KRead
;
b_k_split_offset
=
k_id
*
karg
.
KRead
;
}
}
if
(
blockIdx
.
z
<
static_cast
<
uint32_t
>
(
karg
.
KBatch
-
1
)
)
if
(
k_id
<
karg
.
KBatch
-
1
)
{
{
karg
.
K
=
karg
.
KRead
;
karg
.
K
=
karg
.
KRead
;
}
}
...
...
include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp
View file @
58d75b7a
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
...
@@ -13,245 +13,614 @@
...
@@ -13,245 +13,614 @@
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
namespace
{
template
<
template
<
index_t
NDimSpatial
,
index_t
NDimSpatial
,
ck
::
tensor_operation
::
device
::
ConvolutionBackwardDataSpecialization
ConvBwdDataSpecialization
,
index_t
AK1
,
index_t
BK1
,
index_t
GemmMPerBlock
,
index_t
GemmNPerBlock
,
index_t
GemmKPerBlock
,
bool
DoPadGemmM
,
bool
DoPadGemmN
,
typename
ALayout
,
typename
ALayout
,
ck
::
tensor_operation
::
device
::
ConvolutionBackwardDataSpecialization
ConvBwdDataSpecialization
>
typename
BLayout
,
constexpr
auto
make_out_grid_desc
(
const
index_t
N
,
typename
CLayout
,
const
index_t
Do
,
bool
SplitN
=
false
,
const
index_t
Ho
,
typename
ADataType
=
float
,
const
index_t
Wo
,
typename
CDataType
=
float
,
const
index_t
K
,
index_t
NumGroupsToMerge
=
1
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
out_g_n_k_wos_strides
)
typename
IndexType
=
index_t
>
struct
TransformConvBwdDataToGemm_v1
{
{
const
auto
KStride
=
Number
<
1
>
{};
private:
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
static
constexpr
auto
I2
=
Number
<
2
>
{};
static
constexpr
auto
I3
=
Number
<
3
>
{};
if
constexpr
(
is_same_v
<
ALayout
,
tensor_layout
::
convolution
::
NHWGK
>
)
static
constexpr
auto
NonSpatialDimsNum
=
Number
<
3
>
{};
{
const
index_t
NStride
=
out_g_n_k_wos_strides
[
1
];
const
index_t
HiStride
=
out_g_n_k_wos_strides
[
3
];
const
index_t
WiStride
=
out_g_n_k_wos_strides
[
4
];
if
constexpr
(
ConvBwdDataSpecialization
==
ck
::
tensor_operation
::
device
::
ConvolutionBackwardDataSpecialization
::
Filter1x1Stride1Pad0
)
{
return
make_naive_tensor_descriptor
(
make_tuple
(
N
*
Ho
*
Wo
,
K
),
static
constexpr
auto
DIdx
=
NonSpatialDimsNum
;
make_tuple
(
WiStride
,
KStride
));
static
constexpr
auto
HIdx
=
}
NDimSpatial
==
2
?
NonSpatialDimsNum
:
Number
<
NonSpatialDimsNum
+
1
>
{};
else
static
constexpr
auto
WIdx
=
NDimSpatial
==
2
?
Number
<
NonSpatialDimsNum
+
1
>
{}
:
Number
<
NonSpatialDimsNum
+
2
>
{};
static
constexpr
auto
ZIdx
=
NonSpatialDimsNum
;
static
constexpr
auto
YIdx
=
NDimSpatial
==
2
?
NonSpatialDimsNum
:
Number
<
NonSpatialDimsNum
+
1
>
{};
static
constexpr
auto
XIdx
=
NDimSpatial
==
2
?
Number
<
NonSpatialDimsNum
+
1
>
{}
:
Number
<
NonSpatialDimsNum
+
2
>
{};
template
<
typename
ConvDimsType
>
static
long_index_t
calculate_element_space_size_impl
(
const
ConvDimsType
&
lengths
,
const
ConvDimsType
&
strides
,
index_t
i
)
{
long_index_t
acc
=
1
;
for
(;
i
<
(
NDimSpatial
+
3
);
i
++
)
{
{
return
make_naive_tensor_descriptor
(
make_tuple
(
N
,
Ho
,
Wo
,
K
),
acc
+=
make_tuple
(
NStride
,
HiStride
,
WiStride
,
KS
tride
)
);
static_cast
<
long_index_t
>
(
lengths
[
i
]
-
I1
)
*
static_cast
<
long_index_t
>
(
s
tride
s
[
i
]
);
}
}
return
acc
;
}
}
else
if
constexpr
(
is_same_v
<
ALayout
,
tensor_layout
::
convolution
::
NDHWGK
>
)
template
<
typename
ConvDimsType
>
static
IndexType
GetSplitedNSize
(
const
ConvDimsType
&
a_g_n_k_wos_lengths
,
const
ConvDimsType
&
a_g_n_k_wos_strides
,
const
ConvDimsType
&
c_g_n_c_wis_lengths
,
const
ConvDimsType
&
c_g_n_c_wis_strides
)
{
{
const
index_t
NStride
=
out_g_n_k_wos_strides
[
1
];
const
long_index_t
a_element_space_size
=
const
index_t
DoStride
=
out_g_n_k_wos_strides
[
3
];
calculate_element_space_size_impl
(
a_g_n_k_wos_lengths
,
a_g_n_k_wos_strides
,
I1
);
const
index_t
HoStride
=
out_g_n_k_wos_strides
[
4
];
const
long_index_t
c_element_space_size
=
const
index_t
WoStride
=
out_g_n_k_wos_strides
[
5
];
calculate_element_space_size_impl
(
c_g_n_c_wis_lengths
,
c_g_n_c_wis_strides
,
I1
);
if
constexpr
(
ConvBwdDataSpecialization
==
const
long_index_t
element_space_size
=
math
::
max
(
a_element_space_size
*
sizeof
(
ADataType
),
ck
::
tensor_operation
::
device
::
ConvolutionBackwardDataSpecialization
::
c_element_space_size
*
sizeof
(
CDataType
));
Filter1x1Stride1Pad0
)
constexpr
long_index_t
TwoGB
=
(
long_index_t
{
1
}
<<
31
);
const
IndexType
N
=
a_g_n_k_wos_lengths
[
I1
];
if
(
element_space_size
>
TwoGB
)
{
{
// Minimum divisor of N to not exceed 2GB
const
auto
divisor
=
math
::
integer_divide_ceil
(
element_space_size
,
TwoGB
);
return
make_naive_tensor_descriptor
(
make_tuple
(
N
*
Do
*
Ho
*
Wo
,
K
),
if
(
divisor
<=
static_cast
<
double
>
(
N
))
make_tuple
(
WoStride
,
KStride
));
{
// Find least divisor of N larger than element_space_size / TwoGB
// Iterate up to sqrt(N). There are no divisors above this value.
for
(
IndexType
least_divisor
=
divisor
;
least_divisor
*
least_divisor
<=
N
;
least_divisor
++
)
{
if
(
N
%
least_divisor
==
0
)
{
return
N
/
least_divisor
;
}
}
// Not found, process one Convolution N per block
return
1
;
}
else
{
// Not possible to support even after split N.
// Too large tensor.
return
N
;
}
}
}
else
else
{
{
return
make_naive_tensor_descriptor
(
// Split N is not needed.
make_tuple
(
N
,
Do
,
Ho
,
Wo
,
K
),
return
N
;
make_tuple
(
NStride
,
DoStride
,
HoStride
,
WoStride
,
KStride
));
}
}
}
}
else
if
constexpr
(
is_same_v
<
ALayout
,
tensor_layout
::
convolution
::
GNHWK
>
)
public:
__host__
__device__
constexpr
TransformConvBwdDataToGemm_v1
()
{}
template
<
typename
TransformConvBwdDataToGemm_v1Base
>
__host__
__device__
TransformConvBwdDataToGemm_v1
(
const
TransformConvBwdDataToGemm_v1Base
&
transform_conv_bwd_data_to_gemm_base
)
:
N_
{
static_cast
<
IndexType
>
(
transform_conv_bwd_data_to_gemm_base
.
N_
)},
Di_
{
static_cast
<
IndexType
>
(
transform_conv_bwd_data_to_gemm_base
.
Di_
)},
Hi_
{
static_cast
<
IndexType
>
(
transform_conv_bwd_data_to_gemm_base
.
Hi_
)},
Wi_
{
static_cast
<
IndexType
>
(
transform_conv_bwd_data_to_gemm_base
.
Wi_
)},
Do_
{
static_cast
<
IndexType
>
(
transform_conv_bwd_data_to_gemm_base
.
Do_
)},
Ho_
{
static_cast
<
IndexType
>
(
transform_conv_bwd_data_to_gemm_base
.
Ho_
)},
Wo_
{
static_cast
<
IndexType
>
(
transform_conv_bwd_data_to_gemm_base
.
Wo_
)},
Z_
{
static_cast
<
IndexType
>
(
transform_conv_bwd_data_to_gemm_base
.
Z_
)},
Y_
{
static_cast
<
IndexType
>
(
transform_conv_bwd_data_to_gemm_base
.
Y_
)},
X_
{
static_cast
<
IndexType
>
(
transform_conv_bwd_data_to_gemm_base
.
X_
)},
K_
{
static_cast
<
IndexType
>
(
transform_conv_bwd_data_to_gemm_base
.
K_
)},
C_
{
static_cast
<
IndexType
>
(
transform_conv_bwd_data_to_gemm_base
.
C_
)},
DiStride_
{
static_cast
<
IndexType
>
(
transform_conv_bwd_data_to_gemm_base
.
DiStride_
)},
HiStride_
{
static_cast
<
IndexType
>
(
transform_conv_bwd_data_to_gemm_base
.
HiStride_
)},
WiStride_
{
static_cast
<
IndexType
>
(
transform_conv_bwd_data_to_gemm_base
.
WiStride_
)},
DoStride_
{
static_cast
<
IndexType
>
(
transform_conv_bwd_data_to_gemm_base
.
DoStride_
)},
HoStride_
{
static_cast
<
IndexType
>
(
transform_conv_bwd_data_to_gemm_base
.
HoStride_
)},
WoStride_
{
static_cast
<
IndexType
>
(
transform_conv_bwd_data_to_gemm_base
.
WoStride_
)},
CStrideTensorB_
{
static_cast
<
IndexType
>
(
transform_conv_bwd_data_to_gemm_base
.
CStrideTensorB_
)},
CStrideTensorC_
{
static_cast
<
IndexType
>
(
transform_conv_bwd_data_to_gemm_base
.
CStrideTensorC_
)},
KStrideTensorA_
{
static_cast
<
IndexType
>
(
transform_conv_bwd_data_to_gemm_base
.
KStrideTensorA_
)},
KStrideTensorB_
{
static_cast
<
IndexType
>
(
transform_conv_bwd_data_to_gemm_base
.
KStrideTensorB_
)},
NStrideTensorA_
{
static_cast
<
IndexType
>
(
transform_conv_bwd_data_to_gemm_base
.
NStrideTensorA_
)},
NStrideTensorC_
{
static_cast
<
IndexType
>
(
transform_conv_bwd_data_to_gemm_base
.
NStrideTensorC_
)},
ConvStrideD_
{
static_cast
<
IndexType
>
(
transform_conv_bwd_data_to_gemm_base
.
ConvStrideD_
)},
ConvStrideH_
{
static_cast
<
IndexType
>
(
transform_conv_bwd_data_to_gemm_base
.
ConvStrideH_
)},
ConvStrideW_
{
static_cast
<
IndexType
>
(
transform_conv_bwd_data_to_gemm_base
.
ConvStrideW_
)},
ConvDilationD_
{
static_cast
<
IndexType
>
(
transform_conv_bwd_data_to_gemm_base
.
ConvDilationD_
)},
ConvDilationH_
{
static_cast
<
IndexType
>
(
transform_conv_bwd_data_to_gemm_base
.
ConvDilationH_
)},
ConvDilationW_
{
static_cast
<
IndexType
>
(
transform_conv_bwd_data_to_gemm_base
.
ConvDilationW_
)},
InLeftPadD_
{
static_cast
<
IndexType
>
(
transform_conv_bwd_data_to_gemm_base
.
InLeftPadD_
)},
InLeftPadH_
{
static_cast
<
IndexType
>
(
transform_conv_bwd_data_to_gemm_base
.
InLeftPadH_
)},
InLeftPadW_
{
static_cast
<
IndexType
>
(
transform_conv_bwd_data_to_gemm_base
.
InLeftPadW_
)},
InRightPadD_
{
static_cast
<
IndexType
>
(
transform_conv_bwd_data_to_gemm_base
.
InRightPadD_
)},
InRightPadH_
{
static_cast
<
IndexType
>
(
transform_conv_bwd_data_to_gemm_base
.
InRightPadH_
)},
InRightPadW_
{
static_cast
<
IndexType
>
(
transform_conv_bwd_data_to_gemm_base
.
InRightPadW_
)},
IdxZTilde_
{
static_cast
<
IndexType
>
(
transform_conv_bwd_data_to_gemm_base
.
IdxZTilde_
)},
IdxYTilde_
{
static_cast
<
IndexType
>
(
transform_conv_bwd_data_to_gemm_base
.
IdxYTilde_
)},
IdxXTilde_
{
static_cast
<
IndexType
>
(
transform_conv_bwd_data_to_gemm_base
.
IdxXTilde_
)},
GcdStrideDilationD_
{
static_cast
<
IndexType
>
(
transform_conv_bwd_data_to_gemm_base
.
GcdStrideDilationD_
)},
GcdStrideDilationH_
{
static_cast
<
IndexType
>
(
transform_conv_bwd_data_to_gemm_base
.
GcdStrideDilationH_
)},
GcdStrideDilationW_
{
static_cast
<
IndexType
>
(
transform_conv_bwd_data_to_gemm_base
.
GcdStrideDilationW_
)},
ZTilde_
{
static_cast
<
IndexType
>
(
transform_conv_bwd_data_to_gemm_base
.
ZTilde_
)},
YTilde_
{
static_cast
<
IndexType
>
(
transform_conv_bwd_data_to_gemm_base
.
YTilde_
)},
XTilde_
{
static_cast
<
IndexType
>
(
transform_conv_bwd_data_to_gemm_base
.
XTilde_
)},
DTilde_
{
static_cast
<
IndexType
>
(
transform_conv_bwd_data_to_gemm_base
.
DTilde_
)},
HTilde_
{
static_cast
<
IndexType
>
(
transform_conv_bwd_data_to_gemm_base
.
HTilde_
)},
WTilde_
{
static_cast
<
IndexType
>
(
transform_conv_bwd_data_to_gemm_base
.
WTilde_
)},
ZDot_
{
static_cast
<
IndexType
>
(
transform_conv_bwd_data_to_gemm_base
.
ZDot_
)},
YDot_
{
static_cast
<
IndexType
>
(
transform_conv_bwd_data_to_gemm_base
.
YDot_
)},
XDot_
{
static_cast
<
IndexType
>
(
transform_conv_bwd_data_to_gemm_base
.
XDot_
)}
{
{
// assume packed
}
if
constexpr
(
ConvBwdDataSpecialization
==
ck
::
tensor_operation
::
device
::
ConvolutionBackwardDataSpecialization
::
template
<
typename
ConvDimsType
,
typename
ConvSpatialDimsType
>
Filter1x1Stride1Pad0
)
__host__
__device__
TransformConvBwdDataToGemm_v1
(
const
ConvDimsType
&
a_g_n_k_wos_lengths
,
const
ConvDimsType
&
a_g_n_k_wos_strides
,
const
ConvDimsType
&
b_g_k_c_xs_lengths
,
const
ConvDimsType
&
b_g_k_c_xs_strides
,
const
ConvDimsType
&
c_g_n_c_wis_lengths
,
const
ConvDimsType
&
c_g_n_c_wis_strides
,
const
ConvSpatialDimsType
&
conv_filter_strides
,
const
ConvSpatialDimsType
&
conv_filter_dilations
,
const
ConvSpatialDimsType
&
input_left_pads
,
const
ConvSpatialDimsType
&
input_right_pads
,
const
ConvSpatialDimsType
&
tildes
)
:
Hi_
{
c_g_n_c_wis_lengths
[
HIdx
]},
Wi_
{
c_g_n_c_wis_lengths
[
WIdx
]},
Ho_
{
a_g_n_k_wos_lengths
[
HIdx
]},
Wo_
{
a_g_n_k_wos_lengths
[
WIdx
]},
Y_
{
b_g_k_c_xs_lengths
[
YIdx
]},
X_
{
b_g_k_c_xs_lengths
[
XIdx
]},
K_
{
a_g_n_k_wos_lengths
[
I2
]},
C_
{
b_g_k_c_xs_lengths
[
I2
]},
HiStride_
{
c_g_n_c_wis_strides
[
HIdx
]},
WiStride_
{
c_g_n_c_wis_strides
[
WIdx
]},
HoStride_
{
a_g_n_k_wos_strides
[
HIdx
]},
WoStride_
{
a_g_n_k_wos_strides
[
WIdx
]},
CStrideTensorB_
{
b_g_k_c_xs_strides
[
I2
]},
CStrideTensorC_
{
c_g_n_c_wis_strides
[
I2
]},
KStrideTensorA_
{
a_g_n_k_wos_strides
[
I2
]},
KStrideTensorB_
{
b_g_k_c_xs_strides
[
I1
]},
NStrideTensorA_
{
a_g_n_k_wos_strides
[
I1
]},
NStrideTensorC_
{
c_g_n_c_wis_strides
[
I1
]},
ConvStrideH_
{
conv_filter_strides
[
HIdx
-
NonSpatialDimsNum
]},
ConvStrideW_
{
conv_filter_strides
[
WIdx
-
NonSpatialDimsNum
]},
ConvDilationH_
{
conv_filter_dilations
[
HIdx
-
NonSpatialDimsNum
]},
ConvDilationW_
{
conv_filter_dilations
[
WIdx
-
NonSpatialDimsNum
]},
InLeftPadH_
{
input_left_pads
[
HIdx
-
NonSpatialDimsNum
]},
InLeftPadW_
{
input_left_pads
[
WIdx
-
NonSpatialDimsNum
]},
InRightPadH_
{
input_right_pads
[
HIdx
-
NonSpatialDimsNum
]},
InRightPadW_
{
input_right_pads
[
WIdx
-
NonSpatialDimsNum
]},
IdxYTilde_
{
tildes
[
YIdx
-
NonSpatialDimsNum
]},
IdxXTilde_
{
tildes
[
XIdx
-
NonSpatialDimsNum
]}
{
static_assert
(
is_same_v
<
ConvSpatialDimsType
,
std
::
array
<
IndexType
,
NDimSpatial
>>
||
is_same_v
<
ConvSpatialDimsType
,
ck
::
Array
<
IndexType
,
NDimSpatial
>>
);
static_assert
(
is_same_v
<
ConvDimsType
,
std
::
array
<
IndexType
,
NDimSpatial
+
I3
>>
||
is_same_v
<
ConvDimsType
,
ck
::
Array
<
IndexType
,
NDimSpatial
+
I3
>>
);
if
constexpr
(
SplitN
)
{
{
return
make_naive_tensor_descriptor_packed
(
make_tuple
(
N
*
Ho
*
Wo
,
K
));
N_
=
GetSplitedNSize
(
a_g_n_k_wos_lengths
,
a_g_n_k_wos_strides
,
c_g_n_c_wis_lengths
,
c_g_n_c_wis_strides
);
}
}
else
else
{
{
return
make_naive_tensor_descriptor_packed
(
make_tuple
(
N
,
Ho
,
Wo
,
K
))
;
N_
=
c_g_n_c_wis_lengths
[
I1
]
;
}
}
}
if
constexpr
(
NDimSpatial
==
3
)
else
if
constexpr
(
is_same_v
<
ALayout
,
tensor_layout
::
convolution
::
GNDHWK
>
)
{
// assume packed
if
constexpr
(
ConvBwdDataSpecialization
==
ck
::
tensor_operation
::
device
::
ConvolutionBackwardDataSpecialization
::
Filter1x1Stride1Pad0
)
{
{
return
make_naive_tensor_descriptor_packed
(
make_tuple
(
N
*
Do
*
Ho
*
Wo
,
K
));
Di_
=
c_g_n_c_wis_lengths
[
DIdx
];
Do_
=
a_g_n_k_wos_lengths
[
DIdx
];
Z_
=
b_g_k_c_xs_lengths
[
ZIdx
];
DiStride_
=
c_g_n_c_wis_strides
[
DIdx
];
DoStride_
=
a_g_n_k_wos_strides
[
DIdx
];
ConvStrideD_
=
conv_filter_strides
[
DIdx
-
NonSpatialDimsNum
];
ConvDilationD_
=
conv_filter_dilations
[
DIdx
-
NonSpatialDimsNum
];
InLeftPadD_
=
input_left_pads
[
DIdx
-
NonSpatialDimsNum
];
InRightPadD_
=
input_right_pads
[
DIdx
-
NonSpatialDimsNum
];
IdxZTilde_
=
tildes
[
ZIdx
-
NonSpatialDimsNum
];
GcdStrideDilationD_
=
math
::
gcd
(
ConvStrideD_
,
ConvDilationD_
);
ZTilde_
=
ConvStrideD_
/
GcdStrideDilationD_
;
DTilde_
=
Do_
+
math
::
integer_divide_ceil
(
ConvDilationD_
*
(
Z_
-
I1
),
ConvStrideD_
);
ZDot_
=
math
::
integer_divide_ceil
(
Z_
,
ZTilde_
);
}
}
else
else
{
{
return
make_naive_tensor_descriptor_packed
(
make_tuple
(
N
,
Do
,
Ho
,
Wo
,
K
));
Di_
=
Do_
=
Z_
=
ZTilde_
=
ConvStrideD_
=
DTilde_
=
ZDot_
=
1
;
InLeftPadD_
=
InRightPadD_
=
DiStride_
=
DoStride_
=
IdxZTilde_
=
0
;
}
}
}
else
{
throw
std
::
runtime_error
(
"wrong! unsupported layout: "
+
ALayout
::
name
());
}
}
template
<
typename
BLayout
>
GcdStrideDilationH_
=
math
::
gcd
(
ConvStrideH_
,
ConvDilationH_
);
constexpr
auto
make_wei_grid_desc
(
GcdStrideDilationW_
=
math
::
gcd
(
ConvStrideW_
,
ConvDilationW_
);
const
index_t
K
,
const
index_t
Z
,
const
index_t
Y
,
const
index_t
X
,
const
index_t
C
)
{
if
constexpr
(
is_same_v
<
BLayout
,
tensor_layout
::
convolution
::
GKYXC
>
)
YTilde_
=
ConvStrideH_
/
GcdStrideDilationH_
;
{
XTilde_
=
ConvStrideW_
/
GcdStrideDilationW_
;
return
make_naive_tensor_descriptor_packed
(
make_tuple
(
K
,
Y
,
X
,
C
));
}
else
if
constexpr
(
is_same_v
<
BLayout
,
tensor_layout
::
convolution
::
GKZYXC
>
)
{
return
make_naive_tensor_descriptor_packed
(
make_tuple
(
K
,
Z
,
Y
,
X
,
C
));
}
else
{
throw
std
::
runtime_error
(
"wrong! unsupported layout: "
+
BLayout
::
name
());
}
}
template
<
index_t
NDimSpatial
,
typename
CLayout
>
constexpr
auto
make_in_grid_desc
(
const
index_t
N
,
const
index_t
Di
,
const
index_t
Hi
,
const
index_t
Wi
,
const
index_t
C
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
in_g_n_c_wis_strides
)
{
if
constexpr
(
is_same_v
<
CLayout
,
tensor_layout
::
convolution
::
GNHWC
>
||
HTilde_
=
Ho_
+
math
::
integer_divide_ceil
(
ConvDilationH_
*
(
Y_
-
I1
),
ConvStrideH_
);
is_same_v
<
CLayout
,
tensor_layout
::
convolution
::
NHWGC
>
||
WTilde_
=
Wo_
+
math
::
integer_divide_ceil
(
ConvDilationW_
*
(
X_
-
I1
),
ConvStrideW_
);
is_same_v
<
CLayout
,
tensor_layout
::
convolution
::
G_NHW_C
>
)
{
YDot_
=
math
::
integer_divide_ceil
(
Y_
,
YTilde_
);
return
make_naive_tensor_descriptor
(
make_tuple
(
N
,
Hi
,
Wi
,
C
),
XDot_
=
math
::
integer_divide_ceil
(
X_
,
XTilde_
);
make_tuple
(
in_g_n_c_wis_strides
[
1
],
in_g_n_c_wis_strides
[
3
],
in_g_n_c_wis_strides
[
4
],
in_g_n_c_wis_strides
[
2
]));
}
}
else
if
constexpr
(
is_same_v
<
CLayout
,
tensor_layout
::
convolution
::
GNDHWC
>
||
is_same_v
<
CLayout
,
tensor_layout
::
convolution
::
NDHWGC
>
)
#if 0 // At now not supported to split tensor
__host__ bool AreDescriptorsSmallerThan2GB() const
{
{
return
make_naive_tensor_descriptor
(
make_tuple
(
N
,
Di
,
Hi
,
Wi
,
C
),
constexpr long_index_t TwoGB = (long_index_t{1} << 31);
make_tuple
(
in_g_n_c_wis_strides
[
1
],
in_g_n_c_wis_strides
[
3
],
const long_index_t in_desc_space_size =
in_g_n_c_wis_strides
[
4
],
I1 + (N_ - I1) * NStrideTensorC_ + (Di_ - I1) * DiStride_ + (Hi_ - I1) * HiStride_ +
in_g_n_c_wis_strides
[
5
],
(Wi_ - I1) * WiStride_ + (C_ - I1) * CStrideTensorC_;
in_g_n_c_wis_strides
[
2
]));
const long_index_t out_desc_space_size =
I1 + (N_ - I1) * NStrideTensorA_ + (Do_ - I1) * DoStride_ + (Ho_ - I1) * HoStride_ +
(Wo_ - I1) * WoStride_ + (K_ - I1) * KStrideTensorA_;
bool is_a_descriptor_smaller_than_2GB = (out_desc_space_size * sizeof(ADataType)) <= TwoGB;
bool is_c_descriptor_smaller_than_2GB = (in_desc_space_size * sizeof(CDataType)) <= TwoGB;
return is_a_descriptor_smaller_than_2GB && is_c_descriptor_smaller_than_2GB;
}
}
else
__host__ auto SplitConvProblem(const ADataType* a_grid_ptr_base,
CDataType* c_grid_ptr_base) const
{
{
throw
std
::
runtime_error
(
"wrong! unsupported layout: "
+
CLayout
::
name
());
// Create copies
}
auto conv_to_gemm_transformer_left = *this;
}
auto conv_to_gemm_transformer_right = *this;
IndexType a_right_offset = 0;
IndexType c_right_offset = 0;
// Calculate real filter size
const IndexType z_eff = (Z_ - 1) * ConvDilationD_ + 1;
const IndexType y_eff = (Y_ - 1) * ConvDilationH_ + 1;
const IndexType x_eff = (X_ - 1) * ConvDilationW_ + 1;
// Calculate start position in input for right tensor
const IndexType di_right_transformer_start_idx = (Do_ / 2) * ConvStrideD_;
const IndexType hi_right_transformer_start_idx = (Ho_ / 2) * ConvStrideH_;
const IndexType wi_right_transformer_start_idx = (Wo_ / 2) * ConvStrideW_;
// Calculate last position in input for left tensor
const IndexType di_left_transformer_end_idx = (Do_ / 2 - 1) * ConvStrideD_ + z_eff;
const IndexType hi_left_transformer_end_idx = (Ho_ / 2 - 1) * ConvStrideH_ + y_eff;
const IndexType wi_left_transformer_end_idx = (Wo_ / 2 - 1) * ConvStrideW_ + x_eff;
// Allow to split if whole left padding will be in left tensor and right padding in right
// tensor
const bool is_possible_to_split_d = Do_ != 1 &&
di_right_transformer_start_idx > InLeftPadD_ &&
di_left_transformer_end_idx <= (InLeftPadD_ + Di_);
const bool is_possible_to_split_h = Ho_ != 1 &&
hi_right_transformer_start_idx > InLeftPadH_ &&
hi_left_transformer_end_idx <= (InLeftPadH_ + Hi_);
const bool is_possible_to_split_w = Wo_ != 1 &&
wi_right_transformer_start_idx > InLeftPadW_ &&
wi_left_transformer_end_idx <= (InLeftPadW_ + Wi_);
if(is_possible_to_split_d)
{
// Apply new sizes
// Split output on half
conv_to_gemm_transformer_left.Do_ = Do_ / 2;
conv_to_gemm_transformer_right.Do_ = Do_ - Do_ / 2;
// Assign left padding to left convolution
conv_to_gemm_transformer_left.InLeftPadD_ = InLeftPadD_;
conv_to_gemm_transformer_right.InLeftPadD_ = 0;
// Assign right padding to right convolution
conv_to_gemm_transformer_left.InRightPadD_ = 0;
conv_to_gemm_transformer_right.InRightPadD_ = InRightPadD_;
// Calculate new input size
conv_to_gemm_transformer_left.Di_ = di_left_transformer_end_idx - InLeftPadD_;
conv_to_gemm_transformer_right.Di_ =
math::min(Di_ - (di_right_transformer_start_idx - InLeftPadD_),
(conv_to_gemm_transformer_right.Do_ - 1) * ConvStrideD_ + z_eff);
;
// Calcualte offsets
a_right_offset = (Do_ / 2) * DoStride_;
c_right_offset = ((Do_ / 2) * ConvStrideD_ - InLeftPadD_) * DiStride_;
}
else if(is_possible_to_split_h)
{
conv_to_gemm_transformer_left.Ho_ = Ho_ / 2;
conv_to_gemm_transformer_right.Ho_ = Ho_ - Ho_ / 2;
}
// namespace
conv_to_gemm_transformer_left.InLeftPadH_ = InLeftPadH_;
conv_to_gemm_transformer_right.InLeftPadH_ = 0;
template
<
conv_to_gemm_transformer_left.InRightPadH_ = 0;
index_t
NDimSpatial
,
conv_to_gemm_transformer_right.InRightPadH_ = InRightPadH_;
ck
::
tensor_operation
::
device
::
ConvolutionBackwardDataSpecialization
ConvBwdDataSpecialization
,
index_t
AK1
,
index_t
BK1
,
index_t
GemmMPerBlock
,
index_t
GemmNPerBlock
,
index_t
GemmKPerBlock
,
bool
DoPadGemmM
,
bool
DoPadGemmN
>
struct
TransformConvBwdDataToGemm_v1
{
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
static
constexpr
auto
NonSpatialDimsNum
=
Number
<
3
>
{};
conv_to_gemm_transformer_left.Hi_ = hi_left_transformer_end_idx - InLeftPadH_;
conv_to_gemm_transformer_right.Hi_ =
math::min(Hi_ - (hi_right_transformer_start_idx - InLeftPadH_),
(conv_to_gemm_transformer_right.Ho_ - 1) * ConvStrideH_ + y_eff);
a_right_offset = (Ho_ / 2) * HoStride_;
c_right_offset = ((Ho_ / 2) * ConvStrideH_ - InLeftPadH_) * HiStride_;
}
else if(is_possible_to_split_w)
{
conv_to_gemm_transformer_left.Wo_ = Wo_ / 2;
conv_to_gemm_transformer_right.Wo_ = Wo_ - Wo_ / 2;
static
constexpr
auto
DIdx
=
Number
<
NonSpatialDimsNum
>
{};
conv_to_gemm_transformer_left.InLeftPadW_ = InLeftPadW_;
static
constexpr
auto
HIdx
=
conv_to_gemm_transformer_right.InLeftPadW_ = 0;
NDimSpatial
==
2
?
Number
<
NonSpatialDimsNum
>
{}
:
Number
<
NonSpatialDimsNum
+
1
>
{};
static
constexpr
auto
WIdx
=
NDimSpatial
==
2
?
Number
<
NonSpatialDimsNum
+
1
>
{}
:
Number
<
NonSpatialDimsNum
+
2
>
{};
static
constexpr
auto
ZIdx
=
Number
<
NonSpatialDimsNum
>
{};
conv_to_gemm_transformer_left.InRightPadW_ = 0;
static
constexpr
auto
YIdx
=
conv_to_gemm_transformer_right.InRightPadW_ = InRightPadW_;
NDimSpatial
==
2
?
Number
<
NonSpatialDimsNum
>
{}
:
Number
<
NonSpatialDimsNum
+
1
>
{};
static
constexpr
auto
XIdx
=
NDimSpatial
==
2
?
Number
<
NonSpatialDimsNum
+
1
>
{}
:
Number
<
NonSpatialDimsNum
+
2
>
{};
template
<
typename
ALayout
,
conv_to_gemm_transformer_left.Wi_ = wi_left_transformer_end_idx - InLeftPadW_;
typename
std
::
enable_if
<
(
NDimSpatial
==
2
||
NDimSpatial
==
3
)
&&
conv_to_gemm_transformer_right.Wi_ =
(
is_same_v
<
ALayout
,
tensor_layout
::
convolution
::
GNHWK
>
||
math::min(Wi_ - (wi_right_transformer_start_idx - InLeftPadW_),
is_same_v
<
ALayout
,
tensor_layout
::
convolution
::
GNDHWK
>
||
(conv_to_gemm_transformer_right.Wo_ - 1) * ConvStrideW_ + x_eff);
is_same_v
<
ALayout
,
tensor_layout
::
convolution
::
NHWGK
>
||
is_same_v
<
ALayout
,
tensor_layout
::
convolution
::
NDHWGK
>
),
a_right_offset = (Wo_ / 2) * WoStride_;
bool
>::
type
=
false
>
c_right_offset = ((Wo_ / 2) * ConvStrideW_ - InLeftPadW_) * WiStride_;
static
auto
MakeADescriptor_AK0_M_AK1
(
}
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
out_g_n_k_wos_lengths
,
// Return left transform, right transformer, right offset to Input and right offset to
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
out_g_n_k_wos_strides
,
// Output
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
wei_g_k_c_xs_lengths
,
return ck::make_tuple(conv_to_gemm_transformer_left,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
/* wei_g_k_c_xs_strides */
,
conv_to_gemm_transformer_right,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
in_g_n_c_wis_lengths
,
a_grid_ptr_base + a_right_offset,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
/* in_g_n_c_wis_strides */
,
c_grid_ptr_base + c_right_offset);
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_strides
,
}
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_dilations
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_left_pads
,
__host__ auto SplitConvProblem(const ADataType* a_grid_ptr_base,
const
std
::
array
<
index_t
,
NDimSpatial
>&
/* input_right_pads */
,
CDataType* c_grid_ptr_base) const
const
std
::
array
<
index_t
,
NDimSpatial
>&
tildes
)
{
{
index_t
i_ztilde
=
tildes
[
ZIdx
-
NonSpatialDimsNum
];
// Create copies
index_t
i_ytilde
=
tildes
[
YIdx
-
NonSpatialDimsNum
];
auto conv_to_gemm_transformer_left = *this;
index_t
i_xtilde
=
tildes
[
XIdx
-
NonSpatialDimsNum
];
auto conv_to_gemm_transformer_right = *this;
IndexType a_right_offset = 0;
IndexType c_right_offset = 0;
// Calculate start position in input for right tensor
const IndexType do_right_transformer_start_idx = math::integer_divide_ceil((Di_ / 2) + InLeftPadD_ - ((Z_ - 1) * ConvDilationD_), ConvStrideD_);
const IndexType ho_right_transformer_start_idx = math::integer_divide_ceil((Hi_ / 2) + InLeftPadH_ - ((Y_ - 1) * ConvDilationH_), ConvStrideH_);
const IndexType wo_right_transformer_start_idx = math::integer_divide_ceil((Wi_ / 2) + InLeftPadW_ - ((X_ - 1) * ConvDilationW_), ConvStrideW_);
// Calculate last position in input for left tensor
const IndexType do_left_transformer_end_idx = math::integer_divide_ceil((Di_ / 2 - 1) + InLeftPadD_, ConvStrideD_);
const IndexType ho_left_transformer_end_idx = math::integer_divide_ceil((Hi_ / 2 - 1) + InLeftPadH_, ConvStrideH_);
const IndexType wo_left_transformer_end_idx = math::integer_divide_ceil((Wi_ / 2 - 1) + InLeftPadW_, ConvStrideW_);
if(Di_!=1)
{
// Apply new sizes
// Split output on half
conv_to_gemm_transformer_left.Di_ = Di_ / 2;
conv_to_gemm_transformer_right.Di_ = Di_ - Di_ / 2;
// Assign left padding to left convolution
conv_to_gemm_transformer_left.InLeftPadD_ = InLeftPadD_;
conv_to_gemm_transformer_right.InLeftPadD_ = 0;
// // Assign right padding to right convolution
conv_to_gemm_transformer_left.InRightPadD_ = 0;
conv_to_gemm_transformer_right.InRightPadD_ = InRightPadD_;
// Calculate new input size
conv_to_gemm_transformer_left.Do_ = do_left_transformer_end_idx;
conv_to_gemm_transformer_right.Do_ = Do_ - do_right_transformer_start_idx;
;
// Calcualte offsets
a_right_offset = do_right_transformer_start_idx * DoStride_;
c_right_offset = (Di_ / 2) * DiStride_;
}
else if(Hi_!=1)
{
// Apply new sizes
// Split output on half
conv_to_gemm_transformer_left.Hi_ = Hi_ / 2;
conv_to_gemm_transformer_right.Hi_ = Hi_ - Hi_ / 2;
// Assign left padding to left convolution
conv_to_gemm_transformer_left.InLeftPadH_ = InLeftPadH_;
conv_to_gemm_transformer_right.InLeftPadH_ = 0;
// // Assign right padding to right convolution
conv_to_gemm_transformer_left.InRightPadH_ = 0;
conv_to_gemm_transformer_right.InRightPadH_ = InRightPadH_;
// Calculate new input size
conv_to_gemm_transformer_left.Ho_ = ho_left_transformer_end_idx ;
conv_to_gemm_transformer_right.Ho_ = Ho_ - ho_right_transformer_start_idx ;
;
// Calcualte offsets
a_right_offset = ho_right_transformer_start_idx * HoStride_;
c_right_offset = (Hi_ / 2) * HiStride_;
}
else if(Wi_!=1)
{
// Apply new sizes
// Split output on half
conv_to_gemm_transformer_left.Wi_ = Wi_ / 2;
conv_to_gemm_transformer_right.Wi_ = Wi_ - Wi_ / 2;
// Assign left padding to left convolution
conv_to_gemm_transformer_left.InLeftPadW_ = InLeftPadW_;
conv_to_gemm_transformer_right.InLeftPadW_ = 0;
// Assign right padding to right convolution
conv_to_gemm_transformer_left.InRightPadW_ = 0;
conv_to_gemm_transformer_right.InRightPadW_ = InRightPadW_;
// Calculate new input size
conv_to_gemm_transformer_left.Wo_ = wo_left_transformer_end_idx;
conv_to_gemm_transformer_right.Wo_ = Wo_ - wo_right_transformer_start_idx;
;
// Calcualte offsets
a_right_offset = wo_right_transformer_start_idx * WoStride_;
c_right_offset = (Wi_ / 2) * WiStride_;
}
// Return left transform, right transformer, right offset to Input and right offset to
// Output
return ck::make_tuple(conv_to_gemm_transformer_left,
conv_to_gemm_transformer_right,
a_grid_ptr_base + a_right_offset,
c_grid_ptr_base + c_right_offset);
}
#endif
const
index_t
N
=
in_g_n_c_wis_lengths
[
1
];
__host__
__device__
auto
MakeOutGridDesc
()
const
const
index_t
K
=
wei_g_k_c_xs_lengths
[
1
];
{
if
constexpr
(
is_same_v
<
ALayout
,
tensor_layout
::
convolution
::
NHWGK
>
)
{
if
constexpr
(
ConvBwdDataSpecialization
==
ck
::
tensor_operation
::
device
::
ConvolutionBackwardDataSpecialization
::
Filter1x1Stride1Pad0
)
{
const
index_t
Di
=
NDimSpatial
==
3
?
in_g_n_c_wis_lengths
[
DIdx
]
:
1
;
return
make_naive_tensor_descriptor
(
make_tuple
(
N_
*
Ho_
*
Wo_
,
K_
),
const
index_t
Hi
=
in_g_n_c_wis_lengths
[
HIdx
];
make_tuple
(
WoStride_
,
KStrideTensorA_
));
const
index_t
Wi
=
in_g_n_c_wis_lengths
[
WIdx
];
}
else
{
return
make_naive_tensor_descriptor
(
make_tuple
(
N_
,
Ho_
,
Wo_
,
K_
),
make_tuple
(
NStrideTensorA_
,
HoStride_
,
WoStride_
,
KStrideTensorA_
));
}
}
else
if
constexpr
(
is_same_v
<
ALayout
,
tensor_layout
::
convolution
::
NDHWGK
>
)
{
if
constexpr
(
ConvBwdDataSpecialization
==
ck
::
tensor_operation
::
device
::
ConvolutionBackwardDataSpecialization
::
Filter1x1Stride1Pad0
)
{
const
index_t
Do
=
NDimSpatial
==
3
?
out_g_n_k_wos_lengths
[
DIdx
]
:
1
;
return
make_naive_tensor_descriptor
(
make_tuple
(
N_
*
Do_
*
Ho_
*
Wo_
,
K_
),
const
index_t
Ho
=
out_g_n_k_wos_lengths
[
HIdx
];
make_tuple
(
WoStride_
,
KStrideTensorA_
));
const
index_t
Wo
=
out_g_n_k_wos_lengths
[
WIdx
];
}
else
{
return
make_naive_tensor_descriptor
(
make_tuple
(
N_
,
Do_
,
Ho_
,
Wo_
,
K_
),
make_tuple
(
NStrideTensorA_
,
DoStride_
,
HoStride_
,
WoStride_
,
KStrideTensorA_
));
}
}
else
if
constexpr
(
is_same_v
<
ALayout
,
tensor_layout
::
convolution
::
GNHWK
>
)
{
// assume packed
if
constexpr
(
ConvBwdDataSpecialization
==
ck
::
tensor_operation
::
device
::
ConvolutionBackwardDataSpecialization
::
Filter1x1Stride1Pad0
)
{
return
make_naive_tensor_descriptor_packed
(
make_tuple
(
N_
*
Ho_
*
Wo_
,
K_
));
}
else
{
return
make_naive_tensor_descriptor_packed
(
make_tuple
(
N_
,
Ho_
,
Wo_
,
K_
));
}
}
else
if
constexpr
(
is_same_v
<
ALayout
,
tensor_layout
::
convolution
::
GNDHWK
>
)
{
// assume packed
if
constexpr
(
ConvBwdDataSpecialization
==
ck
::
tensor_operation
::
device
::
ConvolutionBackwardDataSpecialization
::
Filter1x1Stride1Pad0
)
{
return
make_naive_tensor_descriptor_packed
(
make_tuple
(
N_
*
Do_
*
Ho_
*
Wo_
,
K_
));
}
else
{
return
make_naive_tensor_descriptor_packed
(
make_tuple
(
N_
,
Do_
,
Ho_
,
Wo_
,
K_
));
}
}
else
{
throw
std
::
runtime_error
(
"wrong! unsupported layout: "
+
ALayout
::
name
());
}
}
const
index_t
Z
=
NDimSpatial
==
3
?
wei_g_k_c_xs_lengths
[
ZIdx
]
:
1
;
__host__
__device__
auto
MakeWeiGridDesc
()
const
const
index_t
Y
=
wei_g_k_c_xs_lengths
[
YIdx
];
{
const
index_t
X
=
wei_g_k_c_xs_lengths
[
XIdx
];
const
index_t
InLeftPadD
=
input_left_pads
[
DIdx
-
NonSpatialDimsNum
];
if
constexpr
(
is_same_v
<
BLayout
,
tensor_layout
::
convolution
::
GKYXC
>
)
const
index_t
InLeftPadH
=
input_left_pads
[
HIdx
-
NonSpatialDimsNum
];
{
const
index_t
InLeftPadW
=
input_left_pads
[
WIdx
-
NonSpatialDimsNum
];
return
make_naive_tensor_descriptor_packed
(
make_tuple
(
K_
,
Y_
,
X_
,
C_
));
}
else
if
constexpr
(
is_same_v
<
BLayout
,
tensor_layout
::
convolution
::
GKZYXC
>
)
{
return
make_naive_tensor_descriptor_packed
(
make_tuple
(
K_
,
Z_
,
Y_
,
X_
,
C_
));
}
else
{
throw
std
::
runtime_error
(
"wrong! unsupported layout: "
+
BLayout
::
name
());
}
}
const
index_t
ConvStrideD
=
conv_filter_strides
[
DIdx
-
NonSpatialDimsNum
];
__host__
__device__
auto
MakeInGridDesc
()
const
const
index_t
ConvStrideH
=
conv_filter_strides
[
HIdx
-
NonSpatialDimsNum
];
{
const
index_t
ConvStrideW
=
conv_filter_strides
[
WIdx
-
NonSpatialDimsNum
];
const
index_t
ConvDilationD
=
conv_filter_dilations
[
DIdx
-
NonSpatialDimsNum
];
if
constexpr
(
is_same_v
<
CLayout
,
tensor_layout
::
convolution
::
GNHWC
>
||
const
index_t
ConvDilationH
=
conv_filter_dilations
[
HIdx
-
NonSpatialDimsNum
];
is_same_v
<
CLayout
,
tensor_layout
::
convolution
::
NHWGC
>
||
const
index_t
ConvDilationW
=
conv_filter_dilations
[
WIdx
-
NonSpatialDimsNum
];
is_same_v
<
CLayout
,
tensor_layout
::
convolution
::
G_NHW_C
>
)
{
return
make_naive_tensor_descriptor
(
make_tuple
(
N_
,
Hi_
,
Wi_
,
C_
),
make_tuple
(
NStrideTensorC_
,
HiStride_
,
WiStride_
,
CStrideTensorC_
));
}
else
if
constexpr
(
is_same_v
<
CLayout
,
tensor_layout
::
convolution
::
GNDHWC
>
||
is_same_v
<
CLayout
,
tensor_layout
::
convolution
::
NDHWGC
>
)
{
return
make_naive_tensor_descriptor
(
make_tuple
(
N_
,
Di_
,
Hi_
,
Wi_
,
C_
),
make_tuple
(
NStrideTensorC_
,
DiStride_
,
HiStride_
,
WiStride_
,
CStrideTensorC_
));
}
else
{
throw
std
::
runtime_error
(
"wrong! unsupported layout: "
+
CLayout
::
name
());
}
}
template
<
typename
ALayout_
=
ALayout
,
typename
std
::
enable_if
<
(
NDimSpatial
==
2
||
NDimSpatial
==
3
)
&&
(
is_same_v
<
ALayout_
,
tensor_layout
::
convolution
::
GNHWK
>
||
is_same_v
<
ALayout_
,
tensor_layout
::
convolution
::
GNDHWK
>
||
is_same_v
<
ALayout_
,
tensor_layout
::
convolution
::
NHWGK
>
||
is_same_v
<
ALayout_
,
tensor_layout
::
convolution
::
NDHWGK
>
),
bool
>::
type
=
false
>
__host__
__device__
auto
MakeADescriptor_AK0_M_AK1
()
const
{
// n_do_ho_wo_k for 3d or n_ho_wo_k for 2d
// n_do_ho_wo_k for 3d or n_ho_wo_k for 2d
const
auto
out_grid_desc
=
const
auto
out_grid_desc
=
MakeOutGridDesc
();
make_out_grid_desc
<
NDimSpatial
,
ALayout
,
ConvBwdDataSpecialization
>
(
N
,
Do
,
Ho
,
Wo
,
K
,
out_g_n_k_wos_strides
);
if
constexpr
(
ConvBwdDataSpecialization
==
if
constexpr
(
ConvBwdDataSpecialization
==
ck
::
tensor_operation
::
device
::
ConvolutionBackwardDataSpecialization
::
ck
::
tensor_operation
::
device
::
ConvolutionBackwardDataSpecialization
::
Filter1x1Stride1Pad0
)
Filter1x1Stride1Pad0
)
{
{
const
index_t
AK0
=
math
::
integer_divide_ceil
(
K
,
AK1
);
const
index_t
AK0
=
math
::
integer_divide_ceil
(
K
_
,
AK1
);
// A: output tensor
// A: output tensor
const
auto
out_gemmak0_gemmmraw_gemmak1_grid_desc
=
transform_tensor_descriptor
(
const
auto
out_gemmak0_gemmmraw_gemmak1_grid_desc
=
transform_tensor_descriptor
(
out_grid_desc
,
out_grid_desc
,
make_tuple
(
make_pass_through_transform
(
N
*
Do
*
Ho
*
Wo
),
make_tuple
(
make_pass_through_transform
(
N
_
*
Do
_
*
Ho
_
*
Wo
_
),
make_unmerge_transform
(
make_tuple
(
AK0
,
AK1
))),
make_unmerge_transform
(
make_tuple
(
AK0
,
AK1
))),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
1
>
{},
Sequence
<
0
,
2
>
{}));
make_tuple
(
Sequence
<
1
>
{},
Sequence
<
0
,
2
>
{}));
...
@@ -266,82 +635,63 @@ struct TransformConvBwdDataToGemm_v1
...
@@ -266,82 +635,63 @@ struct TransformConvBwdDataToGemm_v1
}
}
else
else
{
{
const
auto
GcdStrideDilationD
=
math
::
gcd
(
ConvStrideD
,
ConvDilationD
);
const
auto
GcdStrideDilationH
=
math
::
gcd
(
ConvStrideH
,
ConvDilationH
);
const
auto
GcdStrideDilationW
=
math
::
gcd
(
ConvStrideW
,
ConvDilationW
);
const
auto
ZTilde
=
ConvStrideD
/
GcdStrideDilationD
;
const
auto
YTilde
=
ConvStrideH
/
GcdStrideDilationH
;
const
auto
XTilde
=
ConvStrideW
/
GcdStrideDilationW
;
const
auto
ZDot
=
math
::
integer_divide_ceil
(
Z
,
ZTilde
);
const
auto
YDot
=
math
::
integer_divide_ceil
(
Y
,
YTilde
);
const
auto
XDot
=
math
::
integer_divide_ceil
(
X
,
XTilde
);
const
auto
DTilde
=
Do
+
math
::
integer_divide_ceil
(
ConvDilationD
*
(
Z
-
I1
),
ConvStrideD
);
const
auto
HTilde
=
Ho
+
math
::
integer_divide_ceil
(
ConvDilationH
*
(
Y
-
I1
),
ConvStrideH
);
const
auto
WTilde
=
Wo
+
math
::
integer_divide_ceil
(
ConvDilationW
*
(
X
-
I1
),
ConvStrideW
);
// only work on HTilde and WTilde that contribute to non-padding area of input tensor
// only work on HTilde and WTilde that contribute to non-padding area of input tensor
const
auto
IDTildeSliceBegin
=
math
::
integer_divide_floor
(
const
auto
IDTildeSliceBegin
=
math
::
integer_divide_floor
(
math
::
max
(
I0
,
InLeftPadD
-
ConvDilationD
*
(
ZTilde
-
I1
)),
ConvStrideD
);
math
::
max
(
I0
,
InLeftPadD
_
-
ConvDilationD
_
*
(
ZTilde
_
-
I1
)),
ConvStrideD
_
);
const
auto
IHTildeSliceBegin
=
math
::
integer_divide_floor
(
const
auto
IHTildeSliceBegin
=
math
::
integer_divide_floor
(
math
::
max
(
I0
,
InLeftPadH
-
ConvDilationH
*
(
YTilde
-
I1
)),
ConvStrideH
);
math
::
max
(
I0
,
InLeftPadH
_
-
ConvDilationH
_
*
(
YTilde
_
-
I1
)),
ConvStrideH
_
);
const
auto
IWTildeSliceBegin
=
math
::
integer_divide_floor
(
const
auto
IWTildeSliceBegin
=
math
::
integer_divide_floor
(
math
::
max
(
I0
,
InLeftPadW
-
ConvDilationW
*
(
XTilde
-
I1
)),
ConvStrideW
);
math
::
max
(
I0
,
InLeftPadW
_
-
ConvDilationW
_
*
(
XTilde
_
-
I1
)),
ConvStrideW
_
);
const
auto
IDTildeSliceEnd
=
math
::
min
(
const
auto
IDTildeSliceEnd
=
math
::
min
(
DTilde
,
math
::
integer_divide_ceil
(
InLeftPadD
+
Di
-
I1
,
ConvStrideD
)
+
I1
);
DTilde
_
,
math
::
integer_divide_ceil
(
InLeftPadD
_
+
Di
_
-
I1
,
ConvStrideD
_
)
+
I1
);
const
auto
IHTildeSliceEnd
=
math
::
min
(
const
auto
IHTildeSliceEnd
=
math
::
min
(
HTilde
,
math
::
integer_divide_ceil
(
InLeftPadH
+
Hi
-
I1
,
ConvStrideH
)
+
I1
);
HTilde
_
,
math
::
integer_divide_ceil
(
InLeftPadH
_
+
Hi
_
-
I1
,
ConvStrideH
_
)
+
I1
);
const
auto
IWTildeSliceEnd
=
math
::
min
(
const
auto
IWTildeSliceEnd
=
math
::
min
(
WTilde
,
math
::
integer_divide_ceil
(
InLeftPadW
+
Wi
-
I1
,
ConvStrideW
)
+
I1
);
WTilde
_
,
math
::
integer_divide_ceil
(
InLeftPadW
_
+
Wi
_
-
I1
,
ConvStrideW
_
)
+
I1
);
const
auto
DTildeSlice
=
IDTildeSliceEnd
-
IDTildeSliceBegin
;
const
auto
DTildeSlice
=
IDTildeSliceEnd
-
IDTildeSliceBegin
;
const
auto
HTildeSlice
=
IHTildeSliceEnd
-
IHTildeSliceBegin
;
const
auto
HTildeSlice
=
IHTildeSliceEnd
-
IHTildeSliceBegin
;
const
auto
WTildeSlice
=
IWTildeSliceEnd
-
IWTildeSliceBegin
;
const
auto
WTildeSlice
=
IWTildeSliceEnd
-
IWTildeSliceBegin
;
// GemmK is different for each GEMM
// GemmK is different for each GEMM
const
auto
ZDotSlice
=
math
::
integer_divide_ceil
(
Z
-
i_zt
ilde
,
ZTilde
);
const
auto
ZDotSlice
=
math
::
integer_divide_ceil
(
Z
_
-
IdxZT
ilde
_
,
ZTilde
_
);
const
auto
YDotSlice
=
math
::
integer_divide_ceil
(
Y
-
i_yt
ilde
,
YTilde
);
const
auto
YDotSlice
=
math
::
integer_divide_ceil
(
Y
_
-
IdxYT
ilde
_
,
YTilde
_
);
const
auto
XDotSlice
=
math
::
integer_divide_ceil
(
X
-
i_xt
ilde
,
XTilde
);
const
auto
XDotSlice
=
math
::
integer_divide_ceil
(
X
_
-
IdxXT
ilde
_
,
XTilde
_
);
if
constexpr
(
NDimSpatial
==
2
)
if
constexpr
(
NDimSpatial
==
2
)
{
{
// A: output tensor
// A: output tensor
const
auto
out_n_hop_wop_k_grid_desc
=
transform_tensor_descriptor
(
const
auto
out_n_hop_wop_k_grid_desc
=
transform_tensor_descriptor
(
out_grid_desc
,
out_grid_desc
,
make_tuple
(
make_pass_through_transform
(
N
),
make_tuple
(
make_pass_through_transform
(
N
_
),
make_pad_transform
(
Ho
,
I0
,
I0
),
make_pad_transform
(
Ho
_
,
I0
,
I0
),
make_pad_transform
(
Wo
,
I0
,
I0
),
make_pad_transform
(
Wo
_
,
I0
,
I0
),
make_pass_through_transform
(
K
)),
make_pass_through_transform
(
K
_
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
const
auto
out_n_ydot_htilde_xdot_wtilde_k_grid_desc
=
transform_tensor_descriptor
(
const
auto
out_n_ydot_htilde_xdot_wtilde_k_grid_desc
=
transform_tensor_descriptor
(
out_n_hop_wop_k_grid_desc
,
out_n_hop_wop_k_grid_desc
,
make_tuple
(
make_tuple
(
make_pass_through_transform
(
N
),
make_pass_through_transform
(
N
_
),
make_embed_transform
(
make_tuple
(
YDot
,
HTilde
),
make_embed_transform
(
make_tuple
(
YDot
_
,
HTilde
_
),
make_tuple
(
-
ConvDilationH
/
GcdStrideDilationH
,
I1
)),
make_tuple
(
-
ConvDilationH
_
/
GcdStrideDilationH
_
,
I1
)),
make_embed_transform
(
make_tuple
(
XDot
,
WTilde
),
make_embed_transform
(
make_tuple
(
XDot
_
,
WTilde
_
),
make_tuple
(
-
ConvDilationW
/
GcdStrideDilationW
,
I1
)),
make_tuple
(
-
ConvDilationW
_
/
GcdStrideDilationW
_
,
I1
)),
make_pass_through_transform
(
K
)),
make_pass_through_transform
(
K
_
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
,
2
>
{},
Sequence
<
3
,
4
>
{},
Sequence
<
5
>
{}));
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
,
2
>
{},
Sequence
<
3
,
4
>
{},
Sequence
<
5
>
{}));
const
auto
out_n_ydotslice_htildeslice_xdotslice_wtildeslice_k_grid_desc
=
const
auto
out_n_ydotslice_htildeslice_xdotslice_wtildeslice_k_grid_desc
=
transform_tensor_descriptor
(
transform_tensor_descriptor
(
out_n_ydot_htilde_xdot_wtilde_k_grid_desc
,
out_n_ydot_htilde_xdot_wtilde_k_grid_desc
,
make_tuple
(
make_pass_through_transform
(
N
),
make_tuple
(
make_pass_through_transform
(
N
_
),
make_slice_transform
(
YDot
,
I0
,
YDotSlice
),
make_slice_transform
(
YDot
_
,
I0
,
YDotSlice
),
make_slice_transform
(
HTilde
,
IHTildeSliceBegin
,
HTildeSlice
),
make_slice_transform
(
HTilde
_
,
IHTildeSliceBegin
,
HTildeSlice
),
make_slice_transform
(
XDot
,
I0
,
XDotSlice
),
make_slice_transform
(
XDot
_
,
I0
,
XDotSlice
),
make_slice_transform
(
WTilde
,
IWTildeSliceBegin
,
WTildeSlice
),
make_slice_transform
(
WTilde
_
,
IWTildeSliceBegin
,
WTildeSlice
),
make_pass_through_transform
(
K
)),
make_pass_through_transform
(
K
_
)),
make_tuple
(
Sequence
<
0
>
{},
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
2
>
{},
...
@@ -357,8 +707,8 @@ struct TransformConvBwdDataToGemm_v1
...
@@ -357,8 +707,8 @@ struct TransformConvBwdDataToGemm_v1
const
auto
out_gemmk_gemmmraw_grid_desc
=
transform_tensor_descriptor
(
const
auto
out_gemmk_gemmmraw_grid_desc
=
transform_tensor_descriptor
(
out_n_ydotslice_htildeslice_xdotslice_wtildeslice_k_grid_desc
,
out_n_ydotslice_htildeslice_xdotslice_wtildeslice_k_grid_desc
,
make_tuple
(
make_merge_transform
(
make_tuple
(
YDotSlice
,
XDotSlice
,
K
)),
make_tuple
(
make_merge_transform
(
make_tuple
(
YDotSlice
,
XDotSlice
,
K
_
)),
make_merge_transform
(
make_tuple
(
N
,
HTildeSlice
,
WTildeSlice
))),
make_merge_transform
(
make_tuple
(
N
_
,
HTildeSlice
,
WTildeSlice
))),
make_tuple
(
Sequence
<
1
,
3
,
5
>
{},
Sequence
<
0
,
2
,
4
>
{}),
make_tuple
(
Sequence
<
1
,
3
,
5
>
{},
Sequence
<
0
,
2
,
4
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
...
@@ -385,11 +735,11 @@ struct TransformConvBwdDataToGemm_v1
...
@@ -385,11 +735,11 @@ struct TransformConvBwdDataToGemm_v1
// A: output tensor
// A: output tensor
const
auto
out_n_hop_wop_k_grid_desc
=
transform_tensor_descriptor
(
const
auto
out_n_hop_wop_k_grid_desc
=
transform_tensor_descriptor
(
out_grid_desc
,
out_grid_desc
,
make_tuple
(
make_pass_through_transform
(
N
),
make_tuple
(
make_pass_through_transform
(
N
_
),
make_pad_transform
(
Do
,
I0
,
I0
),
make_pad_transform
(
Do
_
,
I0
,
I0
),
make_pad_transform
(
Ho
,
I0
,
I0
),
make_pad_transform
(
Ho
_
,
I0
,
I0
),
make_pad_transform
(
Wo
,
I0
,
I0
),
make_pad_transform
(
Wo
_
,
I0
,
I0
),
make_pass_through_transform
(
K
)),
make_pass_through_transform
(
K
_
)),
make_tuple
(
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{},
Sequence
<
4
>
{}),
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{},
Sequence
<
4
>
{}),
make_tuple
(
make_tuple
(
...
@@ -398,17 +748,17 @@ struct TransformConvBwdDataToGemm_v1
...
@@ -398,17 +748,17 @@ struct TransformConvBwdDataToGemm_v1
const
auto
out_n_zdot_dtilde_ydot_htilde_xdot_wtilde_k_grid_desc
=
const
auto
out_n_zdot_dtilde_ydot_htilde_xdot_wtilde_k_grid_desc
=
transform_tensor_descriptor
(
transform_tensor_descriptor
(
out_n_hop_wop_k_grid_desc
,
out_n_hop_wop_k_grid_desc
,
make_tuple
(
make_pass_through_transform
(
N
),
make_tuple
(
make_pass_through_transform
(
N
_
),
make_embed_transform
(
make_embed_transform
(
make_tuple
(
ZDot
,
DTilde
),
make_tuple
(
ZDot
_
,
DTilde
_
),
make_tuple
(
-
ConvDilationD
/
GcdStrideDilationD
,
I1
)),
make_tuple
(
-
ConvDilationD
_
/
GcdStrideDilationD
_
,
I1
)),
make_embed_transform
(
make_embed_transform
(
make_tuple
(
YDot
,
HTilde
),
make_tuple
(
YDot
_
,
HTilde
_
),
make_tuple
(
-
ConvDilationH
/
GcdStrideDilationH
,
I1
)),
make_tuple
(
-
ConvDilationH
_
/
GcdStrideDilationH
_
,
I1
)),
make_embed_transform
(
make_embed_transform
(
make_tuple
(
XDot
,
WTilde
),
make_tuple
(
XDot
_
,
WTilde
_
),
make_tuple
(
-
ConvDilationW
/
GcdStrideDilationW
,
I1
)),
make_tuple
(
-
ConvDilationW
_
/
GcdStrideDilationW
_
,
I1
)),
make_pass_through_transform
(
K
)),
make_pass_through_transform
(
K
_
)),
make_tuple
(
Sequence
<
0
>
{},
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
2
>
{},
...
@@ -424,14 +774,15 @@ struct TransformConvBwdDataToGemm_v1
...
@@ -424,14 +774,15 @@ struct TransformConvBwdDataToGemm_v1
out_n_zdotslice_dtildeslice_ydotslice_htildeslice_xdotslice_wtildeslice_k_grid_desc
=
out_n_zdotslice_dtildeslice_ydotslice_htildeslice_xdotslice_wtildeslice_k_grid_desc
=
transform_tensor_descriptor
(
transform_tensor_descriptor
(
out_n_zdot_dtilde_ydot_htilde_xdot_wtilde_k_grid_desc
,
out_n_zdot_dtilde_ydot_htilde_xdot_wtilde_k_grid_desc
,
make_tuple
(
make_pass_through_transform
(
N
),
make_tuple
(
make_slice_transform
(
ZDot
,
I0
,
ZDotSlice
),
make_pass_through_transform
(
N_
),
make_slice_transform
(
DTilde
,
IDTildeSliceBegin
,
DTildeSlice
),
make_slice_transform
(
ZDot_
,
I0
,
ZDotSlice
),
make_slice_transform
(
YDot
,
I0
,
YDotSlice
),
make_slice_transform
(
DTilde_
,
IDTildeSliceBegin
,
DTildeSlice
),
make_slice_transform
(
HTilde
,
IHTildeSliceBegin
,
HTildeSlice
),
make_slice_transform
(
YDot_
,
I0
,
YDotSlice
),
make_slice_transform
(
XDot
,
I0
,
XDotSlice
),
make_slice_transform
(
HTilde_
,
IHTildeSliceBegin
,
HTildeSlice
),
make_slice_transform
(
WTilde
,
IWTildeSliceBegin
,
WTildeSlice
),
make_slice_transform
(
XDot_
,
I0
,
XDotSlice
),
make_pass_through_transform
(
K
)),
make_slice_transform
(
WTilde_
,
IWTildeSliceBegin
,
WTildeSlice
),
make_pass_through_transform
(
K_
)),
make_tuple
(
Sequence
<
0
>
{},
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
2
>
{},
...
@@ -452,8 +803,9 @@ struct TransformConvBwdDataToGemm_v1
...
@@ -452,8 +803,9 @@ struct TransformConvBwdDataToGemm_v1
const
auto
out_gemmk_gemmmraw_grid_desc
=
transform_tensor_descriptor
(
const
auto
out_gemmk_gemmmraw_grid_desc
=
transform_tensor_descriptor
(
out_n_zdotslice_dtildeslice_ydotslice_htildeslice_xdotslice_wtildeslice_k_grid_desc
,
out_n_zdotslice_dtildeslice_ydotslice_htildeslice_xdotslice_wtildeslice_k_grid_desc
,
make_tuple
(
make_tuple
(
make_merge_transform
(
make_tuple
(
ZDotSlice
,
YDotSlice
,
XDotSlice
,
K
)),
make_merge_transform
(
make_tuple
(
ZDotSlice
,
YDotSlice
,
XDotSlice
,
K_
)),
make_merge_transform
(
make_tuple
(
N
,
DTildeSlice
,
HTildeSlice
,
WTildeSlice
))),
make_merge_transform
(
make_tuple
(
N_
,
DTildeSlice
,
HTildeSlice
,
WTildeSlice
))),
make_tuple
(
Sequence
<
1
,
3
,
5
,
7
>
{},
Sequence
<
0
,
2
,
4
,
6
>
{}),
make_tuple
(
Sequence
<
1
,
3
,
5
,
7
>
{},
Sequence
<
0
,
2
,
4
,
6
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
...
@@ -482,66 +834,31 @@ struct TransformConvBwdDataToGemm_v1
...
@@ -482,66 +834,31 @@ struct TransformConvBwdDataToGemm_v1
}
}
}
}
template
<
typename
BLayout
,
template
<
typename
BLayout_
=
BLayout
,
typename
std
::
enable_if
<
(
NDimSpatial
==
2
||
NDimSpatial
==
3
)
&&
typename
std
::
enable_if
<
(
NDimSpatial
==
2
||
NDimSpatial
==
3
)
&&
(
is_same_v
<
BLayout
,
tensor_layout
::
convolution
::
GKYXC
>
||
(
is_same_v
<
BLayout
_
,
tensor_layout
::
convolution
::
GKYXC
>
||
is_same_v
<
BLayout
,
tensor_layout
::
convolution
::
GKZYXC
>
),
is_same_v
<
BLayout
_
,
tensor_layout
::
convolution
::
GKZYXC
>
),
bool
>::
type
=
false
>
bool
>::
type
=
false
>
static
auto
MakeBDescriptor_BK0_N_BK1
(
__host__
__device__
auto
MakeBDescriptor_BK0_N_BK1
()
const
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
out_g_n_k_wos_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
/* out_g_n_k_wos_strides */
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
wei_g_k_c_xs_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
/* wei_g_k_c_xs_strides */
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
in_g_n_c_wis_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
/* in_g_n_c_wis_strides */
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_dilations
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
/* input_left_pads */
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
/* input_right_pads */
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
tildes
)
{
{
index_t
i_ztilde
=
tildes
[
ZIdx
-
NonSpatialDimsNum
];
index_t
i_ytilde
=
tildes
[
YIdx
-
NonSpatialDimsNum
];
index_t
i_xtilde
=
tildes
[
XIdx
-
NonSpatialDimsNum
];
const
index_t
N
=
in_g_n_c_wis_lengths
[
1
];
const
index_t
K
=
wei_g_k_c_xs_lengths
[
1
];
const
index_t
C
=
wei_g_k_c_xs_lengths
[
2
];
const
index_t
Do
=
NDimSpatial
==
3
?
out_g_n_k_wos_lengths
[
DIdx
]
:
1
;
const
index_t
Ho
=
out_g_n_k_wos_lengths
[
HIdx
];
const
index_t
Wo
=
out_g_n_k_wos_lengths
[
WIdx
];
const
index_t
Z
=
NDimSpatial
==
3
?
wei_g_k_c_xs_lengths
[
ZIdx
]
:
1
;
const
index_t
Y
=
wei_g_k_c_xs_lengths
[
YIdx
];
const
index_t
X
=
wei_g_k_c_xs_lengths
[
XIdx
];
const
index_t
ConvStrideD
=
conv_filter_strides
[
DIdx
-
NonSpatialDimsNum
];
const
index_t
ConvStrideH
=
conv_filter_strides
[
HIdx
-
NonSpatialDimsNum
];
const
index_t
ConvStrideW
=
conv_filter_strides
[
WIdx
-
NonSpatialDimsNum
];
const
index_t
ConvDilationD
=
conv_filter_dilations
[
DIdx
-
NonSpatialDimsNum
];
const
index_t
ConvDilationH
=
conv_filter_dilations
[
HIdx
-
NonSpatialDimsNum
];
const
index_t
ConvDilationW
=
conv_filter_dilations
[
WIdx
-
NonSpatialDimsNum
];
// assume packed
// assume packed
// k_y_x_c for 2d or k_z_y_x_c for 3d
// k_y_x_c for 2d or k_z_y_x_c for 3d
const
auto
wei_grid_desc
=
m
ake
_wei_g
rid
_d
esc
<
BLayout
>
(
K
,
Z
,
Y
,
X
,
C
);
const
auto
wei_grid_desc
=
M
ake
WeiG
rid
D
esc
(
);
if
constexpr
(
ConvBwdDataSpecialization
==
if
constexpr
(
ConvBwdDataSpecialization
==
ck
::
tensor_operation
::
device
::
ConvolutionBackwardDataSpecialization
::
ck
::
tensor_operation
::
device
::
ConvolutionBackwardDataSpecialization
::
Filter1x1Stride1Pad0
)
Filter1x1Stride1Pad0
)
{
{
const
index_t
BK0
=
math
::
integer_divide_ceil
(
K
,
BK1
);
const
index_t
BK0
=
math
::
integer_divide_ceil
(
K
_
,
BK1
);
// B: weight tensor
// B: weight tensor
const
auto
wei_gemmbk0_gemmnraw_gemmbk1_grid_desc
=
const
auto
wei_gemmbk0_gemmnraw_gemmbk1_grid_desc
=
transform_tensor_descriptor
(
make_naive_tensor_descriptor_packed
(
make_tuple
(
K
,
C
)),
transform_tensor_descriptor
(
make_naive_tensor_descriptor_packed
(
make_tuple
(
K
_
,
C
_
)),
make_tuple
(
make_unmerge_transform
(
make_tuple
(
BK0
,
BK1
)),
make_tuple
(
make_unmerge_transform
(
make_tuple
(
BK0
,
BK1
)),
make_pass_through_transform
(
C
)),
make_pass_through_transform
(
C
_
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
,
2
>
{},
Sequence
<
1
>
{}));
make_tuple
(
Sequence
<
0
,
2
>
{},
Sequence
<
1
>
{}));
make_naive_tensor_descriptor
(
make_tuple
(
N
*
Do
*
Ho
*
Wo
,
C
),
make_tuple
(
I0
,
I1
));
make_naive_tensor_descriptor
(
make_tuple
(
N
_
*
Do
_
*
Ho
_
*
Wo
_
,
C
_
),
make_tuple
(
I0
,
I1
));
const
auto
wei_gemmbk0_gemmn_gemmbk1_grid_desc
=
const
auto
wei_gemmbk0_gemmn_gemmbk1_grid_desc
=
ck
::
tensor_operation
::
device
::
PadTensorDescriptor
(
ck
::
tensor_operation
::
device
::
PadTensorDescriptor
(
...
@@ -553,22 +870,10 @@ struct TransformConvBwdDataToGemm_v1
...
@@ -553,22 +870,10 @@ struct TransformConvBwdDataToGemm_v1
}
}
else
else
{
{
const
auto
GcdStrideDilationD
=
math
::
gcd
(
ConvStrideD
,
ConvDilationD
);
const
auto
GcdStrideDilationH
=
math
::
gcd
(
ConvStrideH
,
ConvDilationH
);
const
auto
GcdStrideDilationW
=
math
::
gcd
(
ConvStrideW
,
ConvDilationW
);
const
auto
ZTilde
=
ConvStrideD
/
GcdStrideDilationD
;
const
auto
YTilde
=
ConvStrideH
/
GcdStrideDilationH
;
const
auto
XTilde
=
ConvStrideW
/
GcdStrideDilationW
;
const
auto
ZDot
=
math
::
integer_divide_ceil
(
Z
,
ZTilde
);
const
auto
YDot
=
math
::
integer_divide_ceil
(
Y
,
YTilde
);
const
auto
XDot
=
math
::
integer_divide_ceil
(
X
,
XTilde
);
// GemmK is different for each GEMM
// GemmK is different for each GEMM
const
auto
ZDotSlice
=
math
::
integer_divide_ceil
(
Z
-
i_zt
ilde
,
ZTilde
);
const
auto
ZDotSlice
=
math
::
integer_divide_ceil
(
Z
_
-
IdxZT
ilde
_
,
ZTilde
_
);
const
auto
YDotSlice
=
math
::
integer_divide_ceil
(
Y
-
i_yt
ilde
,
YTilde
);
const
auto
YDotSlice
=
math
::
integer_divide_ceil
(
Y
_
-
IdxYT
ilde
_
,
YTilde
_
);
const
auto
XDotSlice
=
math
::
integer_divide_ceil
(
X
-
i_xt
ilde
,
XTilde
);
const
auto
XDotSlice
=
math
::
integer_divide_ceil
(
X
_
-
IdxXT
ilde
_
,
XTilde
_
);
// B weight tensor
// B weight tensor
if
constexpr
(
NDimSpatial
==
2
)
if
constexpr
(
NDimSpatial
==
2
)
...
@@ -576,23 +881,23 @@ struct TransformConvBwdDataToGemm_v1
...
@@ -576,23 +881,23 @@ struct TransformConvBwdDataToGemm_v1
const
auto
wei_k_ydot_ytilde_xdot_xtilde_c_grid_desc
=
transform_tensor_descriptor
(
const
auto
wei_k_ydot_ytilde_xdot_xtilde_c_grid_desc
=
transform_tensor_descriptor
(
wei_grid_desc
,
wei_grid_desc
,
make_tuple
(
make_tuple
(
make_pass_through_transform
(
K
),
make_pass_through_transform
(
K
_
),
make_embed_transform
(
make_tuple
(
YDot
,
YTilde
),
make_embed_transform
(
make_tuple
(
YDot
_
,
YTilde
_
),
make_tuple
(
ConvStrideH
/
GcdStrideDilationH
,
I1
)),
make_tuple
(
ConvStrideH
_
/
GcdStrideDilationH
_
,
I1
)),
make_embed_transform
(
make_tuple
(
XDot
,
XTilde
),
make_embed_transform
(
make_tuple
(
XDot
_
,
XTilde
_
),
make_tuple
(
ConvStrideW
/
GcdStrideDilationW
,
I1
)),
make_tuple
(
ConvStrideW
_
/
GcdStrideDilationW
_
,
I1
)),
make_pass_through_transform
(
C
)),
make_pass_through_transform
(
C
_
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
,
2
>
{},
Sequence
<
3
,
4
>
{},
Sequence
<
5
>
{}));
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
,
2
>
{},
Sequence
<
3
,
4
>
{},
Sequence
<
5
>
{}));
const
auto
wei_k_ydotslice_xdotslice_c_grid_desc
=
transform_tensor_descriptor
(
const
auto
wei_k_ydotslice_xdotslice_c_grid_desc
=
transform_tensor_descriptor
(
wei_k_ydot_ytilde_xdot_xtilde_c_grid_desc
,
wei_k_ydot_ytilde_xdot_xtilde_c_grid_desc
,
make_tuple
(
make_pass_through_transform
(
K
),
make_tuple
(
make_pass_through_transform
(
K
_
),
make_slice_transform
(
YDot
,
I0
,
YDotSlice
),
make_slice_transform
(
YDot
_
,
I0
,
YDotSlice
),
make_slice_transform
(
XDot
,
I0
,
XDotSlice
),
make_slice_transform
(
XDot
_
,
I0
,
XDotSlice
),
make_freeze_transform
(
i_yt
ilde
),
make_freeze_transform
(
IdxYT
ilde
_
),
make_freeze_transform
(
i_xt
ilde
),
make_freeze_transform
(
IdxXT
ilde
_
),
make_pass_through_transform
(
C
)),
make_pass_through_transform
(
C
_
)),
make_tuple
(
Sequence
<
0
>
{},
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
1
>
{},
Sequence
<
3
>
{},
Sequence
<
3
>
{},
...
@@ -608,8 +913,8 @@ struct TransformConvBwdDataToGemm_v1
...
@@ -608,8 +913,8 @@ struct TransformConvBwdDataToGemm_v1
const
auto
wei_gemmk_gemmnraw_grid_desc
=
transform_tensor_descriptor
(
const
auto
wei_gemmk_gemmnraw_grid_desc
=
transform_tensor_descriptor
(
wei_k_ydotslice_xdotslice_c_grid_desc
,
wei_k_ydotslice_xdotslice_c_grid_desc
,
make_tuple
(
make_merge_transform
(
make_tuple
(
YDotSlice
,
XDotSlice
,
K
)),
make_tuple
(
make_merge_transform
(
make_tuple
(
YDotSlice
,
XDotSlice
,
K
_
)),
make_pass_through_transform
(
C
)),
make_pass_through_transform
(
C
_
)),
make_tuple
(
Sequence
<
1
,
2
,
0
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
1
,
2
,
0
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
...
@@ -636,15 +941,17 @@ struct TransformConvBwdDataToGemm_v1
...
@@ -636,15 +941,17 @@ struct TransformConvBwdDataToGemm_v1
const
auto
wei_k_zdot_ztilde_ydot_ytilde_xdot_xtilde_c_grid_desc
=
const
auto
wei_k_zdot_ztilde_ydot_ytilde_xdot_xtilde_c_grid_desc
=
transform_tensor_descriptor
(
transform_tensor_descriptor
(
wei_grid_desc
,
wei_grid_desc
,
make_tuple
(
make_tuple
(
make_pass_through_transform
(
K_
),
make_pass_through_transform
(
K
),
make_embed_transform
(
make_embed_transform
(
make_tuple
(
ZDot
,
ZTilde
),
make_tuple
(
ZDot_
,
ZTilde_
),
make_tuple
(
ConvStrideD
/
GcdStrideDilationD
,
I1
)),
make_tuple
(
ConvStrideD_
/
GcdStrideDilationD_
,
I1
)),
make_embed_transform
(
make_tuple
(
YDot
,
YTilde
),
make_embed_transform
(
make_tuple
(
ConvStrideH
/
GcdStrideDilationH
,
I1
)),
make_tuple
(
YDot_
,
YTilde_
),
make_embed_transform
(
make_tuple
(
XDot
,
XTilde
),
make_tuple
(
ConvStrideH_
/
GcdStrideDilationH_
,
I1
)),
make_tuple
(
ConvStrideW
/
GcdStrideDilationW
,
I1
)),
make_embed_transform
(
make_pass_through_transform
(
C
)),
make_tuple
(
XDot_
,
XTilde_
),
make_tuple
(
ConvStrideW_
/
GcdStrideDilationW_
,
I1
)),
make_pass_through_transform
(
C_
)),
make_tuple
(
Sequence
<
0
>
{},
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
2
>
{},
...
@@ -659,14 +966,14 @@ struct TransformConvBwdDataToGemm_v1
...
@@ -659,14 +966,14 @@ struct TransformConvBwdDataToGemm_v1
const
auto
wei_gemmk_zdotslice_ydotslice_xdotslice_c_grid_desc
=
const
auto
wei_gemmk_zdotslice_ydotslice_xdotslice_c_grid_desc
=
transform_tensor_descriptor
(
transform_tensor_descriptor
(
wei_k_zdot_ztilde_ydot_ytilde_xdot_xtilde_c_grid_desc
,
wei_k_zdot_ztilde_ydot_ytilde_xdot_xtilde_c_grid_desc
,
make_tuple
(
make_pass_through_transform
(
K
),
make_tuple
(
make_pass_through_transform
(
K
_
),
make_slice_transform
(
ZDot
,
I0
,
ZDotSlice
),
make_slice_transform
(
ZDot
_
,
I0
,
ZDotSlice
),
make_slice_transform
(
YDot
,
I0
,
YDotSlice
),
make_slice_transform
(
YDot
_
,
I0
,
YDotSlice
),
make_slice_transform
(
XDot
,
I0
,
XDotSlice
),
make_slice_transform
(
XDot
_
,
I0
,
XDotSlice
),
make_freeze_transform
(
i_zt
ilde
),
make_freeze_transform
(
IdxZT
ilde
_
),
make_freeze_transform
(
i_yt
ilde
),
make_freeze_transform
(
IdxYT
ilde
_
),
make_freeze_transform
(
i_xt
ilde
),
make_freeze_transform
(
IdxXT
ilde
_
),
make_pass_through_transform
(
C
)),
make_pass_through_transform
(
C
_
)),
make_tuple
(
Sequence
<
0
>
{},
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
1
>
{},
Sequence
<
3
>
{},
Sequence
<
3
>
{},
...
@@ -686,8 +993,9 @@ struct TransformConvBwdDataToGemm_v1
...
@@ -686,8 +993,9 @@ struct TransformConvBwdDataToGemm_v1
const
auto
wei_gemmk_gemmnraw_grid_desc
=
transform_tensor_descriptor
(
const
auto
wei_gemmk_gemmnraw_grid_desc
=
transform_tensor_descriptor
(
wei_gemmk_zdotslice_ydotslice_xdotslice_c_grid_desc
,
wei_gemmk_zdotslice_ydotslice_xdotslice_c_grid_desc
,
make_tuple
(
make_merge_transform
(
make_tuple
(
ZDotSlice
,
YDotSlice
,
XDotSlice
,
K
)),
make_tuple
(
make_pass_through_transform
(
C
)),
make_merge_transform
(
make_tuple
(
ZDotSlice
,
YDotSlice
,
XDotSlice
,
K_
)),
make_pass_through_transform
(
C_
)),
make_tuple
(
Sequence
<
1
,
2
,
3
,
0
>
{},
Sequence
<
4
>
{}),
make_tuple
(
Sequence
<
1
,
2
,
3
,
0
>
{},
Sequence
<
4
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
...
@@ -716,66 +1024,20 @@ struct TransformConvBwdDataToGemm_v1
...
@@ -716,66 +1024,20 @@ struct TransformConvBwdDataToGemm_v1
}
}
}
}
template
<
typename
CLayout
,
template
<
typename
std
::
enable_if
<
(
NDimSpatial
==
2
||
NDimSpatial
==
3
)
&&
typename
CLayout_
=
CLayout
,
(
is_same_v
<
CLayout
,
tensor_layout
::
convolution
::
GNHWC
>
||
typename
std
::
enable_if
<
(
NDimSpatial
==
2
||
NDimSpatial
==
3
)
&&
is_same_v
<
CLayout
,
tensor_layout
::
convolution
::
GNDHWC
>
||
(
is_same_v
<
CLayout_
,
tensor_layout
::
convolution
::
GNHWC
>
||
is_same_v
<
CLayout
,
tensor_layout
::
convolution
::
NHWGC
>
||
is_same_v
<
CLayout_
,
tensor_layout
::
convolution
::
GNDHWC
>
||
is_same_v
<
CLayout
,
tensor_layout
::
convolution
::
NDHWGC
>
||
is_same_v
<
CLayout_
,
tensor_layout
::
convolution
::
NHWGC
>
||
is_same_v
<
CLayout
,
tensor_layout
::
convolution
::
G_NHW_C
>
),
is_same_v
<
CLayout_
,
tensor_layout
::
convolution
::
NDHWGC
>
||
bool
>::
type
=
false
>
is_same_v
<
CLayout_
,
tensor_layout
::
convolution
::
G_NHW_C
>
),
static
auto
bool
>::
type
=
false
>
MakeCDescriptor_M_N
(
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
out_g_n_k_wos_lengths
,
__host__
__device__
auto
MakeCDescriptor_M_N
()
const
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
/* out_g_n_k_wos_strides */
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
wei_g_k_c_xs_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
/* wei_g_k_c_xs_strides */
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
in_g_n_c_wis_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
in_g_n_c_wis_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_dilations
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_left_pads
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_right_pads
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
tildes
)
{
{
index_t
i_ztilde
=
tildes
[
ZIdx
-
NonSpatialDimsNum
];
index_t
i_ytilde
=
tildes
[
YIdx
-
NonSpatialDimsNum
];
index_t
i_xtilde
=
tildes
[
XIdx
-
NonSpatialDimsNum
];
const
index_t
N
=
in_g_n_c_wis_lengths
[
1
];
const
index_t
C
=
wei_g_k_c_xs_lengths
[
2
];
const
index_t
Di
=
NDimSpatial
==
3
?
in_g_n_c_wis_lengths
[
DIdx
]
:
1
;
const
index_t
Hi
=
in_g_n_c_wis_lengths
[
HIdx
];
const
index_t
Wi
=
in_g_n_c_wis_lengths
[
WIdx
];
const
index_t
Do
=
NDimSpatial
==
3
?
out_g_n_k_wos_lengths
[
DIdx
]
:
1
;
const
index_t
Ho
=
out_g_n_k_wos_lengths
[
HIdx
];
const
index_t
Wo
=
out_g_n_k_wos_lengths
[
WIdx
];
const
index_t
Z
=
NDimSpatial
==
3
?
wei_g_k_c_xs_lengths
[
ZIdx
]
:
1
;
const
index_t
Y
=
wei_g_k_c_xs_lengths
[
YIdx
];
const
index_t
X
=
wei_g_k_c_xs_lengths
[
XIdx
];
const
index_t
InLeftPadD
=
input_left_pads
[
DIdx
-
NonSpatialDimsNum
];
const
index_t
InLeftPadH
=
input_left_pads
[
HIdx
-
NonSpatialDimsNum
];
const
index_t
InLeftPadW
=
input_left_pads
[
WIdx
-
NonSpatialDimsNum
];
const
index_t
InRightPadD
=
input_right_pads
[
DIdx
-
NonSpatialDimsNum
];
const
index_t
InRightPadH
=
input_right_pads
[
HIdx
-
NonSpatialDimsNum
];
const
index_t
InRightPadW
=
input_right_pads
[
WIdx
-
NonSpatialDimsNum
];
const
index_t
ConvStrideD
=
conv_filter_strides
[
DIdx
-
NonSpatialDimsNum
];
const
index_t
ConvStrideH
=
conv_filter_strides
[
HIdx
-
NonSpatialDimsNum
];
const
index_t
ConvStrideW
=
conv_filter_strides
[
WIdx
-
NonSpatialDimsNum
];
const
index_t
ConvDilationD
=
conv_filter_dilations
[
DIdx
-
NonSpatialDimsNum
];
const
index_t
ConvDilationH
=
conv_filter_dilations
[
HIdx
-
NonSpatialDimsNum
];
const
index_t
ConvDilationW
=
conv_filter_dilations
[
WIdx
-
NonSpatialDimsNum
];
// assume strided
// assume strided
// n_hi_wi_c for 2d n_di_hi_wi_c for 3d
// n_hi_wi_c for 2d n_di_hi_wi_c for 3d
const
auto
in_grid_desc
=
const
auto
in_grid_desc
=
MakeInGridDesc
();
make_in_grid_desc
<
NDimSpatial
,
CLayout
>
(
N
,
Di
,
Hi
,
Wi
,
C
,
in_g_n_c_wis_strides
);
if
constexpr
(
ConvBwdDataSpecialization
==
if
constexpr
(
ConvBwdDataSpecialization
==
ck
::
tensor_operation
::
device
::
ConvolutionBackwardDataSpecialization
::
ck
::
tensor_operation
::
device
::
ConvolutionBackwardDataSpecialization
::
...
@@ -787,10 +1049,10 @@ struct TransformConvBwdDataToGemm_v1
...
@@ -787,10 +1049,10 @@ struct TransformConvBwdDataToGemm_v1
const
auto
in_n_y_ho_x_wo_c_grid_desc
=
transform_tensor_descriptor
(
const
auto
in_n_y_ho_x_wo_c_grid_desc
=
transform_tensor_descriptor
(
in_grid_desc
,
in_grid_desc
,
make_tuple
(
make_tuple
(
make_pass_through_transform
(
N
),
make_pass_through_transform
(
N
_
),
make_embed_transform
(
make_tuple
(
I1
,
Ho
),
make_tuple
(
I1
,
ConvStrideH
)),
make_embed_transform
(
make_tuple
(
I1
,
Ho
_
),
make_tuple
(
I1
,
ConvStrideH
_
)),
make_embed_transform
(
make_tuple
(
I1
,
Wo
),
make_tuple
(
I1
,
ConvStrideW
)),
make_embed_transform
(
make_tuple
(
I1
,
Wo
_
),
make_tuple
(
I1
,
ConvStrideW
_
)),
make_pass_through_transform
(
C
)),
make_pass_through_transform
(
C
_
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
,
2
>
{},
Sequence
<
3
,
4
>
{},
Sequence
<
5
>
{}));
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
,
2
>
{},
Sequence
<
3
,
4
>
{},
Sequence
<
5
>
{}));
...
@@ -798,8 +1060,8 @@ struct TransformConvBwdDataToGemm_v1
...
@@ -798,8 +1060,8 @@ struct TransformConvBwdDataToGemm_v1
in_n_y_ho_x_wo_c_grid_desc
,
in_n_y_ho_x_wo_c_grid_desc
,
make_tuple
(
make_freeze_transform
(
I0
),
make_tuple
(
make_freeze_transform
(
I0
),
make_freeze_transform
(
I0
),
make_freeze_transform
(
I0
),
make_merge_transform
(
make_tuple
(
N
,
Ho
,
Wo
)),
make_merge_transform
(
make_tuple
(
N
_
,
Ho
_
,
Wo
_
)),
make_pass_through_transform
(
C
)),
make_pass_through_transform
(
C
_
)),
make_tuple
(
Sequence
<
1
>
{},
Sequence
<
3
>
{},
Sequence
<
0
,
2
,
4
>
{},
Sequence
<
5
>
{}),
make_tuple
(
Sequence
<
1
>
{},
Sequence
<
3
>
{},
Sequence
<
0
,
2
,
4
>
{},
Sequence
<
5
>
{}),
make_tuple
(
Sequence
<>
{},
Sequence
<>
{},
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
make_tuple
(
Sequence
<>
{},
Sequence
<>
{},
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
...
@@ -818,11 +1080,11 @@ struct TransformConvBwdDataToGemm_v1
...
@@ -818,11 +1080,11 @@ struct TransformConvBwdDataToGemm_v1
const
auto
in_n_x_do_y_ho_x_wo_c_grid_desc
=
transform_tensor_descriptor
(
const
auto
in_n_x_do_y_ho_x_wo_c_grid_desc
=
transform_tensor_descriptor
(
in_grid_desc
,
in_grid_desc
,
make_tuple
(
make_tuple
(
make_pass_through_transform
(
N
),
make_pass_through_transform
(
N
_
),
make_embed_transform
(
make_tuple
(
I1
,
Do
),
make_tuple
(
I1
,
ConvStrideD
)),
make_embed_transform
(
make_tuple
(
I1
,
Do
_
),
make_tuple
(
I1
,
ConvStrideD
_
)),
make_embed_transform
(
make_tuple
(
I1
,
Ho
),
make_tuple
(
I1
,
ConvStrideH
)),
make_embed_transform
(
make_tuple
(
I1
,
Ho
_
),
make_tuple
(
I1
,
ConvStrideH
_
)),
make_embed_transform
(
make_tuple
(
I1
,
Wo
),
make_tuple
(
I1
,
ConvStrideW
)),
make_embed_transform
(
make_tuple
(
I1
,
Wo
_
),
make_tuple
(
I1
,
ConvStrideW
_
)),
make_pass_through_transform
(
C
)),
make_pass_through_transform
(
C
_
)),
make_tuple
(
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{},
Sequence
<
4
>
{}),
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{},
Sequence
<
4
>
{}),
make_tuple
(
Sequence
<
0
>
{},
make_tuple
(
Sequence
<
0
>
{},
...
@@ -836,8 +1098,8 @@ struct TransformConvBwdDataToGemm_v1
...
@@ -836,8 +1098,8 @@ struct TransformConvBwdDataToGemm_v1
make_tuple
(
make_freeze_transform
(
I0
),
make_tuple
(
make_freeze_transform
(
I0
),
make_freeze_transform
(
I0
),
make_freeze_transform
(
I0
),
make_freeze_transform
(
I0
),
make_freeze_transform
(
I0
),
make_merge_transform
(
make_tuple
(
N
,
Do
,
Ho
,
Wo
)),
make_merge_transform
(
make_tuple
(
N
_
,
Do
_
,
Ho
_
,
Wo
_
)),
make_pass_through_transform
(
C
)),
make_pass_through_transform
(
C
_
)),
make_tuple
(
Sequence
<
1
>
{},
make_tuple
(
Sequence
<
1
>
{},
Sequence
<
3
>
{},
Sequence
<
3
>
{},
Sequence
<
5
>
{},
Sequence
<
5
>
{},
...
@@ -861,36 +1123,21 @@ struct TransformConvBwdDataToGemm_v1
...
@@ -861,36 +1123,21 @@ struct TransformConvBwdDataToGemm_v1
}
}
else
else
{
{
const
auto
GcdStrideDilationD
=
math
::
gcd
(
ConvStrideD
,
ConvDilationD
);
const
auto
GcdStrideDilationH
=
math
::
gcd
(
ConvStrideH
,
ConvDilationH
);
const
auto
GcdStrideDilationW
=
math
::
gcd
(
ConvStrideW
,
ConvDilationW
);
const
auto
ZTilde
=
ConvStrideD
/
GcdStrideDilationD
;
const
auto
YTilde
=
ConvStrideH
/
GcdStrideDilationH
;
const
auto
XTilde
=
ConvStrideW
/
GcdStrideDilationW
;
const
auto
DTilde
=
Do
+
math
::
integer_divide_ceil
(
ConvDilationD
*
(
Z
-
I1
),
ConvStrideD
);
const
auto
HTilde
=
Ho
+
math
::
integer_divide_ceil
(
ConvDilationH
*
(
Y
-
I1
),
ConvStrideH
);
const
auto
WTilde
=
Wo
+
math
::
integer_divide_ceil
(
ConvDilationW
*
(
X
-
I1
),
ConvStrideW
);
// only work on DTilde, HTilde and WTilde that contribute to
// only work on DTilde, HTilde and WTilde that contribute to
// non-padding area of input tensor
// non-padding area of input tensor
const
auto
IDTildeSliceBegin
=
math
::
integer_divide_floor
(
const
auto
IDTildeSliceBegin
=
math
::
integer_divide_floor
(
math
::
max
(
I0
,
InLeftPadD
-
ConvDilationD
*
(
ZTilde
-
I1
)),
ConvStrideD
);
math
::
max
(
I0
,
InLeftPadD
_
-
ConvDilationD
_
*
(
ZTilde
_
-
I1
)),
ConvStrideD
_
);
const
auto
IHTildeSliceBegin
=
math
::
integer_divide_floor
(
const
auto
IHTildeSliceBegin
=
math
::
integer_divide_floor
(
math
::
max
(
I0
,
InLeftPadH
-
ConvDilationH
*
(
YTilde
-
I1
)),
ConvStrideH
);
math
::
max
(
I0
,
InLeftPadH
_
-
ConvDilationH
_
*
(
YTilde
_
-
I1
)),
ConvStrideH
_
);
const
auto
IWTildeSliceBegin
=
math
::
integer_divide_floor
(
const
auto
IWTildeSliceBegin
=
math
::
integer_divide_floor
(
math
::
max
(
I0
,
InLeftPadW
-
ConvDilationW
*
(
XTilde
-
I1
)),
ConvStrideW
);
math
::
max
(
I0
,
InLeftPadW
_
-
ConvDilationW
_
*
(
XTilde
_
-
I1
)),
ConvStrideW
_
);
const
auto
IDTildeSliceEnd
=
math
::
min
(
const
auto
IDTildeSliceEnd
=
math
::
min
(
DTilde
,
math
::
integer_divide_ceil
(
InLeftPadD
+
Di
-
I1
,
ConvStrideD
)
+
I1
);
DTilde
_
,
math
::
integer_divide_ceil
(
InLeftPadD
_
+
Di
_
-
I1
,
ConvStrideD
_
)
+
I1
);
const
auto
IHTildeSliceEnd
=
math
::
min
(
const
auto
IHTildeSliceEnd
=
math
::
min
(
HTilde
,
math
::
integer_divide_ceil
(
InLeftPadH
+
Hi
-
I1
,
ConvStrideH
)
+
I1
);
HTilde
_
,
math
::
integer_divide_ceil
(
InLeftPadH
_
+
Hi
_
-
I1
,
ConvStrideH
_
)
+
I1
);
const
auto
IWTildeSliceEnd
=
math
::
min
(
const
auto
IWTildeSliceEnd
=
math
::
min
(
WTilde
,
math
::
integer_divide_ceil
(
InLeftPadW
+
Wi
-
I1
,
ConvStrideW
)
+
I1
);
WTilde
_
,
math
::
integer_divide_ceil
(
InLeftPadW
_
+
Wi
_
-
I1
,
ConvStrideW
_
)
+
I1
);
const
auto
DTildeSlice
=
IDTildeSliceEnd
-
IDTildeSliceBegin
;
const
auto
DTildeSlice
=
IDTildeSliceEnd
-
IDTildeSliceBegin
;
const
auto
HTildeSlice
=
IHTildeSliceEnd
-
IHTildeSliceBegin
;
const
auto
HTildeSlice
=
IHTildeSliceEnd
-
IHTildeSliceBegin
;
...
@@ -901,34 +1148,34 @@ struct TransformConvBwdDataToGemm_v1
...
@@ -901,34 +1148,34 @@ struct TransformConvBwdDataToGemm_v1
{
{
const
auto
in_n_hip_wip_c_grid_desc
=
transform_tensor_descriptor
(
const
auto
in_n_hip_wip_c_grid_desc
=
transform_tensor_descriptor
(
in_grid_desc
,
in_grid_desc
,
make_tuple
(
make_pass_through_transform
(
N
),
make_tuple
(
make_pass_through_transform
(
N
_
),
make_pad_transform
(
Hi
,
InLeftPadH
,
InRightPadH
),
make_pad_transform
(
Hi
_
,
InLeftPadH
_
,
InRightPadH
_
),
make_pad_transform
(
Wi
,
InLeftPadW
,
InRightPadW
),
make_pad_transform
(
Wi
_
,
InLeftPadW
_
,
InRightPadW
_
),
make_pass_through_transform
(
C
)),
make_pass_through_transform
(
C
_
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
const
auto
in_n_ytilde_htilde_xtilde_wtilde_c_grid_desc
=
const
auto
in_n_ytilde_htilde_xtilde_wtilde_c_grid_desc
=
transform_tensor_descriptor
(
transform_tensor_descriptor
(
in_n_hip_wip_c_grid_desc
,
in_n_hip_wip_c_grid_desc
,
make_tuple
(
make_pass_through_transform
(
N
),
make_tuple
(
make_pass_through_transform
(
N
_
),
make_embed_transform
(
make_tuple
(
YTilde
,
HTilde
),
make_embed_transform
(
make_tuple
(
YTilde
_
,
HTilde
_
),
make_tuple
(
ConvDilationH
,
ConvStrideH
)),
make_tuple
(
ConvDilationH
_
,
ConvStrideH
_
)),
make_embed_transform
(
make_tuple
(
XTilde
,
WTilde
),
make_embed_transform
(
make_tuple
(
XTilde
_
,
WTilde
_
),
make_tuple
(
ConvDilationW
,
ConvStrideW
)),
make_tuple
(
ConvDilationW
_
,
ConvStrideW
_
)),
make_pass_through_transform
(
C
)),
make_pass_through_transform
(
C
_
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
,
2
>
{},
Sequence
<
3
,
4
>
{},
Sequence
<
5
>
{}));
Sequence
<
0
>
{},
Sequence
<
1
,
2
>
{},
Sequence
<
3
,
4
>
{},
Sequence
<
5
>
{}));
const
auto
in_n_htildeslice_wtildeslice_c_grid_desc
=
transform_tensor_descriptor
(
const
auto
in_n_htildeslice_wtildeslice_c_grid_desc
=
transform_tensor_descriptor
(
in_n_ytilde_htilde_xtilde_wtilde_c_grid_desc
,
in_n_ytilde_htilde_xtilde_wtilde_c_grid_desc
,
make_tuple
(
make_pass_through_transform
(
N
),
make_tuple
(
make_pass_through_transform
(
N
_
),
make_freeze_transform
(
i_yt
ilde
),
make_freeze_transform
(
IdxYT
ilde
_
),
make_slice_transform
(
HTilde
,
IHTildeSliceBegin
,
HTildeSlice
),
make_slice_transform
(
HTilde
_
,
IHTildeSliceBegin
,
HTildeSlice
),
make_freeze_transform
(
i_xt
ilde
),
make_freeze_transform
(
IdxXT
ilde
_
),
make_slice_transform
(
WTilde
,
IWTildeSliceBegin
,
WTildeSlice
),
make_slice_transform
(
WTilde
_
,
IWTildeSliceBegin
,
WTildeSlice
),
make_pass_through_transform
(
C
)),
make_pass_through_transform
(
C
_
)),
make_tuple
(
Sequence
<
0
>
{},
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
2
>
{},
...
@@ -944,8 +1191,8 @@ struct TransformConvBwdDataToGemm_v1
...
@@ -944,8 +1191,8 @@ struct TransformConvBwdDataToGemm_v1
const
auto
in_gemmmraw_gemmnraw_grid_desc
=
transform_tensor_descriptor
(
const
auto
in_gemmmraw_gemmnraw_grid_desc
=
transform_tensor_descriptor
(
in_n_htildeslice_wtildeslice_c_grid_desc
,
in_n_htildeslice_wtildeslice_c_grid_desc
,
make_tuple
(
make_merge_transform
(
make_tuple
(
N
,
HTildeSlice
,
WTildeSlice
)),
make_tuple
(
make_merge_transform
(
make_tuple
(
N
_
,
HTildeSlice
,
WTildeSlice
)),
make_pass_through_transform
(
C
)),
make_pass_through_transform
(
C
_
)),
make_tuple
(
Sequence
<
0
,
1
,
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
,
1
,
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
...
@@ -961,11 +1208,11 @@ struct TransformConvBwdDataToGemm_v1
...
@@ -961,11 +1208,11 @@ struct TransformConvBwdDataToGemm_v1
{
{
const
auto
in_n_dip_hip_wip_c_grid_desc
=
transform_tensor_descriptor
(
const
auto
in_n_dip_hip_wip_c_grid_desc
=
transform_tensor_descriptor
(
in_grid_desc
,
in_grid_desc
,
make_tuple
(
make_pass_through_transform
(
N
),
make_tuple
(
make_pass_through_transform
(
N
_
),
make_pad_transform
(
Di
,
InLeftPadD
,
InRightPadD
),
make_pad_transform
(
Di
_
,
InLeftPadD
_
,
InRightPadD
_
),
make_pad_transform
(
Hi
,
InLeftPadH
,
InRightPadH
),
make_pad_transform
(
Hi
_
,
InLeftPadH
_
,
InRightPadH
_
),
make_pad_transform
(
Wi
,
InLeftPadW
,
InRightPadW
),
make_pad_transform
(
Wi
_
,
InLeftPadW
_
,
InRightPadW
_
),
make_pass_through_transform
(
C
)),
make_pass_through_transform
(
C
_
)),
make_tuple
(
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{},
Sequence
<
4
>
{}),
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{},
Sequence
<
4
>
{}),
make_tuple
(
make_tuple
(
...
@@ -974,14 +1221,14 @@ struct TransformConvBwdDataToGemm_v1
...
@@ -974,14 +1221,14 @@ struct TransformConvBwdDataToGemm_v1
const
auto
in_n_ztilde_dtilde_ytilde_htilde_xtilde_wtilde_c_grid_desc
=
const
auto
in_n_ztilde_dtilde_ytilde_htilde_xtilde_wtilde_c_grid_desc
=
transform_tensor_descriptor
(
transform_tensor_descriptor
(
in_n_dip_hip_wip_c_grid_desc
,
in_n_dip_hip_wip_c_grid_desc
,
make_tuple
(
make_pass_through_transform
(
N
),
make_tuple
(
make_pass_through_transform
(
N
_
),
make_embed_transform
(
make_tuple
(
ZTilde
,
DTilde
),
make_embed_transform
(
make_tuple
(
ZTilde
_
,
DTilde
_
),
make_tuple
(
ConvDilationD
,
ConvStrideD
)),
make_tuple
(
ConvDilationD
_
,
ConvStrideD
_
)),
make_embed_transform
(
make_tuple
(
YTilde
,
HTilde
),
make_embed_transform
(
make_tuple
(
YTilde
_
,
HTilde
_
),
make_tuple
(
ConvDilationH
,
ConvStrideH
)),
make_tuple
(
ConvDilationH
_
,
ConvStrideH
_
)),
make_embed_transform
(
make_tuple
(
XTilde
,
WTilde
),
make_embed_transform
(
make_tuple
(
XTilde
_
,
WTilde
_
),
make_tuple
(
ConvDilationW
,
ConvStrideW
)),
make_tuple
(
ConvDilationW
_
,
ConvStrideW
_
)),
make_pass_through_transform
(
C
)),
make_pass_through_transform
(
C
_
)),
make_tuple
(
Sequence
<
0
>
{},
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
2
>
{},
...
@@ -996,14 +1243,14 @@ struct TransformConvBwdDataToGemm_v1
...
@@ -996,14 +1243,14 @@ struct TransformConvBwdDataToGemm_v1
const
auto
in_n_dtildeslice_htildeslice_wtildeslice_c_grid_desc
=
const
auto
in_n_dtildeslice_htildeslice_wtildeslice_c_grid_desc
=
transform_tensor_descriptor
(
transform_tensor_descriptor
(
in_n_ztilde_dtilde_ytilde_htilde_xtilde_wtilde_c_grid_desc
,
in_n_ztilde_dtilde_ytilde_htilde_xtilde_wtilde_c_grid_desc
,
make_tuple
(
make_pass_through_transform
(
N
),
make_tuple
(
make_pass_through_transform
(
N
_
),
make_freeze_transform
(
i_zt
ilde
),
make_freeze_transform
(
IdxZT
ilde
_
),
make_slice_transform
(
DTilde
,
IDTildeSliceBegin
,
DTildeSlice
),
make_slice_transform
(
DTilde
_
,
IDTildeSliceBegin
,
DTildeSlice
),
make_freeze_transform
(
i_yt
ilde
),
make_freeze_transform
(
IdxYT
ilde
_
),
make_slice_transform
(
HTilde
,
IHTildeSliceBegin
,
HTildeSlice
),
make_slice_transform
(
HTilde
_
,
IHTildeSliceBegin
,
HTildeSlice
),
make_freeze_transform
(
i_xt
ilde
),
make_freeze_transform
(
IdxXT
ilde
_
),
make_slice_transform
(
WTilde
,
IWTildeSliceBegin
,
WTildeSlice
),
make_slice_transform
(
WTilde
_
,
IWTildeSliceBegin
,
WTildeSlice
),
make_pass_through_transform
(
C
)),
make_pass_through_transform
(
C
_
)),
make_tuple
(
Sequence
<
0
>
{},
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
2
>
{},
...
@@ -1024,8 +1271,8 @@ struct TransformConvBwdDataToGemm_v1
...
@@ -1024,8 +1271,8 @@ struct TransformConvBwdDataToGemm_v1
const
auto
in_gemmmraw_gemmnraw_grid_desc
=
transform_tensor_descriptor
(
const
auto
in_gemmmraw_gemmnraw_grid_desc
=
transform_tensor_descriptor
(
in_n_dtildeslice_htildeslice_wtildeslice_c_grid_desc
,
in_n_dtildeslice_htildeslice_wtildeslice_c_grid_desc
,
make_tuple
(
make_tuple
(
make_merge_transform
(
make_tuple
(
N
,
DTildeSlice
,
HTildeSlice
,
WTildeSlice
)),
make_merge_transform
(
make_tuple
(
N
_
,
DTildeSlice
,
HTildeSlice
,
WTildeSlice
)),
make_pass_through_transform
(
C
)),
make_pass_through_transform
(
C
_
)),
make_tuple
(
Sequence
<
0
,
1
,
2
,
3
>
{},
Sequence
<
4
>
{}),
make_tuple
(
Sequence
<
0
,
1
,
2
,
3
>
{},
Sequence
<
4
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
...
@@ -1044,84 +1291,41 @@ struct TransformConvBwdDataToGemm_v1
...
@@ -1044,84 +1291,41 @@ struct TransformConvBwdDataToGemm_v1
}
}
// for input bias
// for input bias
template
<
typename
CLayout
,
template
<
typename
CLayout_
=
CLayout
,
typename
std
::
enable_if
<
NDimSpatial
==
2
&&
typename
std
::
enable_if
<
NDimSpatial
==
2
&&
(
is_same_v
<
CLayout
,
tensor_layout
::
convolution
::
GC
>
||
(
is_same_v
<
CLayout
_
,
tensor_layout
::
convolution
::
GC
>
||
is_same_v
<
CLayout
,
tensor_layout
::
convolution
::
G_C
>
),
is_same_v
<
CLayout
_
,
tensor_layout
::
convolution
::
G_C
>
),
bool
>::
type
=
false
>
bool
>::
type
=
false
>
static
auto
__host__
__device__
auto
MakeCDescriptor_M_N
()
const
MakeCDescriptor_M_N
(
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
out_g_n_k_wos_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
/* out_g_n_k_wos_strides */
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
wei_g_k_c_xs_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
/* wei_g_k_c_xs_strides */
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
in_g_n_c_wis_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
/* in_g_n_c_wis_strides */
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_dilations
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_left_pads
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
/* input_right_pads */
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
/* tildes */
)
{
{
const
index_t
N
=
in_g_n_c_wis_lengths
[
1
];
const
index_t
C
=
wei_g_k_c_xs_lengths
[
2
];
const
index_t
Hi
=
in_g_n_c_wis_lengths
[
3
];
const
index_t
Wi
=
in_g_n_c_wis_lengths
[
4
];
const
index_t
Ho
=
out_g_n_k_wos_lengths
[
3
];
const
index_t
Wo
=
out_g_n_k_wos_lengths
[
4
];
const
index_t
Y
=
wei_g_k_c_xs_lengths
[
3
];
const
index_t
X
=
wei_g_k_c_xs_lengths
[
4
];
const
index_t
InLeftPadH
=
input_left_pads
[
0
];
const
index_t
InLeftPadW
=
input_left_pads
[
1
];
const
index_t
ConvStrideH
=
conv_filter_strides
[
0
];
const
index_t
ConvStrideW
=
conv_filter_strides
[
1
];
const
index_t
ConvDilationH
=
conv_filter_dilations
[
0
];
const
index_t
ConvDilationW
=
conv_filter_dilations
[
1
];
if
constexpr
(
ConvBwdDataSpecialization
==
if
constexpr
(
ConvBwdDataSpecialization
==
ck
::
tensor_operation
::
device
::
ConvolutionBackwardDataSpecialization
::
ck
::
tensor_operation
::
device
::
ConvolutionBackwardDataSpecialization
::
Filter1x1Stride1Pad0
)
Filter1x1Stride1Pad0
)
{
{
const
auto
in_gemmm_gemmn_grid_desc
=
const
auto
in_gemmm_gemmn_grid_desc
=
make_naive_tensor_descriptor
(
make_tuple
(
N
*
Ho
*
Wo
,
C
),
make_tuple
(
I0
,
I1
));
make_naive_tensor_descriptor
(
make_tuple
(
N
_
*
Ho
_
*
Wo
_
,
C
_
),
make_tuple
(
I0
,
I1
));
return
in_gemmm_gemmn_grid_desc
;
return
in_gemmm_gemmn_grid_desc
;
}
}
else
else
{
{
const
auto
GcdStrideDilationH
=
math
::
gcd
(
ConvStrideH
,
ConvDilationH
);
const
auto
GcdStrideDilationW
=
math
::
gcd
(
ConvStrideW
,
ConvDilationW
);
const
auto
YTilde
=
ConvStrideH
/
GcdStrideDilationH
;
const
auto
XTilde
=
ConvStrideW
/
GcdStrideDilationW
;
const
auto
HTilde
=
Ho
+
math
::
integer_divide_ceil
(
ConvDilationH
*
(
Y
-
I1
),
ConvStrideH
);
const
auto
WTilde
=
Wo
+
math
::
integer_divide_ceil
(
ConvDilationW
*
(
X
-
I1
),
ConvStrideW
);
// only work on HTilde and WTilde that contribute to non-padding area of input tensor
// only work on HTilde and WTilde that contribute to non-padding area of input tensor
const
auto
IHTildeSliceBegin
=
math
::
integer_divide_floor
(
const
auto
IHTildeSliceBegin
=
math
::
integer_divide_floor
(
math
::
max
(
I0
,
InLeftPadH
-
ConvDilationH
*
(
YTilde
-
I1
)),
ConvStrideH
);
math
::
max
(
I0
,
InLeftPadH
_
-
ConvDilationH
_
*
(
YTilde
_
-
I1
)),
ConvStrideH
_
);
const
auto
IWTildeSliceBegin
=
math
::
integer_divide_floor
(
const
auto
IWTildeSliceBegin
=
math
::
integer_divide_floor
(
math
::
max
(
I0
,
InLeftPadW
-
ConvDilationW
*
(
XTilde
-
I1
)),
ConvStrideW
);
math
::
max
(
I0
,
InLeftPadW
_
-
ConvDilationW
_
*
(
XTilde
_
-
I1
)),
ConvStrideW
_
);
const
auto
IHTildeSliceEnd
=
math
::
min
(
const
auto
IHTildeSliceEnd
=
math
::
min
(
HTilde
,
math
::
integer_divide_ceil
(
InLeftPadH
+
Hi
-
I1
,
ConvStrideH
)
+
I1
);
HTilde
_
,
math
::
integer_divide_ceil
(
InLeftPadH
_
+
Hi
_
-
I1
,
ConvStrideH
_
)
+
I1
);
const
auto
IWTildeSliceEnd
=
math
::
min
(
const
auto
IWTildeSliceEnd
=
math
::
min
(
WTilde
,
math
::
integer_divide_ceil
(
InLeftPadW
+
Wi
-
I1
,
ConvStrideW
)
+
I1
);
WTilde
_
,
math
::
integer_divide_ceil
(
InLeftPadW
_
+
Wi
_
-
I1
,
ConvStrideW
_
)
+
I1
);
const
auto
HTildeSlice
=
IHTildeSliceEnd
-
IHTildeSliceBegin
;
const
auto
HTildeSlice
=
IHTildeSliceEnd
-
IHTildeSliceBegin
;
const
auto
WTildeSlice
=
IWTildeSliceEnd
-
IWTildeSliceBegin
;
const
auto
WTildeSlice
=
IWTildeSliceEnd
-
IWTildeSliceBegin
;
// bias tensor
// bias tensor
const
auto
in_gemmmraw_gemmnraw_grid_desc
=
make_naive_tensor_descriptor
(
const
auto
in_gemmmraw_gemmnraw_grid_desc
=
make_naive_tensor_descriptor
(
make_tuple
(
N
*
HTildeSlice
*
WTildeSlice
,
C
),
make_tuple
(
I0
,
I1
));
make_tuple
(
N
_
*
HTildeSlice
*
WTildeSlice
,
C
_
),
make_tuple
(
I0
,
I1
));
const
auto
in_gemmm_gemmn_grid_desc
=
ck
::
tensor_operation
::
device
::
PadTensorDescriptor
(
const
auto
in_gemmm_gemmn_grid_desc
=
ck
::
tensor_operation
::
device
::
PadTensorDescriptor
(
in_gemmmraw_gemmnraw_grid_desc
,
in_gemmmraw_gemmnraw_grid_desc
,
...
@@ -1131,6 +1335,25 @@ struct TransformConvBwdDataToGemm_v1
...
@@ -1131,6 +1335,25 @@ struct TransformConvBwdDataToGemm_v1
return
in_gemmm_gemmn_grid_desc
;
return
in_gemmm_gemmn_grid_desc
;
}
}
}
}
IndexType
N_
;
IndexType
Di_
,
Hi_
,
Wi_
;
IndexType
Do_
,
Ho_
,
Wo_
;
IndexType
Z_
,
Y_
,
X_
;
IndexType
K_
,
C_
;
IndexType
DiStride_
,
HiStride_
,
WiStride_
;
IndexType
DoStride_
,
HoStride_
,
WoStride_
;
IndexType
CStrideTensorB_
,
CStrideTensorC_
,
KStrideTensorA_
,
KStrideTensorB_
;
IndexType
NStrideTensorA_
,
NStrideTensorC_
;
IndexType
ConvStrideD_
,
ConvStrideH_
,
ConvStrideW_
;
IndexType
ConvDilationD_
,
ConvDilationH_
,
ConvDilationW_
;
IndexType
InLeftPadD_
,
InLeftPadH_
,
InLeftPadW_
;
IndexType
InRightPadD_
,
InRightPadH_
,
InRightPadW_
;
IndexType
IdxZTilde_
,
IdxYTilde_
,
IdxXTilde_
;
IndexType
GcdStrideDilationD_
,
GcdStrideDilationH_
,
GcdStrideDilationW_
;
IndexType
ZTilde_
,
YTilde_
,
XTilde_
;
IndexType
DTilde_
,
HTilde_
,
WTilde_
;
IndexType
ZDot_
,
YDot_
,
XDot_
;
};
};
}
// namespace tensor_operation
}
// namespace tensor_operation
...
...
include/ck/utility/math_v2.hpp
View file @
58d75b7a
...
@@ -611,7 +611,7 @@ inline __device__ int8_t neg<int8_t>(int8_t x)
...
@@ -611,7 +611,7 @@ inline __device__ int8_t neg<int8_t>(int8_t x)
template
<
>
template
<
>
inline
__device__
half_t
neg
<
half_t
>
(
half_t
x
)
inline
__device__
half_t
neg
<
half_t
>
(
half_t
x
)
{
{
return
__hneg
(
x
);
return
__hneg
(
static_cast
<
__half
>
(
x
)
);
};
};
template
<
typename
T
>
template
<
typename
T
>
...
...
include/ck_tile/README.md
View file @
58d75b7a
...
@@ -45,5 +45,8 @@ our implementation of different device operators.
...
@@ -45,5 +45,8 @@ our implementation of different device operators.
**[ops/epilogue]**
**[ops/epilogue]**
epilogue part of our kernel. We may extend this epilogue part to let users to build their own cutomized epilogues.
epilogue part of our kernel. We may extend this epilogue part to let users to build their own cutomized epilogues.
**[ref]**
reference implementation of cpu or gpu. This folder is supposed to include a specific header on demand.
## examples
## examples
currently we put all ck_tile related example under
[
/example/ck_tile
](
/example/ck_tile/
)
folder. Please check each example's subfolder.
currently we put all ck_tile related example under
[
/example/ck_tile
](
/example/ck_tile/
)
folder. Please check each example's subfolder.
include/ck_tile/core.hpp
View file @
58d75b7a
...
@@ -54,6 +54,7 @@
...
@@ -54,6 +54,7 @@
#include "ck_tile/core/tensor/tile_window_linear.hpp"
#include "ck_tile/core/tensor/tile_window_linear.hpp"
#include "ck_tile/core/tensor/tile_window_utils.hpp"
#include "ck_tile/core/tensor/tile_window_utils.hpp"
#include "ck_tile/core/tensor/update_tile.hpp"
#include "ck_tile/core/tensor/update_tile.hpp"
#include "ck_tile/core/utility/amd_address_space.hpp"
#include "ck_tile/core/utility/bit_cast.hpp"
#include "ck_tile/core/utility/bit_cast.hpp"
#include "ck_tile/core/utility/functional.hpp"
#include "ck_tile/core/utility/functional.hpp"
#include "ck_tile/core/utility/functional_with_tuple.hpp"
#include "ck_tile/core/utility/functional_with_tuple.hpp"
...
...
include/ck_tile/ops/flatmm.hpp
View file @
58d75b7a
...
@@ -5,6 +5,7 @@
...
@@ -5,6 +5,7 @@
#include "ck_tile/ops/flatmm/block/flatmm_32x512x128_1x4x1_16x16x32.hpp"
#include "ck_tile/ops/flatmm/block/flatmm_32x512x128_1x4x1_16x16x32.hpp"
#include "ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32.hpp"
#include "ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32.hpp"
#include "ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32_itl.hpp"
#include "ck_tile/ops/flatmm/block/flatmm_uk_config.hpp"
#include "ck_tile/ops/flatmm/block/flatmm_uk_config.hpp"
#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
#include "ck_tile/ops/common/tensor_layout.hpp"
#include "ck_tile/ops/common/tensor_layout.hpp"
include/ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32_itl.hpp
0 → 100644
View file @
58d75b7a
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/ops/gemm/warp/warp_gemm.hpp"
#include "ck_tile/ops/flatmm/block/flatmm_uk_config.hpp"
#include "ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32.hpp"
namespace
ck_tile
{
// "S"tream update output along "N"
// A in smem, B load from global
// require 4 wave, occupancy=1c
struct
FlatmmSn_32x128x512_1x4x1_16x16x32_BF16_itl
:
public
FlatmmSn_32x128x512_1x4x1_16x16x32_Base
{
using
BDataType
=
bf16_t
;
using
ODataType
=
bf16_t
;
// TODO: need paired with tile_window_linear!
// TODO: need call init_raw() before call this function!
// template <typename AWindow, typename BWindow, typename OWindow, typename ScaleTensor>
template
<
typename
BRes
,
typename
BCoords
,
typename
ORes
,
typename
OCoords
,
typename
OFlags
,
typename
ScaleTensor
>
CK_TILE_DEVICE
auto
operator
()(
const
BRes
&
res_b
,
const
BCoords
&
cached_coords_b
,
const
ORes
&
res_o
,
const
OCoords
&
cached_coords_o
,
const
OFlags
&
o_flags
,
// this should be in sgpr
CK_TILE_LDS_ADDR
void
*
smem
,
index_t
n
,
// loop along n dim
const
ScaleTensor
&
scale_
,
index_t
tile_offset_b
,
// stride b is fixed to blockKr * blockW, but still can adjust
index_t
tile_offset_o
)
{
static_assert
(
BCoords
::
size
()
==
8
);
// 8
static_assert
(
OCoords
::
size
()
==
8
);
const
index_t
tile_stride_b_bytes
=
tile_offset_b
*
sizeof
(
BDataType
);
const
index_t
tile_stride_o_bytes
=
tile_offset_o
*
sizeof
(
ODataType
);
static_assert
(
ScaleTensor
::
size
()
==
2
);
float
s0
=
scale_
[
number
<
0
>
{}];
float
s1
=
scale_
[
number
<
1
>
{}];
// index_t loop_cnt = n / Block_N;
register
float
v_c0
asm
(
"v64"
);
register
float
v_c1
asm
(
"v65"
);
register
float
v_c2
asm
(
"v66"
);
register
float
v_c3
asm
(
"v67"
);
register
float
v_c4
asm
(
"v68"
);
register
float
v_c5
asm
(
"v69"
);
register
float
v_c6
asm
(
"v70"
);
register
float
v_c7
asm
(
"v71"
);
register
float
v_c8
asm
(
"v72"
);
register
float
v_c9
asm
(
"v73"
);
register
float
v_c10
asm
(
"v74"
);
register
float
v_c11
asm
(
"v75"
);
register
float
v_c12
asm
(
"v76"
);
register
float
v_c13
asm
(
"v77"
);
register
float
v_c14
asm
(
"v78"
);
register
float
v_c15
asm
(
"v79"
);
register
float
v_c16
asm
(
"v80"
);
register
float
v_c17
asm
(
"v81"
);
register
float
v_c18
asm
(
"v82"
);
register
float
v_c19
asm
(
"v83"
);
register
float
v_c20
asm
(
"v84"
);
register
float
v_c21
asm
(
"v85"
);
register
float
v_c22
asm
(
"v86"
);
register
float
v_c23
asm
(
"v87"
);
register
float
v_c24
asm
(
"v88"
);
register
float
v_c25
asm
(
"v89"
);
register
float
v_c26
asm
(
"v90"
);
register
float
v_c27
asm
(
"v91"
);
register
float
v_c28
asm
(
"v92"
);
register
float
v_c29
asm
(
"v93"
);
register
float
v_c30
asm
(
"v94"
);
register
float
v_c31
asm
(
"v95"
);
int32_t
nan_hi
=
0x7fff0000
;
int32_t
nan_lo
=
0x00007fff
;
// in smem, the layout is M0(2)*K0(128)*M1(16)*K1(4)
// every threads need 8xK in contiguous register
// ... and every wave need the same data
int
lane_id
=
threadIdx
.
x
%
64
;
int
sld_y_os
=
(
lane_id
%
16
)
*
4
+
(
lane_id
/
16
)
*
128
;
sld_y_os
*=
2
;
// y y p p p y
// reg before shfl M0(2)*N0(2)*Nl(4)*Nw(4)*Mw(16)*Nv(4)
// but order is N0*M0*Nv
// in LDS we need store as
// M0(2)* N0(2) * Nl(4) * Nw(4) * (Mw(16)*Nv(4) + 4)
// y y wave-id lid/16 lid%16 v
// sst(v3) = (v0/16*34 + v0%16 * 2 + wid*136) * 4
int
sfl_sst
=
(
threadIdx
.
x
%
16
*
4
)
+
(
threadIdx
.
x
/
16
)
*
(
64
+
4
);
sfl_sst
*=
2
;
// from LDS we need load as
// M0(2)* N0(2) * Nl(4) * Nw(4) * (Mw(16) * Nv(4) + 4)
// ( 2 issue) (rem 32-lane) (4 wave*4issue) 2lane*1ussue(pk2)
// sld(v4) = v0/2 *34*4 + v0 % 2 *4 + wid*2 *4
int
sfl_sld
=
(
lane_id
%
2
)
*
2
+
(
lane_id
/
2
)
*
(
64
+
4
)
+
(
threadIdx
.
x
/
64
)
*
4
;
sfl_sld
*=
2
;
// B nr->kr
// clang-format off
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Winline-asm"
asm
volatile
(
#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_BF16
#include "uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16_itl.inc"
#undef CK_TILE_FLATMM_UK_MFMA
:
[
smem_
]
"+r"
(
smem
),
// [s_loop_cnt]"+s"(loop_cnt),
[
s_loop_cnt
]
"+s"
(
n
),
[
c0
]
"+v"
(
v_c0
),
[
c1
]
"+v"
(
v_c1
),
[
c2
]
"+v"
(
v_c2
),
[
c3
]
"+v"
(
v_c3
),
[
c4
]
"+v"
(
v_c4
),
[
c5
]
"+v"
(
v_c5
),
[
c6
]
"+v"
(
v_c6
),
[
c7
]
"+v"
(
v_c7
),
[
c8
]
"+v"
(
v_c8
),
[
c9
]
"+v"
(
v_c9
),
[
c10
]
"+v"
(
v_c10
),
[
c11
]
"+v"
(
v_c11
),
[
c12
]
"+v"
(
v_c12
),
[
c13
]
"+v"
(
v_c13
),
[
c14
]
"+v"
(
v_c14
),
[
c15
]
"+v"
(
v_c15
),
[
c16
]
"+v"
(
v_c16
),
[
c17
]
"+v"
(
v_c17
),
[
c18
]
"+v"
(
v_c18
),
[
c19
]
"+v"
(
v_c19
),
[
c20
]
"+v"
(
v_c20
),
[
c21
]
"+v"
(
v_c21
),
[
c22
]
"+v"
(
v_c22
),
[
c23
]
"+v"
(
v_c23
),
[
c24
]
"+v"
(
v_c24
),
[
c25
]
"+v"
(
v_c25
),
[
c26
]
"+v"
(
v_c26
),
[
c27
]
"+v"
(
v_c27
),
[
c28
]
"+v"
(
v_c28
),
[
c29
]
"+v"
(
v_c29
),
[
c30
]
"+v"
(
v_c30
),
[
c31
]
"+v"
(
v_c31
)
:
[
sld_a_base
]
"n"
(
0
),
[
shfl_base
]
"n"
(
0
),
[
v_sld_y_os
]
"v"
(
sld_y_os
),
[
v_sfl_sld
]
"v"
(
sfl_sld
),
[
v_sfl_sst
]
"v"
(
sfl_sst
),
[
s_res_o0
]
"s"
(
res_o
[
0
]),
[
s_res_o1
]
"s"
(
res_o
[
1
]),
//[s_res_o2]"s"(res_o[2]),
//[s_res_o3]"s"(res_o[3]),
[
s_res_b0
]
"s"
(
res_b
[
0
]),
[
s_res_b1
]
"s"
(
res_b
[
1
]),
[
s_res_b2
]
"s"
(
res_b
[
2
]),
[
s_res_b3
]
"s"
(
res_b
[
3
]),
[
v_os_o0
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_o
[
number
<
0
>
{}]
*
sizeof
(
ODataType
))),
[
v_os_o1
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_o
[
number
<
1
>
{}]
*
sizeof
(
ODataType
))),
[
v_os_o2
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_o
[
number
<
2
>
{}]
*
sizeof
(
ODataType
))),
[
v_os_o3
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_o
[
number
<
3
>
{}]
*
sizeof
(
ODataType
))),
[
v_os_o4
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_o
[
number
<
4
>
{}]
*
sizeof
(
ODataType
))),
[
v_os_o5
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_o
[
number
<
5
>
{}]
*
sizeof
(
ODataType
))),
[
v_os_o6
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_o
[
number
<
6
>
{}]
*
sizeof
(
ODataType
))),
[
v_os_o7
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_o
[
number
<
7
>
{}]
*
sizeof
(
ODataType
))),
[
v_os_b0
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
0
>
{}]
*
sizeof
(
BDataType
))),
[
v_os_b1
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
1
>
{}]
*
sizeof
(
BDataType
))),
[
v_os_b2
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
2
>
{}]
*
sizeof
(
BDataType
))),
[
v_os_b3
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
3
>
{}]
*
sizeof
(
BDataType
))),
[
v_os_b4
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
4
>
{}]
*
sizeof
(
BDataType
))),
[
v_os_b5
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
5
>
{}]
*
sizeof
(
BDataType
))),
[
v_os_b6
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
6
>
{}]
*
sizeof
(
BDataType
))),
[
v_os_b7
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
7
>
{}]
*
sizeof
(
BDataType
))),
[
s_tile_os_o
]
"s"
(
tile_stride_o_bytes
),
[
s_tile_os_b
]
"s"
(
tile_stride_b_bytes
),
[
scale_0
]
"v"
(
s0
),
[
scale_1
]
"v"
(
s1
),
[
v_nan_lo
]
"v"
(
nan_lo
),
[
v_nan_hi
]
"v"
(
nan_hi
),
[
s_execflag_0
]
"s"
(
o_flags
[
number
<
0
>
{}]),
[
s_execflag_1
]
"s"
(
o_flags
[
number
<
1
>
{}]),
[
s_execflag_2
]
"s"
(
o_flags
[
number
<
2
>
{}]),
[
s_execflag_3
]
"s"
(
o_flags
[
number
<
3
>
{}]),
[
s_execflag_4
]
"s"
(
o_flags
[
number
<
4
>
{}]),
[
s_execflag_5
]
"s"
(
o_flags
[
number
<
5
>
{}]),
[
s_execflag_6
]
"s"
(
o_flags
[
number
<
6
>
{}]),
[
s_execflag_7
]
"s"
(
o_flags
[
number
<
7
>
{}])
:
"memory"
,
"a0"
,
"a1"
,
"a2"
,
"a3"
,
"a4"
,
"a5"
,
"a6"
,
"a7"
,
"a8"
,
"a9"
,
"a10"
,
"a11"
,
"a12"
,
"a13"
,
"a14"
,
"a15"
,
"a16"
,
"a17"
,
"a18"
,
"a19"
,
"a20"
,
"a21"
,
"a22"
,
"a23"
,
"a24"
,
"a25"
,
"a26"
,
"a27"
,
"a28"
,
"a29"
,
"a30"
,
"a31"
,
"a32"
,
"a33"
,
"a34"
,
"a35"
,
"a36"
,
"a37"
,
"a38"
,
"a39"
,
"a40"
,
"a41"
,
"a42"
,
"a43"
,
"a44"
,
"a45"
,
"a46"
,
"a47"
,
"a48"
,
"a49"
,
"a50"
,
"a51"
,
"a52"
,
"a53"
,
"a54"
,
"a55"
,
"a56"
,
"a57"
,
"a58"
,
"a59"
,
"a60"
,
"a61"
,
"a62"
,
"a63"
,
"a64"
,
"a65"
,
"a66"
,
"a67"
,
"a68"
,
"a69"
,
"a70"
,
"a71"
,
"a72"
,
"a73"
,
"a74"
,
"a75"
,
"a76"
,
"a77"
,
"a78"
,
"a79"
,
"a80"
,
"a81"
,
"a82"
,
"a83"
,
"a84"
,
"a85"
,
"a86"
,
"a87"
,
"a88"
,
"a89"
,
"a90"
,
"a91"
,
"a92"
,
"a93"
,
"a94"
,
"a95"
,
"a96"
,
"a97"
,
"a98"
,
"a99"
,
"a100"
,
"a101"
,
"a102"
,
"a103"
,
"a104"
,
"a105"
,
"a106"
,
"a107"
,
"a108"
,
"a109"
,
"a110"
,
"a111"
,
"a112"
,
"a113"
,
"a114"
,
"a115"
,
"a116"
,
"a117"
,
"a118"
,
"a119"
,
"a120"
,
"a121"
,
"a122"
,
"a123"
,
"a124"
,
"a125"
,
"a126"
,
"a127"
,
"a128"
,
"a129"
,
"a130"
,
"a131"
,
"a132"
,
"a133"
,
"a134"
,
"a135"
,
"a136"
,
"a137"
,
"a138"
,
"a139"
,
"a140"
,
"a141"
,
"a142"
,
"a143"
,
"a144"
,
"a145"
,
"a146"
,
"a147"
,
"a148"
,
"a149"
,
"a150"
,
"a151"
,
"a152"
,
"a153"
,
"a154"
,
"a155"
,
"a156"
,
"a157"
,
"a158"
,
"a159"
,
"a160"
,
"a161"
,
"a162"
,
"a163"
,
"a164"
,
"a165"
,
"a166"
,
"a167"
,
"a168"
,
"a169"
,
"a170"
,
"a171"
,
"a172"
,
"a173"
,
"a174"
,
"a175"
,
"a176"
,
"a177"
,
"a178"
,
"a179"
,
"a180"
,
"a181"
,
"a182"
,
"a183"
,
"a184"
,
"a185"
,
"a186"
,
"a187"
,
"a188"
,
"a189"
,
"a190"
,
"a191"
,
"a192"
,
"a193"
,
"a194"
,
"a195"
,
"a196"
,
"a197"
,
"a198"
,
"a199"
,
"a200"
,
"a201"
,
"a202"
,
"a203"
,
"a204"
,
"a205"
,
"a206"
,
"a207"
,
"a208"
,
"a209"
,
"a210"
,
"a211"
,
"a212"
,
"a213"
,
"a214"
,
"a215"
,
"a216"
,
"a217"
,
"a218"
,
"a219"
,
"a220"
,
"a221"
,
"a222"
,
"a223"
,
"a224"
,
"a225"
,
"a226"
,
"a227"
,
"a228"
,
"a229"
,
"a230"
,
"a231"
,
"a232"
,
"a233"
,
"a234"
,
"a235"
,
"a236"
,
"a237"
,
"a238"
,
"a239"
,
"a240"
,
"a241"
,
"a242"
,
"a243"
,
"a244"
,
"a245"
,
"a246"
,
"a247"
,
"a248"
,
"a249"
,
"a250"
,
"a251"
,
"a252"
,
"a253"
,
"a254"
,
"a255"
,
"s8"
,
"s9"
,
"s12"
,
"s13"
,
"s14"
,
"s15"
,
"s38"
,
"s39"
,
"s52"
,
"s86"
,
"s36"
,
"s37"
,
"s59"
,
"s80"
,
"v10"
,
"v11"
,
"v12"
,
"v13"
,
"v14"
,
"v15"
,
"v16"
,
"v17"
,
"v50"
,
"v54"
,
"v55"
,
"v64"
,
"v65"
,
"v66"
,
"v67"
,
"v68"
,
"v69"
,
"v70"
,
"v71"
,
"v72"
,
"v73"
,
"v74"
,
"v75"
,
"v76"
,
"v77"
,
"v78"
,
"v79"
,
"v80"
,
"v81"
,
"v82"
,
"v83"
,
"v84"
,
"v85"
,
"v86"
,
"v87"
,
"v88"
,
"v89"
,
"v90"
,
"v91"
,
"v92"
,
"v93"
,
"v94"
,
"v95"
,
"v128"
,
"v129"
,
"v130"
,
"v131"
,
"v132"
,
"v133"
,
"v134"
,
"v135"
,
"v136"
,
"v137"
,
"v138"
,
"v139"
,
"v140"
,
"v141"
,
"v142"
,
"v143"
,
"v144"
,
"v145"
,
"v146"
,
"v147"
,
"v148"
,
"v149"
,
"v150"
,
"v151"
,
"v152"
,
"v153"
,
"v154"
,
"v155"
,
"v156"
,
"v157"
,
"v158"
,
"v159"
,
"v160"
,
"v161"
,
"v162"
,
"v163"
,
"v164"
,
"v165"
,
"v166"
,
"v167"
,
"v168"
,
"v169"
,
"v170"
,
"v171"
,
"v172"
,
"v173"
,
"v174"
,
"v175"
,
"v176"
,
"v177"
,
"v178"
,
"v179"
,
"v180"
,
"v181"
,
"v182"
,
"v183"
,
"v184"
,
"v185"
,
"v186"
,
"v187"
,
"v188"
,
"v189"
,
"v190"
,
"v191"
,
"v192"
,
"v193"
,
"v194"
,
"v195"
,
"v196"
,
"v197"
,
"v198"
,
"v199"
,
"v200"
,
"v201"
,
"v202"
,
"v203"
,
"v204"
,
"v205"
,
"v206"
,
"v207"
,
"v208"
,
"v209"
,
"v210"
,
"v211"
,
"v212"
,
"v213"
,
"v214"
,
"v215"
,
"v216"
,
"v217"
,
"v218"
,
"v219"
,
"v220"
,
"v221"
,
"v222"
,
"v223"
,
"v224"
,
"v225"
,
"v226"
,
"v227"
,
"v228"
,
"v229"
,
"v230"
,
"v231"
,
"v232"
,
"v233"
,
"v234"
,
"v235"
,
"v236"
,
"v237"
,
"v238"
,
"v239"
,
"v240"
,
"v241"
,
"v242"
,
"v243"
,
"v244"
,
"v245"
,
"v246"
,
"v247"
,
"v248"
,
"v249"
,
"v250"
,
"v251"
,
"v252"
,
"v253"
,
"v254"
,
"v255"
);
#pragma clang diagnostic pop
// clang-format on
}
};
struct
FlatmmSn_32x128x512_1x4x1_16x16x32_FP16_itl
:
public
FlatmmSn_32x128x512_1x4x1_16x16x32_Base
{
using
BDataType
=
bf16_t
;
using
ODataType
=
bf16_t
;
// TODO: need paired with tile_window_linear!
// TODO: need call init_raw() before call this function!
// template <typename AWindow, typename BWindow, typename OWindow, typename ScaleTensor>
template
<
typename
BRes
,
typename
BCoords
,
typename
ORes
,
typename
OCoords
,
typename
OFlags
,
typename
ScaleTensor
>
CK_TILE_DEVICE
auto
operator
()(
const
BRes
&
res_b
,
const
BCoords
&
cached_coords_b
,
const
ORes
&
res_o
,
const
OCoords
&
cached_coords_o
,
const
OFlags
&
o_flags
,
// this should be in sgpr
CK_TILE_LDS_ADDR
void
*
smem
,
index_t
n
,
// loop along n dim
const
ScaleTensor
&
scale_
,
index_t
tile_offset_b
,
// stride b is fixed to blockKr * blockW, but still can adjust
index_t
tile_offset_o
)
{
static_assert
(
BCoords
::
size
()
==
8
);
// 8
static_assert
(
OCoords
::
size
()
==
8
);
const
index_t
tile_stride_b_bytes
=
tile_offset_b
*
sizeof
(
BDataType
);
const
index_t
tile_stride_o_bytes
=
tile_offset_o
*
sizeof
(
ODataType
);
static_assert
(
ScaleTensor
::
size
()
==
2
);
float
s0
=
scale_
[
number
<
0
>
{}];
float
s1
=
scale_
[
number
<
1
>
{}];
// index_t loop_cnt = n / Block_N;
register
float
v_c0
asm
(
"v64"
);
register
float
v_c1
asm
(
"v65"
);
register
float
v_c2
asm
(
"v66"
);
register
float
v_c3
asm
(
"v67"
);
register
float
v_c4
asm
(
"v68"
);
register
float
v_c5
asm
(
"v69"
);
register
float
v_c6
asm
(
"v70"
);
register
float
v_c7
asm
(
"v71"
);
register
float
v_c8
asm
(
"v72"
);
register
float
v_c9
asm
(
"v73"
);
register
float
v_c10
asm
(
"v74"
);
register
float
v_c11
asm
(
"v75"
);
register
float
v_c12
asm
(
"v76"
);
register
float
v_c13
asm
(
"v77"
);
register
float
v_c14
asm
(
"v78"
);
register
float
v_c15
asm
(
"v79"
);
register
float
v_c16
asm
(
"v80"
);
register
float
v_c17
asm
(
"v81"
);
register
float
v_c18
asm
(
"v82"
);
register
float
v_c19
asm
(
"v83"
);
register
float
v_c20
asm
(
"v84"
);
register
float
v_c21
asm
(
"v85"
);
register
float
v_c22
asm
(
"v86"
);
register
float
v_c23
asm
(
"v87"
);
register
float
v_c24
asm
(
"v88"
);
register
float
v_c25
asm
(
"v89"
);
register
float
v_c26
asm
(
"v90"
);
register
float
v_c27
asm
(
"v91"
);
register
float
v_c28
asm
(
"v92"
);
register
float
v_c29
asm
(
"v93"
);
register
float
v_c30
asm
(
"v94"
);
register
float
v_c31
asm
(
"v95"
);
int32_t
nan_hi
=
0x7fff0000
;
int32_t
nan_lo
=
0x00007fff
;
// in smem, the layout is M0(2)*K0(128)*M1(16)*K1(4)
// every threads need 8xK in contiguous register
// ... and every wave need the same data
int
lane_id
=
threadIdx
.
x
%
64
;
int
sld_y_os
=
(
lane_id
%
16
)
*
4
+
(
lane_id
/
16
)
*
128
;
sld_y_os
*=
2
;
// y y p p p y
// reg before shfl M0(2)*N0(2)*Nl(4)*Nw(4)*Mw(16)*Nv(4)
// but order is N0*M0*Nv
// in LDS we need store as
// M0(2)* N0(2) * Nl(4) * Nw(4) * (Mw(16)*Nv(4) + 4)
// y y wave-id lid/16 lid%16 v
// sst(v3) = (v0/16*34 + v0%16 * 2 + wid*136) * 4
int
sfl_sst
=
(
threadIdx
.
x
%
16
*
4
)
+
(
threadIdx
.
x
/
16
)
*
(
64
+
4
);
sfl_sst
*=
2
;
// from LDS we need load as
// M0(2)* N0(2) * Nl(4) * Nw(4) * (Mw(16) * Nv(4) + 4)
// ( 2 issue) (rem 32-lane) (4 wave*4issue) 2lane*1ussue(pk2)
// sld(v4) = v0/2 *34*4 + v0 % 2 *4 + wid*2 *4
int
sfl_sld
=
(
lane_id
%
2
)
*
2
+
(
lane_id
/
2
)
*
(
64
+
4
)
+
(
threadIdx
.
x
/
64
)
*
4
;
sfl_sld
*=
2
;
// B nr->kr
// clang-format off
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Winline-asm"
asm
volatile
(
#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_FP16
#include "uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16_itl.inc"
#undef CK_TILE_FLATMM_UK_MFMA
:
[
smem_
]
"+r"
(
smem
),
[
s_loop_cnt
]
"+s"
(
n
),
[
c0
]
"+v"
(
v_c0
),
[
c1
]
"+v"
(
v_c1
),
[
c2
]
"+v"
(
v_c2
),
[
c3
]
"+v"
(
v_c3
),
[
c4
]
"+v"
(
v_c4
),
[
c5
]
"+v"
(
v_c5
),
[
c6
]
"+v"
(
v_c6
),
[
c7
]
"+v"
(
v_c7
),
[
c8
]
"+v"
(
v_c8
),
[
c9
]
"+v"
(
v_c9
),
[
c10
]
"+v"
(
v_c10
),
[
c11
]
"+v"
(
v_c11
),
[
c12
]
"+v"
(
v_c12
),
[
c13
]
"+v"
(
v_c13
),
[
c14
]
"+v"
(
v_c14
),
[
c15
]
"+v"
(
v_c15
),
[
c16
]
"+v"
(
v_c16
),
[
c17
]
"+v"
(
v_c17
),
[
c18
]
"+v"
(
v_c18
),
[
c19
]
"+v"
(
v_c19
),
[
c20
]
"+v"
(
v_c20
),
[
c21
]
"+v"
(
v_c21
),
[
c22
]
"+v"
(
v_c22
),
[
c23
]
"+v"
(
v_c23
),
[
c24
]
"+v"
(
v_c24
),
[
c25
]
"+v"
(
v_c25
),
[
c26
]
"+v"
(
v_c26
),
[
c27
]
"+v"
(
v_c27
),
[
c28
]
"+v"
(
v_c28
),
[
c29
]
"+v"
(
v_c29
),
[
c30
]
"+v"
(
v_c30
),
[
c31
]
"+v"
(
v_c31
)
:
[
sld_a_base
]
"n"
(
0
),
[
shfl_base
]
"n"
(
0
),
[
v_sld_y_os
]
"v"
(
sld_y_os
),
[
v_sfl_sld
]
"v"
(
sfl_sld
),
[
v_sfl_sst
]
"v"
(
sfl_sst
),
[
s_res_o0
]
"s"
(
res_o
[
0
]),
[
s_res_o1
]
"s"
(
res_o
[
1
]),
//[s_res_o2]"s"(res_o[2]),
//[s_res_o3]"s"(res_o[3]),
[
s_res_b0
]
"s"
(
res_b
[
0
]),
[
s_res_b1
]
"s"
(
res_b
[
1
]),
[
s_res_b2
]
"s"
(
res_b
[
2
]),
[
s_res_b3
]
"s"
(
res_b
[
3
]),
[
v_os_o0
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_o
[
number
<
0
>
{}]
*
sizeof
(
ODataType
))),
[
v_os_o1
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_o
[
number
<
1
>
{}]
*
sizeof
(
ODataType
))),
[
v_os_o2
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_o
[
number
<
2
>
{}]
*
sizeof
(
ODataType
))),
[
v_os_o3
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_o
[
number
<
3
>
{}]
*
sizeof
(
ODataType
))),
[
v_os_o4
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_o
[
number
<
4
>
{}]
*
sizeof
(
ODataType
))),
[
v_os_o5
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_o
[
number
<
5
>
{}]
*
sizeof
(
ODataType
))),
[
v_os_o6
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_o
[
number
<
6
>
{}]
*
sizeof
(
ODataType
))),
[
v_os_o7
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_o
[
number
<
7
>
{}]
*
sizeof
(
ODataType
))),
[
v_os_b0
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
0
>
{}]
*
sizeof
(
BDataType
))),
[
v_os_b1
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
1
>
{}]
*
sizeof
(
BDataType
))),
[
v_os_b2
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
2
>
{}]
*
sizeof
(
BDataType
))),
[
v_os_b3
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
3
>
{}]
*
sizeof
(
BDataType
))),
[
v_os_b4
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
4
>
{}]
*
sizeof
(
BDataType
))),
[
v_os_b5
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
5
>
{}]
*
sizeof
(
BDataType
))),
[
v_os_b6
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
6
>
{}]
*
sizeof
(
BDataType
))),
[
v_os_b7
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
7
>
{}]
*
sizeof
(
BDataType
))),
[
s_tile_os_o
]
"s"
(
tile_stride_o_bytes
),
[
s_tile_os_b
]
"s"
(
tile_stride_b_bytes
),
[
scale_0
]
"v"
(
s0
),
[
scale_1
]
"v"
(
s1
),
[
v_nan_lo
]
"v"
(
nan_lo
),
[
v_nan_hi
]
"v"
(
nan_hi
),
[
s_execflag_0
]
"s"
(
o_flags
[
number
<
0
>
{}]),
[
s_execflag_1
]
"s"
(
o_flags
[
number
<
1
>
{}]),
[
s_execflag_2
]
"s"
(
o_flags
[
number
<
2
>
{}]),
[
s_execflag_3
]
"s"
(
o_flags
[
number
<
3
>
{}]),
[
s_execflag_4
]
"s"
(
o_flags
[
number
<
4
>
{}]),
[
s_execflag_5
]
"s"
(
o_flags
[
number
<
5
>
{}]),
[
s_execflag_6
]
"s"
(
o_flags
[
number
<
6
>
{}]),
[
s_execflag_7
]
"s"
(
o_flags
[
number
<
7
>
{}])
:
"memory"
,
"a0"
,
"a1"
,
"a2"
,
"a3"
,
"a4"
,
"a5"
,
"a6"
,
"a7"
,
"a8"
,
"a9"
,
"a10"
,
"a11"
,
"a12"
,
"a13"
,
"a14"
,
"a15"
,
"a16"
,
"a17"
,
"a18"
,
"a19"
,
"a20"
,
"a21"
,
"a22"
,
"a23"
,
"a24"
,
"a25"
,
"a26"
,
"a27"
,
"a28"
,
"a29"
,
"a30"
,
"a31"
,
"a32"
,
"a33"
,
"a34"
,
"a35"
,
"a36"
,
"a37"
,
"a38"
,
"a39"
,
"a40"
,
"a41"
,
"a42"
,
"a43"
,
"a44"
,
"a45"
,
"a46"
,
"a47"
,
"a48"
,
"a49"
,
"a50"
,
"a51"
,
"a52"
,
"a53"
,
"a54"
,
"a55"
,
"a56"
,
"a57"
,
"a58"
,
"a59"
,
"a60"
,
"a61"
,
"a62"
,
"a63"
,
"a64"
,
"a65"
,
"a66"
,
"a67"
,
"a68"
,
"a69"
,
"a70"
,
"a71"
,
"a72"
,
"a73"
,
"a74"
,
"a75"
,
"a76"
,
"a77"
,
"a78"
,
"a79"
,
"a80"
,
"a81"
,
"a82"
,
"a83"
,
"a84"
,
"a85"
,
"a86"
,
"a87"
,
"a88"
,
"a89"
,
"a90"
,
"a91"
,
"a92"
,
"a93"
,
"a94"
,
"a95"
,
"a96"
,
"a97"
,
"a98"
,
"a99"
,
"a100"
,
"a101"
,
"a102"
,
"a103"
,
"a104"
,
"a105"
,
"a106"
,
"a107"
,
"a108"
,
"a109"
,
"a110"
,
"a111"
,
"a112"
,
"a113"
,
"a114"
,
"a115"
,
"a116"
,
"a117"
,
"a118"
,
"a119"
,
"a120"
,
"a121"
,
"a122"
,
"a123"
,
"a124"
,
"a125"
,
"a126"
,
"a127"
,
"a128"
,
"a129"
,
"a130"
,
"a131"
,
"a132"
,
"a133"
,
"a134"
,
"a135"
,
"a136"
,
"a137"
,
"a138"
,
"a139"
,
"a140"
,
"a141"
,
"a142"
,
"a143"
,
"a144"
,
"a145"
,
"a146"
,
"a147"
,
"a148"
,
"a149"
,
"a150"
,
"a151"
,
"a152"
,
"a153"
,
"a154"
,
"a155"
,
"a156"
,
"a157"
,
"a158"
,
"a159"
,
"a160"
,
"a161"
,
"a162"
,
"a163"
,
"a164"
,
"a165"
,
"a166"
,
"a167"
,
"a168"
,
"a169"
,
"a170"
,
"a171"
,
"a172"
,
"a173"
,
"a174"
,
"a175"
,
"a176"
,
"a177"
,
"a178"
,
"a179"
,
"a180"
,
"a181"
,
"a182"
,
"a183"
,
"a184"
,
"a185"
,
"a186"
,
"a187"
,
"a188"
,
"a189"
,
"a190"
,
"a191"
,
"a192"
,
"a193"
,
"a194"
,
"a195"
,
"a196"
,
"a197"
,
"a198"
,
"a199"
,
"a200"
,
"a201"
,
"a202"
,
"a203"
,
"a204"
,
"a205"
,
"a206"
,
"a207"
,
"a208"
,
"a209"
,
"a210"
,
"a211"
,
"a212"
,
"a213"
,
"a214"
,
"a215"
,
"a216"
,
"a217"
,
"a218"
,
"a219"
,
"a220"
,
"a221"
,
"a222"
,
"a223"
,
"a224"
,
"a225"
,
"a226"
,
"a227"
,
"a228"
,
"a229"
,
"a230"
,
"a231"
,
"a232"
,
"a233"
,
"a234"
,
"a235"
,
"a236"
,
"a237"
,
"a238"
,
"a239"
,
"a240"
,
"a241"
,
"a242"
,
"a243"
,
"a244"
,
"a245"
,
"a246"
,
"a247"
,
"a248"
,
"a249"
,
"a250"
,
"a251"
,
"a252"
,
"a253"
,
"a254"
,
"a255"
,
"s8"
,
"s9"
,
"s12"
,
"s13"
,
"s14"
,
"s15"
,
"s38"
,
"s39"
,
"s52"
,
"s86"
,
"s36"
,
"s37"
,
"s59"
,
"s80"
,
"v10"
,
"v11"
,
"v12"
,
"v13"
,
"v14"
,
"v15"
,
"v16"
,
"v17"
,
"v50"
,
"v54"
,
"v55"
,
"v64"
,
"v65"
,
"v66"
,
"v67"
,
"v68"
,
"v69"
,
"v70"
,
"v71"
,
"v72"
,
"v73"
,
"v74"
,
"v75"
,
"v76"
,
"v77"
,
"v78"
,
"v79"
,
"v80"
,
"v81"
,
"v82"
,
"v83"
,
"v84"
,
"v85"
,
"v86"
,
"v87"
,
"v88"
,
"v89"
,
"v90"
,
"v91"
,
"v92"
,
"v93"
,
"v94"
,
"v95"
,
"v128"
,
"v129"
,
"v130"
,
"v131"
,
"v132"
,
"v133"
,
"v134"
,
"v135"
,
"v136"
,
"v137"
,
"v138"
,
"v139"
,
"v140"
,
"v141"
,
"v142"
,
"v143"
,
"v144"
,
"v145"
,
"v146"
,
"v147"
,
"v148"
,
"v149"
,
"v150"
,
"v151"
,
"v152"
,
"v153"
,
"v154"
,
"v155"
,
"v156"
,
"v157"
,
"v158"
,
"v159"
,
"v160"
,
"v161"
,
"v162"
,
"v163"
,
"v164"
,
"v165"
,
"v166"
,
"v167"
,
"v168"
,
"v169"
,
"v170"
,
"v171"
,
"v172"
,
"v173"
,
"v174"
,
"v175"
,
"v176"
,
"v177"
,
"v178"
,
"v179"
,
"v180"
,
"v181"
,
"v182"
,
"v183"
,
"v184"
,
"v185"
,
"v186"
,
"v187"
,
"v188"
,
"v189"
,
"v190"
,
"v191"
,
"v192"
,
"v193"
,
"v194"
,
"v195"
,
"v196"
,
"v197"
,
"v198"
,
"v199"
,
"v200"
,
"v201"
,
"v202"
,
"v203"
,
"v204"
,
"v205"
,
"v206"
,
"v207"
,
"v208"
,
"v209"
,
"v210"
,
"v211"
,
"v212"
,
"v213"
,
"v214"
,
"v215"
,
"v216"
,
"v217"
,
"v218"
,
"v219"
,
"v220"
,
"v221"
,
"v222"
,
"v223"
,
"v224"
,
"v225"
,
"v226"
,
"v227"
,
"v228"
,
"v229"
,
"v230"
,
"v231"
,
"v232"
,
"v233"
,
"v234"
,
"v235"
,
"v236"
,
"v237"
,
"v238"
,
"v239"
,
"v240"
,
"v241"
,
"v242"
,
"v243"
,
"v244"
,
"v245"
,
"v246"
,
"v247"
,
"v248"
,
"v249"
,
"v250"
,
"v251"
,
"v252"
,
"v253"
,
"v254"
,
"v255"
);
#pragma clang diagnostic pop
// clang-format on
}
};
}
// namespace ck_tile
include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16.inc
View file @
58d75b7a
...
@@ -3,610 +3,815 @@
...
@@ -3,610 +3,815 @@
#endif
#endif
#if CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_BF16
#if CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_BF16
#
define _UK_MFMA_ "v_mfma_f32_16x16x16_bf16"
#define _UK_MFMA_ "v_mfma_f32_16x16x16_bf16"
#
define _UK_PK_CVT_(x0_, x1_, y_) \
#define _UK_PK_CVT_(x0_, x1_, y_)
\
" v_cmp_u_f32 s[36:37], "
x0_
", "
x0_
"
\n
"
\
" v_cmp_u_f32 s[36:37], "
x0_
", "
x0_
"
\n
"
\
" v_add3_u32 v50, "
x0_
", %[v_nan_lo], 1
\n
"
\
" v_add3_u32 v50, "
x0_
", %[v_nan_lo], 1
\n
"
\
" v_cndmask_b32 v54, v50, %[v_nan_hi], s[36:37]
\n
"
\
" v_cndmask_b32 v54, v50, %[v_nan_hi], s[36:37]
\n
"
\
" v_cmp_u_f32 s[36:37], "
x1_
", "
x1_
"
\n
"
\
" v_cmp_u_f32 s[36:37], "
x1_
", "
x1_
"
\n
"
\
" v_add3_u32 v50, "
x1_
", %[v_nan_lo], 1
\n
"
\
" v_add3_u32 v50, "
x1_
", %[v_nan_lo], 1
\n
"
\
" v_cndmask_b32 v55, v50, %[v_nan_hi], s[36:37]
\n
"
\
" v_cndmask_b32 v55, v50, %[v_nan_hi], s[36:37]
\n
"
\
" v_perm_b32 "
y_
", v55, v54, s52
\n
"
" v_perm_b32 "
y_
", v55, v54, s52
\n
"
#
define _UK_ATOMIC_ADD_ "global_atomic_pk_add_bf16"
#define _UK_ATOMIC_ADD_ "global_atomic_pk_add_bf16"
#elif CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_FP16
#elif CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_FP16
#define _UK_MFMA_ "v_mfma_f32_16x16x16_f16"
#define _UK_MFMA_ "v_mfma_f32_16x16x16_f16"
#
define _UK_PK_CVT_(x0_, x1_, y_)
\
#define _UK_PK_CVT_(x0_, x1_, y_) \
" v_cvt_f16_f32 v54, "
x0_
"
\n
"
\
" v_cvt_f16_f32 v54, "
x0_
"
\n
"
\
" v_cvt_f16_f32 v55, "
x1_
"
\n
"
\
" v_cvt_f16_f32 v55, "
x1_
"
\n
"
\
" v_pack_b32_f16 "
y_
", v54, v55
\n
"
" v_pack_b32_f16 "
y_
", v54, v55
\n
"
#
define _UK_ATOMIC_ADD_ "global_atomic_pk_add_f16"
#define _UK_ATOMIC_ADD_ "global_atomic_pk_add_f16"
#endif
#endif
";-------------------------------------------------------------
\n
"
";-------------------------------------------------------------
\n
"
" s_mov_b32 s52, 0x07060302 ; v_perm
\n
"
" s_mov_b32 s52, 0x07060302 ; v_perm
\n
"
" s_mov_b64 s[38:39], exec ; save current exec
\n
"
" s_mov_b64 s[38:39], exec ; save current exec
\n
"
" s_mov_b32 s8, %[s_res_o0]
\n
"
" s_mov_b32 s8, %[s_res_o0]
\n
"
" s_mov_b32 s9, %[s_res_o1]
\n
"
" s_mov_b32 s9, %[s_res_o1]
\n
"
" s_mov_b32 s12, %[s_res_b0]
\n
"
" s_mov_b32 s12, %[s_res_b0]
\n
"
" s_mov_b32 s13, %[s_res_b1]
\n
"
" s_mov_b32 s13, %[s_res_b1]
\n
"
" s_mov_b32 s14, %[s_res_b2]
\n
"
" s_mov_b32 s14, %[s_res_b2]
\n
"
" s_mov_b32 s15, %[s_res_b3]
\n
"
" s_mov_b32 s15, %[s_res_b3]
\n
"
" ds_read_b64 v[128:129], %[v_sld_y_os] offset:0 + %[sld_a_base]
\n
"
" ds_read_b64 v[128:129], %[v_sld_y_os] offset:0 + %[sld_a_base]
\n
"
" ds_read_b64 v[130:131], %[v_sld_y_os] offset:128 + %[sld_a_base]
\n
"
" ds_read_b64 v[130:131], %[v_sld_y_os] offset:128 + %[sld_a_base]
\n
"
" ds_read_b64 v[132:133], %[v_sld_y_os] offset:1024 + %[sld_a_base]
\n
"
" ds_read_b64 v[132:133], %[v_sld_y_os] offset:1024 + %[sld_a_base]
\n
"
" ds_read_b64 v[134:135], %[v_sld_y_os] offset:1152 + %[sld_a_base]
\n
"
" ds_read_b64 v[134:135], %[v_sld_y_os] offset:1152 + %[sld_a_base]
\n
"
" ds_read_b64 v[136:137], %[v_sld_y_os] offset:2048 + %[sld_a_base]
\n
"
" ds_read_b64 v[136:137], %[v_sld_y_os] offset:2048 + %[sld_a_base]
\n
"
" ds_read_b64 v[138:139], %[v_sld_y_os] offset:2176 + %[sld_a_base]
\n
"
" ds_read_b64 v[138:139], %[v_sld_y_os] offset:2176 + %[sld_a_base]
\n
"
" ds_read_b64 v[140:141], %[v_sld_y_os] offset:3072 + %[sld_a_base]
\n
"
" ds_read_b64 v[140:141], %[v_sld_y_os] offset:3072 + %[sld_a_base]
\n
"
" ds_read_b64 v[142:143], %[v_sld_y_os] offset:3200 + %[sld_a_base]
\n
"
" ds_read_b64 v[142:143], %[v_sld_y_os] offset:3200 + %[sld_a_base]
\n
"
" ds_read_b64 v[144:145], %[v_sld_y_os] offset:4096 + %[sld_a_base]
\n
"
" ds_read_b64 v[144:145], %[v_sld_y_os] offset:4096 + %[sld_a_base]
\n
"
" ds_read_b64 v[146:147], %[v_sld_y_os] offset:4224 + %[sld_a_base]
\n
"
" ds_read_b64 v[146:147], %[v_sld_y_os] offset:4224 + %[sld_a_base]
\n
"
" ds_read_b64 v[148:149], %[v_sld_y_os] offset:5120 + %[sld_a_base]
\n
"
" ds_read_b64 v[148:149], %[v_sld_y_os] offset:5120 + %[sld_a_base]
\n
"
" ds_read_b64 v[150:151], %[v_sld_y_os] offset:5248 + %[sld_a_base]
\n
"
" ds_read_b64 v[150:151], %[v_sld_y_os] offset:5248 + %[sld_a_base]
\n
"
" ds_read_b64 v[152:153], %[v_sld_y_os] offset:6144 + %[sld_a_base]
\n
"
" ds_read_b64 v[152:153], %[v_sld_y_os] offset:6144 + %[sld_a_base]
\n
"
" ds_read_b64 v[154:155], %[v_sld_y_os] offset:6272 + %[sld_a_base]
\n
"
" ds_read_b64 v[154:155], %[v_sld_y_os] offset:6272 + %[sld_a_base]
\n
"
" ds_read_b64 v[156:157], %[v_sld_y_os] offset:7168 + %[sld_a_base]
\n
"
" ds_read_b64 v[156:157], %[v_sld_y_os] offset:7168 + %[sld_a_base]
\n
"
" ds_read_b64 v[158:159], %[v_sld_y_os] offset:7296 + %[sld_a_base]
\n
"
" ds_read_b64 v[158:159], %[v_sld_y_os] offset:7296 + %[sld_a_base]
\n
"
" ds_read_b64 v[160:161], %[v_sld_y_os] offset:8192 + %[sld_a_base]
\n
"
" ds_read_b64 v[160:161], %[v_sld_y_os] offset:8192 + %[sld_a_base]
\n
"
" ds_read_b64 v[162:163], %[v_sld_y_os] offset:8320 + %[sld_a_base]
\n
"
" ds_read_b64 v[162:163], %[v_sld_y_os] offset:8320 + %[sld_a_base]
\n
"
" ds_read_b64 v[164:165], %[v_sld_y_os] offset:9216 + %[sld_a_base]
\n
"
" ds_read_b64 v[164:165], %[v_sld_y_os] offset:9216 + %[sld_a_base]
\n
"
" ds_read_b64 v[166:167], %[v_sld_y_os] offset:9344 + %[sld_a_base]
\n
"
" ds_read_b64 v[166:167], %[v_sld_y_os] offset:9344 + %[sld_a_base]
\n
"
" ds_read_b64 v[168:169], %[v_sld_y_os] offset:10240 + %[sld_a_base]
\n
"
" ds_read_b64 v[168:169], %[v_sld_y_os] offset:10240 + %[sld_a_base]
\n
"
" ds_read_b64 v[170:171], %[v_sld_y_os] offset:10368 + %[sld_a_base]
\n
"
" ds_read_b64 v[170:171], %[v_sld_y_os] offset:10368 + %[sld_a_base]
\n
"
" ds_read_b64 v[172:173], %[v_sld_y_os] offset:11264 + %[sld_a_base]
\n
"
" ds_read_b64 v[172:173], %[v_sld_y_os] offset:11264 + %[sld_a_base]
\n
"
" ds_read_b64 v[174:175], %[v_sld_y_os] offset:11392 + %[sld_a_base]
\n
"
" ds_read_b64 v[174:175], %[v_sld_y_os] offset:11392 + %[sld_a_base]
\n
"
" ds_read_b64 v[176:177], %[v_sld_y_os] offset:12288 + %[sld_a_base]
\n
"
" ds_read_b64 v[176:177], %[v_sld_y_os] offset:12288 + %[sld_a_base]
\n
"
" ds_read_b64 v[178:179], %[v_sld_y_os] offset:12416 + %[sld_a_base]
\n
"
" ds_read_b64 v[178:179], %[v_sld_y_os] offset:12416 + %[sld_a_base]
\n
"
" ds_read_b64 v[180:181], %[v_sld_y_os] offset:13312 + %[sld_a_base]
\n
"
" ds_read_b64 v[180:181], %[v_sld_y_os] offset:13312 + %[sld_a_base]
\n
"
" ds_read_b64 v[182:183], %[v_sld_y_os] offset:13440 + %[sld_a_base]
\n
"
" ds_read_b64 v[182:183], %[v_sld_y_os] offset:13440 + %[sld_a_base]
\n
"
" ds_read_b64 v[184:185], %[v_sld_y_os] offset:14336 + %[sld_a_base]
\n
"
" ds_read_b64 v[184:185], %[v_sld_y_os] offset:14336 + %[sld_a_base]
\n
"
" ds_read_b64 v[186:187], %[v_sld_y_os] offset:14464 + %[sld_a_base]
\n
"
" ds_read_b64 v[186:187], %[v_sld_y_os] offset:14464 + %[sld_a_base]
\n
"
" ds_read_b64 v[188:189], %[v_sld_y_os] offset:15360 + %[sld_a_base]
\n
"
" ds_read_b64 v[188:189], %[v_sld_y_os] offset:15360 + %[sld_a_base]
\n
"
" ds_read_b64 v[190:191], %[v_sld_y_os] offset:15488 + %[sld_a_base]
\n
"
" ds_read_b64 v[190:191], %[v_sld_y_os] offset:15488 + %[sld_a_base]
\n
"
" ds_read_b64 v[192:193], %[v_sld_y_os] offset:16384 + %[sld_a_base]
\n
"
" ds_read_b64 v[192:193], %[v_sld_y_os] offset:16384 + %[sld_a_base]
\n
"
" ds_read_b64 v[194:195], %[v_sld_y_os] offset:16512 + %[sld_a_base]
\n
"
" ds_read_b64 v[194:195], %[v_sld_y_os] offset:16512 + %[sld_a_base]
\n
"
" ds_read_b64 v[196:197], %[v_sld_y_os] offset:17408 + %[sld_a_base]
\n
"
" ds_read_b64 v[196:197], %[v_sld_y_os] offset:17408 + %[sld_a_base]
\n
"
" ds_read_b64 v[198:199], %[v_sld_y_os] offset:17536 + %[sld_a_base]
\n
"
" ds_read_b64 v[198:199], %[v_sld_y_os] offset:17536 + %[sld_a_base]
\n
"
" ds_read_b64 v[200:201], %[v_sld_y_os] offset:18432 + %[sld_a_base]
\n
"
" ds_read_b64 v[200:201], %[v_sld_y_os] offset:18432 + %[sld_a_base]
\n
"
" ds_read_b64 v[202:203], %[v_sld_y_os] offset:18560 + %[sld_a_base]
\n
"
" ds_read_b64 v[202:203], %[v_sld_y_os] offset:18560 + %[sld_a_base]
\n
"
" ds_read_b64 v[204:205], %[v_sld_y_os] offset:19456 + %[sld_a_base]
\n
"
" ds_read_b64 v[204:205], %[v_sld_y_os] offset:19456 + %[sld_a_base]
\n
"
" ds_read_b64 v[206:207], %[v_sld_y_os] offset:19584 + %[sld_a_base]
\n
"
" ds_read_b64 v[206:207], %[v_sld_y_os] offset:19584 + %[sld_a_base]
\n
"
" ds_read_b64 v[208:209], %[v_sld_y_os] offset:20480 + %[sld_a_base]
\n
"
" ds_read_b64 v[208:209], %[v_sld_y_os] offset:20480 + %[sld_a_base]
\n
"
" ds_read_b64 v[210:211], %[v_sld_y_os] offset:20608 + %[sld_a_base]
\n
"
" ds_read_b64 v[210:211], %[v_sld_y_os] offset:20608 + %[sld_a_base]
\n
"
" ds_read_b64 v[212:213], %[v_sld_y_os] offset:21504 + %[sld_a_base]
\n
"
" ds_read_b64 v[212:213], %[v_sld_y_os] offset:21504 + %[sld_a_base]
\n
"
" ds_read_b64 v[214:215], %[v_sld_y_os] offset:21632 + %[sld_a_base]
\n
"
" ds_read_b64 v[214:215], %[v_sld_y_os] offset:21632 + %[sld_a_base]
\n
"
" ds_read_b64 v[216:217], %[v_sld_y_os] offset:22528 + %[sld_a_base]
\n
"
" ds_read_b64 v[216:217], %[v_sld_y_os] offset:22528 + %[sld_a_base]
\n
"
" ds_read_b64 v[218:219], %[v_sld_y_os] offset:22656 + %[sld_a_base]
\n
"
" ds_read_b64 v[218:219], %[v_sld_y_os] offset:22656 + %[sld_a_base]
\n
"
" ds_read_b64 v[220:221], %[v_sld_y_os] offset:23552 + %[sld_a_base]
\n
"
" ds_read_b64 v[220:221], %[v_sld_y_os] offset:23552 + %[sld_a_base]
\n
"
" ds_read_b64 v[222:223], %[v_sld_y_os] offset:23680 + %[sld_a_base]
\n
"
" ds_read_b64 v[222:223], %[v_sld_y_os] offset:23680 + %[sld_a_base]
\n
"
" ds_read_b64 v[224:225], %[v_sld_y_os] offset:24576 + %[sld_a_base]
\n
"
" ds_read_b64 v[224:225], %[v_sld_y_os] offset:24576 + %[sld_a_base]
\n
"
" ds_read_b64 v[226:227], %[v_sld_y_os] offset:24704 + %[sld_a_base]
\n
"
" ds_read_b64 v[226:227], %[v_sld_y_os] offset:24704 + %[sld_a_base]
\n
"
" ds_read_b64 v[228:229], %[v_sld_y_os] offset:25600 + %[sld_a_base]
\n
"
" ds_read_b64 v[228:229], %[v_sld_y_os] offset:25600 + %[sld_a_base]
\n
"
" ds_read_b64 v[230:231], %[v_sld_y_os] offset:25728 + %[sld_a_base]
\n
"
" ds_read_b64 v[230:231], %[v_sld_y_os] offset:25728 + %[sld_a_base]
\n
"
" ds_read_b64 v[232:233], %[v_sld_y_os] offset:26624 + %[sld_a_base]
\n
"
" ds_read_b64 v[232:233], %[v_sld_y_os] offset:26624 + %[sld_a_base]
\n
"
" ds_read_b64 v[234:235], %[v_sld_y_os] offset:26752 + %[sld_a_base]
\n
"
" ds_read_b64 v[234:235], %[v_sld_y_os] offset:26752 + %[sld_a_base]
\n
"
" ds_read_b64 v[236:237], %[v_sld_y_os] offset:27648 + %[sld_a_base]
\n
"
" ds_read_b64 v[236:237], %[v_sld_y_os] offset:27648 + %[sld_a_base]
\n
"
" ds_read_b64 v[238:239], %[v_sld_y_os] offset:27776 + %[sld_a_base]
\n
"
" ds_read_b64 v[238:239], %[v_sld_y_os] offset:27776 + %[sld_a_base]
\n
"
" ds_read_b64 v[240:241], %[v_sld_y_os] offset:28672 + %[sld_a_base]
\n
"
" ds_read_b64 v[240:241], %[v_sld_y_os] offset:28672 + %[sld_a_base]
\n
"
" ds_read_b64 v[242:243], %[v_sld_y_os] offset:28800 + %[sld_a_base]
\n
"
" ds_read_b64 v[242:243], %[v_sld_y_os] offset:28800 + %[sld_a_base]
\n
"
" ds_read_b64 v[244:245], %[v_sld_y_os] offset:29696 + %[sld_a_base]
\n
"
" ds_read_b64 v[244:245], %[v_sld_y_os] offset:29696 + %[sld_a_base]
\n
"
" ds_read_b64 v[246:247], %[v_sld_y_os] offset:29824 + %[sld_a_base]
\n
"
" ds_read_b64 v[246:247], %[v_sld_y_os] offset:29824 + %[sld_a_base]
\n
"
" ds_read_b64 v[248:249], %[v_sld_y_os] offset:30720 + %[sld_a_base]
\n
"
" ds_read_b64 v[248:249], %[v_sld_y_os] offset:30720 + %[sld_a_base]
\n
"
" ds_read_b64 v[250:251], %[v_sld_y_os] offset:30848 + %[sld_a_base]
\n
"
" ds_read_b64 v[250:251], %[v_sld_y_os] offset:30848 + %[sld_a_base]
\n
"
" ds_read_b64 v[252:253], %[v_sld_y_os] offset:31744 + %[sld_a_base]
\n
"
" ds_read_b64 v[252:253], %[v_sld_y_os] offset:31744 + %[sld_a_base]
\n
"
" ds_read_b64 v[254:255], %[v_sld_y_os] offset:31872 + %[sld_a_base]
\n
"
" ds_read_b64 v[254:255], %[v_sld_y_os] offset:31872 + %[sld_a_base]
\n
"
" s_waitcnt 0
\n
"
" s_waitcnt 0
\n
"
" buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[12:15], 0 offen offset:3072
\n
"
" s_cmp_gt_i32 %[s_loop_cnt] 1 ; move b with cond
\n
"
" s_cmp_gt_i32 %[s_loop_cnt] 1 ; move b with cond
\n
"
" s_cselect_b32 s86, %[s_tile_os_b], 0
\n
"
" s_cselect_b32 s86, %[s_tile_os_b], 0
\n
"
" s_add_u32 s12, s86, s12
\n
"
" s_add_u32 s12, s86, s12
\n
"
" s_addc_u32 s13, 0, s13
\n
"
" s_addc_u32 s13, 0, s13
\n
"
" s_waitcnt 0
\n
"
" s_waitcnt 0
\n
"
"L_start%=:
\n
"
"L_start%=:
\n
"
" s_waitcnt vmcnt(32)
\n
"
" s_waitcnt vmcnt(32)
\n
"
" s_barrier
\n
"
" s_barrier
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[0:1], v[128:129], 0
\n
"
" [%[c0], %[c1], %[c2], %[c3]], acc[0:1], v[128:129], 0
\n
"
" buffer_load_dwordx4 acc[128:131], %[v_os_b0], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[128:131], %[v_os_b0], s[12:15], 0 offen
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[2:3], v[130:131], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" [%[c0], %[c1], %[c2], %[c3]], acc[2:3], v[130:131], [%[c0], %[c1], %[c2], %[c3]] "
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[4:5], v[132:133], [%[c0], %[c1], %[c2], %[c3]]
\n
"
"
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[4:5], v[132:133], [%[c0], %[c1], %[c2], "
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[6:7], v[134:135], [%[c0], %[c1], %[c2], %[c3]]
\n
"
"%[c3]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[6:7], v[134:135], "
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[8:9], v[136:137], [%[c0], %[c1], %[c2], %[c3]]
\n
"
"[%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[132:135], %[v_os_b0], s[12:15], 0 offen offset:1024
\n
"
" [%[c0], %[c1], %[c2], %[c3]], acc[8:9], v[136:137], [%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[10:11], v[138:139], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" buffer_load_dwordx4 acc[132:135], %[v_os_b0], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[12:13], v[140:141], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" [%[c0], %[c1], %[c2], %[c3]], acc[10:11], v[138:139], [%[c0], %[c1], %[c2], %[c3]] "
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[14:15], v[142:143], [%[c0], %[c1], %[c2], %[c3]]
\n
"
"
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[12:13], v[140:141], [%[c0], %[c1], %[c2], "
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[0:1], v[192:193], 0
\n
"
"%[c3]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[14:15], v[142:143], "
" buffer_load_dwordx4 acc[136:139], %[v_os_b0], s[12:15], 0 offen offset:2048
\n
"
"[%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[2:3], v[194:195], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" [%[c4], %[c5], %[c6], %[c7]], acc[0:1], v[192:193], 0
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[4:5], v[196:197], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" buffer_load_dwordx4 acc[136:139], %[v_os_b0], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[6:7], v[198:199], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" [%[c4], %[c5], %[c6], %[c7]], acc[2:3], v[194:195], [%[c4], %[c5], %[c6], %[c7]] "
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[8:9], v[200:201], [%[c4], %[c5], %[c6], %[c7]]
\n
"
"
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[4:5], v[196:197], [%[c4], %[c5], %[c6], "
" buffer_load_dwordx4 acc[140:143], %[v_os_b0], s[12:15], 0 offen offset:3072
\n
"
"%[c7]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[6:7], v[198:199], "
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[10:11], v[202:203], [%[c4], %[c5], %[c6], %[c7]]
\n
"
"[%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[12:13], v[204:205], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" [%[c4], %[c5], %[c6], %[c7]], acc[8:9], v[200:201], [%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[14:15], v[206:207], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" buffer_load_dwordx4 acc[140:143], %[v_os_b0], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[16:17], v[128:129], 0
\n
"
" [%[c4], %[c5], %[c6], %[c7]], acc[10:11], v[202:203], [%[c4], %[c5], %[c6], %[c7]] "
" buffer_load_dwordx4 acc[144:147], %[v_os_b1], s[12:15], 0 offen
\n
"
"
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[12:13], v[204:205], [%[c4], %[c5], %[c6], "
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[18:19], v[130:131], [%[c8], %[c9], %[c10], %[c11]]
\n
"
"%[c7]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[14:15], v[206:207], "
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[20:21], v[132:133], [%[c8], %[c9], %[c10], %[c11]]
\n
"
"[%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[22:23], v[134:135], [%[c8], %[c9], %[c10], %[c11]]
\n
"
" [%[c8], %[c9], %[c10], %[c11]], acc[16:17], v[128:129], 0
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[24:25], v[136:137], [%[c8], %[c9], %[c10], %[c11]]
\n
"
" buffer_load_dwordx4 acc[144:147], %[v_os_b1], s[12:15], 0 offen
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[148:151], %[v_os_b1], s[12:15], 0 offen offset:1024
\n
"
" [%[c8], %[c9], %[c10], %[c11]], acc[18:19], v[130:131], [%[c8], %[c9], %[c10], %[c11]] "
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[26:27], v[138:139], [%[c8], %[c9], %[c10], %[c11]]
\n
"
"
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[20:21], v[132:133], [%[c8], %[c9], "
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[28:29], v[140:141], [%[c8], %[c9], %[c10], %[c11]]
\n
"
"%[c10], %[c11]]
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[30:31], v[142:143], [%[c8], %[c9], %[c10], %[c11]]
\n
"
" [%[c8], %[c9], %[c10], %[c11]], acc[22:23], v[134:135], [%[c8], %[c9], %[c10], %[c11]] "
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[16:17], v[192:193], 0
\n
"
"
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[152:155], %[v_os_b1], s[12:15], 0 offen offset:2048
\n
"
" [%[c8], %[c9], %[c10], %[c11]], acc[24:25], v[136:137], [%[c8], %[c9], %[c10], %[c11]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[18:19], v[194:195], [%[c12], %[c13], %[c14], %[c15]]
\n
"
" buffer_load_dwordx4 acc[148:151], %[v_os_b1], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[20:21], v[196:197], [%[c12], %[c13], %[c14], %[c15]]
\n
"
" [%[c8], %[c9], %[c10], %[c11]], acc[26:27], v[138:139], [%[c8], %[c9], %[c10], %[c11]] "
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[22:23], v[198:199], [%[c12], %[c13], %[c14], %[c15]]
\n
"
"
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[28:29], v[140:141], [%[c8], %[c9], "
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[24:25], v[200:201], [%[c12], %[c13], %[c14], %[c15]]
\n
"
"%[c10], %[c11]]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[156:159], %[v_os_b1], s[12:15], 0 offen offset:3072
\n
"
" [%[c8], %[c9], %[c10], %[c11]], acc[30:31], v[142:143], [%[c8], %[c9], %[c10], %[c11]] "
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[26:27], v[202:203], [%[c12], %[c13], %[c14], %[c15]]
\n
"
"
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[16:17], v[192:193], 0
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[28:29], v[204:205], [%[c12], %[c13], %[c14], %[c15]]
\n
"
" buffer_load_dwordx4 acc[152:155], %[v_os_b1], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[30:31], v[206:207], [%[c12], %[c13], %[c14], %[c15]]
\n
"
" [%[c12], %[c13], %[c14], %[c15]], acc[18:19], v[194:195], [%[c12], %[c13], %[c14], %[c15]] "
" s_waitcnt vmcnt(32)
\n
"
"
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[20:21], v[196:197], [%[c12], %[c13], "
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[32:33], v[144:145], [%[c0], %[c1], %[c2], %[c3]]
\n
"
"%[c14], %[c15]]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[160:163], %[v_os_b2], s[12:15], 0 offen
\n
"
" [%[c12], %[c13], %[c14], %[c15]], acc[22:23], v[198:199], [%[c12], %[c13], %[c14], %[c15]] "
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[34:35], v[146:147], [%[c0], %[c1], %[c2], %[c3]]
\n
"
"
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[24:25], v[200:201], [%[c12], %[c13], "
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[36:37], v[148:149], [%[c0], %[c1], %[c2], %[c3]]
\n
"
"%[c14], %[c15]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[38:39], v[150:151], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" buffer_load_dwordx4 acc[156:159], %[v_os_b1], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[40:41], v[152:153], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" [%[c12], %[c13], %[c14], %[c15]], acc[26:27], v[202:203], [%[c12], %[c13], %[c14], %[c15]] "
" buffer_load_dwordx4 acc[164:167], %[v_os_b2], s[12:15], 0 offen offset:1024
\n
"
"
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[28:29], v[204:205], [%[c12], %[c13], "
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[42:43], v[154:155], [%[c0], %[c1], %[c2], %[c3]]
\n
"
"%[c14], %[c15]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[30:31], "
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[44:45], v[156:157], [%[c0], %[c1], %[c2], %[c3]]
\n
"
"v[206:207], [%[c12], %[c13], %[c14], %[c15]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[46:47], v[158:159], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" s_waitcnt vmcnt(32)
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[32:33], v[208:209], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" [%[c0], %[c1], %[c2], %[c3]], acc[32:33], v[144:145], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" buffer_load_dwordx4 acc[168:171], %[v_os_b2], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[160:163], %[v_os_b2], s[12:15], 0 offen
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[34:35], v[210:211], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" [%[c0], %[c1], %[c2], %[c3]], acc[34:35], v[146:147], [%[c0], %[c1], %[c2], %[c3]] "
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[36:37], v[212:213], [%[c4], %[c5], %[c6], %[c7]]
\n
"
"
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[36:37], v[148:149], [%[c0], %[c1], %[c2], "
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[38:39], v[214:215], [%[c4], %[c5], %[c6], %[c7]]
\n
"
"%[c3]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[38:39], v[150:151], "
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[40:41], v[216:217], [%[c4], %[c5], %[c6], %[c7]]
\n
"
"[%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[172:175], %[v_os_b2], s[12:15], 0 offen offset:3072
\n
"
" [%[c0], %[c1], %[c2], %[c3]], acc[40:41], v[152:153], [%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[42:43], v[218:219], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" buffer_load_dwordx4 acc[164:167], %[v_os_b2], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[44:45], v[220:221], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" [%[c0], %[c1], %[c2], %[c3]], acc[42:43], v[154:155], [%[c0], %[c1], %[c2], %[c3]] "
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[46:47], v[222:223], [%[c4], %[c5], %[c6], %[c7]]
\n
"
"
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[44:45], v[156:157], [%[c0], %[c1], %[c2], "
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[48:49], v[144:145], [%[c8], %[c9], %[c10], %[c11]]
\n
"
"%[c3]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[46:47], v[158:159], "
" buffer_load_dwordx4 acc[176:179], %[v_os_b3], s[12:15], 0 offen
\n
"
"[%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[50:51], v[146:147], [%[c8], %[c9], %[c10], %[c11]]
\n
"
" [%[c4], %[c5], %[c6], %[c7]], acc[32:33], v[208:209], [%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[52:53], v[148:149], [%[c8], %[c9], %[c10], %[c11]]
\n
"
" buffer_load_dwordx4 acc[168:171], %[v_os_b2], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[54:55], v[150:151], [%[c8], %[c9], %[c10], %[c11]]
\n
"
" [%[c4], %[c5], %[c6], %[c7]], acc[34:35], v[210:211], [%[c4], %[c5], %[c6], %[c7]] "
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[56:57], v[152:153], [%[c8], %[c9], %[c10], %[c11]]
\n
"
"
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[36:37], v[212:213], [%[c4], %[c5], %[c6], "
" buffer_load_dwordx4 acc[180:183], %[v_os_b3], s[12:15], 0 offen offset:1024
\n
"
"%[c7]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[38:39], v[214:215], "
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[58:59], v[154:155], [%[c8], %[c9], %[c10], %[c11]]
\n
"
"[%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[60:61], v[156:157], [%[c8], %[c9], %[c10], %[c11]]
\n
"
" [%[c4], %[c5], %[c6], %[c7]], acc[40:41], v[216:217], [%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[62:63], v[158:159], [%[c8], %[c9], %[c10], %[c11]]
\n
"
" buffer_load_dwordx4 acc[172:175], %[v_os_b2], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[48:49], v[208:209], [%[c12], %[c13], %[c14], %[c15]]
\n
"
" [%[c4], %[c5], %[c6], %[c7]], acc[42:43], v[218:219], [%[c4], %[c5], %[c6], %[c7]] "
" buffer_load_dwordx4 acc[184:187], %[v_os_b3], s[12:15], 0 offen offset:2048
\n
"
"
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[44:45], v[220:221], [%[c4], %[c5], %[c6], "
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[50:51], v[210:211], [%[c12], %[c13], %[c14], %[c15]]
\n
"
"%[c7]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[46:47], v[222:223], "
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[52:53], v[212:213], [%[c12], %[c13], %[c14], %[c15]]
\n
"
"[%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[54:55], v[214:215], [%[c12], %[c13], %[c14], %[c15]]
\n
"
" [%[c8], %[c9], %[c10], %[c11]], acc[48:49], v[144:145], [%[c8], %[c9], %[c10], %[c11]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[56:57], v[216:217], [%[c12], %[c13], %[c14], %[c15]]
\n
"
" buffer_load_dwordx4 acc[176:179], %[v_os_b3], s[12:15], 0 offen
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[188:191], %[v_os_b3], s[12:15], 0 offen offset:3072
\n
"
" [%[c8], %[c9], %[c10], %[c11]], acc[50:51], v[146:147], [%[c8], %[c9], %[c10], %[c11]] "
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[58:59], v[218:219], [%[c12], %[c13], %[c14], %[c15]]
\n
"
"
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[52:53], v[148:149], [%[c8], %[c9], "
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[60:61], v[220:221], [%[c12], %[c13], %[c14], %[c15]]
\n
"
"%[c10], %[c11]]
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[62:63], v[222:223], [%[c12], %[c13], %[c14], %[c15]]
\n
"
" [%[c8], %[c9], %[c10], %[c11]], acc[54:55], v[150:151], [%[c8], %[c9], %[c10], %[c11]] "
" s_waitcnt vmcnt(32)
\n
"
"
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[64:65], v[160:161], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" [%[c8], %[c9], %[c10], %[c11]], acc[56:57], v[152:153], [%[c8], %[c9], %[c10], %[c11]]
\n
"
" buffer_load_dwordx4 acc[192:195], %[v_os_b4], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[180:183], %[v_os_b3], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[66:67], v[162:163], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" [%[c8], %[c9], %[c10], %[c11]], acc[58:59], v[154:155], [%[c8], %[c9], %[c10], %[c11]] "
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[68:69], v[164:165], [%[c0], %[c1], %[c2], %[c3]]
\n
"
"
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[60:61], v[156:157], [%[c8], %[c9], "
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[70:71], v[166:167], [%[c0], %[c1], %[c2], %[c3]]
\n
"
"%[c10], %[c11]]
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[72:73], v[168:169], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" [%[c8], %[c9], %[c10], %[c11]], acc[62:63], v[158:159], [%[c8], %[c9], %[c10], %[c11]] "
" buffer_load_dwordx4 acc[196:199], %[v_os_b4], s[12:15], 0 offen offset:1024
\n
"
"
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[48:49], v[208:209], [%[c12], %[c13], "
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[74:75], v[170:171], [%[c0], %[c1], %[c2], %[c3]]
\n
"
"%[c14], %[c15]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[76:77], v[172:173], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" buffer_load_dwordx4 acc[184:187], %[v_os_b3], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[78:79], v[174:175], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" [%[c12], %[c13], %[c14], %[c15]], acc[50:51], v[210:211], [%[c12], %[c13], %[c14], %[c15]] "
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[64:65], v[224:225], [%[c4], %[c5], %[c6], %[c7]]
\n
"
"
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[52:53], v[212:213], [%[c12], %[c13], "
" buffer_load_dwordx4 acc[200:203], %[v_os_b4], s[12:15], 0 offen offset:2048
\n
"
"%[c14], %[c15]]
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[66:67], v[226:227], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" [%[c12], %[c13], %[c14], %[c15]], acc[54:55], v[214:215], [%[c12], %[c13], %[c14], %[c15]] "
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[68:69], v[228:229], [%[c4], %[c5], %[c6], %[c7]]
\n
"
"
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[56:57], v[216:217], [%[c12], %[c13], "
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[70:71], v[230:231], [%[c4], %[c5], %[c6], %[c7]]
\n
"
"%[c14], %[c15]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[72:73], v[232:233], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" buffer_load_dwordx4 acc[188:191], %[v_os_b3], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[204:207], %[v_os_b4], s[12:15], 0 offen offset:3072
\n
"
" [%[c12], %[c13], %[c14], %[c15]], acc[58:59], v[218:219], [%[c12], %[c13], %[c14], %[c15]] "
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[74:75], v[234:235], [%[c4], %[c5], %[c6], %[c7]]
\n
"
"
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[60:61], v[220:221], [%[c12], %[c13], "
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[76:77], v[236:237], [%[c4], %[c5], %[c6], %[c7]]
\n
"
"%[c14], %[c15]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[62:63], "
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[78:79], v[238:239], [%[c4], %[c5], %[c6], %[c7]]
\n
"
"v[222:223], [%[c12], %[c13], %[c14], %[c15]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[80:81], v[160:161], [%[c8], %[c9], %[c10], %[c11]]
\n
"
" s_waitcnt vmcnt(32)
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[208:211], %[v_os_b5], s[12:15], 0 offen
\n
"
" [%[c0], %[c1], %[c2], %[c3]], acc[64:65], v[160:161], [%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[82:83], v[162:163], [%[c8], %[c9], %[c10], %[c11]]
\n
"
" buffer_load_dwordx4 acc[192:195], %[v_os_b4], s[12:15], 0 offen
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[84:85], v[164:165], [%[c8], %[c9], %[c10], %[c11]]
\n
"
" [%[c0], %[c1], %[c2], %[c3]], acc[66:67], v[162:163], [%[c0], %[c1], %[c2], %[c3]] "
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[86:87], v[166:167], [%[c8], %[c9], %[c10], %[c11]]
\n
"
"
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[68:69], v[164:165], [%[c0], %[c1], %[c2], "
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[88:89], v[168:169], [%[c8], %[c9], %[c10], %[c11]]
\n
"
"%[c3]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[70:71], v[166:167], "
" buffer_load_dwordx4 acc[212:215], %[v_os_b5], s[12:15], 0 offen offset:1024
\n
"
"[%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[90:91], v[170:171], [%[c8], %[c9], %[c10], %[c11]]
\n
"
" [%[c0], %[c1], %[c2], %[c3]], acc[72:73], v[168:169], [%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[92:93], v[172:173], [%[c8], %[c9], %[c10], %[c11]]
\n
"
" buffer_load_dwordx4 acc[196:199], %[v_os_b4], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[94:95], v[174:175], [%[c8], %[c9], %[c10], %[c11]]
\n
"
" [%[c0], %[c1], %[c2], %[c3]], acc[74:75], v[170:171], [%[c0], %[c1], %[c2], %[c3]] "
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[80:81], v[224:225], [%[c12], %[c13], %[c14], %[c15]]
\n
"
"
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[76:77], v[172:173], [%[c0], %[c1], %[c2], "
" buffer_load_dwordx4 acc[216:219], %[v_os_b5], s[12:15], 0 offen offset:2048
\n
"
"%[c3]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[78:79], v[174:175], "
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[82:83], v[226:227], [%[c12], %[c13], %[c14], %[c15]]
\n
"
"[%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[84:85], v[228:229], [%[c12], %[c13], %[c14], %[c15]]
\n
"
" [%[c4], %[c5], %[c6], %[c7]], acc[64:65], v[224:225], [%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[86:87], v[230:231], [%[c12], %[c13], %[c14], %[c15]]
\n
"
" buffer_load_dwordx4 acc[200:203], %[v_os_b4], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[88:89], v[232:233], [%[c12], %[c13], %[c14], %[c15]]
\n
"
" [%[c4], %[c5], %[c6], %[c7]], acc[66:67], v[226:227], [%[c4], %[c5], %[c6], %[c7]] "
" buffer_load_dwordx4 acc[220:223], %[v_os_b5], s[12:15], 0 offen offset:3072
\n
"
"
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[68:69], v[228:229], [%[c4], %[c5], %[c6], "
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[90:91], v[234:235], [%[c12], %[c13], %[c14], %[c15]]
\n
"
"%[c7]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[70:71], v[230:231], "
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[92:93], v[236:237], [%[c12], %[c13], %[c14], %[c15]]
\n
"
"[%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[94:95], v[238:239], [%[c12], %[c13], %[c14], %[c15]]
\n
"
" [%[c4], %[c5], %[c6], %[c7]], acc[72:73], v[232:233], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" s_waitcnt vmcnt(32)
\n
"
" buffer_load_dwordx4 acc[204:207], %[v_os_b4], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[96:97], v[176:177], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" [%[c4], %[c5], %[c6], %[c7]], acc[74:75], v[234:235], [%[c4], %[c5], %[c6], %[c7]] "
" buffer_load_dwordx4 acc[224:227], %[v_os_b6], s[12:15], 0 offen
\n
"
"
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[76:77], v[236:237], [%[c4], %[c5], %[c6], "
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[98:99], v[178:179], [%[c0], %[c1], %[c2], %[c3]]
\n
"
"%[c7]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[78:79], v[238:239], "
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[100:101], v[180:181], [%[c0], %[c1], %[c2], %[c3]]
\n
"
"[%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[102:103], v[182:183], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" [%[c8], %[c9], %[c10], %[c11]], acc[80:81], v[160:161], [%[c8], %[c9], %[c10], %[c11]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[104:105], v[184:185], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" buffer_load_dwordx4 acc[208:211], %[v_os_b5], s[12:15], 0 offen
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[228:231], %[v_os_b6], s[12:15], 0 offen offset:1024
\n
"
" [%[c8], %[c9], %[c10], %[c11]], acc[82:83], v[162:163], [%[c8], %[c9], %[c10], %[c11]] "
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[106:107], v[186:187], [%[c0], %[c1], %[c2], %[c3]]
\n
"
"
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[84:85], v[164:165], [%[c8], %[c9], "
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[108:109], v[188:189], [%[c0], %[c1], %[c2], %[c3]]
\n
"
"%[c10], %[c11]]
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[110:111], v[190:191], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" [%[c8], %[c9], %[c10], %[c11]], acc[86:87], v[166:167], [%[c8], %[c9], %[c10], %[c11]] "
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[96:97], v[240:241], [%[c4], %[c5], %[c6], %[c7]]
\n
"
"
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[232:235], %[v_os_b6], s[12:15], 0 offen offset:2048
\n
"
" [%[c8], %[c9], %[c10], %[c11]], acc[88:89], v[168:169], [%[c8], %[c9], %[c10], %[c11]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[98:99], v[242:243], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" buffer_load_dwordx4 acc[212:215], %[v_os_b5], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[100:101], v[244:245], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" [%[c8], %[c9], %[c10], %[c11]], acc[90:91], v[170:171], [%[c8], %[c9], %[c10], %[c11]] "
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[102:103], v[246:247], [%[c4], %[c5], %[c6], %[c7]]
\n
"
"
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[92:93], v[172:173], [%[c8], %[c9], "
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[104:105], v[248:249], [%[c4], %[c5], %[c6], %[c7]]
\n
"
"%[c10], %[c11]]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[236:239], %[v_os_b6], s[12:15], 0 offen offset:3072
\n
"
" [%[c8], %[c9], %[c10], %[c11]], acc[94:95], v[174:175], [%[c8], %[c9], %[c10], %[c11]] "
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[106:107], v[250:251], [%[c4], %[c5], %[c6], %[c7]]
\n
"
"
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[80:81], v[224:225], [%[c12], %[c13], "
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[108:109], v[252:253], [%[c4], %[c5], %[c6], %[c7]]
\n
"
"%[c14], %[c15]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[110:111], v[254:255], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" buffer_load_dwordx4 acc[216:219], %[v_os_b5], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[112:113], v[176:177], [%[c8], %[c9], %[c10], %[c11]]
\n
"
" [%[c12], %[c13], %[c14], %[c15]], acc[82:83], v[226:227], [%[c12], %[c13], %[c14], %[c15]] "
" buffer_load_dwordx4 acc[240:243], %[v_os_b7], s[12:15], 0 offen
\n
"
"
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[84:85], v[228:229], [%[c12], %[c13], "
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[114:115], v[178:179], [%[c8], %[c9], %[c10], %[c11]]
\n
"
"%[c14], %[c15]]
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[116:117], v[180:181], [%[c8], %[c9], %[c10], %[c11]]
\n
"
" [%[c12], %[c13], %[c14], %[c15]], acc[86:87], v[230:231], [%[c12], %[c13], %[c14], %[c15]] "
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[118:119], v[182:183], [%[c8], %[c9], %[c10], %[c11]]
\n
"
"
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[88:89], v[232:233], [%[c12], %[c13], "
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[120:121], v[184:185], [%[c8], %[c9], %[c10], %[c11]]
\n
"
"%[c14], %[c15]]
\n
"
" buffer_load_dwordx4 acc[244:247], %[v_os_b7], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[220:223], %[v_os_b5], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[122:123], v[186:187], [%[c8], %[c9], %[c10], %[c11]]
\n
"
" [%[c12], %[c13], %[c14], %[c15]], acc[90:91], v[234:235], [%[c12], %[c13], %[c14], %[c15]] "
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[124:125], v[188:189], [%[c8], %[c9], %[c10], %[c11]]
\n
"
"
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[92:93], v[236:237], [%[c12], %[c13], "
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[126:127], v[190:191], [%[c8], %[c9], %[c10], %[c11]]
\n
"
"%[c14], %[c15]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[94:95], "
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[112:113], v[240:241], [%[c12], %[c13], %[c14], %[c15]]
\n
"
"v[238:239], [%[c12], %[c13], %[c14], %[c15]]
\n
"
" buffer_load_dwordx4 acc[248:251], %[v_os_b7], s[12:15], 0 offen offset:2048
\n
"
" s_waitcnt vmcnt(32)
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[114:115], v[242:243], [%[c12], %[c13], %[c14], %[c15]]
\n
"
" [%[c0], %[c1], %[c2], %[c3]], acc[96:97], v[176:177], [%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[116:117], v[244:245], [%[c12], %[c13], %[c14], %[c15]]
\n
"
" buffer_load_dwordx4 acc[224:227], %[v_os_b6], s[12:15], 0 offen
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[118:119], v[246:247], [%[c12], %[c13], %[c14], %[c15]]
\n
"
" [%[c0], %[c1], %[c2], %[c3]], acc[98:99], v[178:179], [%[c0], %[c1], %[c2], %[c3]] "
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[120:121], v[248:249], [%[c12], %[c13], %[c14], %[c15]]
\n
"
"
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[100:101], v[180:181], [%[c0], %[c1], "
" buffer_load_dwordx4 acc[252:255], %[v_os_b7], s[12:15], 0 offen offset:3072
\n
"
"%[c2], %[c3]]
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[122:123], v[250:251], [%[c12], %[c13], %[c14], %[c15]]
\n
"
" [%[c0], %[c1], %[c2], %[c3]], acc[102:103], v[182:183], [%[c0], %[c1], %[c2], %[c3]] "
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[124:125], v[252:253], [%[c12], %[c13], %[c14], %[c15]]
\n
"
"
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[126:127], v[254:255], [%[c12], %[c13], %[c14], %[c15]]
\n
"
" [%[c0], %[c1], %[c2], %[c3]], acc[104:105], v[184:185], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" v_mul_f32 %[c0], %[scale_0], %[c0]
\n
"
" buffer_load_dwordx4 acc[228:231], %[v_os_b6], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" v_mul_f32 %[c1], %[scale_0], %[c1]
\n
"
" [%[c0], %[c1], %[c2], %[c3]], acc[106:107], v[186:187], [%[c0], %[c1], %[c2], %[c3]] "
" v_mul_f32 %[c2], %[scale_0], %[c2]
\n
"
"
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[108:109], v[188:189], [%[c0], %[c1], "
" v_mul_f32 %[c3], %[scale_0], %[c3]
\n
"
"%[c2], %[c3]]
\n
"
_UK_MFMA_
" v_mul_f32 %[c4], %[scale_1], %[c4]
\n
"
" [%[c0], %[c1], %[c2], %[c3]], acc[110:111], v[190:191], [%[c0], %[c1], %[c2], %[c3]] "
" v_mul_f32 %[c5], %[scale_1], %[c5]
\n
"
"
\n
"
_UK_MFMA_
" v_mul_f32 %[c6], %[scale_1], %[c6]
\n
"
" [%[c4], %[c5], %[c6], %[c7]], acc[96:97], v[240:241], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" v_mul_f32 %[c7], %[scale_1], %[c7]
\n
"
" buffer_load_dwordx4 acc[232:235], %[v_os_b6], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" v_mul_f32 %[c8], %[scale_0], %[c8]
\n
"
" [%[c4], %[c5], %[c6], %[c7]], acc[98:99], v[242:243], [%[c4], %[c5], %[c6], %[c7]] "
" v_mul_f32 %[c9], %[scale_0], %[c9]
\n
"
"
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[100:101], v[244:245], [%[c4], %[c5], "
" v_mul_f32 %[c10], %[scale_0], %[c10]
\n
"
"%[c6], %[c7]]
\n
"
_UK_MFMA_
" v_mul_f32 %[c11], %[scale_0], %[c11]
\n
"
" [%[c4], %[c5], %[c6], %[c7]], acc[102:103], v[246:247], [%[c4], %[c5], %[c6], %[c7]] "
" v_mul_f32 %[c12], %[scale_1], %[c12]
\n
"
"
\n
"
_UK_MFMA_
" v_mul_f32 %[c13], %[scale_1], %[c13]
\n
"
" [%[c4], %[c5], %[c6], %[c7]], acc[104:105], v[248:249], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" v_mul_f32 %[c14], %[scale_1], %[c14]
\n
"
" buffer_load_dwordx4 acc[236:239], %[v_os_b6], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" v_mul_f32 %[c15], %[scale_1], %[c15]
\n
"
" [%[c4], %[c5], %[c6], %[c7]], acc[106:107], v[250:251], [%[c4], %[c5], %[c6], %[c7]] "
_UK_PK_CVT_
(
"%[c0]"
,
"%[c1]"
,
"%[c0]"
)
"
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[108:109], v[252:253], [%[c4], %[c5], "
_UK_PK_CVT_
(
"%[c2]"
,
"%[c3]"
,
"%[c1]"
)
"%[c6], %[c7]]
\n
"
_UK_MFMA_
_UK_PK_CVT_
(
"%[c4]"
,
"%[c5]"
,
"%[c2]"
)
" [%[c4], %[c5], %[c6], %[c7]], acc[110:111], v[254:255], [%[c4], %[c5], %[c6], %[c7]] "
_UK_PK_CVT_
(
"%[c6]"
,
"%[c7]"
,
"%[c3]"
)
"
\n
"
_UK_MFMA_
_UK_PK_CVT_
(
"%[c8]"
,
"%[c9]"
,
"%[c4]"
)
" [%[c8], %[c9], %[c10], %[c11]], acc[112:113], v[176:177], [%[c8], %[c9], %[c10], %[c11]]
\n
"
_UK_PK_CVT_
(
"%[c10]"
,
"%[c11]"
,
"%[c5]"
)
" buffer_load_dwordx4 acc[240:243], %[v_os_b7], s[12:15], 0 offen
\n
"
_UK_MFMA_
_UK_PK_CVT_
(
"%[c12]"
,
"%[c13]"
,
"%[c6]"
)
" [%[c8], %[c9], %[c10], %[c11]], acc[114:115], v[178:179], [%[c8], %[c9], %[c10], %[c11]] "
_UK_PK_CVT_
(
"%[c14]"
,
"%[c15]"
,
"%[c7]"
)
"
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[116:117], v[180:181], [%[c8], %[c9], "
" ;------------------------------
\n
"
"%[c10], %[c11]]
\n
"
_UK_MFMA_
" ds_write_b64 %[v_sfl_sst], [%[c0],%[c1]] offset:0 + %[shfl_base]
\n
"
" [%[c8], %[c9], %[c10], %[c11]], acc[118:119], v[182:183], [%[c8], %[c9], %[c10], %[c11]] "
" ds_write_b64 %[v_sfl_sst], [%[c2],%[c3]] offset:4352 + %[shfl_base]
\n
"
"
\n
"
_UK_MFMA_
" ds_write_b64 %[v_sfl_sst], [%[c4],%[c5]] offset:2176 + %[shfl_base]
\n
"
" [%[c8], %[c9], %[c10], %[c11]], acc[120:121], v[184:185], [%[c8], %[c9], %[c10], %[c11]]
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c6],%[c7]] offset:6528 + %[shfl_base]
\n
"
" buffer_load_dwordx4 acc[244:247], %[v_os_b7], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" s_waitcnt lgkmcnt(0)
\n
"
" [%[c8], %[c9], %[c10], %[c11]], acc[122:123], v[186:187], [%[c8], %[c9], %[c10], %[c11]] "
" s_barrier
\n
"
"
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[124:125], v[188:189], [%[c8], %[c9], "
" ds_read_b32 %[c0], %[v_sfl_sld] offset:0 + %[shfl_base]
\n
"
"%[c10], %[c11]]
\n
"
_UK_MFMA_
" ds_read_b32 %[c1], %[v_sfl_sld] offset:32 + %[shfl_base]
\n
"
" [%[c8], %[c9], %[c10], %[c11]], acc[126:127], v[190:191], [%[c8], %[c9], %[c10], %[c11]] "
" ds_read_b32 %[c2], %[v_sfl_sld] offset:64 + %[shfl_base]
\n
"
"
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[112:113], v[240:241], [%[c12], %[c13], "
" ds_read_b32 %[c3], %[v_sfl_sld] offset:96 + %[shfl_base]
\n
"
"%[c14], %[c15]]
\n
"
" ds_read_b32 %[c4], %[v_sfl_sld] offset:4352 + %[shfl_base]
\n
"
" buffer_load_dwordx4 acc[248:251], %[v_os_b7], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" ds_read_b32 %[c5], %[v_sfl_sld] offset:4384 + %[shfl_base]
\n
"
" [%[c12], %[c13], %[c14], %[c15]], acc[114:115], v[242:243], [%[c12], %[c13], %[c14], "
" ds_read_b32 %[c6], %[v_sfl_sld] offset:4416 + %[shfl_base]
\n
"
"%[c15]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[116:117], v[244:245], [%[c12], "
" ds_read_b32 %[c7], %[v_sfl_sld] offset:4448 + %[shfl_base]
\n
"
"%[c13], %[c14], %[c15]]
\n
"
_UK_MFMA_
" s_waitcnt lgkmcnt(0)
\n
"
" [%[c12], %[c13], %[c14], %[c15]], acc[118:119], v[246:247], [%[c12], %[c13], %[c14], "
" s_mov_b64 exec, %[s_execflag_0]
\n
"
"%[c15]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[120:121], v[248:249], [%[c12], "
_UK_ATOMIC_ADD_
" %[v_os_o0], %[c0], s[8:9]
\n
"
"%[c13], %[c14], %[c15]]
\n
"
" s_mov_b64 exec, %[s_execflag_1]
\n
"
" buffer_load_dwordx4 acc[252:255], %[v_os_b7], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
_UK_ATOMIC_ADD_
" %[v_os_o1], %[c1], s[8:9]
\n
"
" [%[c12], %[c13], %[c14], %[c15]], acc[122:123], v[250:251], [%[c12], %[c13], %[c14], "
" s_mov_b64 exec, %[s_execflag_2]
\n
"
"%[c15]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[124:125], v[252:253], [%[c12], "
_UK_ATOMIC_ADD_
" %[v_os_o2], %[c2], s[8:9]
\n
"
"%[c13], %[c14], %[c15]]
\n
"
_UK_MFMA_
" s_mov_b64 exec, %[s_execflag_3]
\n
"
" [%[c12], %[c13], %[c14], %[c15]], acc[126:127], v[254:255], [%[c12], %[c13], %[c14], "
_UK_ATOMIC_ADD_
" %[v_os_o3], %[c3], s[8:9]
\n
"
"%[c15]]
\n
"
" s_mov_b64 exec, %[s_execflag_4]
\n
"
" v_mul_f32 %[c0], %[scale_0], %[c0]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o4], %[c4], s[8:9]
\n
"
" v_mul_f32 %[c1], %[scale_0], %[c1]
\n
"
" s_mov_b64 exec, %[s_execflag_5]
\n
"
" v_mul_f32 %[c2], %[scale_0], %[c2]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o5], %[c5], s[8:9]
\n
"
" v_mul_f32 %[c3], %[scale_0], %[c3]
\n
"
" s_mov_b64 exec, %[s_execflag_6]
\n
"
" v_mul_f32 %[c4], %[scale_1], %[c4]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o6], %[c6], s[8:9]
\n
"
" v_mul_f32 %[c5], %[scale_1], %[c5]
\n
"
" s_mov_b64 exec, %[s_execflag_7]
\n
"
" v_mul_f32 %[c6], %[scale_1], %[c6]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o7], %[c7], s[8:9]
\n
"
" v_mul_f32 %[c7], %[scale_1], %[c7]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
" v_mul_f32 %[c8], %[scale_0], %[c8]
\n
"
" s_sub_i32 %[s_loop_cnt], %[s_loop_cnt], 1 ; k--
\n
"
" v_mul_f32 %[c9], %[scale_0], %[c9]
\n
"
" s_cmp_gt_i32 %[s_loop_cnt] 0
\n
"
" v_mul_f32 %[c10], %[scale_0], %[c10]
\n
"
" s_cbranch_scc0 L_end%=
\n
"
" v_mul_f32 %[c11], %[scale_0], %[c11]
\n
"
" s_cmp_gt_i32 %[s_loop_cnt] 1 ; move b with cond
\n
"
" v_mul_f32 %[c12], %[scale_1], %[c12]
\n
"
" s_cselect_b32 s86, %[s_tile_os_b], 0
\n
"
" v_mul_f32 %[c13], %[scale_1], %[c13]
\n
"
" s_add_u32 s12, s86, s12
\n
"
" v_mul_f32 %[c14], %[scale_1], %[c14]
\n
"
" s_addc_u32 s13, 0, s13
\n
"
" v_mul_f32 %[c15], %[scale_1], %[c15]
\n
"
_UK_PK_CVT_
(
" s_add_u32 s8, %[s_tile_os_o], s8
\n
"
"%[c0]"
,
"%[c1]"
,
"%[c0]"
)
_UK_PK_CVT_
(
"%[c2]"
,
"%[c3]"
,
"%[c1]"
)
" s_addc_u32 s9, 0, s9
\n
"
_UK_PK_CVT_
(
"%[c4]"
,
"%[c5]"
,
"%[c2]"
)
_UK_PK_CVT_
(
"%[c6]"
,
"%[c7]"
,
"%[c3]"
)
_UK_PK_CVT_
(
" s_waitcnt vmcnt(32)
\n
"
"%[c8]"
,
"%[c9]"
,
"%[c4]"
)
_UK_PK_CVT_
(
"%[c10]"
,
"%[c11]"
,
"%[c5]"
)
" s_barrier
\n
"
_UK_PK_CVT_
(
"%[c12]"
,
"%[c13]"
,
"%[c6]"
)
_UK_PK_CVT_
(
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[128:129], v[128:129], 0
\n
"
"%[c14]"
,
" buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[12:15], 0 offen
\n
"
"%[c15]"
,
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[130:131], v[130:131], [%[c16],%[c17],%[c18],%[c19]]
\n
"
"%[c7]"
)
" ;------------------------------
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[132:133], v[132:133], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c0],%[c1]] offset:0 + %[shfl_base] "
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[134:135], v[134:135], [%[c16],%[c17],%[c18],%[c19]]
\n
"
"
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[136:137], v[136:137], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c2],%[c3]] offset:4352 + %[shfl_base] "
" buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[12:15], 0 offen offset:1024
\n
"
"
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[138:139], v[138:139], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c4],%[c5]] offset:2176 + %[shfl_base] "
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[140:141], v[140:141], [%[c16],%[c17],%[c18],%[c19]]
\n
"
"
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[142:143], v[142:143], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c6],%[c7]] offset:6528 + %[shfl_base] "
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[128:129], v[192:193], 0
\n
"
"
\n
"
" buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[12:15], 0 offen offset:2048
\n
"
" s_waitcnt lgkmcnt(0)
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[130:131], v[194:195], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" s_barrier
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[132:133], v[196:197], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" ds_read_b32 %[c0], %[v_sfl_sld] offset:0 + %[shfl_base] "
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[134:135], v[198:199], [%[c20],%[c21],%[c22],%[c23]]
\n
"
"
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[136:137], v[200:201], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" ds_read_b32 %[c1], %[v_sfl_sld] offset:32 + %[shfl_base] "
" buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[12:15], 0 offen offset:3072
\n
"
"
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[138:139], v[202:203], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" ds_read_b32 %[c2], %[v_sfl_sld] offset:64 + %[shfl_base] "
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[140:141], v[204:205], [%[c20],%[c21],%[c22],%[c23]]
\n
"
"
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[142:143], v[206:207], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" ds_read_b32 %[c3], %[v_sfl_sld] offset:96 + %[shfl_base] "
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[144:145], v[128:129], 0
\n
"
"
\n
"
" buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[12:15], 0 offen
\n
"
" ds_read_b32 %[c4], %[v_sfl_sld] offset:4352 + %[shfl_base] "
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[146:147], v[130:131], [%[c24],%[c25],%[c26],%[c27]]
\n
"
"
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[148:149], v[132:133], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" ds_read_b32 %[c5], %[v_sfl_sld] offset:4384 + %[shfl_base] "
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[150:151], v[134:135], [%[c24],%[c25],%[c26],%[c27]]
\n
"
"
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[152:153], v[136:137], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" ds_read_b32 %[c6], %[v_sfl_sld] offset:4416 + %[shfl_base] "
" buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[12:15], 0 offen offset:1024
\n
"
"
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[154:155], v[138:139], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" ds_read_b32 %[c7], %[v_sfl_sld] offset:4448 + %[shfl_base] "
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[156:157], v[140:141], [%[c24],%[c25],%[c26],%[c27]]
\n
"
"
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[158:159], v[142:143], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" s_waitcnt lgkmcnt(0)
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[144:145], v[192:193], 0
\n
"
" s_mov_b64 exec, %[s_execflag_0] "
" buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[12:15], 0 offen offset:2048
\n
"
"
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o0], %[c0], s[8:9]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[146:147], v[194:195], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" s_mov_b64 exec, %[s_execflag_1] "
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[148:149], v[196:197], [%[c28],%[c29],%[c30],%[c31]]
\n
"
"
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o1], %[c1], s[8:9]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[150:151], v[198:199], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" s_mov_b64 exec, %[s_execflag_2] "
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[152:153], v[200:201], [%[c28],%[c29],%[c30],%[c31]]
\n
"
"
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o2], %[c2], s[8:9]
\n
"
" buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[12:15], 0 offen offset:3072
\n
"
" s_mov_b64 exec, %[s_execflag_3] "
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[154:155], v[202:203], [%[c28],%[c29],%[c30],%[c31]]
\n
"
"
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o3], %[c3], s[8:9]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[156:157], v[204:205], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" s_mov_b64 exec, %[s_execflag_4] "
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[158:159], v[206:207], [%[c28],%[c29],%[c30],%[c31]]
\n
"
"
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o4], %[c4], s[8:9]
\n
"
" s_waitcnt vmcnt(32)
\n
"
" s_mov_b64 exec, %[s_execflag_5] "
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[160:161], v[144:145], [%[c16],%[c17],%[c18],%[c19]]
\n
"
"
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o5], %[c5], s[8:9]
\n
"
" buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[12:15], 0 offen
\n
"
" s_mov_b64 exec, %[s_execflag_6] "
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[162:163], v[146:147], [%[c16],%[c17],%[c18],%[c19]]
\n
"
"
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o6], %[c6], s[8:9]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[164:165], v[148:149], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" s_mov_b64 exec, %[s_execflag_7] "
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[166:167], v[150:151], [%[c16],%[c17],%[c18],%[c19]]
\n
"
"
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o7], %[c7], s[8:9]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[168:169], v[152:153], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
" buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[12:15], 0 offen offset:1024
\n
"
" s_sub_i32 %[s_loop_cnt], %[s_loop_cnt], 1 ; k--
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[170:171], v[154:155], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" s_cmp_gt_i32 %[s_loop_cnt] 0
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[172:173], v[156:157], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" s_cbranch_scc0 L_end%=
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[174:175], v[158:159], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" s_cmp_gt_i32 %[s_loop_cnt] 1 ; move b with cond
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[160:161], v[208:209], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" s_cselect_b32 s86, %[s_tile_os_b], 0
\n
"
" buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[12:15], 0 offen offset:2048
\n
"
" s_add_u32 s12, s86, s12
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[162:163], v[210:211], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" s_addc_u32 s13, 0, s13
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[164:165], v[212:213], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" s_add_u32 s8, %[s_tile_os_o], s8
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[166:167], v[214:215], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" s_addc_u32 s9, 0, s9
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[168:169], v[216:217], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" s_waitcnt vmcnt(32)
\n
"
" buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[12:15], 0 offen offset:3072
\n
"
" s_barrier
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[170:171], v[218:219], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" [%[c16],%[c17],%[c18],%[c19]], acc[128:129], v[128:129], 0
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[172:173], v[220:221], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[12:15], 0 offen "
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[174:175], v[222:223], [%[c20],%[c21],%[c22],%[c23]]
\n
"
"
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[130:131], "
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[176:177], v[144:145], [%[c24],%[c25],%[c26],%[c27]]
\n
"
"v[130:131], [%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[12:15], 0 offen
\n
"
" [%[c16],%[c17],%[c18],%[c19]], acc[132:133], v[132:133], "
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[178:179], v[146:147], [%[c24],%[c25],%[c26],%[c27]]
\n
"
"[%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[180:181], v[148:149], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" [%[c16],%[c17],%[c18],%[c19]], acc[134:135], v[134:135], "
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[182:183], v[150:151], [%[c24],%[c25],%[c26],%[c27]]
\n
"
"[%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[184:185], v[152:153], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" [%[c16],%[c17],%[c18],%[c19]], acc[136:137], v[136:137], "
" buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[12:15], 0 offen offset:1024
\n
"
"[%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[186:187], v[154:155], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[12:15], 0 offen "
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[188:189], v[156:157], [%[c24],%[c25],%[c26],%[c27]]
\n
"
"offset:1024
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[190:191], v[158:159], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" [%[c16],%[c17],%[c18],%[c19]], acc[138:139], v[138:139], "
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[176:177], v[208:209], [%[c28],%[c29],%[c30],%[c31]]
\n
"
"[%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[12:15], 0 offen offset:2048
\n
"
" [%[c16],%[c17],%[c18],%[c19]], acc[140:141], v[140:141], "
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[178:179], v[210:211], [%[c28],%[c29],%[c30],%[c31]]
\n
"
"[%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[180:181], v[212:213], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" [%[c16],%[c17],%[c18],%[c19]], acc[142:143], v[142:143], "
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[182:183], v[214:215], [%[c28],%[c29],%[c30],%[c31]]
\n
"
"[%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[184:185], v[216:217], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" [%[c20],%[c21],%[c22],%[c23]], acc[128:129], v[192:193], 0
\n
"
" buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[12:15], 0 offen "
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[186:187], v[218:219], [%[c28],%[c29],%[c30],%[c31]]
\n
"
"offset:2048
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[188:189], v[220:221], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" [%[c20],%[c21],%[c22],%[c23]], acc[130:131], v[194:195], "
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[190:191], v[222:223], [%[c28],%[c29],%[c30],%[c31]]
\n
"
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" s_waitcnt vmcnt(32)
\n
"
" [%[c20],%[c21],%[c22],%[c23]], acc[132:133], v[196:197], "
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[192:193], v[160:161], [%[c16],%[c17],%[c18],%[c19]]
\n
"
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[12:15], 0 offen
\n
"
" [%[c20],%[c21],%[c22],%[c23]], acc[134:135], v[198:199], "
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[194:195], v[162:163], [%[c16],%[c17],%[c18],%[c19]]
\n
"
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[196:197], v[164:165], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" [%[c20],%[c21],%[c22],%[c23]], acc[136:137], v[200:201], "
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[198:199], v[166:167], [%[c16],%[c17],%[c18],%[c19]]
\n
"
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[200:201], v[168:169], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[12:15], 0 offen "
" buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[12:15], 0 offen offset:1024
\n
"
"offset:3072
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[202:203], v[170:171], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" [%[c20],%[c21],%[c22],%[c23]], acc[138:139], v[202:203], "
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[204:205], v[172:173], [%[c16],%[c17],%[c18],%[c19]]
\n
"
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[206:207], v[174:175], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" [%[c20],%[c21],%[c22],%[c23]], acc[140:141], v[204:205], "
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[192:193], v[224:225], [%[c20],%[c21],%[c22],%[c23]]
\n
"
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[12:15], 0 offen offset:2048
\n
"
" [%[c20],%[c21],%[c22],%[c23]], acc[142:143], v[206:207], "
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[194:195], v[226:227], [%[c20],%[c21],%[c22],%[c23]]
\n
"
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[196:197], v[228:229], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" [%[c24],%[c25],%[c26],%[c27]], acc[144:145], v[128:129], 0
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[198:199], v[230:231], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[12:15], 0 offen "
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[200:201], v[232:233], [%[c20],%[c21],%[c22],%[c23]]
\n
"
"
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[146:147], "
" buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[12:15], 0 offen offset:3072
\n
"
"v[130:131], [%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[202:203], v[234:235], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" [%[c24],%[c25],%[c26],%[c27]], acc[148:149], v[132:133], "
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[204:205], v[236:237], [%[c20],%[c21],%[c22],%[c23]]
\n
"
"[%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[206:207], v[238:239], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" [%[c24],%[c25],%[c26],%[c27]], acc[150:151], v[134:135], "
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[208:209], v[160:161], [%[c24],%[c25],%[c26],%[c27]]
\n
"
"[%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[12:15], 0 offen
\n
"
" [%[c24],%[c25],%[c26],%[c27]], acc[152:153], v[136:137], "
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[210:211], v[162:163], [%[c24],%[c25],%[c26],%[c27]]
\n
"
"[%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[212:213], v[164:165], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[12:15], 0 offen "
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[214:215], v[166:167], [%[c24],%[c25],%[c26],%[c27]]
\n
"
"offset:1024
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[216:217], v[168:169], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" [%[c24],%[c25],%[c26],%[c27]], acc[154:155], v[138:139], "
" buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[12:15], 0 offen offset:1024
\n
"
"[%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[218:219], v[170:171], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" [%[c24],%[c25],%[c26],%[c27]], acc[156:157], v[140:141], "
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[220:221], v[172:173], [%[c24],%[c25],%[c26],%[c27]]
\n
"
"[%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[222:223], v[174:175], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" [%[c24],%[c25],%[c26],%[c27]], acc[158:159], v[142:143], "
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[208:209], v[224:225], [%[c28],%[c29],%[c30],%[c31]]
\n
"
"[%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[12:15], 0 offen offset:2048
\n
"
" [%[c28],%[c29],%[c30],%[c31]], acc[144:145], v[192:193], 0
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[210:211], v[226:227], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[12:15], 0 offen "
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[212:213], v[228:229], [%[c28],%[c29],%[c30],%[c31]]
\n
"
"offset:2048
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[214:215], v[230:231], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" [%[c28],%[c29],%[c30],%[c31]], acc[146:147], v[194:195], "
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[216:217], v[232:233], [%[c28],%[c29],%[c30],%[c31]]
\n
"
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[12:15], 0 offen offset:3072
\n
"
" [%[c28],%[c29],%[c30],%[c31]], acc[148:149], v[196:197], "
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[218:219], v[234:235], [%[c28],%[c29],%[c30],%[c31]]
\n
"
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[220:221], v[236:237], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" [%[c28],%[c29],%[c30],%[c31]], acc[150:151], v[198:199], "
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[222:223], v[238:239], [%[c28],%[c29],%[c30],%[c31]]
\n
"
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" s_waitcnt vmcnt(32)
\n
"
" [%[c28],%[c29],%[c30],%[c31]], acc[152:153], v[200:201], "
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[224:225], v[176:177], [%[c16],%[c17],%[c18],%[c19]]
\n
"
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
" buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[12:15], 0 offen "
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[226:227], v[178:179], [%[c16],%[c17],%[c18],%[c19]]
\n
"
"offset:3072
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[228:229], v[180:181], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" [%[c28],%[c29],%[c30],%[c31]], acc[154:155], v[202:203], "
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[230:231], v[182:183], [%[c16],%[c17],%[c18],%[c19]]
\n
"
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[232:233], v[184:185], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" [%[c28],%[c29],%[c30],%[c31]], acc[156:157], v[204:205], "
" buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[12:15], 0 offen offset:1024
\n
"
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[234:235], v[186:187], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" [%[c28],%[c29],%[c30],%[c31]], acc[158:159], v[206:207], "
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[236:237], v[188:189], [%[c16],%[c17],%[c18],%[c19]]
\n
"
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[238:239], v[190:191], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" s_waitcnt vmcnt(32)
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[224:225], v[240:241], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" [%[c16],%[c17],%[c18],%[c19]], acc[160:161], v[144:145], "
" buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[12:15], 0 offen offset:2048
\n
"
"[%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[226:227], v[242:243], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[12:15], 0 offen "
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[228:229], v[244:245], [%[c20],%[c21],%[c22],%[c23]]
\n
"
"
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[162:163], "
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[230:231], v[246:247], [%[c20],%[c21],%[c22],%[c23]]
\n
"
"v[146:147], [%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[232:233], v[248:249], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" [%[c16],%[c17],%[c18],%[c19]], acc[164:165], v[148:149], "
" buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[12:15], 0 offen offset:3072
\n
"
"[%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[234:235], v[250:251], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" [%[c16],%[c17],%[c18],%[c19]], acc[166:167], v[150:151], "
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[236:237], v[252:253], [%[c20],%[c21],%[c22],%[c23]]
\n
"
"[%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[238:239], v[254:255], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" [%[c16],%[c17],%[c18],%[c19]], acc[168:169], v[152:153], "
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[240:241], v[176:177], [%[c24],%[c25],%[c26],%[c27]]
\n
"
"[%[c16],%[c17],%[c18],%[c19]]
\n
"
" buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[12:15], 0 offen "
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[242:243], v[178:179], [%[c24],%[c25],%[c26],%[c27]]
\n
"
"offset:1024
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[244:245], v[180:181], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" [%[c16],%[c17],%[c18],%[c19]], acc[170:171], v[154:155], "
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[246:247], v[182:183], [%[c24],%[c25],%[c26],%[c27]]
\n
"
"[%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[248:249], v[184:185], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" [%[c16],%[c17],%[c18],%[c19]], acc[172:173], v[156:157], "
" buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[12:15], 0 offen offset:1024
\n
"
"[%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[250:251], v[186:187], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" [%[c16],%[c17],%[c18],%[c19]], acc[174:175], v[158:159], "
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[252:253], v[188:189], [%[c24],%[c25],%[c26],%[c27]]
\n
"
"[%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[254:255], v[190:191], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" [%[c20],%[c21],%[c22],%[c23]], acc[160:161], v[208:209], "
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[240:241], v[240:241], [%[c28],%[c29],%[c30],%[c31]]
\n
"
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
" buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[12:15], 0 offen "
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[242:243], v[242:243], [%[c28],%[c29],%[c30],%[c31]]
\n
"
"offset:2048
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[244:245], v[244:245], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" [%[c20],%[c21],%[c22],%[c23]], acc[162:163], v[210:211], "
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[246:247], v[246:247], [%[c28],%[c29],%[c30],%[c31]]
\n
"
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[248:249], v[248:249], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" [%[c20],%[c21],%[c22],%[c23]], acc[164:165], v[212:213], "
" buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[12:15], 0 offen offset:3072
\n
"
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[250:251], v[250:251], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" [%[c20],%[c21],%[c22],%[c23]], acc[166:167], v[214:215], "
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[252:253], v[252:253], [%[c28],%[c29],%[c30],%[c31]]
\n
"
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[254:255], v[254:255], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" [%[c20],%[c21],%[c22],%[c23]], acc[168:169], v[216:217], "
" v_mul_f32 %[c16], %[scale_0], %[c16]
\n
"
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
" v_mul_f32 %[c17], %[scale_0], %[c17]
\n
"
" buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[12:15], 0 offen "
" v_mul_f32 %[c18], %[scale_0], %[c18]
\n
"
"offset:3072
\n
"
_UK_MFMA_
" v_mul_f32 %[c19], %[scale_0], %[c19]
\n
"
" [%[c20],%[c21],%[c22],%[c23]], acc[170:171], v[218:219], "
" v_mul_f32 %[c20], %[scale_1], %[c20]
\n
"
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" v_mul_f32 %[c21], %[scale_1], %[c21]
\n
"
" [%[c20],%[c21],%[c22],%[c23]], acc[172:173], v[220:221], "
" v_mul_f32 %[c22], %[scale_1], %[c22]
\n
"
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" v_mul_f32 %[c23], %[scale_1], %[c23]
\n
"
" [%[c20],%[c21],%[c22],%[c23]], acc[174:175], v[222:223], "
" v_mul_f32 %[c24], %[scale_0], %[c24]
\n
"
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" v_mul_f32 %[c25], %[scale_0], %[c25]
\n
"
" [%[c24],%[c25],%[c26],%[c27]], acc[176:177], v[144:145], "
" v_mul_f32 %[c26], %[scale_0], %[c26]
\n
"
"[%[c24],%[c25],%[c26],%[c27]]
\n
"
" v_mul_f32 %[c27], %[scale_0], %[c27]
\n
"
" buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[12:15], 0 offen "
" v_mul_f32 %[c28], %[scale_1], %[c28]
\n
"
"
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[178:179], "
" v_mul_f32 %[c29], %[scale_1], %[c29]
\n
"
"v[146:147], [%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" v_mul_f32 %[c30], %[scale_1], %[c30]
\n
"
" [%[c24],%[c25],%[c26],%[c27]], acc[180:181], v[148:149], "
" v_mul_f32 %[c31], %[scale_1], %[c31]
\n
"
"[%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[182:183], v[150:151], "
"[%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[184:185], v[152:153], "
"[%[c24],%[c25],%[c26],%[c27]]
\n
"
" buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[12:15], 0 offen "
"offset:1024
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[186:187], v[154:155], "
"[%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[188:189], v[156:157], "
"[%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[190:191], v[158:159], "
"[%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[176:177], v[208:209], "
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
" buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[12:15], 0 offen "
"offset:2048
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[178:179], v[210:211], "
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[180:181], v[212:213], "
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[182:183], v[214:215], "
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[184:185], v[216:217], "
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
" buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[12:15], 0 offen "
"offset:3072
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[186:187], v[218:219], "
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[188:189], v[220:221], "
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[190:191], v[222:223], "
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
" s_waitcnt vmcnt(32)
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[192:193], v[160:161], "
"[%[c16],%[c17],%[c18],%[c19]]
\n
"
" buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[12:15], 0 offen "
"
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[194:195], "
"v[162:163], [%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[196:197], v[164:165], "
"[%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[198:199], v[166:167], "
"[%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[200:201], v[168:169], "
"[%[c16],%[c17],%[c18],%[c19]]
\n
"
" buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[12:15], 0 offen "
"offset:1024
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[202:203], v[170:171], "
"[%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[204:205], v[172:173], "
"[%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[206:207], v[174:175], "
"[%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[192:193], v[224:225], "
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
" buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[12:15], 0 offen "
"offset:2048
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[194:195], v[226:227], "
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[196:197], v[228:229], "
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[198:199], v[230:231], "
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[200:201], v[232:233], "
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
" buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[12:15], 0 offen "
"offset:3072
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[202:203], v[234:235], "
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[204:205], v[236:237], "
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[206:207], v[238:239], "
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[208:209], v[160:161], "
"[%[c24],%[c25],%[c26],%[c27]]
\n
"
" buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[12:15], 0 offen "
"
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[210:211], "
"v[162:163], [%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[212:213], v[164:165], "
"[%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[214:215], v[166:167], "
"[%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[216:217], v[168:169], "
"[%[c24],%[c25],%[c26],%[c27]]
\n
"
" buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[12:15], 0 offen "
"offset:1024
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[218:219], v[170:171], "
"[%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[220:221], v[172:173], "
"[%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[222:223], v[174:175], "
"[%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[208:209], v[224:225], "
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
" buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[12:15], 0 offen "
"offset:2048
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[210:211], v[226:227], "
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[212:213], v[228:229], "
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[214:215], v[230:231], "
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[216:217], v[232:233], "
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
" buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[12:15], 0 offen "
"offset:3072
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[218:219], v[234:235], "
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[220:221], v[236:237], "
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[222:223], v[238:239], "
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
" s_waitcnt vmcnt(32)
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[224:225], v[176:177], "
"[%[c16],%[c17],%[c18],%[c19]]
\n
"
" buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[12:15], 0 offen "
"
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[226:227], "
"v[178:179], [%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[228:229], v[180:181], "
"[%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[230:231], v[182:183], "
"[%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[232:233], v[184:185], "
"[%[c16],%[c17],%[c18],%[c19]]
\n
"
" buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[12:15], 0 offen "
"offset:1024
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[234:235], v[186:187], "
"[%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[236:237], v[188:189], "
"[%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[238:239], v[190:191], "
"[%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[224:225], v[240:241], "
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
" buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[12:15], 0 offen "
"offset:2048
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[226:227], v[242:243], "
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[228:229], v[244:245], "
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[230:231], v[246:247], "
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[232:233], v[248:249], "
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
" buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[12:15], 0 offen "
"offset:3072
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[234:235], v[250:251], "
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[236:237], v[252:253], "
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[238:239], v[254:255], "
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[240:241], v[176:177], "
"[%[c24],%[c25],%[c26],%[c27]]
\n
"
" buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[12:15], 0 offen "
"
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[242:243], "
"v[178:179], [%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[244:245], v[180:181], "
"[%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[246:247], v[182:183], "
"[%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[248:249], v[184:185], "
"[%[c24],%[c25],%[c26],%[c27]]
\n
"
" buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[12:15], 0 offen "
"offset:1024
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[250:251], v[186:187], "
"[%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[252:253], v[188:189], "
"[%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[254:255], v[190:191], "
"[%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[240:241], v[240:241], "
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
" buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[12:15], 0 offen "
"offset:2048
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[242:243], v[242:243], "
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[244:245], v[244:245], "
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[246:247], v[246:247], "
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[248:249], v[248:249], "
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
" buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[12:15], 0 offen "
"offset:3072
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[250:251], v[250:251], "
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[252:253], v[252:253], "
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[254:255], v[254:255], "
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
" v_mul_f32 %[c16], %[scale_0], %[c16]
\n
"
" v_mul_f32 %[c17], %[scale_0], %[c17]
\n
"
" v_mul_f32 %[c18], %[scale_0], %[c18]
\n
"
" v_mul_f32 %[c19], %[scale_0], %[c19]
\n
"
" v_mul_f32 %[c20], %[scale_1], %[c20]
\n
"
" v_mul_f32 %[c21], %[scale_1], %[c21]
\n
"
" v_mul_f32 %[c22], %[scale_1], %[c22]
\n
"
" v_mul_f32 %[c23], %[scale_1], %[c23]
\n
"
" v_mul_f32 %[c24], %[scale_0], %[c24]
\n
"
" v_mul_f32 %[c25], %[scale_0], %[c25]
\n
"
" v_mul_f32 %[c26], %[scale_0], %[c26]
\n
"
" v_mul_f32 %[c27], %[scale_0], %[c27]
\n
"
" v_mul_f32 %[c28], %[scale_1], %[c28]
\n
"
" v_mul_f32 %[c29], %[scale_1], %[c29]
\n
"
" v_mul_f32 %[c30], %[scale_1], %[c30]
\n
"
" v_mul_f32 %[c31], %[scale_1], %[c31]
\n
"
_UK_PK_CVT_
(
"%[c16]"
,
"%[c17]"
,
"%[c1
6
]"
)
_UK_PK_CVT_
(
"%[c16]"
,
"%[c17]"
,
"%[c16]"
)
_UK_PK_CVT_
(
"%[c18]"
,
"%[c19]"
,
"%[c1
7
]"
)
_UK_PK_CVT_
(
_UK_PK_CVT_
(
"%[c
18
]"
,
"%[c
19
]"
,
"%[c1
7
]"
)
"%[c20]"
,
"%[c21]"
,
"%[c18]"
)
_UK_PK_CVT_
(
"%[c
22
]"
,
"%[c
23
]"
,
"%[c1
9
]"
)
_UK_PK_CVT_
(
"%[c2
0
]"
,
"%[c2
1
]"
,
"%[c
18
]"
)
_UK_PK_CVT_
(
"%[c2
4
]"
,
"%[c2
5
]"
,
"%[c
20
]"
)
_UK_PK_CVT_
(
_UK_PK_CVT_
(
"%[c2
2
]"
,
"%[c2
3
]"
,
"%[c1
9
]"
)
"%[c2
6
]"
,
"%[c2
7
]"
,
"%[c
2
1]"
)
_UK_PK_CVT_
(
"%[c28]"
,
_UK_PK_CVT_
(
"%[c24]"
,
"%[c25]"
,
"%[c2
0
]"
)
"%[c2
9
]"
,
_UK_PK_CVT_
(
"%[c
26
]"
,
"%[c27]"
,
"%[c21]"
)
"%[c22]"
)
_UK_PK_CVT_
(
"%[c
30
]"
,
_UK_PK_CVT_
(
"%[c28]"
,
"%[c29]"
,
"%[c
22
]"
)
"%[c
31
]"
,
_UK_PK_CVT_
(
"%[c30]"
,
"%[c31]"
,
"%[c23]"
)
"%[c23]"
)
" ;------------------------------
\n
"
" ;------------------------------
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c16],%[c17]] offset:0 + %[shfl_base]
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c16],%[c17]] offset:0 + %[shfl_base]
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c18],%[c19]] offset:4352 + %[shfl_base]
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c18],%[c19]] offset:4352 + %[shfl_base]
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c20],%[c21]] offset:2176 + %[shfl_base]
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c20],%[c21]] offset:2176 + %[shfl_base]
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c22],%[c23]] offset:6528 + %[shfl_base]
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c22],%[c23]] offset:6528 + %[shfl_base]
\n
"
" s_waitcnt lgkmcnt(0)
\n
"
" s_waitcnt lgkmcnt(0)
\n
"
" s_barrier
\n
"
" s_barrier
\n
"
" ds_read_b32 %[c16], %[v_sfl_sld] offset:0 + %[shfl_base]
\n
"
" ds_read_b32 %[c16], %[v_sfl_sld] offset:0 + %[shfl_base]
\n
"
" ds_read_b32 %[c17], %[v_sfl_sld] offset:32 + %[shfl_base]
\n
"
" ds_read_b32 %[c17], %[v_sfl_sld] offset:32 + %[shfl_base]
\n
"
" ds_read_b32 %[c18], %[v_sfl_sld] offset:64 + %[shfl_base]
\n
"
" ds_read_b32 %[c18], %[v_sfl_sld] offset:64 + %[shfl_base]
\n
"
" ds_read_b32 %[c19], %[v_sfl_sld] offset:96 + %[shfl_base]
\n
"
" ds_read_b32 %[c19], %[v_sfl_sld] offset:96 + %[shfl_base]
\n
"
" ds_read_b32 %[c20], %[v_sfl_sld] offset:4352 + %[shfl_base]
\n
"
" ds_read_b32 %[c20], %[v_sfl_sld] offset:4352 + %[shfl_base]
\n
"
" ds_read_b32 %[c21], %[v_sfl_sld] offset:4384 + %[shfl_base]
\n
"
" ds_read_b32 %[c21], %[v_sfl_sld] offset:4384 + %[shfl_base]
\n
"
" ds_read_b32 %[c22], %[v_sfl_sld] offset:4416 + %[shfl_base]
\n
"
" ds_read_b32 %[c22], %[v_sfl_sld] offset:4416 + %[shfl_base]
\n
"
" ds_read_b32 %[c23], %[v_sfl_sld] offset:4448 + %[shfl_base]
\n
"
" ds_read_b32 %[c23], %[v_sfl_sld] offset:4448 + %[shfl_base]
\n
"
" s_waitcnt lgkmcnt(0)
\n
"
" s_waitcnt lgkmcnt(0)
\n
"
" s_mov_b64 exec, %[s_execflag_0]
\n
"
" s_mov_b64 exec, %[s_execflag_0]
\n
"
_UK_ATOMIC_ADD_
_UK_ATOMIC_ADD_
" %[v_os_o0], %[c16], s[8:9]
\n
"
" %[v_os_o0], %[c16], s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_1]
\n
"
" s_mov_b64 exec, %[s_execflag_1]
\n
"
_UK_ATOMIC_ADD_
_UK_ATOMIC_ADD_
" %[v_os_o1], %[c17], s[8:9]
\n
"
" %[v_os_o1], %[c17], s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_2]
\n
"
" s_mov_b64 exec, %[s_execflag_2]
\n
"
_UK_ATOMIC_ADD_
_UK_ATOMIC_ADD_
" %[v_os_o2], %[c18], s[8:9]
\n
"
" %[v_os_o2], %[c18], s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_3]
\n
"
" s_mov_b64 exec, %[s_execflag_3]
\n
"
_UK_ATOMIC_ADD_
_UK_ATOMIC_ADD_
" %[v_os_o3], %[c19], s[8:9]
\n
"
" %[v_os_o3], %[c19], s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_4]
\n
"
" s_mov_b64 exec, %[s_execflag_4]
\n
"
_UK_ATOMIC_ADD_
_UK_ATOMIC_ADD_
" %[v_os_o4], %[c20], s[8:9]
\n
"
" %[v_os_o4], %[c20], s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_5]
\n
"
" s_mov_b64 exec, %[s_execflag_5]
\n
"
_UK_ATOMIC_ADD_
_UK_ATOMIC_ADD_
" %[v_os_o5], %[c21], s[8:9]
\n
"
" %[v_os_o5], %[c21], s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_6]
\n
"
" s_mov_b64 exec, %[s_execflag_6]
\n
"
_UK_ATOMIC_ADD_
_UK_ATOMIC_ADD_
" %[v_os_o6], %[c22], s[8:9]
\n
"
" %[v_os_o6], %[c22], s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_7]
\n
"
" s_mov_b64 exec, %[s_execflag_7]
\n
"
_UK_ATOMIC_ADD_
_UK_ATOMIC_ADD_
" %[v_os_o7], %[c23], s[8:9]
\n
"
" %[v_os_o7], %[c23], s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
" s_sub_i32 %[s_loop_cnt], %[s_loop_cnt], 1 ; k--
\n
"
" s_sub_i32 %[s_loop_cnt], %[s_loop_cnt], 1 ; k--
\n
"
" s_cmp_gt_i32 %[s_loop_cnt] 0
\n
"
" s_cmp_gt_i32 %[s_loop_cnt] 0
\n
"
" s_cbranch_scc0 L_end%=
\n
"
" s_cbranch_scc0 L_end%=
\n
"
" s_cmp_gt_i32 %[s_loop_cnt] 1 ; move b with cond
\n
"
" s_cmp_gt_i32 %[s_loop_cnt] 1 ; move b with cond
\n
"
" s_cselect_b32 s86, %[s_tile_os_b], 0
\n
"
" s_cselect_b32 s86, %[s_tile_os_b], 0
\n
"
" s_add_u32 s12, s86, s12
\n
"
" s_add_u32 s12, s86, s12
\n
"
" s_addc_u32 s13, 0, s13
\n
"
" s_addc_u32 s13, 0, s13
\n
"
" s_add_u32 s8, %[s_tile_os_o], s8
\n
"
" s_add_u32 s8, %[s_tile_os_o], s8
\n
"
" s_addc_u32 s9, 0, s9
\n
"
" s_addc_u32 s9, 0, s9
\n
"
" s_branch L_start%=
\n
"
" s_branch L_start%=
\n
"
"L_end%=:
\n
"
"L_end%=:
\n
"
#undef _UK_MFMA_
#undef _UK_MFMA_
#undef _UK_PK_CVT_
#undef _UK_PK_CVT_
...
...
include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16_itl.inc
0 → 100644
View file @
58d75b7a
#ifndef CK_TILE_FLATMM_UK_MFMA
#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_BF16
#endif
#if CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_BF16
#define _UK_MFMA_ "v_mfma_f32_16x16x16_bf16"
#define _UK_PK_CVT_(x0_, x1_, y_) \
" v_cmp_u_f32 s[36:37], "
x0_
", "
x0_
"
\n
"
\
" v_add3_u32 v50, "
x0_
", %[v_nan_lo], 1
\n
"
\
" v_cndmask_b32 v54, v50, %[v_nan_hi], s[36:37]
\n
"
\
" v_cmp_u_f32 s[36:37], "
x1_
", "
x1_
"
\n
"
\
" v_add3_u32 v50, "
x1_
", %[v_nan_lo], 1
\n
"
\
" v_cndmask_b32 v55, v50, %[v_nan_hi], s[36:37]
\n
"
\
" v_perm_b32 "
y_
", v55, v54, s52
\n
"
#define _UK_ATOMIC_ADD_ "global_atomic_pk_add_bf16"
#elif CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_FP16
#define _UK_MFMA_ "v_mfma_f32_16x16x16_f16"
#define _UK_PK_CVT_(x0_, x1_, y_) \
" v_cvt_f16_f32 v54, "
x0_
"
\n
"
\
" v_cvt_f16_f32 v55, "
x1_
"
\n
"
\
" v_pack_b32_f16 "
y_
", v54, v55
\n
"
#define _UK_ATOMIC_ADD_ "global_atomic_pk_add_f16"
#endif
";-------------------------------------------------------------
\n
"
" s_mov_b32 s52, 0x07060302 ; v_perm
\n
"
" s_mov_b64 s[38:39], exec ; save current exec
\n
"
" s_mov_b32 s8, %[s_res_o0]
\n
"
" s_mov_b32 s9, %[s_res_o1]
\n
"
" s_mov_b32 s12, %[s_res_b0]
\n
"
" s_mov_b32 s13, %[s_res_b1]
\n
"
" s_mov_b32 s14, %[s_res_b2]
\n
"
" s_mov_b32 s15, %[s_res_b3]
\n
"
" s_mov_b32 s59, 0
\n
"
" ds_read_b64 v[128:129], %[v_sld_y_os] offset:0 + %[sld_a_base]
\n
"
" ds_read_b64 v[130:131], %[v_sld_y_os] offset:128 + %[sld_a_base]
\n
"
" ds_read_b64 v[132:133], %[v_sld_y_os] offset:1024 + %[sld_a_base]
\n
"
" ds_read_b64 v[134:135], %[v_sld_y_os] offset:1152 + %[sld_a_base]
\n
"
" ds_read_b64 v[136:137], %[v_sld_y_os] offset:2048 + %[sld_a_base]
\n
"
" ds_read_b64 v[138:139], %[v_sld_y_os] offset:2176 + %[sld_a_base]
\n
"
" ds_read_b64 v[140:141], %[v_sld_y_os] offset:3072 + %[sld_a_base]
\n
"
" ds_read_b64 v[142:143], %[v_sld_y_os] offset:3200 + %[sld_a_base]
\n
"
" ds_read_b64 v[144:145], %[v_sld_y_os] offset:4096 + %[sld_a_base]
\n
"
" ds_read_b64 v[146:147], %[v_sld_y_os] offset:4224 + %[sld_a_base]
\n
"
" ds_read_b64 v[148:149], %[v_sld_y_os] offset:5120 + %[sld_a_base]
\n
"
" ds_read_b64 v[150:151], %[v_sld_y_os] offset:5248 + %[sld_a_base]
\n
"
" ds_read_b64 v[152:153], %[v_sld_y_os] offset:6144 + %[sld_a_base]
\n
"
" ds_read_b64 v[154:155], %[v_sld_y_os] offset:6272 + %[sld_a_base]
\n
"
" ds_read_b64 v[156:157], %[v_sld_y_os] offset:7168 + %[sld_a_base]
\n
"
" ds_read_b64 v[158:159], %[v_sld_y_os] offset:7296 + %[sld_a_base]
\n
"
" ds_read_b64 v[160:161], %[v_sld_y_os] offset:8192 + %[sld_a_base]
\n
"
" ds_read_b64 v[162:163], %[v_sld_y_os] offset:8320 + %[sld_a_base]
\n
"
" ds_read_b64 v[164:165], %[v_sld_y_os] offset:9216 + %[sld_a_base]
\n
"
" ds_read_b64 v[166:167], %[v_sld_y_os] offset:9344 + %[sld_a_base]
\n
"
" ds_read_b64 v[168:169], %[v_sld_y_os] offset:10240 + %[sld_a_base]
\n
"
" ds_read_b64 v[170:171], %[v_sld_y_os] offset:10368 + %[sld_a_base]
\n
"
" ds_read_b64 v[172:173], %[v_sld_y_os] offset:11264 + %[sld_a_base]
\n
"
" ds_read_b64 v[174:175], %[v_sld_y_os] offset:11392 + %[sld_a_base]
\n
"
" ds_read_b64 v[176:177], %[v_sld_y_os] offset:12288 + %[sld_a_base]
\n
"
" ds_read_b64 v[178:179], %[v_sld_y_os] offset:12416 + %[sld_a_base]
\n
"
" ds_read_b64 v[180:181], %[v_sld_y_os] offset:13312 + %[sld_a_base]
\n
"
" ds_read_b64 v[182:183], %[v_sld_y_os] offset:13440 + %[sld_a_base]
\n
"
" ds_read_b64 v[184:185], %[v_sld_y_os] offset:14336 + %[sld_a_base]
\n
"
" ds_read_b64 v[186:187], %[v_sld_y_os] offset:14464 + %[sld_a_base]
\n
"
" ds_read_b64 v[188:189], %[v_sld_y_os] offset:15360 + %[sld_a_base]
\n
"
" ds_read_b64 v[190:191], %[v_sld_y_os] offset:15488 + %[sld_a_base]
\n
"
" ds_read_b64 v[192:193], %[v_sld_y_os] offset:16384 + %[sld_a_base]
\n
"
" ds_read_b64 v[194:195], %[v_sld_y_os] offset:16512 + %[sld_a_base]
\n
"
" ds_read_b64 v[196:197], %[v_sld_y_os] offset:17408 + %[sld_a_base]
\n
"
" ds_read_b64 v[198:199], %[v_sld_y_os] offset:17536 + %[sld_a_base]
\n
"
" ds_read_b64 v[200:201], %[v_sld_y_os] offset:18432 + %[sld_a_base]
\n
"
" ds_read_b64 v[202:203], %[v_sld_y_os] offset:18560 + %[sld_a_base]
\n
"
" ds_read_b64 v[204:205], %[v_sld_y_os] offset:19456 + %[sld_a_base]
\n
"
" ds_read_b64 v[206:207], %[v_sld_y_os] offset:19584 + %[sld_a_base]
\n
"
" ds_read_b64 v[208:209], %[v_sld_y_os] offset:20480 + %[sld_a_base]
\n
"
" ds_read_b64 v[210:211], %[v_sld_y_os] offset:20608 + %[sld_a_base]
\n
"
" ds_read_b64 v[212:213], %[v_sld_y_os] offset:21504 + %[sld_a_base]
\n
"
" ds_read_b64 v[214:215], %[v_sld_y_os] offset:21632 + %[sld_a_base]
\n
"
" ds_read_b64 v[216:217], %[v_sld_y_os] offset:22528 + %[sld_a_base]
\n
"
" ds_read_b64 v[218:219], %[v_sld_y_os] offset:22656 + %[sld_a_base]
\n
"
" ds_read_b64 v[220:221], %[v_sld_y_os] offset:23552 + %[sld_a_base]
\n
"
" ds_read_b64 v[222:223], %[v_sld_y_os] offset:23680 + %[sld_a_base]
\n
"
" ds_read_b64 v[224:225], %[v_sld_y_os] offset:24576 + %[sld_a_base]
\n
"
" ds_read_b64 v[226:227], %[v_sld_y_os] offset:24704 + %[sld_a_base]
\n
"
" ds_read_b64 v[228:229], %[v_sld_y_os] offset:25600 + %[sld_a_base]
\n
"
" ds_read_b64 v[230:231], %[v_sld_y_os] offset:25728 + %[sld_a_base]
\n
"
" ds_read_b64 v[232:233], %[v_sld_y_os] offset:26624 + %[sld_a_base]
\n
"
" ds_read_b64 v[234:235], %[v_sld_y_os] offset:26752 + %[sld_a_base]
\n
"
" ds_read_b64 v[236:237], %[v_sld_y_os] offset:27648 + %[sld_a_base]
\n
"
" ds_read_b64 v[238:239], %[v_sld_y_os] offset:27776 + %[sld_a_base]
\n
"
" ds_read_b64 v[240:241], %[v_sld_y_os] offset:28672 + %[sld_a_base]
\n
"
" ds_read_b64 v[242:243], %[v_sld_y_os] offset:28800 + %[sld_a_base]
\n
"
" ds_read_b64 v[244:245], %[v_sld_y_os] offset:29696 + %[sld_a_base]
\n
"
" ds_read_b64 v[246:247], %[v_sld_y_os] offset:29824 + %[sld_a_base]
\n
"
" ds_read_b64 v[248:249], %[v_sld_y_os] offset:30720 + %[sld_a_base]
\n
"
" ds_read_b64 v[250:251], %[v_sld_y_os] offset:30848 + %[sld_a_base]
\n
"
" ds_read_b64 v[252:253], %[v_sld_y_os] offset:31744 + %[sld_a_base]
\n
"
" ds_read_b64 v[254:255], %[v_sld_y_os] offset:31872 + %[sld_a_base]
\n
"
" s_waitcnt 0
\n
"
" buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[12:15], 0 offen offset:3072
\n
"
" s_add_u32 s12, %[s_tile_os_b], s12
\n
"
" s_addc_u32 s13, 0, s13
\n
"
" v_mov_b32 v64, 0
\n
"
" v_mov_b32 v80, 0
\n
"
" v_mov_b32 v65, 0
\n
"
" v_mov_b32 v81, 0
\n
"
" v_mov_b32 v66, 0
\n
"
" v_mov_b32 v82, 0
\n
"
" v_mov_b32 v67, 0
\n
"
" v_mov_b32 v83, 0
\n
"
" v_mov_b32 v68, 0
\n
"
" v_mov_b32 v84, 0
\n
"
" v_mov_b32 v69, 0
\n
"
" v_mov_b32 v85, 0
\n
"
" v_mov_b32 v70, 0
\n
"
" v_mov_b32 v86, 0
\n
"
" v_mov_b32 v71, 0
\n
"
" v_mov_b32 v87, 0
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c0],%[c1]] offset:16640
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c2],%[c3]] offset:20992
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c4],%[c5]] offset:18816
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c6],%[c7]] offset:23168
\n
"
" s_mov_b32 s80, 0
\n
"
" s_waitcnt vmcnt(24)
\n
"
"label_0AA6:
\n
"
" s_waitcnt vmcnt(30) & lgkmcnt(0)
\n
"
" s_barrier
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[0:1], v[128:129], 0
\n
"
" ds_read_b32 v10, %[v_sfl_sld] offset:16640
\n
"
" ds_read_b32 v11, %[v_sfl_sld] offset:16672
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c16],%[c17]] offset:25344
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[2:3], v[130:131], v[64:67]
\n
"
" buffer_load_dwordx4 acc[128:131], %[v_os_b0], s[12:15], 0 offen
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c18],%[c19]] offset:29696
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[4:5], v[132:133], v[64:67]
\n
"
" ds_read_b32 v12, %[v_sfl_sld] offset:16704
\n
"
" ds_read_b32 v13, %[v_sfl_sld] offset:16736
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c20],%[c21]] offset:27520
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[6:7], v[134:135], v[64:67]
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c22],%[c23]] offset:31872
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[8:9], v[136:137], v[64:67]
\n
"
" ds_read_b32 v14, %[v_sfl_sld] offset:20992
\n
"
" ds_read_b32 v15, %[v_sfl_sld] offset:21024
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[10:11], v[138:139], v[64:67]
\n
"
" buffer_load_dwordx4 acc[132:135], %[v_os_b0], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[12:13], v[140:141], v[64:67]
\n
"
" ds_read_b32 v16, %[v_sfl_sld] offset:21056
\n
"
" ds_read_b32 v17, %[v_sfl_sld] offset:21088
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[14:15], v[142:143], v[64:67]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[0:1], v[192:193], 0
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[2:3], v[194:195], v[68:71]
\n
"
" buffer_load_dwordx4 acc[136:139], %[v_os_b0], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[4:5], v[196:197], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[6:7], v[198:199], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[8:9], v[200:201], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[10:11], v[202:203], v[68:71]
\n
"
" buffer_load_dwordx4 acc[140:143], %[v_os_b0], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[12:13], v[204:205], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[14:15], v[206:207], v[68:71]
\n
"
" s_waitcnt lgkmcnt(0)
\n
"
" s_mov_b64 exec, %[s_execflag_0]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o0], v10, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[16:17], v[128:129], 0
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[18:19], v[130:131], v[72:75]
\n
"
" buffer_load_dwordx4 acc[144:147], %[v_os_b1], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[20:21], v[132:133], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[22:23], v[134:135], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[24:25], v[136:137], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[26:27], v[138:139], v[72:75]
\n
"
" buffer_load_dwordx4 acc[148:151], %[v_os_b1], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[28:29], v[140:141], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[30:31], v[142:143], v[72:75]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[16:17], v[192:193], 0
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[18:19], v[194:195], v[76:79]
\n
"
" buffer_load_dwordx4 acc[152:155], %[v_os_b1], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[20:21], v[196:197], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[22:23], v[198:199], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[24:25], v[200:201], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[26:27], v[202:203], v[76:79]
\n
"
" buffer_load_dwordx4 acc[156:159], %[v_os_b1], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[28:29], v[204:205], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[30:31], v[206:207], v[76:79]
\n
"
" s_mov_b64 exec, %[s_execflag_1]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o1], v11, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
" s_waitcnt vmcnt(30)
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[32:33], v[144:145], v[64:67]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[34:35], v[146:147], v[64:67]
\n
"
" buffer_load_dwordx4 acc[160:163], %[v_os_b2], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[36:37], v[148:149], v[64:67]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[38:39], v[150:151], v[64:67]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[40:41], v[152:153], v[64:67]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[42:43], v[154:155], v[64:67]
\n
"
" buffer_load_dwordx4 acc[164:167], %[v_os_b2], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[44:45], v[156:157], v[64:67]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[46:47], v[158:159], v[64:67]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[32:33], v[208:209], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[34:35], v[210:211], v[68:71]
\n
"
" buffer_load_dwordx4 acc[168:171], %[v_os_b2], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[36:37], v[212:213], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[38:39], v[214:215], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[40:41], v[216:217], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[42:43], v[218:219], v[68:71]
\n
"
" buffer_load_dwordx4 acc[172:175], %[v_os_b2], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[44:45], v[220:221], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[46:47], v[222:223], v[68:71]
\n
"
" s_mov_b64 exec, %[s_execflag_2]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o2], v12, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[48:49], v[144:145], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[50:51], v[146:147], v[72:75]
\n
"
" buffer_load_dwordx4 acc[176:179], %[v_os_b3], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[52:53], v[148:149], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[54:55], v[150:151], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[56:57], v[152:153], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[58:59], v[154:155], v[72:75]
\n
"
" buffer_load_dwordx4 acc[180:183], %[v_os_b3], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[60:61], v[156:157], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[62:63], v[158:159], v[72:75]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[48:49], v[208:209], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[50:51], v[210:211], v[76:79]
\n
"
" buffer_load_dwordx4 acc[184:187], %[v_os_b3], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[52:53], v[212:213], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[54:55], v[214:215], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[56:57], v[216:217], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[58:59], v[218:219], v[76:79]
\n
"
" buffer_load_dwordx4 acc[188:191], %[v_os_b3], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[60:61], v[220:221], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[62:63], v[222:223], v[76:79]
\n
"
" s_mov_b64 exec, %[s_execflag_3]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o3], v13, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
" s_waitcnt vmcnt(30)
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[64:65], v[160:161], v[64:67]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[66:67], v[162:163], v[64:67]
\n
"
" buffer_load_dwordx4 acc[192:195], %[v_os_b4], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[68:69], v[164:165], v[64:67]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[70:71], v[166:167], v[64:67]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[72:73], v[168:169], v[64:67]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[74:75], v[170:171], v[64:67]
\n
"
" buffer_load_dwordx4 acc[196:199], %[v_os_b4], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[76:77], v[172:173], v[64:67]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[78:79], v[174:175], v[64:67]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[64:65], v[224:225], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[66:67], v[226:227], v[68:71]
\n
"
" buffer_load_dwordx4 acc[200:203], %[v_os_b4], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[68:69], v[228:229], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[70:71], v[230:231], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[72:73], v[232:233], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[74:75], v[234:235], v[68:71]
\n
"
" buffer_load_dwordx4 acc[204:207], %[v_os_b4], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[76:77], v[236:237], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[78:79], v[238:239], v[68:71]
\n
"
" s_mov_b64 exec, %[s_execflag_4]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o4], v14, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[80:81], v[160:161], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[82:83], v[162:163], v[72:75]
\n
"
" buffer_load_dwordx4 acc[208:211], %[v_os_b5], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[84:85], v[164:165], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[86:87], v[166:167], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[88:89], v[168:169], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[90:91], v[170:171], v[72:75]
\n
"
" buffer_load_dwordx4 acc[212:215], %[v_os_b5], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[92:93], v[172:173], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[94:95], v[174:175], v[72:75]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[80:81], v[224:225], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[82:83], v[226:227], v[76:79]
\n
"
" buffer_load_dwordx4 acc[216:219], %[v_os_b5], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[84:85], v[228:229], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[86:87], v[230:231], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[88:89], v[232:233], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[90:91], v[234:235], v[76:79]
\n
"
" buffer_load_dwordx4 acc[220:223], %[v_os_b5], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[92:93], v[236:237], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[94:95], v[238:239], v[76:79]
\n
"
" s_mov_b64 exec, %[s_execflag_5]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o5], v15, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
" s_waitcnt vmcnt(30)
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[96:97], v[176:177], v[64:67]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[98:99], v[178:179], v[64:67]
\n
"
" buffer_load_dwordx4 acc[224:227], %[v_os_b6], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[100:101], v[180:181], v[64:67]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[102:103], v[182:183], v[64:67]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[104:105], v[184:185], v[64:67]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[106:107], v[186:187], v[64:67]
\n
"
" buffer_load_dwordx4 acc[228:231], %[v_os_b6], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[108:109], v[188:189], v[64:67]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[110:111], v[190:191], v[64:67]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[96:97], v[240:241], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[98:99], v[242:243], v[68:71]
\n
"
" buffer_load_dwordx4 acc[232:235], %[v_os_b6], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[100:101], v[244:245], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[102:103], v[246:247], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[104:105], v[248:249], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[106:107], v[250:251], v[68:71]
\n
"
" buffer_load_dwordx4 acc[236:239], %[v_os_b6], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[108:109], v[252:253], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[110:111], v[254:255], v[68:71]
\n
"
" s_mov_b64 exec, %[s_execflag_6]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o6], v16, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[112:113], v[176:177], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[114:115], v[178:179], v[72:75]
\n
"
" buffer_load_dwordx4 acc[240:243], %[v_os_b7], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[116:117], v[180:181], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[118:119], v[182:183], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[120:121], v[184:185], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[122:123], v[186:187], v[72:75]
\n
"
" buffer_load_dwordx4 acc[244:247], %[v_os_b7], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[124:125], v[188:189], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[126:127], v[190:191], v[72:75]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[112:113], v[240:241], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[114:115], v[242:243], v[76:79]
\n
"
" buffer_load_dwordx4 acc[248:251], %[v_os_b7], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[116:117], v[244:245], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[118:119], v[246:247], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[120:121], v[248:249], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[122:123], v[250:251], v[76:79]
\n
"
" buffer_load_dwordx4 acc[252:255], %[v_os_b7], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[124:125], v[252:253], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[126:127], v[254:255], v[76:79]
\n
"
" s_mov_b64 exec, %[s_execflag_7]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o7], v17, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
" s_add_u32 s60, 0x00000100, s80
\n
"
" s_cmp_lt_u32 s60, %[s_loop_cnt]
\n
"
" s_cselect_b32 s56, %[s_tile_os_b], 0
\n
"
" s_add_u32 s12, s56, s12
\n
"
" s_addc_u32 s13, 0, s13
\n
"
" s_cmp_ge_u32 s80, 0x00000100
\n
"
" s_cselect_b32 s59, %[s_tile_os_o], s59
\n
"
" s_add_u32 s8, s59, s8
\n
"
" s_addc_u32 s9, 0, s9
\n
"
" v_mul_f32 %[c0], %[scale_0], %[c0]
\n
"
" v_mul_f32 %[c1], %[scale_0], %[c1]
\n
"
" v_mul_f32 %[c2], %[scale_0], %[c2]
\n
"
" v_mul_f32 %[c3], %[scale_0], %[c3]
\n
"
" v_mul_f32 %[c4], %[scale_1], %[c4]
\n
"
" v_mul_f32 %[c5], %[scale_1], %[c5]
\n
"
" v_mul_f32 %[c6], %[scale_1], %[c6]
\n
"
" v_mul_f32 %[c7], %[scale_1], %[c7]
\n
"
" v_mul_f32 %[c8], %[scale_0], %[c8]
\n
"
" v_mul_f32 %[c9], %[scale_0], %[c9]
\n
"
" v_mul_f32 %[c10], %[scale_0], %[c10]
\n
"
" v_mul_f32 %[c11], %[scale_0], %[c11]
\n
"
" v_mul_f32 %[c12], %[scale_1], %[c12]
\n
"
" v_mul_f32 %[c13], %[scale_1], %[c13]
\n
"
" v_mul_f32 %[c14], %[scale_1], %[c14]
\n
"
" v_mul_f32 %[c15], %[scale_1], %[c15]
\n
"
_UK_PK_CVT_
(
"%[c0]"
,
"%[c1]"
,
"%[c0]"
)
_UK_PK_CVT_
(
"%[c2]"
,
"%[c3]"
,
"%[c1]"
)
_UK_PK_CVT_
(
"%[c4]"
,
"%[c5]"
,
"%[c2]"
)
_UK_PK_CVT_
(
"%[c6]"
,
"%[c7]"
,
"%[c3]"
)
_UK_PK_CVT_
(
"%[c8]"
,
"%[c9]"
,
"%[c4]"
)
_UK_PK_CVT_
(
"%["
"c10]"
,
"%["
"c11]"
,
"%[c5]"
)
_UK_PK_CVT_
(
"%[c12]"
,
"%[c13]"
,
"%[c6]"
)
_UK_PK_CVT_
(
"%[c14]"
,
"%[c15]"
,
"%[c7]"
)
" s_addk_i32 s80, 0x0080
\n
"
" s_cmp_lt_i32 s80, %[s_loop_cnt]
\n
"
" s_cbranch_scc0 label_0EC1
\n
"
" s_waitcnt vmcnt(30) & lgkmcnt(0)
\n
"
" s_barrier
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[128:129], v[128:129], 0
\n
"
" ds_read_b32 v10, %[v_sfl_sld] offset:25344
\n
"
" ds_read_b32 v11, %[v_sfl_sld] offset:25376
\n
"
" ds_write_b64 v3, v[64:65] offset:16640
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[130:131], v[130:131], v[80:83]
\n
"
" buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[12:15], 0 offen
\n
"
" ds_write_b64 v3, v[66:67] offset:20992
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[132:133], v[132:133], v[80:83]
\n
"
" ds_read_b32 v12, %[v_sfl_sld] offset:25408
\n
"
" ds_read_b32 v13, %[v_sfl_sld] offset:25440
\n
"
" ds_write_b64 v3, v[68:69] offset:18816
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[134:135], v[134:135], v[80:83]
\n
"
" ds_write_b64 v3, v[70:71] offset:23168
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[136:137], v[136:137], v[80:83]
\n
"
" ds_read_b32 v14, %[v_sfl_sld] offset:29696
\n
"
" ds_read_b32 v15, %[v_sfl_sld] offset:29728
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[138:139], v[138:139], v[80:83]
\n
"
" buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[12:15], 0 offen offset:1024 "
"
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[140:141], v[140:141], v[80:83]
\n
"
" ds_read_b32 v16, %[v_sfl_sld] offset:29760
\n
"
" ds_read_b32 v17, %[v_sfl_sld] offset:29792
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[142:143], v[142:143], v[80:83] "
"
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[128:129], v[192:193], 0
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[130:131], v[194:195], v[84:87]
\n
"
" buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[12:15], 0 offen offset:2048 "
"
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[132:133], v[196:197], "
"v[84:87]
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[134:135], v[198:199], v[84:87] "
"
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[136:137], v[200:201], "
"v[84:87]
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[138:139], v[202:203], v[84:87]
\n
"
" buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[12:15], 0 offen offset:3072 "
"
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[140:141], v[204:205], "
"v[84:87]
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[142:143], v[206:207], v[84:87]
\n
"
" s_waitcnt lgkmcnt(0)
\n
"
" s_mov_b64 exec, %[s_execflag_0]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o0], v10, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[144:145], v[128:129], 0
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[146:147], v[130:131], v[88:91]
\n
"
" buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[148:149], v[132:133], v[88:91] "
"
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[150:151], v[134:135], "
"v[88:91]
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[152:153], v[136:137], v[88:91] "
"
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[154:155], v[138:139], v[88:91]
\n
"
" buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[12:15], 0 offen offset:1024 "
"
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[156:157], v[140:141], "
"v[88:91]
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[158:159], v[142:143], v[88:91] "
"
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[144:145], v[192:193], 0
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[146:147], v[194:195], v[92:95]
\n
"
" buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[12:15], 0 offen offset:2048 "
"
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[148:149], v[196:197], "
"v[92:95]
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[150:151], v[198:199], v[92:95] "
"
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[152:153], v[200:201], "
"v[92:95]
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[154:155], v[202:203], v[92:95]
\n
"
" buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[12:15], 0 offen offset:3072 "
"
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[156:157], v[204:205], "
"v[92:95]
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[158:159], v[206:207], v[92:95]
\n
"
" s_mov_b64 exec, %[s_execflag_1]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o1], v11, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
" s_waitcnt vmcnt(30)
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[160:161], v[144:145], v[80:83] "
"
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[162:163], v[146:147], v[80:83]
\n
"
" buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[164:165], v[148:149], v[80:83] "
"
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[166:167], v[150:151], "
"v[80:83]
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[168:169], v[152:153], v[80:83] "
"
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[170:171], v[154:155], v[80:83]
\n
"
" buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[12:15], 0 offen offset:1024 "
"
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[172:173], v[156:157], "
"v[80:83]
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[174:175], v[158:159], v[80:83] "
"
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[160:161], v[208:209], "
"v[84:87]
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[162:163], v[210:211], v[84:87]
\n
"
" buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[12:15], 0 offen offset:2048 "
"
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[164:165], v[212:213], "
"v[84:87]
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[166:167], v[214:215], v[84:87] "
"
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[168:169], v[216:217], "
"v[84:87]
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[170:171], v[218:219], v[84:87]
\n
"
" buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[12:15], 0 offen offset:3072 "
"
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[172:173], v[220:221], "
"v[84:87]
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[174:175], v[222:223], v[84:87]
\n
"
" s_mov_b64 exec, %[s_execflag_2]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o2], v12, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[176:177], v[144:145], v[88:91] "
"
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[178:179], v[146:147], v[88:91]
\n
"
" buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[180:181], v[148:149], v[88:91] "
"
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[182:183], v[150:151], "
"v[88:91]
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[184:185], v[152:153], v[88:91] "
"
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[186:187], v[154:155], v[88:91]
\n
"
" buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[12:15], 0 offen offset:1024 "
"
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[188:189], v[156:157], "
"v[88:91]
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[190:191], v[158:159], v[88:91] "
"
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[176:177], v[208:209], "
"v[92:95]
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[178:179], v[210:211], v[92:95]
\n
"
" buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[12:15], 0 offen offset:2048 "
"
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[180:181], v[212:213], "
"v[92:95]
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[182:183], v[214:215], v[92:95] "
"
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[184:185], v[216:217], "
"v[92:95]
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[186:187], v[218:219], v[92:95]
\n
"
" buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[12:15], 0 offen offset:3072 "
"
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[188:189], v[220:221], "
"v[92:95]
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[190:191], v[222:223], v[92:95]
\n
"
" s_mov_b64 exec, %[s_execflag_3]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o3], v13, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
" s_waitcnt vmcnt(30)
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[192:193], v[160:161], v[80:83] "
"
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[194:195], v[162:163], v[80:83]
\n
"
" buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[196:197], v[164:165], v[80:83] "
"
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[198:199], v[166:167], "
"v[80:83]
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[200:201], v[168:169], v[80:83] "
"
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[202:203], v[170:171], v[80:83]
\n
"
" buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[12:15], 0 offen offset:1024 "
"
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[204:205], v[172:173], "
"v[80:83]
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[206:207], v[174:175], v[80:83] "
"
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[192:193], v[224:225], "
"v[84:87]
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[194:195], v[226:227], v[84:87]
\n
"
" buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[12:15], 0 offen offset:2048 "
"
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[196:197], v[228:229], "
"v[84:87]
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[198:199], v[230:231], v[84:87] "
"
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[200:201], v[232:233], "
"v[84:87]
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[202:203], v[234:235], v[84:87]
\n
"
" buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[12:15], 0 offen offset:3072 "
"
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[204:205], v[236:237], "
"v[84:87]
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[206:207], v[238:239], v[84:87]
\n
"
" s_mov_b64 exec, %[s_execflag_4]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o4], v14, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[208:209], v[160:161], v[88:91] "
"
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[210:211], v[162:163], v[88:91]
\n
"
" buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[212:213], v[164:165], v[88:91] "
"
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[214:215], v[166:167], "
"v[88:91]
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[216:217], v[168:169], v[88:91] "
"
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[218:219], v[170:171], v[88:91]
\n
"
" buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[12:15], 0 offen offset:1024 "
"
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[220:221], v[172:173], "
"v[88:91]
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[222:223], v[174:175], v[88:91] "
"
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[208:209], v[224:225], "
"v[92:95]
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[210:211], v[226:227], v[92:95]
\n
"
" buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[12:15], 0 offen offset:2048 "
"
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[212:213], v[228:229], "
"v[92:95]
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[214:215], v[230:231], v[92:95] "
"
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[216:217], v[232:233], "
"v[92:95]
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[218:219], v[234:235], v[92:95]
\n
"
" buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[12:15], 0 offen offset:3072 "
"
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[220:221], v[236:237], "
"v[92:95]
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[222:223], v[238:239], v[92:95]
\n
"
" s_mov_b64 exec, %[s_execflag_5]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o5], v15, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
" s_waitcnt vmcnt(30)
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[224:225], v[176:177], v[80:83] "
"
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[226:227], v[178:179], v[80:83]
\n
"
" buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[228:229], v[180:181], v[80:83] "
"
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[230:231], v[182:183], "
"v[80:83]
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[232:233], v[184:185], v[80:83] "
"
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[234:235], v[186:187], v[80:83]
\n
"
" buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[12:15], 0 offen "
"offset:1024
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[236:237], "
"v[188:189], v[80:83]
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[238:239], v[190:191], v[80:83] "
"
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[224:225], v[240:241], "
"v[84:87]
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[226:227], v[242:243], v[84:87]
\n
"
" buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[12:15], 0 offen "
"offset:2048
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[228:229], "
"v[244:245], v[84:87]
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[230:231], v[246:247], v[84:87] "
"
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[232:233], v[248:249], "
"v[84:87]
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[234:235], v[250:251], v[84:87]
\n
"
" buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[12:15], 0 offen "
"offset:3072
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[236:237], "
"v[252:253], v[84:87]
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[238:239], v[254:255], v[84:87]
\n
"
" s_mov_b64 exec, %[s_execflag_6]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o6], v16, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[240:241], v[176:177], v[88:91] "
"
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[242:243], v[178:179], v[88:91]
\n
"
" buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[12:15], 0 offen "
"
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[244:245], v[180:181], "
"v[88:91]
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[246:247], v[182:183], v[88:91] "
"
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[248:249], v[184:185], "
"v[88:91]
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[250:251], v[186:187], v[88:91]
\n
"
" buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[12:15], 0 offen "
"offset:1024
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[252:253], "
"v[188:189], v[88:91]
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[254:255], v[190:191], v[88:91] "
"
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[240:241], v[240:241], "
"v[92:95]
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[242:243], v[242:243], v[92:95]
\n
"
" buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[12:15], 0 offen "
"offset:2048
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[244:245], "
"v[244:245], v[92:95]
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[246:247], v[246:247], v[92:95] "
"
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[248:249], v[248:249], "
"v[92:95]
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[250:251], v[250:251], v[92:95]
\n
"
" buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[12:15], 0 offen "
"offset:3072
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[252:253], "
"v[252:253], v[92:95]
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[254:255], v[254:255], v[92:95]
\n
"
" s_mov_b64 exec, %[s_execflag_7]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o7], v17, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
" s_add_u32 s60, 0x00000100, s80
\n
"
" s_cmp_lt_u32 s60, %[s_loop_cnt]
\n
"
" s_cselect_b32 s56, s56, 0
\n
"
" s_add_u32 s12, s56, s12
\n
"
" s_addc_u32 s13, 0, s13
\n
"
" s_cmp_ge_u32 s80, 0x00000100
\n
"
" s_cselect_b32 s59, 0x00000100, s59
\n
"
" s_add_u32 s8, s59, s8
\n
"
" s_addc_u32 s9, 0, s9
\n
"
" v_mul_f32 %[c16], %[scale_0], %[c16]
\n
"
" v_mul_f32 %[c17], %[scale_0], %[c17]
\n
"
" v_mul_f32 %[c18], %[scale_0], %[c18]
\n
"
" v_mul_f32 %[c19], %[scale_0], %[c19]
\n
"
" v_mul_f32 %[c20], %[scale_1], %[c20]
\n
"
" v_mul_f32 %[c21], %[scale_1], %[c21]
\n
"
" v_mul_f32 %[c22], %[scale_1], %[c22]
\n
"
" v_mul_f32 %[c23], %[scale_1], %[c23]
\n
"
" v_mul_f32 %[c24], %[scale_0], %[c24]
\n
"
" v_mul_f32 %[c25], %[scale_0], %[c25]
\n
"
" v_mul_f32 %[c26], %[scale_0], %[c26]
\n
"
" v_mul_f32 %[c27], %[scale_0], %[c27]
\n
"
" v_mul_f32 %[c28], %[scale_1], %[c28]
\n
"
" v_mul_f32 %[c29], %[scale_1], %[c29]
\n
"
" v_mul_f32 %[c30], %[scale_1], %[c30]
\n
"
" v_mul_f32 %[c31], %[scale_1], %[c31]
\n
"
_UK_PK_CVT_
(
"%[c16]"
,
"%[c17]"
,
"%[c16]"
)
_UK_PK_CVT_
(
"%[c18]"
,
"%[c19]"
,
"%[c17]"
)
_UK_PK_CVT_
(
"%[c20]"
,
"%[c21]"
,
"%[c18]"
)
_UK_PK_CVT_
(
"%[c22]"
,
"%[c23]"
,
"%[c19]"
)
_UK_PK_CVT_
(
"%[c24]"
,
"%[c25]"
,
"%[c20]"
)
_UK_PK_CVT_
(
"%[c26]"
,
"%[c27]"
,
"%[c21]"
)
_UK_PK_CVT_
(
"%[c28]"
,
"%[c29]"
,
"%[c22]"
)
_UK_PK_CVT_
(
"%[c30]"
,
"%[c31]"
,
"%[c23]"
)
" s_addk_i32 s80, 0x0080
\n
"
" s_cmp_lt_i32 s80, %[s_loop_cnt]
\n
"
" s_cbranch_scc0 label_0EC1
\n
"
" s_branch label_0AA6
\n
"
" label_0EC1:
\n
"
" s_waitcnt lgkmcnt(0)
\n
"
" s_barrier
\n
"
" ds_read_b32 v10, %[v_sfl_sld] offset:16640
\n
"
" ds_read_b32 v11, %[v_sfl_sld] offset:16672
\n
"
" ds_read_b32 v12, %[v_sfl_sld] offset:16704
\n
"
" ds_read_b32 v13, %[v_sfl_sld] offset:16736
\n
"
" ds_read_b32 v14, %[v_sfl_sld] offset:20992
\n
"
" ds_read_b32 v15, %[v_sfl_sld] offset:21024
\n
"
" ds_read_b32 v16, %[v_sfl_sld] offset:21056
\n
"
" ds_read_b32 v17, %[v_sfl_sld] offset:21088
\n
"
" s_waitcnt lgkmcnt(0)
\n
"
" s_mov_b64 exec, %[s_execflag_0]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o0], v10, s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_1]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o1], v11, s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_2]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o2], v12, s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_3]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o3], v13, s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_4]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o4], v14, s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_5]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o5], v15, s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_6]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o6], v16, s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_7]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o7], v17, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39] "
"
\n
"
" s_add_u32 s8, s59, s8
\n
"
" s_addc_u32 s9, 0, s9
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c16],%[c17]] "
"offset:25344
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c18],%[c19]] "
"offset:29696
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c20],%[c21]] "
"offset:27520
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c22],%[c23]] "
"offset:31872
\n
"
" s_waitcnt lgkmcnt(0)
\n
"
" s_barrier
\n
"
" ds_read_b32 v10, %[v_sfl_sld] offset:25344
\n
"
" ds_read_b32 v11, %[v_sfl_sld] offset:25376
\n
"
" ds_read_b32 v12, %[v_sfl_sld] offset:25408
\n
"
" ds_read_b32 v13, %[v_sfl_sld] offset:25440
\n
"
" ds_read_b32 v14, %[v_sfl_sld] offset:29696
\n
"
" ds_read_b32 v15, %[v_sfl_sld] offset:29728
\n
"
" ds_read_b32 v16, %[v_sfl_sld] offset:29760
\n
"
" ds_read_b32 v17, %[v_sfl_sld] offset:29792
\n
"
" s_waitcnt lgkmcnt(0)
\n
"
" s_mov_b64 exec, %[s_execflag_0]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o0], v10, s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_1]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o1], v11, s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_2]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o2], v12, s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_3]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o3], v13, s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_4]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o4], v14, s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_5]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o5], v15, s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_6]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o6], v16, s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_7]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o7], v17, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
#undef _UK_MFMA_
#undef _UK_PK_CVT_
#undef _UK_ATOMIC_ADD_
include/ck_tile/ops/flatmm/block/uk/flatmm_uk_gfx9_32x512x128_1x1x1_16x16x16.inc
View file @
58d75b7a
...
@@ -9,508 +9,509 @@
...
@@ -9,508 +9,509 @@
#endif
#endif
"s_mov_b32 s16, %[s_res_a0]
\n
"
"s_mov_b32 s16, %[s_res_a0]
\n
"
"s_mov_b32 s17, %[s_res_a1]
\n
"
"s_mov_b32 s17, %[s_res_a1]
\n
"
"s_mov_b32 s18, %[s_res_a2]
\n
"
"s_mov_b32 s18, %[s_res_a2]
\n
"
"s_mov_b32 s19, %[s_res_a3]
\n
"
"s_mov_b32 s19, %[s_res_a3]
\n
"
"s_mov_b32 s20, %[s_res_b0]
\n
"
"s_mov_b32 s20, %[s_res_b0]
\n
"
"s_mov_b32 s21, %[s_res_b1]
\n
"
"s_mov_b32 s21, %[s_res_b1]
\n
"
"s_mov_b32 s22, %[s_res_b2]
\n
"
"s_mov_b32 s22, %[s_res_b2]
\n
"
"s_mov_b32 s23, %[s_res_b3]
\n
"
"s_mov_b32 s23, %[s_res_b3]
\n
"
// "s_nop 4\n"
// "s_nop 4\n"
"; -- prefetch A0
\n
"
"; -- prefetch A0
\n
"
"s_add_u32 m0, 0, %[s_m0_init]
\n
"
"s_add_u32 m0, 0, %[s_m0_init]
\n
"
"buffer_load_dword %[v_os_a0], s[16:19], 0 offen lds
\n
"
"buffer_load_dword %[v_os_a0], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"buffer_load_dword %[v_os_a1], s[16:19], 0 offen lds
\n
"
"buffer_load_dword %[v_os_a1], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"buffer_load_dword %[v_os_a2], s[16:19], 0 offen lds
\n
"
"buffer_load_dword %[v_os_a2], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"buffer_load_dword %[v_os_a3], s[16:19], 0 offen lds
\n
"
"buffer_load_dword %[v_os_a3], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"buffer_load_dword %[v_os_a4], s[16:19], 0 offen lds
\n
"
"buffer_load_dword %[v_os_a4], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"buffer_load_dword %[v_os_a5], s[16:19], 0 offen lds
\n
"
"buffer_load_dword %[v_os_a5], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"buffer_load_dword %[v_os_a6], s[16:19], 0 offen lds
\n
"
"buffer_load_dword %[v_os_a6], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"buffer_load_dword %[v_os_a7], s[16:19], 0 offen lds
\n
"
"buffer_load_dword %[v_os_a7], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, %[smem_sz], %[s_m0_init]
\n
"
"s_add_u32 m0, %[smem_sz], %[s_m0_init]
\n
"
"s_cmp_gt_i32 %[s_loop_cnt] 1 ; move a with cond
\n
"
"s_cmp_gt_i32 %[s_loop_cnt] 1 ; move a with cond
\n
"
"s_cselect_b32 s86, %[s_tile_os_a], 0 ; move a with cond
\n
"
"s_cselect_b32 s86, %[s_tile_os_a], 0 ; move a with cond
\n
"
"s_add_u32 s16, s86, s16 ; move a with cond
\n
"
"s_add_u32 s16, s86, s16 ; move a with cond
\n
"
"s_addc_u32 s17, 0, s17 ; move a with cond
\n
"
"s_addc_u32 s17, 0, s17 ; move a with cond
\n
"
"; -- prefetch A1
\n
"
"; -- prefetch A1
\n
"
"buffer_load_dword %[v_os_a0], s[16:19], 0 offen lds
\n
"
"buffer_load_dword %[v_os_a0], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"buffer_load_dword %[v_os_a1], s[16:19], 0 offen lds
\n
"
"buffer_load_dword %[v_os_a1], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"buffer_load_dword %[v_os_a2], s[16:19], 0 offen lds
\n
"
"buffer_load_dword %[v_os_a2], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"buffer_load_dword %[v_os_a3], s[16:19], 0 offen lds
\n
"
"buffer_load_dword %[v_os_a3], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"buffer_load_dword %[v_os_a4], s[16:19], 0 offen lds
\n
"
"buffer_load_dword %[v_os_a4], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"buffer_load_dword %[v_os_a5], s[16:19], 0 offen lds
\n
"
"buffer_load_dword %[v_os_a5], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"buffer_load_dword %[v_os_a6], s[16:19], 0 offen lds
\n
"
"buffer_load_dword %[v_os_a6], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"buffer_load_dword %[v_os_a7], s[16:19], 0 offen lds
\n
"
"buffer_load_dword %[v_os_a7], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, 0, %[s_m0_init]
\n
"
"s_add_u32 m0, 0, %[s_m0_init]
\n
"
"s_cmp_gt_i32 %[s_loop_cnt] 2 ; move a with cond
\n
"
"s_cmp_gt_i32 %[s_loop_cnt] 2 ; move a with cond
\n
"
"s_cselect_b32 s86, %[s_tile_os_a], 0 ; move a with cond
\n
"
"s_cselect_b32 s86, %[s_tile_os_a], 0 ; move a with cond
\n
"
"s_add_u32 s16, s86, s16 ; move a with cond
\n
"
"s_add_u32 s16, s86, s16 ; move a with cond
\n
"
"s_addc_u32 s17, 0, s17 ; move a with cond
\n
"
"s_addc_u32 s17, 0, s17 ; move a with cond
\n
"
"; -- prefetch B0
\n
"
"; -- prefetch B0
\n
"
"buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[20:23], 0 offen
\n
"
"buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[20:23], 0 offen
\n
"
"buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[20:23], 0 offen offset:1024
\n
"
"buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[20:23], 0 offen offset:1024
\n
"
"buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[20:23], 0 offen offset:2048
\n
"
"buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[20:23], 0 offen offset:2048
\n
"
"buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[20:23], 0 offen offset:3072
\n
"
"buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[20:23], 0 offen offset:3072
\n
"
"buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[20:23], 0 offen
\n
"
"buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[20:23], 0 offen
\n
"
"buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[20:23], 0 offen offset:1024
\n
"
"buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[20:23], 0 offen offset:1024
\n
"
"buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[20:23], 0 offen offset:2048
\n
"
"buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[20:23], 0 offen offset:2048
\n
"
"buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[20:23], 0 offen offset:3072
\n
"
"buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[20:23], 0 offen offset:3072
\n
"
"buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[20:23], 0 offen
\n
"
"buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[20:23], 0 offen
\n
"
"buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[20:23], 0 offen offset:1024
\n
"
"buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[20:23], 0 offen offset:1024
\n
"
"buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[20:23], 0 offen offset:2048
\n
"
"buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[20:23], 0 offen offset:2048
\n
"
"buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[20:23], 0 offen offset:3072
\n
"
"buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[20:23], 0 offen offset:3072
\n
"
"buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[20:23], 0 offen
\n
"
"buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[20:23], 0 offen
\n
"
"buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[20:23], 0 offen offset:1024
\n
"
"buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[20:23], 0 offen offset:1024
\n
"
"buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[20:23], 0 offen offset:2048
\n
"
"buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[20:23], 0 offen offset:2048
\n
"
"buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[20:23], 0 offen offset:3072
\n
"
"buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[20:23], 0 offen offset:3072
\n
"
"buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[20:23], 0 offen
\n
"
"buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[20:23], 0 offen
\n
"
"buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[20:23], 0 offen offset:1024
\n
"
"buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[20:23], 0 offen offset:1024
\n
"
"buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[20:23], 0 offen offset:2048
\n
"
"buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[20:23], 0 offen offset:2048
\n
"
"buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[20:23], 0 offen offset:3072
\n
"
"buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[20:23], 0 offen offset:3072
\n
"
"buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[20:23], 0 offen
\n
"
"buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[20:23], 0 offen
\n
"
"buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[20:23], 0 offen offset:1024
\n
"
"buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[20:23], 0 offen offset:1024
\n
"
"buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[20:23], 0 offen offset:2048
\n
"
"buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[20:23], 0 offen offset:2048
\n
"
"buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[20:23], 0 offen offset:3072
\n
"
"buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[20:23], 0 offen offset:3072
\n
"
"buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[20:23], 0 offen
\n
"
"buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[20:23], 0 offen
\n
"
"buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[20:23], 0 offen offset:1024
\n
"
"buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[20:23], 0 offen offset:1024
\n
"
"buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[20:23], 0 offen offset:2048
\n
"
"buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[20:23], 0 offen offset:2048
\n
"
"buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[20:23], 0 offen offset:3072
\n
"
"buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[20:23], 0 offen offset:3072
\n
"
"buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[20:23], 0 offen
\n
"
"buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[20:23], 0 offen
\n
"
"buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[20:23], 0 offen offset:1024
\n
"
"buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[20:23], 0 offen offset:1024
\n
"
"buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[20:23], 0 offen offset:2048
\n
"
"buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[20:23], 0 offen offset:2048
\n
"
"buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[20:23], 0 offen offset:3072
\n
"
"buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[20:23], 0 offen offset:3072
\n
"
"s_cmp_gt_i32 %[s_loop_cnt] 1 ; move b with cond
\n
"
"s_cmp_gt_i32 %[s_loop_cnt] 1 ; move b with cond
\n
"
"s_cselect_b32 s86, %[s_tile_os_b], 0 ; move b with cond
\n
"
"s_cselect_b32 s86, %[s_tile_os_b], 0 ; move b with cond
\n
"
"s_add_u32 s20, s86, s20 ; move b with cond
\n
"
"s_add_u32 s20, s86, s20 ; move b with cond
\n
"
"s_addc_u32 s21, 0, s21 ; move b with cond
\n
"
"s_addc_u32 s21, 0, s21 ; move b with cond
\n
"
"s_waitcnt vmcnt(40)
\n
"
"s_waitcnt vmcnt(40)
\n
"
"s_barrier
\n
"
"s_barrier
\n
"
"ds_read_b128 v[64:67], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_0]
\n
"
// 1024: N stride, 64 K stride
"ds_read_b128 v[64:67], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_0]
\n
"
// 1024: N stride, 64
"ds_read_b128 v[68:71], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_1]
\n
"
// K stride
"ds_read_b128 v[72:75], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_2]
\n
"
"ds_read_b128 v[68:71], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_1]
\n
"
"ds_read_b128 v[76:79], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_3]
\n
"
"ds_read_b128 v[72:75], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_2]
\n
"
"ds_read_b128 v[80:83], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_4]
\n
"
"ds_read_b128 v[76:79], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_3]
\n
"
"ds_read_b128 v[84:87], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_5]
\n
"
"ds_read_b128 v[80:83], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_4]
\n
"
"ds_read_b128 v[88:91], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_6]
\n
"
"ds_read_b128 v[84:87], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_5]
\n
"
"ds_read_b128 v[92:95], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_7]
\n
"
"ds_read_b128 v[88:91], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_6]
\n
"
"L_start%=:
\n
"
"ds_read_b128 v[92:95], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_7]
\n
"
" s_waitcnt vmcnt(24) & lgkmcnt(0)
\n
"
"L_start%=:
\n
"
" s_barrier
\n
"
" s_waitcnt vmcnt(24) & lgkmcnt(0)
\n
"
_UK_MFMA_
" %[v_acc_0], acc[0:1], v[64:65], %[v_acc_0]
\n
"
" s_barrier
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_0], acc[2:3], v[66:67], %[v_acc_0]
\n
"
" %[v_acc_0], acc[0:1], v[64:65], %[v_acc_0]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[128:131], %[v_os_b0], s[20:23], 0 offen
\n
"
" %[v_acc_0], acc[2:3], v[66:67], %[v_acc_0]
\n
"
_UK_MFMA_
" %[v_acc_0], acc[4:5], v[68:69], %[v_acc_0]
\n
"
" buffer_load_dwordx4 acc[128:131], %[v_os_b0], s[20:23], 0 offen
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_0], acc[6:7], v[70:71], %[v_acc_0]
\n
"
" %[v_acc_0], acc[4:5], v[68:69], %[v_acc_0]
\n
"
_UK_MFMA_
" buffer_load_dword %[v_os_a0], s[16:19], 0 offen lds
\n
"
" %[v_acc_0], acc[6:7], v[70:71], %[v_acc_0]
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
" buffer_load_dword %[v_os_a0], s[16:19], 0 offen lds
\n
"
_UK_MFMA_
" %[v_acc_0], acc[8:9], v[72:73], %[v_acc_0]
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_0], acc[10:11], v[74:75], %[v_acc_0]
\n
"
" %[v_acc_0], acc[8:9], v[72:73], %[v_acc_0]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[132:135], %[v_os_b0], s[20:23], 0 offen offset:1024
\n
"
" %[v_acc_0], acc[10:11], v[74:75], %[v_acc_0]
\n
"
_UK_MFMA_
" %[v_acc_0], acc[12:13], v[76:77], %[v_acc_0]
\n
"
" buffer_load_dwordx4 acc[132:135], %[v_os_b0], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_0], acc[14:15], v[78:79], %[v_acc_0]
\n
"
" %[v_acc_0], acc[12:13], v[76:77], %[v_acc_0]
\n
"
_UK_MFMA_
" buffer_load_dword %[v_os_a1], s[16:19], 0 offen lds
\n
"
" %[v_acc_0], acc[14:15], v[78:79], %[v_acc_0]
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
" buffer_load_dword %[v_os_a1], s[16:19], 0 offen lds
\n
"
_UK_MFMA_
" %[v_acc_1], acc[0:1], v[80:81], %[v_acc_1]
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_1], acc[2:3], v[82:83], %[v_acc_1]
\n
"
" %[v_acc_1], acc[0:1], v[80:81], %[v_acc_1]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[136:139], %[v_os_b0], s[20:23], 0 offen offset:2048
\n
"
" %[v_acc_1], acc[2:3], v[82:83], %[v_acc_1]
\n
"
_UK_MFMA_
" %[v_acc_1], acc[4:5], v[84:85], %[v_acc_1]
\n
"
" buffer_load_dwordx4 acc[136:139], %[v_os_b0], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_1], acc[6:7], v[86:87], %[v_acc_1]
\n
"
" %[v_acc_1], acc[4:5], v[84:85], %[v_acc_1]
\n
"
_UK_MFMA_
" buffer_load_dword %[v_os_a2], s[16:19], 0 offen lds
\n
"
" %[v_acc_1], acc[6:7], v[86:87], %[v_acc_1]
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
" buffer_load_dword %[v_os_a2], s[16:19], 0 offen lds
\n
"
_UK_MFMA_
" %[v_acc_1], acc[8:9], v[88:89], %[v_acc_1]
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_1], acc[10:11], v[90:91], %[v_acc_1]
\n
"
" %[v_acc_1], acc[8:9], v[88:89], %[v_acc_1]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[140:143], %[v_os_b0], s[20:23], 0 offen offset:3072
\n
"
" %[v_acc_1], acc[10:11], v[90:91], %[v_acc_1]
\n
"
_UK_MFMA_
" %[v_acc_1], acc[12:13], v[92:93], %[v_acc_1]
\n
"
" buffer_load_dwordx4 acc[140:143], %[v_os_b0], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_1], acc[14:15], v[94:95], %[v_acc_1]
\n
"
" %[v_acc_1], acc[12:13], v[92:93], %[v_acc_1]
\n
"
_UK_MFMA_
" buffer_load_dword %[v_os_a3], s[16:19], 0 offen lds
\n
"
" %[v_acc_1], acc[14:15], v[94:95], %[v_acc_1]
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
" buffer_load_dword %[v_os_a3], s[16:19], 0 offen lds
\n
"
_UK_MFMA_
" %[v_acc_2], acc[16:17], v[64:65], %[v_acc_2]
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_2], acc[18:19], v[66:67], %[v_acc_2]
\n
"
" %[v_acc_2], acc[16:17], v[64:65], %[v_acc_2]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[144:147], %[v_os_b1], s[20:23], 0 offen
\n
"
" %[v_acc_2], acc[18:19], v[66:67], %[v_acc_2]
\n
"
_UK_MFMA_
" %[v_acc_2], acc[20:21], v[68:69], %[v_acc_2]
\n
"
" buffer_load_dwordx4 acc[144:147], %[v_os_b1], s[20:23], 0 offen
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_2], acc[22:23], v[70:71], %[v_acc_2]
\n
"
" %[v_acc_2], acc[20:21], v[68:69], %[v_acc_2]
\n
"
_UK_MFMA_
" buffer_load_dword %[v_os_a4], s[16:19], 0 offen lds
\n
"
" %[v_acc_2], acc[22:23], v[70:71], %[v_acc_2]
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
" buffer_load_dword %[v_os_a4], s[16:19], 0 offen lds
\n
"
_UK_MFMA_
" %[v_acc_2], acc[24:25], v[72:73], %[v_acc_2]
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_2], acc[26:27], v[74:75], %[v_acc_2]
\n
"
" %[v_acc_2], acc[24:25], v[72:73], %[v_acc_2]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[148:151], %[v_os_b1], s[20:23], 0 offen offset:1024
\n
"
" %[v_acc_2], acc[26:27], v[74:75], %[v_acc_2]
\n
"
_UK_MFMA_
" %[v_acc_2], acc[28:29], v[76:77], %[v_acc_2]
\n
"
" buffer_load_dwordx4 acc[148:151], %[v_os_b1], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_2], acc[30:31], v[78:79], %[v_acc_2]
\n
"
" %[v_acc_2], acc[28:29], v[76:77], %[v_acc_2]
\n
"
_UK_MFMA_
" buffer_load_dword %[v_os_a5], s[16:19], 0 offen lds
\n
"
" %[v_acc_2], acc[30:31], v[78:79], %[v_acc_2]
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
" buffer_load_dword %[v_os_a5], s[16:19], 0 offen lds
\n
"
_UK_MFMA_
" %[v_acc_3], acc[16:17], v[80:81], %[v_acc_3]
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_3], acc[18:19], v[82:83], %[v_acc_3]
\n
"
" %[v_acc_3], acc[16:17], v[80:81], %[v_acc_3]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[152:155], %[v_os_b1], s[20:23], 0 offen offset:2048
\n
"
" %[v_acc_3], acc[18:19], v[82:83], %[v_acc_3]
\n
"
_UK_MFMA_
" %[v_acc_3], acc[20:21], v[84:85], %[v_acc_3]
\n
"
" buffer_load_dwordx4 acc[152:155], %[v_os_b1], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_3], acc[22:23], v[86:87], %[v_acc_3]
\n
"
" %[v_acc_3], acc[20:21], v[84:85], %[v_acc_3]
\n
"
_UK_MFMA_
" buffer_load_dword %[v_os_a6], s[16:19], 0 offen lds
\n
"
" %[v_acc_3], acc[22:23], v[86:87], %[v_acc_3]
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
" buffer_load_dword %[v_os_a6], s[16:19], 0 offen lds
\n
"
_UK_MFMA_
" %[v_acc_3], acc[24:25], v[88:89], %[v_acc_3]
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_3], acc[26:27], v[90:91], %[v_acc_3]
\n
"
" %[v_acc_3], acc[24:25], v[88:89], %[v_acc_3]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[156:159], %[v_os_b1], s[20:23], 0 offen offset:3072
\n
"
" %[v_acc_3], acc[26:27], v[90:91], %[v_acc_3]
\n
"
_UK_MFMA_
" %[v_acc_3], acc[28:29], v[92:93], %[v_acc_3]
\n
"
" buffer_load_dwordx4 acc[156:159], %[v_os_b1], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_3], acc[30:31], v[94:95], %[v_acc_3]
\n
"
" %[v_acc_3], acc[28:29], v[92:93], %[v_acc_3]
\n
"
_UK_MFMA_
" buffer_load_dword %[v_os_a7], s[16:19], 0 offen lds
\n
"
" %[v_acc_3], acc[30:31], v[94:95], %[v_acc_3]
\n
"
" s_add_u32 m0, %[smem_sz], %[s_m0_init]
\n
"
" buffer_load_dword %[v_os_a7], s[16:19], 0 offen lds
\n
"
" s_waitcnt vmcnt(32)
\n
"
" s_add_u32 m0, %[smem_sz], %[s_m0_init]
\n
"
_UK_MFMA_
" %[v_acc_4], acc[32:33], v[64:65], %[v_acc_4]
\n
"
" s_waitcnt vmcnt(32)
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_4], acc[34:35], v[66:67], %[v_acc_4]
\n
"
" %[v_acc_4], acc[32:33], v[64:65], %[v_acc_4]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[160:163], %[v_os_b2], s[20:23], 0 offen
\n
"
" %[v_acc_4], acc[34:35], v[66:67], %[v_acc_4]
\n
"
_UK_MFMA_
" %[v_acc_4], acc[36:37], v[68:69], %[v_acc_4]
\n
"
" buffer_load_dwordx4 acc[160:163], %[v_os_b2], s[20:23], 0 offen
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_4], acc[38:39], v[70:71], %[v_acc_4]
\n
"
" %[v_acc_4], acc[36:37], v[68:69], %[v_acc_4]
\n
"
_UK_MFMA_
" ds_read_b128 v[96:99], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_0]
\n
"
" %[v_acc_4], acc[38:39], v[70:71], %[v_acc_4]
\n
"
_UK_MFMA_
" %[v_acc_4], acc[40:41], v[72:73], %[v_acc_4]
\n
"
" ds_read_b128 v[96:99], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_0] "
_UK_MFMA_
" %[v_acc_4], acc[42:43], v[74:75], %[v_acc_4]
\n
"
"
\n
"
_UK_MFMA_
" %[v_acc_4], acc[40:41], v[72:73], %[v_acc_4]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[164:167], %[v_os_b2], s[20:23], 0 offen offset:1024
\n
"
" %[v_acc_4], acc[42:43], v[74:75], %[v_acc_4]
\n
"
_UK_MFMA_
" %[v_acc_4], acc[44:45], v[76:77], %[v_acc_4]
\n
"
" buffer_load_dwordx4 acc[164:167], %[v_os_b2], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_4], acc[46:47], v[78:79], %[v_acc_4]
\n
"
" %[v_acc_4], acc[44:45], v[76:77], %[v_acc_4]
\n
"
_UK_MFMA_
" ds_read_b128 v[100:103], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_1]
\n
"
" %[v_acc_4], acc[46:47], v[78:79], %[v_acc_4]
\n
"
_UK_MFMA_
" %[v_acc_5], acc[32:33], v[80:81], %[v_acc_5]
\n
"
" ds_read_b128 v[100:103], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_1] "
_UK_MFMA_
" %[v_acc_5], acc[34:35], v[82:83], %[v_acc_5]
\n
"
"
\n
"
_UK_MFMA_
" %[v_acc_5], acc[32:33], v[80:81], %[v_acc_5]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[168:171], %[v_os_b2], s[20:23], 0 offen offset:2048
\n
"
" %[v_acc_5], acc[34:35], v[82:83], %[v_acc_5]
\n
"
_UK_MFMA_
" %[v_acc_5], acc[36:37], v[84:85], %[v_acc_5]
\n
"
" buffer_load_dwordx4 acc[168:171], %[v_os_b2], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_5], acc[38:39], v[86:87], %[v_acc_5]
\n
"
" %[v_acc_5], acc[36:37], v[84:85], %[v_acc_5]
\n
"
_UK_MFMA_
" ds_read_b128 v[104:107], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_2]
\n
"
" %[v_acc_5], acc[38:39], v[86:87], %[v_acc_5]
\n
"
_UK_MFMA_
" %[v_acc_5], acc[40:41], v[88:89], %[v_acc_5]
\n
"
" ds_read_b128 v[104:107], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_2] "
_UK_MFMA_
" %[v_acc_5], acc[42:43], v[90:91], %[v_acc_5]
\n
"
"
\n
"
_UK_MFMA_
" %[v_acc_5], acc[40:41], v[88:89], %[v_acc_5]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[172:175], %[v_os_b2], s[20:23], 0 offen offset:3072
\n
"
" %[v_acc_5], acc[42:43], v[90:91], %[v_acc_5]
\n
"
_UK_MFMA_
" %[v_acc_5], acc[44:45], v[92:93], %[v_acc_5]
\n
"
" buffer_load_dwordx4 acc[172:175], %[v_os_b2], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_5], acc[46:47], v[94:95], %[v_acc_5]
\n
"
" %[v_acc_5], acc[44:45], v[92:93], %[v_acc_5]
\n
"
_UK_MFMA_
" ds_read_b128 v[108:111], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_3]
\n
"
" %[v_acc_5], acc[46:47], v[94:95], %[v_acc_5]
\n
"
_UK_MFMA_
" %[v_acc_6], acc[48:49], v[64:65], %[v_acc_6]
\n
"
" ds_read_b128 v[108:111], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_3] "
_UK_MFMA_
" %[v_acc_6], acc[50:51], v[66:67], %[v_acc_6]
\n
"
"
\n
"
_UK_MFMA_
" %[v_acc_6], acc[48:49], v[64:65], %[v_acc_6]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[176:179], %[v_os_b3], s[20:23], 0 offen
\n
"
" %[v_acc_6], acc[50:51], v[66:67], %[v_acc_6]
\n
"
_UK_MFMA_
" %[v_acc_6], acc[52:53], v[68:69], %[v_acc_6]
\n
"
" buffer_load_dwordx4 acc[176:179], %[v_os_b3], s[20:23], 0 offen
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_6], acc[54:55], v[70:71], %[v_acc_6]
\n
"
" %[v_acc_6], acc[52:53], v[68:69], %[v_acc_6]
\n
"
_UK_MFMA_
" ds_read_b128 v[112:115], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_4]
\n
"
" %[v_acc_6], acc[54:55], v[70:71], %[v_acc_6]
\n
"
_UK_MFMA_
" %[v_acc_6], acc[56:57], v[72:73], %[v_acc_6]
\n
"
" ds_read_b128 v[112:115], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_4] "
_UK_MFMA_
" %[v_acc_6], acc[58:59], v[74:75], %[v_acc_6]
\n
"
"
\n
"
_UK_MFMA_
" %[v_acc_6], acc[56:57], v[72:73], %[v_acc_6]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[180:183], %[v_os_b3], s[20:23], 0 offen offset:1024
\n
"
" %[v_acc_6], acc[58:59], v[74:75], %[v_acc_6]
\n
"
_UK_MFMA_
" %[v_acc_6], acc[60:61], v[76:77], %[v_acc_6]
\n
"
" buffer_load_dwordx4 acc[180:183], %[v_os_b3], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_6], acc[62:63], v[78:79], %[v_acc_6]
\n
"
" %[v_acc_6], acc[60:61], v[76:77], %[v_acc_6]
\n
"
_UK_MFMA_
" ds_read_b128 v[116:119], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_5]
\n
"
" %[v_acc_6], acc[62:63], v[78:79], %[v_acc_6]
\n
"
_UK_MFMA_
" %[v_acc_7], acc[48:49], v[80:81], %[v_acc_7]
\n
"
" ds_read_b128 v[116:119], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_5] "
_UK_MFMA_
" %[v_acc_7], acc[50:51], v[82:83], %[v_acc_7]
\n
"
"
\n
"
_UK_MFMA_
" %[v_acc_7], acc[48:49], v[80:81], %[v_acc_7]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[184:187], %[v_os_b3], s[20:23], 0 offen offset:2048
\n
"
" %[v_acc_7], acc[50:51], v[82:83], %[v_acc_7]
\n
"
_UK_MFMA_
" %[v_acc_7], acc[52:53], v[84:85], %[v_acc_7]
\n
"
" buffer_load_dwordx4 acc[184:187], %[v_os_b3], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_7], acc[54:55], v[86:87], %[v_acc_7]
\n
"
" %[v_acc_7], acc[52:53], v[84:85], %[v_acc_7]
\n
"
_UK_MFMA_
" ds_read_b128 v[120:123], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_6]
\n
"
" %[v_acc_7], acc[54:55], v[86:87], %[v_acc_7]
\n
"
_UK_MFMA_
" %[v_acc_7], acc[56:57], v[88:89], %[v_acc_7]
\n
"
" ds_read_b128 v[120:123], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_6] "
_UK_MFMA_
" %[v_acc_7], acc[58:59], v[90:91], %[v_acc_7]
\n
"
"
\n
"
_UK_MFMA_
" %[v_acc_7], acc[56:57], v[88:89], %[v_acc_7]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[188:191], %[v_os_b3], s[20:23], 0 offen offset:3072
\n
"
" %[v_acc_7], acc[58:59], v[90:91], %[v_acc_7]
\n
"
_UK_MFMA_
" %[v_acc_7], acc[60:61], v[92:93], %[v_acc_7]
\n
"
" buffer_load_dwordx4 acc[188:191], %[v_os_b3], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_7], acc[62:63], v[94:95], %[v_acc_7]
\n
"
" %[v_acc_7], acc[60:61], v[92:93], %[v_acc_7]
\n
"
_UK_MFMA_
" ds_read_b128 v[124:127], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_7]
\n
"
" %[v_acc_7], acc[62:63], v[94:95], %[v_acc_7]
\n
"
" s_waitcnt vmcnt(32)
\n
"
" ds_read_b128 v[124:127], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_7]
\n
"
_UK_MFMA_
" %[v_acc_8], acc[64:65], v[64:65], %[v_acc_8]
\n
"
" s_waitcnt vmcnt(32)
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_8], acc[66:67], v[66:67], %[v_acc_8]
\n
"
" %[v_acc_8], acc[64:65], v[64:65], %[v_acc_8]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[192:195], %[v_os_b4], s[20:23], 0 offen
\n
"
" %[v_acc_8], acc[66:67], v[66:67], %[v_acc_8]
\n
"
_UK_MFMA_
" %[v_acc_8], acc[68:69], v[68:69], %[v_acc_8]
\n
"
" buffer_load_dwordx4 acc[192:195], %[v_os_b4], s[20:23], 0 offen
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_8], acc[70:71], v[70:71], %[v_acc_8]
\n
"
" %[v_acc_8], acc[68:69], v[68:69], %[v_acc_8]
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_8], acc[72:73], v[72:73], %[v_acc_8]
\n
"
" %[v_acc_8], acc[70:71], v[70:71], %[v_acc_8]
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_8], acc[74:75], v[74:75], %[v_acc_8]
\n
"
" %[v_acc_8], acc[72:73], v[72:73], %[v_acc_8]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[196:199], %[v_os_b4], s[20:23], 0 offen offset:1024
\n
"
" %[v_acc_8], acc[74:75], v[74:75], %[v_acc_8]
\n
"
_UK_MFMA_
" %[v_acc_8], acc[76:77], v[76:77], %[v_acc_8]
\n
"
" buffer_load_dwordx4 acc[196:199], %[v_os_b4], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_8], acc[78:79], v[78:79], %[v_acc_8]
\n
"
" %[v_acc_8], acc[76:77], v[76:77], %[v_acc_8]
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_9], acc[64:65], v[80:81], %[v_acc_9]
\n
"
" %[v_acc_8], acc[78:79], v[78:79], %[v_acc_8]
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_9], acc[66:67], v[82:83], %[v_acc_9]
\n
"
" %[v_acc_9], acc[64:65], v[80:81], %[v_acc_9]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[200:203], %[v_os_b4], s[20:23], 0 offen offset:2048
\n
"
" %[v_acc_9], acc[66:67], v[82:83], %[v_acc_9]
\n
"
_UK_MFMA_
" %[v_acc_9], acc[68:69], v[84:85], %[v_acc_9]
\n
"
" buffer_load_dwordx4 acc[200:203], %[v_os_b4], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_9], acc[70:71], v[86:87], %[v_acc_9]
\n
"
" %[v_acc_9], acc[68:69], v[84:85], %[v_acc_9]
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_9], acc[72:73], v[88:89], %[v_acc_9]
\n
"
" %[v_acc_9], acc[70:71], v[86:87], %[v_acc_9]
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_9], acc[74:75], v[90:91], %[v_acc_9]
\n
"
" %[v_acc_9], acc[72:73], v[88:89], %[v_acc_9]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[204:207], %[v_os_b4], s[20:23], 0 offen offset:3072
\n
"
" %[v_acc_9], acc[74:75], v[90:91], %[v_acc_9]
\n
"
_UK_MFMA_
" %[v_acc_9], acc[76:77], v[92:93], %[v_acc_9]
\n
"
" buffer_load_dwordx4 acc[204:207], %[v_os_b4], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_9], acc[78:79], v[94:95], %[v_acc_9]
\n
"
" %[v_acc_9], acc[76:77], v[92:93], %[v_acc_9]
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_10], acc[80:81], v[64:65], %[v_acc_10]
\n
"
" %[v_acc_9], acc[78:79], v[94:95], %[v_acc_9]
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_10], acc[82:83], v[66:67], %[v_acc_10]
\n
"
" %[v_acc_10], acc[80:81], v[64:65], %[v_acc_10]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[208:211], %[v_os_b5], s[20:23], 0 offen
\n
"
" %[v_acc_10], acc[82:83], v[66:67], %[v_acc_10]
\n
"
_UK_MFMA_
" %[v_acc_10], acc[84:85], v[68:69], %[v_acc_10]
\n
"
" buffer_load_dwordx4 acc[208:211], %[v_os_b5], s[20:23], 0 offen
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_10], acc[86:87], v[70:71], %[v_acc_10]
\n
"
" %[v_acc_10], acc[84:85], v[68:69], %[v_acc_10]
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_10], acc[88:89], v[72:73], %[v_acc_10]
\n
"
" %[v_acc_10], acc[86:87], v[70:71], %[v_acc_10]
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_10], acc[90:91], v[74:75], %[v_acc_10]
\n
"
" %[v_acc_10], acc[88:89], v[72:73], %[v_acc_10]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[212:215], %[v_os_b5], s[20:23], 0 offen offset:1024
\n
"
" %[v_acc_10], acc[90:91], v[74:75], %[v_acc_10]
\n
"
_UK_MFMA_
" %[v_acc_10], acc[92:93], v[76:77], %[v_acc_10]
\n
"
" buffer_load_dwordx4 acc[212:215], %[v_os_b5], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_10], acc[94:95], v[78:79], %[v_acc_10]
\n
"
" %[v_acc_10], acc[92:93], v[76:77], %[v_acc_10]
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_11], acc[80:81], v[80:81], %[v_acc_11]
\n
"
" %[v_acc_10], acc[94:95], v[78:79], %[v_acc_10]
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_11], acc[82:83], v[82:83], %[v_acc_11]
\n
"
" %[v_acc_11], acc[80:81], v[80:81], %[v_acc_11]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[216:219], %[v_os_b5], s[20:23], 0 offen offset:2048
\n
"
" %[v_acc_11], acc[82:83], v[82:83], %[v_acc_11]
\n
"
_UK_MFMA_
" %[v_acc_11], acc[84:85], v[84:85], %[v_acc_11]
\n
"
" buffer_load_dwordx4 acc[216:219], %[v_os_b5], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_11], acc[86:87], v[86:87], %[v_acc_11]
\n
"
" %[v_acc_11], acc[84:85], v[84:85], %[v_acc_11]
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_11], acc[88:89], v[88:89], %[v_acc_11]
\n
"
" %[v_acc_11], acc[86:87], v[86:87], %[v_acc_11]
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_11], acc[90:91], v[90:91], %[v_acc_11]
\n
"
" %[v_acc_11], acc[88:89], v[88:89], %[v_acc_11]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[220:223], %[v_os_b5], s[20:23], 0 offen offset:3072
\n
"
" %[v_acc_11], acc[90:91], v[90:91], %[v_acc_11]
\n
"
_UK_MFMA_
" %[v_acc_11], acc[92:93], v[92:93], %[v_acc_11]
\n
"
" buffer_load_dwordx4 acc[220:223], %[v_os_b5], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_11], acc[94:95], v[94:95], %[v_acc_11]
\n
"
" %[v_acc_11], acc[92:93], v[92:93], %[v_acc_11]
\n
"
_UK_MFMA_
" s_waitcnt vmcnt(32)
\n
"
" %[v_acc_11], acc[94:95], v[94:95], %[v_acc_11]
\n
"
_UK_MFMA_
" %[v_acc_12], acc[96:97], v[64:65], %[v_acc_12]
\n
"
" s_waitcnt vmcnt(32)
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_12], acc[98:99], v[66:67], %[v_acc_12]
\n
"
" %[v_acc_12], acc[96:97], v[64:65], %[v_acc_12]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[224:227], %[v_os_b6], s[20:23], 0 offen
\n
"
" %[v_acc_12], acc[98:99], v[66:67], %[v_acc_12]
\n
"
_UK_MFMA_
" %[v_acc_12], acc[100:101], v[68:69], %[v_acc_12]
\n
"
" buffer_load_dwordx4 acc[224:227], %[v_os_b6], s[20:23], 0 offen
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_12], acc[102:103], v[70:71], %[v_acc_12]
\n
"
" %[v_acc_12], acc[100:101], v[68:69], %[v_acc_12]
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_12], acc[104:105], v[72:73], %[v_acc_12]
\n
"
" %[v_acc_12], acc[102:103], v[70:71], %[v_acc_12]
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_12], acc[106:107], v[74:75], %[v_acc_12]
\n
"
" %[v_acc_12], acc[104:105], v[72:73], %[v_acc_12]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[228:231], %[v_os_b6], s[20:23], 0 offen offset:1024
\n
"
" %[v_acc_12], acc[106:107], v[74:75], %[v_acc_12]
\n
"
_UK_MFMA_
" %[v_acc_12], acc[108:109], v[76:77], %[v_acc_12]
\n
"
" buffer_load_dwordx4 acc[228:231], %[v_os_b6], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_12], acc[110:111], v[78:79], %[v_acc_12]
\n
"
" %[v_acc_12], acc[108:109], v[76:77], %[v_acc_12]
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_13], acc[96:97], v[80:81], %[v_acc_13]
\n
"
" %[v_acc_12], acc[110:111], v[78:79], %[v_acc_12]
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_13], acc[98:99], v[82:83], %[v_acc_13]
\n
"
" %[v_acc_13], acc[96:97], v[80:81], %[v_acc_13]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[232:235], %[v_os_b6], s[20:23], 0 offen offset:2048
\n
"
" %[v_acc_13], acc[98:99], v[82:83], %[v_acc_13]
\n
"
_UK_MFMA_
" %[v_acc_13], acc[100:101], v[84:85], %[v_acc_13]
\n
"
" buffer_load_dwordx4 acc[232:235], %[v_os_b6], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_13], acc[102:103], v[86:87], %[v_acc_13]
\n
"
" %[v_acc_13], acc[100:101], v[84:85], %[v_acc_13]
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_13], acc[104:105], v[88:89], %[v_acc_13]
\n
"
" %[v_acc_13], acc[102:103], v[86:87], %[v_acc_13]
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_13], acc[106:107], v[90:91], %[v_acc_13]
\n
"
" %[v_acc_13], acc[104:105], v[88:89], %[v_acc_13]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[236:239], %[v_os_b6], s[20:23], 0 offen offset:3072
\n
"
" %[v_acc_13], acc[106:107], v[90:91], %[v_acc_13]
\n
"
_UK_MFMA_
" %[v_acc_13], acc[108:109], v[92:93], %[v_acc_13]
\n
"
" buffer_load_dwordx4 acc[236:239], %[v_os_b6], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_13], acc[110:111], v[94:95], %[v_acc_13]
\n
"
" %[v_acc_13], acc[108:109], v[92:93], %[v_acc_13]
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_14], acc[112:113], v[64:65], %[v_acc_14]
\n
"
" %[v_acc_13], acc[110:111], v[94:95], %[v_acc_13]
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_14], acc[114:115], v[66:67], %[v_acc_14]
\n
"
" %[v_acc_14], acc[112:113], v[64:65], %[v_acc_14]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[240:243], %[v_os_b7], s[20:23], 0 offen
\n
"
" %[v_acc_14], acc[114:115], v[66:67], %[v_acc_14]
\n
"
_UK_MFMA_
" %[v_acc_14], acc[116:117], v[68:69], %[v_acc_14]
\n
"
" buffer_load_dwordx4 acc[240:243], %[v_os_b7], s[20:23], 0 offen
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_14], acc[118:119], v[70:71], %[v_acc_14]
\n
"
" %[v_acc_14], acc[116:117], v[68:69], %[v_acc_14]
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_14], acc[120:121], v[72:73], %[v_acc_14]
\n
"
" %[v_acc_14], acc[118:119], v[70:71], %[v_acc_14]
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_14], acc[122:123], v[74:75], %[v_acc_14]
\n
"
" %[v_acc_14], acc[120:121], v[72:73], %[v_acc_14]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[244:247], %[v_os_b7], s[20:23], 0 offen offset:1024
\n
"
" %[v_acc_14], acc[122:123], v[74:75], %[v_acc_14]
\n
"
_UK_MFMA_
" %[v_acc_14], acc[124:125], v[76:77], %[v_acc_14]
\n
"
" buffer_load_dwordx4 acc[244:247], %[v_os_b7], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_14], acc[126:127], v[78:79], %[v_acc_14]
\n
"
" %[v_acc_14], acc[124:125], v[76:77], %[v_acc_14]
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_15], acc[112:113], v[80:81], %[v_acc_15]
\n
"
" %[v_acc_14], acc[126:127], v[78:79], %[v_acc_14]
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_15], acc[114:115], v[82:83], %[v_acc_15]
\n
"
" %[v_acc_15], acc[112:113], v[80:81], %[v_acc_15]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[248:251], %[v_os_b7], s[20:23], 0 offen offset:2048
\n
"
" %[v_acc_15], acc[114:115], v[82:83], %[v_acc_15]
\n
"
_UK_MFMA_
" %[v_acc_15], acc[116:117], v[84:85], %[v_acc_15]
\n
"
" buffer_load_dwordx4 acc[248:251], %[v_os_b7], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_15], acc[118:119], v[86:87], %[v_acc_15]
\n
"
" %[v_acc_15], acc[116:117], v[84:85], %[v_acc_15]
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_15], acc[120:121], v[88:89], %[v_acc_15]
\n
"
" %[v_acc_15], acc[118:119], v[86:87], %[v_acc_15]
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_15], acc[122:123], v[90:91], %[v_acc_15]
\n
"
" %[v_acc_15], acc[120:121], v[88:89], %[v_acc_15]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[252:255], %[v_os_b7], s[20:23], 0 offen offset:3072
\n
"
" %[v_acc_15], acc[122:123], v[90:91], %[v_acc_15]
\n
"
_UK_MFMA_
" %[v_acc_15], acc[124:125], v[92:93], %[v_acc_15]
\n
"
" buffer_load_dwordx4 acc[252:255], %[v_os_b7], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_15], acc[126:127], v[94:95], %[v_acc_15]
\n
"
" %[v_acc_15], acc[124:125], v[92:93], %[v_acc_15]
\n
"
_UK_MFMA_
" s_sub_i32 %[s_loop_cnt], %[s_loop_cnt], 1
\n
"
" %[v_acc_15], acc[126:127], v[94:95], %[v_acc_15]
\n
"
" s_cmp_gt_i32 %[s_loop_cnt] 0
\n
"
" s_sub_i32 %[s_loop_cnt], %[s_loop_cnt], 1
\n
"
" s_cbranch_scc0 L_end%=
\n
"
" s_cmp_gt_i32 %[s_loop_cnt] 0
\n
"
" s_cmp_gt_i32 %[s_loop_cnt] 2 ; move a with cond
\n
"
" s_cbranch_scc0 L_end%=
\n
"
" s_cselect_b32 s86, %[s_tile_os_a], 0
\n
"
" s_cmp_gt_i32 %[s_loop_cnt] 2 ; move a with cond
\n
"
" s_add_u32 s16, s86, s16
\n
"
" s_cselect_b32 s86, %[s_tile_os_a], 0
\n
"
" s_addc_u32 s17, 0, s17
\n
"
" s_add_u32 s16, s86, s16
\n
"
" s_cmp_gt_i32 %[s_loop_cnt] 1 ; move b with cond
\n
"
" s_addc_u32 s17, 0, s17
\n
"
" s_cselect_b32 s86, %[s_tile_os_b], 0
\n
"
" s_cmp_gt_i32 %[s_loop_cnt] 1 ; move b with cond
\n
"
" s_add_u32 s20, s86, s20
\n
"
" s_cselect_b32 s86, %[s_tile_os_b], 0
\n
"
" s_addc_u32 s21, 0, s21
\n
"
" s_add_u32 s20, s86, s20
\n
"
" ;------------------------------------------
\n
"
" s_addc_u32 s21, 0, s21
\n
"
" s_waitcnt vmcnt(24) & lgkmcnt(0)
\n
"
" ;------------------------------------------
\n
"
" s_barrier
\n
"
" s_waitcnt vmcnt(24) & lgkmcnt(0)
\n
"
_UK_MFMA_
" %[v_acc_0], acc[128:129], v[96:97], %[v_acc_0]
\n
"
" s_barrier
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_0], acc[130:131], v[98:99], %[v_acc_0]
\n
"
" %[v_acc_0], acc[128:129], v[96:97], %[v_acc_0]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[20:23], 0 offen
\n
"
" %[v_acc_0], acc[130:131], v[98:99], %[v_acc_0]
\n
"
_UK_MFMA_
" %[v_acc_0], acc[132:133], v[100:101], %[v_acc_0]
\n
"
" buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[20:23], 0 offen
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_0], acc[134:135], v[102:103], %[v_acc_0]
\n
"
" %[v_acc_0], acc[132:133], v[100:101], %[v_acc_0]
\n
"
_UK_MFMA_
" buffer_load_dword %[v_os_a0], s[16:19], 0 offen lds
\n
"
" %[v_acc_0], acc[134:135], v[102:103], %[v_acc_0]
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
" buffer_load_dword %[v_os_a0], s[16:19], 0 offen lds
\n
"
_UK_MFMA_
" %[v_acc_0], acc[136:137], v[104:105], %[v_acc_0]
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_0], acc[138:139], v[106:107], %[v_acc_0]
\n
"
" %[v_acc_0], acc[136:137], v[104:105], %[v_acc_0]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[20:23], 0 offen offset:1024
\n
"
" %[v_acc_0], acc[138:139], v[106:107], %[v_acc_0]
\n
"
_UK_MFMA_
" %[v_acc_0], acc[140:141], v[108:109], %[v_acc_0]
\n
"
" buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_0], acc[142:143], v[110:111], %[v_acc_0]
\n
"
" %[v_acc_0], acc[140:141], v[108:109], %[v_acc_0]
\n
"
_UK_MFMA_
" buffer_load_dword %[v_os_a1], s[16:19], 0 offen lds
\n
"
" %[v_acc_0], acc[142:143], v[110:111], %[v_acc_0]
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
" buffer_load_dword %[v_os_a1], s[16:19], 0 offen lds
\n
"
_UK_MFMA_
" %[v_acc_1], acc[128:129], v[112:113], %[v_acc_1]
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_1], acc[130:131], v[114:115], %[v_acc_1]
\n
"
" %[v_acc_1], acc[128:129], v[112:113], %[v_acc_1]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[20:23], 0 offen offset:2048
\n
"
" %[v_acc_1], acc[130:131], v[114:115], %[v_acc_1]
\n
"
_UK_MFMA_
" %[v_acc_1], acc[132:133], v[116:117], %[v_acc_1]
\n
"
" buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_1], acc[134:135], v[118:119], %[v_acc_1]
\n
"
" %[v_acc_1], acc[132:133], v[116:117], %[v_acc_1]
\n
"
_UK_MFMA_
" buffer_load_dword %[v_os_a2], s[16:19], 0 offen lds
\n
"
" %[v_acc_1], acc[134:135], v[118:119], %[v_acc_1]
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
" buffer_load_dword %[v_os_a2], s[16:19], 0 offen lds
\n
"
_UK_MFMA_
" %[v_acc_1], acc[136:137], v[120:121], %[v_acc_1]
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_1], acc[138:139], v[122:123], %[v_acc_1]
\n
"
" %[v_acc_1], acc[136:137], v[120:121], %[v_acc_1]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[20:23], 0 offen offset:3072
\n
"
" %[v_acc_1], acc[138:139], v[122:123], %[v_acc_1]
\n
"
_UK_MFMA_
" %[v_acc_1], acc[140:141], v[124:125], %[v_acc_1]
\n
"
" buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_1], acc[142:143], v[126:127], %[v_acc_1]
\n
"
" %[v_acc_1], acc[140:141], v[124:125], %[v_acc_1]
\n
"
_UK_MFMA_
" buffer_load_dword %[v_os_a3], s[16:19], 0 offen lds
\n
"
" %[v_acc_1], acc[142:143], v[126:127], %[v_acc_1]
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
" buffer_load_dword %[v_os_a3], s[16:19], 0 offen lds
\n
"
_UK_MFMA_
" %[v_acc_2], acc[144:145], v[96:97], %[v_acc_2]
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_2], acc[146:147], v[98:99], %[v_acc_2]
\n
"
" %[v_acc_2], acc[144:145], v[96:97], %[v_acc_2]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[20:23], 0 offen
\n
"
" %[v_acc_2], acc[146:147], v[98:99], %[v_acc_2]
\n
"
_UK_MFMA_
" %[v_acc_2], acc[148:149], v[100:101], %[v_acc_2]
\n
"
" buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[20:23], 0 offen
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_2], acc[150:151], v[102:103], %[v_acc_2]
\n
"
" %[v_acc_2], acc[148:149], v[100:101], %[v_acc_2]
\n
"
_UK_MFMA_
" buffer_load_dword %[v_os_a4], s[16:19], 0 offen lds
\n
"
" %[v_acc_2], acc[150:151], v[102:103], %[v_acc_2]
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
" buffer_load_dword %[v_os_a4], s[16:19], 0 offen lds
\n
"
_UK_MFMA_
" %[v_acc_2], acc[152:153], v[104:105], %[v_acc_2]
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_2], acc[154:155], v[106:107], %[v_acc_2]
\n
"
" %[v_acc_2], acc[152:153], v[104:105], %[v_acc_2]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[20:23], 0 offen offset:1024
\n
"
" %[v_acc_2], acc[154:155], v[106:107], %[v_acc_2]
\n
"
_UK_MFMA_
" %[v_acc_2], acc[156:157], v[108:109], %[v_acc_2]
\n
"
" buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_2], acc[158:159], v[110:111], %[v_acc_2]
\n
"
" %[v_acc_2], acc[156:157], v[108:109], %[v_acc_2]
\n
"
_UK_MFMA_
" buffer_load_dword %[v_os_a5], s[16:19], 0 offen lds
\n
"
" %[v_acc_2], acc[158:159], v[110:111], %[v_acc_2]
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
" buffer_load_dword %[v_os_a5], s[16:19], 0 offen lds
\n
"
_UK_MFMA_
" %[v_acc_3], acc[144:145], v[112:113], %[v_acc_3]
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_3], acc[146:147], v[114:115], %[v_acc_3]
\n
"
" %[v_acc_3], acc[144:145], v[112:113], %[v_acc_3]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[20:23], 0 offen offset:2048
\n
"
" %[v_acc_3], acc[146:147], v[114:115], %[v_acc_3]
\n
"
_UK_MFMA_
" %[v_acc_3], acc[148:149], v[116:117], %[v_acc_3]
\n
"
" buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_3], acc[150:151], v[118:119], %[v_acc_3]
\n
"
" %[v_acc_3], acc[148:149], v[116:117], %[v_acc_3]
\n
"
_UK_MFMA_
" buffer_load_dword %[v_os_a6], s[16:19], 0 offen lds
\n
"
" %[v_acc_3], acc[150:151], v[118:119], %[v_acc_3]
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
" buffer_load_dword %[v_os_a6], s[16:19], 0 offen lds
\n
"
_UK_MFMA_
" %[v_acc_3], acc[152:153], v[120:121], %[v_acc_3]
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_3], acc[154:155], v[122:123], %[v_acc_3]
\n
"
" %[v_acc_3], acc[152:153], v[120:121], %[v_acc_3]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[20:23], 0 offen offset:3072
\n
"
" %[v_acc_3], acc[154:155], v[122:123], %[v_acc_3]
\n
"
_UK_MFMA_
" %[v_acc_3], acc[156:157], v[124:125], %[v_acc_3]
\n
"
" buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_3], acc[158:159], v[126:127], %[v_acc_3]
\n
"
" %[v_acc_3], acc[156:157], v[124:125], %[v_acc_3]
\n
"
_UK_MFMA_
" buffer_load_dword %[v_os_a7], s[16:19], 0 offen lds
\n
"
" %[v_acc_3], acc[158:159], v[126:127], %[v_acc_3]
\n
"
" s_add_u32 m0, 0, %[s_m0_init]
\n
"
" buffer_load_dword %[v_os_a7], s[16:19], 0 offen lds
\n
"
" s_waitcnt vmcnt(32)
\n
"
" s_add_u32 m0, 0, %[s_m0_init]
\n
"
_UK_MFMA_
" %[v_acc_4], acc[160:161], v[96:97], %[v_acc_4]
\n
"
" s_waitcnt vmcnt(32)
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_4], acc[162:163], v[98:99], %[v_acc_4]
\n
"
" %[v_acc_4], acc[160:161], v[96:97], %[v_acc_4]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[20:23], 0 offen
\n
"
" %[v_acc_4], acc[162:163], v[98:99], %[v_acc_4]
\n
"
_UK_MFMA_
" %[v_acc_4], acc[164:165], v[100:101], %[v_acc_4]
\n
"
" buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[20:23], 0 offen
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_4], acc[166:167], v[102:103], %[v_acc_4]
\n
"
" %[v_acc_4], acc[164:165], v[100:101], %[v_acc_4]
\n
"
_UK_MFMA_
" ds_read_b128 v[64:67], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_0]
\n
"
" %[v_acc_4], acc[166:167], v[102:103], %[v_acc_4]
\n
"
_UK_MFMA_
" %[v_acc_4], acc[168:169], v[104:105], %[v_acc_4]
\n
"
" ds_read_b128 v[64:67], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_0]
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_4], acc[170:171], v[106:107], %[v_acc_4]
\n
"
" %[v_acc_4], acc[168:169], v[104:105], %[v_acc_4]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[20:23], 0 offen offset:1024
\n
"
" %[v_acc_4], acc[170:171], v[106:107], %[v_acc_4]
\n
"
_UK_MFMA_
" %[v_acc_4], acc[172:173], v[108:109], %[v_acc_4]
\n
"
" buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_4], acc[174:175], v[110:111], %[v_acc_4]
\n
"
" %[v_acc_4], acc[172:173], v[108:109], %[v_acc_4]
\n
"
_UK_MFMA_
" ds_read_b128 v[68:71], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_1]
\n
"
" %[v_acc_4], acc[174:175], v[110:111], %[v_acc_4]
\n
"
_UK_MFMA_
" %[v_acc_5], acc[160:161], v[112:113], %[v_acc_5]
\n
"
" ds_read_b128 v[68:71], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_1]
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_5], acc[162:163], v[114:115], %[v_acc_5]
\n
"
" %[v_acc_5], acc[160:161], v[112:113], %[v_acc_5]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[20:23], 0 offen offset:2048
\n
"
" %[v_acc_5], acc[162:163], v[114:115], %[v_acc_5]
\n
"
_UK_MFMA_
" %[v_acc_5], acc[164:165], v[116:117], %[v_acc_5]
\n
"
" buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_5], acc[166:167], v[118:119], %[v_acc_5]
\n
"
" %[v_acc_5], acc[164:165], v[116:117], %[v_acc_5]
\n
"
_UK_MFMA_
" ds_read_b128 v[72:75], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_2]
\n
"
" %[v_acc_5], acc[166:167], v[118:119], %[v_acc_5]
\n
"
_UK_MFMA_
" %[v_acc_5], acc[168:169], v[120:121], %[v_acc_5]
\n
"
" ds_read_b128 v[72:75], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_2] "
_UK_MFMA_
" %[v_acc_5], acc[170:171], v[122:123], %[v_acc_5]
\n
"
"
\n
"
_UK_MFMA_
" %[v_acc_5], acc[168:169], v[120:121], %[v_acc_5]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[20:23], 0 offen offset:3072
\n
"
" %[v_acc_5], acc[170:171], v[122:123], %[v_acc_5]
\n
"
_UK_MFMA_
" %[v_acc_5], acc[172:173], v[124:125], %[v_acc_5]
\n
"
" buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_5], acc[174:175], v[126:127], %[v_acc_5]
\n
"
" %[v_acc_5], acc[172:173], v[124:125], %[v_acc_5]
\n
"
_UK_MFMA_
" ds_read_b128 v[76:79], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_3]
\n
"
" %[v_acc_5], acc[174:175], v[126:127], %[v_acc_5]
\n
"
_UK_MFMA_
" %[v_acc_6], acc[176:177], v[96:97], %[v_acc_6]
\n
"
" ds_read_b128 v[76:79], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_3] "
_UK_MFMA_
" %[v_acc_6], acc[178:179], v[98:99], %[v_acc_6]
\n
"
"
\n
"
_UK_MFMA_
" %[v_acc_6], acc[176:177], v[96:97], %[v_acc_6]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[20:23], 0 offen
\n
"
" %[v_acc_6], acc[178:179], v[98:99], %[v_acc_6]
\n
"
_UK_MFMA_
" %[v_acc_6], acc[180:181], v[100:101], %[v_acc_6]
\n
"
" buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[20:23], 0 offen
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_6], acc[182:183], v[102:103], %[v_acc_6]
\n
"
" %[v_acc_6], acc[180:181], v[100:101], %[v_acc_6]
\n
"
_UK_MFMA_
" ds_read_b128 v[80:83], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_4]
\n
"
" %[v_acc_6], acc[182:183], v[102:103], %[v_acc_6]
\n
"
_UK_MFMA_
" %[v_acc_6], acc[184:185], v[104:105], %[v_acc_6]
\n
"
" ds_read_b128 v[80:83], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_4] "
_UK_MFMA_
" %[v_acc_6], acc[186:187], v[106:107], %[v_acc_6]
\n
"
"
\n
"
_UK_MFMA_
" %[v_acc_6], acc[184:185], v[104:105], %[v_acc_6]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[20:23], 0 offen offset:1024
\n
"
" %[v_acc_6], acc[186:187], v[106:107], %[v_acc_6]
\n
"
_UK_MFMA_
" %[v_acc_6], acc[188:189], v[108:109], %[v_acc_6]
\n
"
" buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_6], acc[190:191], v[110:111], %[v_acc_6]
\n
"
" %[v_acc_6], acc[188:189], v[108:109], %[v_acc_6]
\n
"
_UK_MFMA_
" ds_read_b128 v[84:87], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_5]
\n
"
" %[v_acc_6], acc[190:191], v[110:111], %[v_acc_6]
\n
"
_UK_MFMA_
" %[v_acc_7], acc[176:177], v[112:113], %[v_acc_7]
\n
"
" ds_read_b128 v[84:87], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_5] "
_UK_MFMA_
" %[v_acc_7], acc[178:179], v[114:115], %[v_acc_7]
\n
"
"
\n
"
_UK_MFMA_
" %[v_acc_7], acc[176:177], v[112:113], %[v_acc_7]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[20:23], 0 offen offset:2048
\n
"
" %[v_acc_7], acc[178:179], v[114:115], %[v_acc_7]
\n
"
_UK_MFMA_
" %[v_acc_7], acc[180:181], v[116:117], %[v_acc_7]
\n
"
" buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_7], acc[182:183], v[118:119], %[v_acc_7]
\n
"
" %[v_acc_7], acc[180:181], v[116:117], %[v_acc_7]
\n
"
_UK_MFMA_
" ds_read_b128 v[88:91], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_6]
\n
"
" %[v_acc_7], acc[182:183], v[118:119], %[v_acc_7]
\n
"
_UK_MFMA_
" %[v_acc_7], acc[184:185], v[120:121], %[v_acc_7]
\n
"
" ds_read_b128 v[88:91], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_6] "
_UK_MFMA_
" %[v_acc_7], acc[186:187], v[122:123], %[v_acc_7]
\n
"
"
\n
"
_UK_MFMA_
" %[v_acc_7], acc[184:185], v[120:121], %[v_acc_7]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[20:23], 0 offen offset:3072
\n
"
" %[v_acc_7], acc[186:187], v[122:123], %[v_acc_7]
\n
"
_UK_MFMA_
" %[v_acc_7], acc[188:189], v[124:125], %[v_acc_7]
\n
"
" buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_7], acc[190:191], v[126:127], %[v_acc_7]
\n
"
" %[v_acc_7], acc[188:189], v[124:125], %[v_acc_7]
\n
"
_UK_MFMA_
" ds_read_b128 v[92:95], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_7]
\n
"
" %[v_acc_7], acc[190:191], v[126:127], %[v_acc_7]
\n
"
" s_waitcnt vmcnt(32)
\n
"
" ds_read_b128 v[92:95], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_7]
\n
"
_UK_MFMA_
" %[v_acc_8], acc[192:193], v[96:97], %[v_acc_8]
\n
"
" s_waitcnt vmcnt(32)
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_8], acc[194:195], v[98:99], %[v_acc_8]
\n
"
" %[v_acc_8], acc[192:193], v[96:97], %[v_acc_8]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[20:23], 0 offen
\n
"
" %[v_acc_8], acc[194:195], v[98:99], %[v_acc_8]
\n
"
_UK_MFMA_
" %[v_acc_8], acc[196:197], v[100:101], %[v_acc_8]
\n
"
" buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[20:23], 0 offen
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_8], acc[198:199], v[102:103], %[v_acc_8]
\n
"
" %[v_acc_8], acc[196:197], v[100:101], %[v_acc_8]
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_8], acc[200:201], v[104:105], %[v_acc_8]
\n
"
" %[v_acc_8], acc[198:199], v[102:103], %[v_acc_8]
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_8], acc[202:203], v[106:107], %[v_acc_8]
\n
"
" %[v_acc_8], acc[200:201], v[104:105], %[v_acc_8]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[20:23], 0 offen offset:1024
\n
"
" %[v_acc_8], acc[202:203], v[106:107], %[v_acc_8]
\n
"
_UK_MFMA_
" %[v_acc_8], acc[204:205], v[108:109], %[v_acc_8]
\n
"
" buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_8], acc[206:207], v[110:111], %[v_acc_8]
\n
"
" %[v_acc_8], acc[204:205], v[108:109], %[v_acc_8]
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_9], acc[192:193], v[112:113], %[v_acc_9]
\n
"
" %[v_acc_8], acc[206:207], v[110:111], %[v_acc_8]
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_9], acc[194:195], v[114:115], %[v_acc_9]
\n
"
" %[v_acc_9], acc[192:193], v[112:113], %[v_acc_9]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[20:23], 0 offen offset:2048
\n
"
" %[v_acc_9], acc[194:195], v[114:115], %[v_acc_9]
\n
"
_UK_MFMA_
" %[v_acc_9], acc[196:197], v[116:117], %[v_acc_9]
\n
"
" buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_9], acc[198:199], v[118:119], %[v_acc_9]
\n
"
" %[v_acc_9], acc[196:197], v[116:117], %[v_acc_9]
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_9], acc[200:201], v[120:121], %[v_acc_9]
\n
"
" %[v_acc_9], acc[198:199], v[118:119], %[v_acc_9]
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_9], acc[202:203], v[122:123], %[v_acc_9]
\n
"
" %[v_acc_9], acc[200:201], v[120:121], %[v_acc_9]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[20:23], 0 offen offset:3072
\n
"
" %[v_acc_9], acc[202:203], v[122:123], %[v_acc_9]
\n
"
_UK_MFMA_
" %[v_acc_9], acc[204:205], v[124:125], %[v_acc_9]
\n
"
" buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_9], acc[206:207], v[126:127], %[v_acc_9]
\n
"
" %[v_acc_9], acc[204:205], v[124:125], %[v_acc_9]
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_10], acc[208:209], v[96:97], %[v_acc_10]
\n
"
" %[v_acc_9], acc[206:207], v[126:127], %[v_acc_9]
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_10], acc[210:211], v[98:99], %[v_acc_10]
\n
"
" %[v_acc_10], acc[208:209], v[96:97], %[v_acc_10]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[20:23], 0 offen
\n
"
" %[v_acc_10], acc[210:211], v[98:99], %[v_acc_10]
\n
"
_UK_MFMA_
" %[v_acc_10], acc[212:213], v[100:101], %[v_acc_10]
\n
"
" buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[20:23], 0 offen
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_10], acc[214:215], v[102:103], %[v_acc_10]
\n
"
" %[v_acc_10], acc[212:213], v[100:101], %[v_acc_10]
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_10], acc[216:217], v[104:105], %[v_acc_10]
\n
"
" %[v_acc_10], acc[214:215], v[102:103], %[v_acc_10]
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_10], acc[218:219], v[106:107], %[v_acc_10]
\n
"
" %[v_acc_10], acc[216:217], v[104:105], %[v_acc_10]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[20:23], 0 offen offset:1024
\n
"
" %[v_acc_10], acc[218:219], v[106:107], %[v_acc_10]
\n
"
_UK_MFMA_
" %[v_acc_10], acc[220:221], v[108:109], %[v_acc_10]
\n
"
" buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_10], acc[222:223], v[110:111], %[v_acc_10]
\n
"
" %[v_acc_10], acc[220:221], v[108:109], %[v_acc_10]
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_11], acc[208:209], v[112:113], %[v_acc_11]
\n
"
" %[v_acc_10], acc[222:223], v[110:111], %[v_acc_10]
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_11], acc[210:211], v[114:115], %[v_acc_11]
\n
"
" %[v_acc_11], acc[208:209], v[112:113], %[v_acc_11]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[20:23], 0 offen offset:2048
\n
"
" %[v_acc_11], acc[210:211], v[114:115], %[v_acc_11]
\n
"
_UK_MFMA_
" %[v_acc_11], acc[212:213], v[116:117], %[v_acc_11]
\n
"
" buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_11], acc[214:215], v[118:119], %[v_acc_11]
\n
"
" %[v_acc_11], acc[212:213], v[116:117], %[v_acc_11]
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_11], acc[216:217], v[120:121], %[v_acc_11]
\n
"
" %[v_acc_11], acc[214:215], v[118:119], %[v_acc_11]
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_11], acc[218:219], v[122:123], %[v_acc_11]
\n
"
" %[v_acc_11], acc[216:217], v[120:121], %[v_acc_11]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[20:23], 0 offen offset:3072
\n
"
" %[v_acc_11], acc[218:219], v[122:123], %[v_acc_11]
\n
"
_UK_MFMA_
" %[v_acc_11], acc[220:221], v[124:125], %[v_acc_11]
\n
"
" buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_11], acc[222:223], v[126:127], %[v_acc_11]
\n
"
" %[v_acc_11], acc[220:221], v[124:125], %[v_acc_11]
\n
"
_UK_MFMA_
" s_waitcnt vmcnt(32)
\n
"
" %[v_acc_11], acc[222:223], v[126:127], %[v_acc_11]
\n
"
_UK_MFMA_
" %[v_acc_12], acc[224:225], v[96:97], %[v_acc_12]
\n
"
" s_waitcnt vmcnt(32)
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_12], acc[226:227], v[98:99], %[v_acc_12]
\n
"
" %[v_acc_12], acc[224:225], v[96:97], %[v_acc_12]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[20:23], 0 offen
\n
"
" %[v_acc_12], acc[226:227], v[98:99], %[v_acc_12]
\n
"
_UK_MFMA_
" %[v_acc_12], acc[228:229], v[100:101], %[v_acc_12]
\n
"
" buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[20:23], 0 offen
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_12], acc[230:231], v[102:103], %[v_acc_12]
\n
"
" %[v_acc_12], acc[228:229], v[100:101], %[v_acc_12]
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_12], acc[232:233], v[104:105], %[v_acc_12]
\n
"
" %[v_acc_12], acc[230:231], v[102:103], %[v_acc_12]
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_12], acc[234:235], v[106:107], %[v_acc_12]
\n
"
" %[v_acc_12], acc[232:233], v[104:105], %[v_acc_12]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[20:23], 0 offen offset:1024
\n
"
" %[v_acc_12], acc[234:235], v[106:107], %[v_acc_12]
\n
"
_UK_MFMA_
" %[v_acc_12], acc[236:237], v[108:109], %[v_acc_12]
\n
"
" buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_12], acc[238:239], v[110:111], %[v_acc_12]
\n
"
" %[v_acc_12], acc[236:237], v[108:109], %[v_acc_12]
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_13], acc[224:225], v[112:113], %[v_acc_13]
\n
"
" %[v_acc_12], acc[238:239], v[110:111], %[v_acc_12]
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_13], acc[226:227], v[114:115], %[v_acc_13]
\n
"
" %[v_acc_13], acc[224:225], v[112:113], %[v_acc_13]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[20:23], 0 offen offset:2048
\n
"
" %[v_acc_13], acc[226:227], v[114:115], %[v_acc_13]
\n
"
_UK_MFMA_
" %[v_acc_13], acc[228:229], v[116:117], %[v_acc_13]
\n
"
" buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_13], acc[230:231], v[118:119], %[v_acc_13]
\n
"
" %[v_acc_13], acc[228:229], v[116:117], %[v_acc_13]
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_13], acc[232:233], v[120:121], %[v_acc_13]
\n
"
" %[v_acc_13], acc[230:231], v[118:119], %[v_acc_13]
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_13], acc[234:235], v[122:123], %[v_acc_13]
\n
"
" %[v_acc_13], acc[232:233], v[120:121], %[v_acc_13]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[20:23], 0 offen offset:3072
\n
"
" %[v_acc_13], acc[234:235], v[122:123], %[v_acc_13]
\n
"
_UK_MFMA_
" %[v_acc_13], acc[236:237], v[124:125], %[v_acc_13]
\n
"
" buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_13], acc[238:239], v[126:127], %[v_acc_13]
\n
"
" %[v_acc_13], acc[236:237], v[124:125], %[v_acc_13]
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_14], acc[240:241], v[96:97], %[v_acc_14]
\n
"
" %[v_acc_13], acc[238:239], v[126:127], %[v_acc_13]
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_14], acc[242:243], v[98:99], %[v_acc_14]
\n
"
" %[v_acc_14], acc[240:241], v[96:97], %[v_acc_14]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[20:23], 0 offen
\n
"
" %[v_acc_14], acc[242:243], v[98:99], %[v_acc_14]
\n
"
_UK_MFMA_
" %[v_acc_14], acc[244:245], v[100:101], %[v_acc_14]
\n
"
" buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[20:23], 0 offen
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_14], acc[246:247], v[102:103], %[v_acc_14]
\n
"
" %[v_acc_14], acc[244:245], v[100:101], %[v_acc_14]
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_14], acc[248:249], v[104:105], %[v_acc_14]
\n
"
" %[v_acc_14], acc[246:247], v[102:103], %[v_acc_14]
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_14], acc[250:251], v[106:107], %[v_acc_14]
\n
"
" %[v_acc_14], acc[248:249], v[104:105], %[v_acc_14]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[20:23], 0 offen offset:1024
\n
"
" %[v_acc_14], acc[250:251], v[106:107], %[v_acc_14]
\n
"
_UK_MFMA_
" %[v_acc_14], acc[252:253], v[108:109], %[v_acc_14]
\n
"
" buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_14], acc[254:255], v[110:111], %[v_acc_14]
\n
"
" %[v_acc_14], acc[252:253], v[108:109], %[v_acc_14]
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_15], acc[240:241], v[112:113], %[v_acc_15]
\n
"
" %[v_acc_14], acc[254:255], v[110:111], %[v_acc_14]
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_15], acc[242:243], v[114:115], %[v_acc_15]
\n
"
" %[v_acc_15], acc[240:241], v[112:113], %[v_acc_15]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[20:23], 0 offen offset:2048
\n
"
" %[v_acc_15], acc[242:243], v[114:115], %[v_acc_15]
\n
"
_UK_MFMA_
" %[v_acc_15], acc[244:245], v[116:117], %[v_acc_15]
\n
"
" buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_15], acc[246:247], v[118:119], %[v_acc_15]
\n
"
" %[v_acc_15], acc[244:245], v[116:117], %[v_acc_15]
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_15], acc[248:249], v[120:121], %[v_acc_15]
\n
"
" %[v_acc_15], acc[246:247], v[118:119], %[v_acc_15]
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_15], acc[250:251], v[122:123], %[v_acc_15]
\n
"
" %[v_acc_15], acc[248:249], v[120:121], %[v_acc_15]
\n
"
_UK_MFMA_
" buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[20:23], 0 offen offset:3072
\n
"
" %[v_acc_15], acc[250:251], v[122:123], %[v_acc_15]
\n
"
_UK_MFMA_
" %[v_acc_15], acc[252:253], v[124:125], %[v_acc_15]
\n
"
" buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
_UK_MFMA_
" %[v_acc_15], acc[254:255], v[126:127], %[v_acc_15]
\n
"
" %[v_acc_15], acc[252:253], v[124:125], %[v_acc_15]
\n
"
_UK_MFMA_
" s_sub_i32 %[s_loop_cnt], %[s_loop_cnt], 1
\n
"
" %[v_acc_15], acc[254:255], v[126:127], %[v_acc_15]
\n
"
" s_cmp_gt_i32 %[s_loop_cnt] 0
\n
"
" s_sub_i32 %[s_loop_cnt], %[s_loop_cnt], 1
\n
"
" s_cbranch_scc0 L_end%=
\n
"
" s_cmp_gt_i32 %[s_loop_cnt] 0
\n
"
" s_cmp_gt_i32 %[s_loop_cnt] 2 ; move a with cond
\n
"
" s_cbranch_scc0 L_end%=
\n
"
" s_cselect_b32 s86, %[s_tile_os_a], 0
\n
"
" s_cmp_gt_i32 %[s_loop_cnt] 2 ; move a with cond
\n
"
" s_add_u32 s16, s86, s16
\n
"
" s_cselect_b32 s86, %[s_tile_os_a], 0
\n
"
" s_addc_u32 s17, 0, s17
\n
"
" s_add_u32 s16, s86, s16
\n
"
" s_cmp_gt_i32 %[s_loop_cnt] 1 ; move b with cond
\n
"
" s_addc_u32 s17, 0, s17
\n
"
" s_cselect_b32 s86, %[s_tile_os_b], 0
\n
"
" s_cmp_gt_i32 %[s_loop_cnt] 1 ; move b with cond
\n
"
" s_add_u32 s20, s86, s20
\n
"
" s_cselect_b32 s86, %[s_tile_os_b], 0
\n
"
" s_addc_u32 s21, 0, s21
\n
"
" s_add_u32 s20, s86, s20
\n
"
" s_branch L_start%=
\n
"
" s_addc_u32 s21, 0, s21
\n
"
"L_end%=:
\n
"
" s_branch L_start%=
\n
"
" s_nop 2
\n
"
"L_end%=:
\n
"
" s_nop 2
\n
"
#undef _UK_MFMA_
#undef _UK_MFMA_
include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
View file @
58d75b7a
...
@@ -998,14 +998,14 @@ struct FmhaFwdKernel
...
@@ -998,14 +998,14 @@ struct FmhaFwdKernel
return
pad_tensor_view
(
return
pad_tensor_view
(
q_dram_naive
,
q_dram_naive
,
make_tuple
(
number
<
FmhaPipeline
::
kM0
>
{},
number
<
FmhaPipeline
::
kSubQKHeaddim
>
{}),
make_tuple
(
number
<
FmhaPipeline
::
kM0
>
{},
number
<
FmhaPipeline
::
kSubQKHeaddim
>
{}),
sequence
<
false
,
kPadHeadDimQ
>
{});
sequence
<
kPadSeqLenQ
,
kPadHeadDimQ
>
{});
}
}
else
else
{
{
return
pad_tensor_view
(
return
pad_tensor_view
(
q_dram_naive
,
q_dram_naive
,
make_tuple
(
number
<
FmhaPipeline
::
kM0
>
{},
number
<
FmhaPipeline
::
kK0
>
{}),
make_tuple
(
number
<
FmhaPipeline
::
kM0
>
{},
number
<
FmhaPipeline
::
kK0
>
{}),
sequence
<
false
,
kPadHeadDimQ
>
{});
sequence
<
kPadSeqLenQ
,
kPadHeadDimQ
>
{});
}
}
}();
}();
const
auto
k_dram
=
[
&
]()
{
const
auto
k_dram
=
[
&
]()
{
...
@@ -1019,7 +1019,7 @@ struct FmhaFwdKernel
...
@@ -1019,7 +1019,7 @@ struct FmhaFwdKernel
return
pad_tensor_view
(
return
pad_tensor_view
(
k_dram_naive
,
k_dram_naive
,
make_tuple
(
number
<
FmhaPipeline
::
kN0
>
{},
number
<
FmhaPipeline
::
kK0
>
{}),
make_tuple
(
number
<
FmhaPipeline
::
kN0
>
{},
number
<
FmhaPipeline
::
kK0
>
{}),
sequence
<
false
,
kPadHeadDimQ
>
{});
sequence
<
kPadSeqLenK
,
kPadHeadDimQ
>
{});
}();
}();
const
auto
v_dram
=
[
&
]()
{
const
auto
v_dram
=
[
&
]()
{
if
constexpr
(
std
::
is_same_v
<
VLayout
,
ck_tile
::
tensor_layout
::
gemm
::
RowMajor
>
)
if
constexpr
(
std
::
is_same_v
<
VLayout
,
ck_tile
::
tensor_layout
::
gemm
::
RowMajor
>
)
...
@@ -1041,7 +1041,7 @@ struct FmhaFwdKernel
...
@@ -1041,7 +1041,7 @@ struct FmhaFwdKernel
return
pad_tensor_view
(
return
pad_tensor_view
(
v_dram_transposed
,
v_dram_transposed
,
make_tuple
(
number
<
FmhaPipeline
::
kN1
>
{},
number
<
FmhaPipeline
::
kK1
>
{}),
make_tuple
(
number
<
FmhaPipeline
::
kN1
>
{},
number
<
FmhaPipeline
::
kK1
>
{}),
sequence
<
kPadHeadDimV
,
false
>
{});
sequence
<
kPadHeadDimV
,
kPadSeqLenK
>
{});
}
}
else
else
{
{
...
@@ -1055,7 +1055,7 @@ struct FmhaFwdKernel
...
@@ -1055,7 +1055,7 @@ struct FmhaFwdKernel
return
pad_tensor_view
(
return
pad_tensor_view
(
v_dram_naive
,
v_dram_naive
,
make_tuple
(
number
<
FmhaPipeline
::
kN1
>
{},
number
<
FmhaPipeline
::
kK1
>
{}),
make_tuple
(
number
<
FmhaPipeline
::
kN1
>
{},
number
<
FmhaPipeline
::
kK1
>
{}),
sequence
<
false
,
kPadSeqLenK
>
{});
sequence
<
kPadHeadDimV
,
kPadSeqLenK
>
{});
}
}
}();
}();
...
@@ -1097,8 +1097,9 @@ struct FmhaFwdKernel
...
@@ -1097,8 +1097,9 @@ struct FmhaFwdKernel
number
<
FmhaPipeline
::
kAlignmentBias
>
{},
number
<
FmhaPipeline
::
kAlignmentBias
>
{},
number
<
1
>
{});
number
<
1
>
{});
return
pad_tensor_view
(
return
pad_tensor_view
(
bias_dram_naive
,
bias_dram_naive
,
bias_dram_window_lengths
,
sequence
<
false
,
kPadSeqLenK
>
{});
bias_dram_window_lengths
,
sequence
<
kPadSeqLenQ
,
kPadSeqLenK
>
{});
}();
}();
return
make_tile_window
(
bias_dram
,
bias_dram_window_lengths
,
{
i_m0
,
0
});
return
make_tile_window
(
bias_dram
,
bias_dram_window_lengths
,
{
i_m0
,
0
});
...
...
include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_policy.hpp
View file @
58d75b7a
...
@@ -810,21 +810,46 @@ struct FusedMoeGemmPipelineFlatmmPolicy
...
@@ -810,21 +810,46 @@ struct FusedMoeGemmPipelineFlatmmPolicy
CK_TILE_HOST_DEVICE
static
constexpr
auto
GetUK_1
()
CK_TILE_HOST_DEVICE
static
constexpr
auto
GetUK_1
()
{
{
using
S_
=
typename
Problem
::
BlockShape
;
using
S_
=
typename
Problem
::
BlockShape
;
using
T_
=
typename
Problem
::
Traits
;
if
constexpr
(
std
::
is_same_v
<
typename
Problem
::
YDataType
,
ck_tile
::
bf16_t
>
&&
if
constexpr
(
std
::
is_same_v
<
typename
Problem
::
YDataType
,
ck_tile
::
bf16_t
>
&&
std
::
is_same_v
<
typename
Problem
::
DDataType
,
ck_tile
::
bf16_t
>
&&
std
::
is_same_v
<
typename
Problem
::
DDataType
,
ck_tile
::
bf16_t
>
&&
std
::
is_same_v
<
typename
Problem
::
TopkWeightDataType
,
float
>
&&
std
::
is_same_v
<
typename
Problem
::
TopkWeightDataType
,
float
>
&&
S_
::
Block_M1
==
32
&&
S_
::
Block_N1
==
128
&&
S_
::
Block_K1
==
512
&&
S_
::
Block_M1
==
32
&&
S_
::
Block_N1
==
128
&&
S_
::
Block_K1
==
512
&&
S_
::
Warp_M0
==
16
&&
S_
::
Warp_N0
==
16
&&
S_
::
Warp_K0
==
32
)
S_
::
Warp_M0
==
16
&&
S_
::
Warp_N0
==
16
&&
S_
::
Warp_K0
==
32
&&
T_
::
PipeInterleave
==
false
)
{
{
return
FlatmmSn_32x128x512_1x4x1_16x16x32_BF16
{};
return
FlatmmSn_32x128x512_1x4x1_16x16x32_BF16
{};
// return FlatmmSn_32x128x512_1x4x1_16x16x32_BF16_itl{};
}
}
else
if
constexpr
(
std
::
is_same_v
<
typename
Problem
::
YDataType
,
ck_tile
::
fp16_t
>
&&
else
if
constexpr
(
std
::
is_same_v
<
typename
Problem
::
YDataType
,
ck_tile
::
fp16_t
>
&&
std
::
is_same_v
<
typename
Problem
::
DDataType
,
ck_tile
::
fp16_t
>
&&
std
::
is_same_v
<
typename
Problem
::
DDataType
,
ck_tile
::
fp16_t
>
&&
std
::
is_same_v
<
typename
Problem
::
TopkWeightDataType
,
float
>
&&
std
::
is_same_v
<
typename
Problem
::
TopkWeightDataType
,
float
>
&&
S_
::
Block_M1
==
32
&&
S_
::
Block_N1
==
128
&&
S_
::
Block_K1
==
512
&&
S_
::
Block_M1
==
32
&&
S_
::
Block_N1
==
128
&&
S_
::
Block_K1
==
512
&&
S_
::
Warp_M0
==
16
&&
S_
::
Warp_N0
==
16
&&
S_
::
Warp_K0
==
32
)
S_
::
Warp_M0
==
16
&&
S_
::
Warp_N0
==
16
&&
S_
::
Warp_K0
==
32
&&
T_
::
PipeInterleave
==
false
)
{
{
return
FlatmmSn_32x128x512_1x4x1_16x16x32_FP16
{};
return
FlatmmSn_32x128x512_1x4x1_16x16x32_FP16
{};
// return FlatmmSn_32x128x512_1x4x1_16x16x32_FP16_itl{};
}
else
if
constexpr
(
std
::
is_same_v
<
typename
Problem
::
YDataType
,
ck_tile
::
bf16_t
>
&&
std
::
is_same_v
<
typename
Problem
::
DDataType
,
ck_tile
::
bf16_t
>
&&
std
::
is_same_v
<
typename
Problem
::
TopkWeightDataType
,
float
>
&&
S_
::
Block_M1
==
32
&&
S_
::
Block_N1
==
128
&&
S_
::
Block_K1
==
512
&&
S_
::
Warp_M0
==
16
&&
S_
::
Warp_N0
==
16
&&
S_
::
Warp_K0
==
32
&&
T_
::
PipeInterleave
==
true
)
{
// return FlatmmSn_32x128x512_1x4x1_16x16x32_FP16{};
return
FlatmmSn_32x128x512_1x4x1_16x16x32_BF16_itl
{};
}
else
if
constexpr
(
std
::
is_same_v
<
typename
Problem
::
YDataType
,
ck_tile
::
fp16_t
>
&&
std
::
is_same_v
<
typename
Problem
::
DDataType
,
ck_tile
::
fp16_t
>
&&
std
::
is_same_v
<
typename
Problem
::
TopkWeightDataType
,
float
>
&&
S_
::
Block_M1
==
32
&&
S_
::
Block_N1
==
128
&&
S_
::
Block_K1
==
512
&&
S_
::
Warp_M0
==
16
&&
S_
::
Warp_N0
==
16
&&
S_
::
Warp_K0
==
32
&&
T_
::
PipeInterleave
==
true
)
{
// return FlatmmSn_32x128x512_1x4x1_16x16x32_FP16{};
return
FlatmmSn_32x128x512_1x4x1_16x16x32_FP16_itl
{};
}
}
}
}
};
};
...
...
include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_traits.hpp
View file @
58d75b7a
...
@@ -22,7 +22,8 @@ template <bool IsGateOnly_,
...
@@ -22,7 +22,8 @@ template <bool IsGateOnly_,
FusedMoeGemmWeightPermuteEnum
PermuteEnum_
=
FusedMoeGemmWeightPermuteEnum
PermuteEnum_
=
FusedMoeGemmWeightPermuteEnum
::
b_nr_kr_waveflatten
,
FusedMoeGemmWeightPermuteEnum
::
b_nr_kr_waveflatten
,
bool
PadHiddenSize_
=
false
,
bool
PadHiddenSize_
=
false
,
bool
PadIntermediateSize_
=
false
>
bool
PadIntermediateSize_
=
false
,
bool
PipeInterleave_
=
true
>
struct
FusedMoeGemmTraits
struct
FusedMoeGemmTraits
{
{
// Gate+Up or Gate only
// Gate+Up or Gate only
...
@@ -32,6 +33,7 @@ struct FusedMoeGemmTraits
...
@@ -32,6 +33,7 @@ struct FusedMoeGemmTraits
static
constexpr
FusedMoeGemmWeightPermuteEnum
PermuteEnum
=
PermuteEnum_
;
static
constexpr
FusedMoeGemmWeightPermuteEnum
PermuteEnum
=
PermuteEnum_
;
static
constexpr
bool
PadHiddenSize
=
PadHiddenSize_
;
static
constexpr
bool
PadHiddenSize
=
PadHiddenSize_
;
static
constexpr
bool
PadIntermediateSize
=
PadIntermediateSize_
;
static
constexpr
bool
PadIntermediateSize
=
PadIntermediateSize_
;
static
constexpr
bool
PipeInterleave
=
PipeInterleave_
;
};
};
// Note: this need to be a bit mask
// Note: this need to be a bit mask
...
...
include/ck_tile/ops/gemm.hpp
View file @
58d75b7a
...
@@ -23,10 +23,10 @@
...
@@ -23,10 +23,10 @@
#include "ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_default_policy.hpp"
#include "ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_default_policy.hpp"
#include "ck_tile/ops/gemm/block/block_gemm_problem.hpp"
#include "ck_tile/ops/gemm/block/block_gemm_problem.hpp"
#include "ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp"
#include "ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp"
#include "ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp"
#include "ck_tile/ops/gemm/kernel/gemm_kernel.hpp"
#include "ck_tile/ops/gemm/kernel/gemm_kernel.hpp"
#include "ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp"
#include "ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp"
#include "ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp"
#include "ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp"
#include "ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp"
#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp"
#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp"
#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp"
#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp"
#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp"
#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp"
...
...
include/ck_tile/ops/smoothquant/kernel/smoothquant_kernel.hpp
View file @
58d75b7a
...
@@ -19,7 +19,8 @@ struct SmoothquantHostArgs
...
@@ -19,7 +19,8 @@ struct SmoothquantHostArgs
index_t
m
;
index_t
m
;
index_t
n
;
index_t
n
;
index_t
stride
;
// row_stride
index_t
x_stride
;
// input row_stride
index_t
y_stride
;
// output row_stride
};
};
// TODO: Extract some type to wrapper class
// TODO: Extract some type to wrapper class
...
@@ -58,14 +59,21 @@ struct Smoothquant
...
@@ -58,14 +59,21 @@ struct Smoothquant
index_t
m
;
index_t
m
;
index_t
n
;
index_t
n
;
index_t
stride
;
// row_stride
index_t
x_stride
;
// input row_stride
index_t
y_stride
;
// out row_stride
};
};
using
Hargs
=
SmoothquantHostArgs
;
using
Hargs
=
SmoothquantHostArgs
;
CK_TILE_HOST
static
constexpr
Kargs
MakeKargs
(
const
Hargs
&
hargs
)
CK_TILE_HOST
static
constexpr
Kargs
MakeKargs
(
const
Hargs
&
hargs
)
{
{
return
Kargs
{
return
Kargs
{
hargs
.
p_x
,
hargs
.
p_x
,
hargs
.
p_xscale
,
hargs
.
p_yscale
,
hargs
.
p_qy
,
hargs
.
m
,
hargs
.
n
,
hargs
.
stride
};
hargs
.
p_xscale
,
hargs
.
p_yscale
,
hargs
.
p_qy
,
hargs
.
m
,
hargs
.
n
,
hargs
.
x_stride
,
hargs
.
y_stride
};
}
}
CK_TILE_HOST
static
constexpr
auto
GridSize
(
const
Hargs
&
hargs
)
CK_TILE_HOST
static
constexpr
auto
GridSize
(
const
Hargs
&
hargs
)
...
@@ -116,7 +124,7 @@ struct Smoothquant
...
@@ -116,7 +124,7 @@ struct Smoothquant
const
auto
tmp_
=
make_naive_tensor_view
<
address_space_enum
::
global
>
(
const
auto
tmp_
=
make_naive_tensor_view
<
address_space_enum
::
global
>
(
static_cast
<
const
XDataType
*>
(
kargs
.
p_x
),
static_cast
<
const
XDataType
*>
(
kargs
.
p_x
),
make_tuple
(
kargs
.
m
,
kargs
.
n
),
make_tuple
(
kargs
.
m
,
kargs
.
n
),
make_tuple
(
kargs
.
stride
,
1
),
make_tuple
(
kargs
.
x_
stride
,
1
),
number
<
Vector_N
>
{},
number
<
Vector_N
>
{},
number
<
1
>
{});
number
<
1
>
{});
...
@@ -157,7 +165,7 @@ struct Smoothquant
...
@@ -157,7 +165,7 @@ struct Smoothquant
auto
tmp_
=
make_naive_tensor_view
<
address_space_enum
::
global
>
(
auto
tmp_
=
make_naive_tensor_view
<
address_space_enum
::
global
>
(
static_cast
<
QYDataType
*>
(
kargs
.
p_qy
),
static_cast
<
QYDataType
*>
(
kargs
.
p_qy
),
make_tuple
(
kargs
.
m
,
kargs
.
n
),
make_tuple
(
kargs
.
m
,
kargs
.
n
),
make_tuple
(
kargs
.
stride
,
1
),
make_tuple
(
kargs
.
y_
stride
,
1
),
number
<
Vector_N
>
{},
number
<
Vector_N
>
{},
number
<
1
>
{});
number
<
1
>
{});
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment