Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
40d038e9
Commit
40d038e9
authored
Oct 14, 2024
by
Jing Zhang
Browse files
clean
parent
c3d05c0c
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
30 additions
and
43 deletions
+30
-43
example/01_gemm/gemm_xdl_fp16_fp8_v3.cpp
example/01_gemm/gemm_xdl_fp16_fp8_v3.cpp
+1
-1
example/01_gemm/run_gemm_example_v2.inc
example/01_gemm/run_gemm_example_v2.inc
+8
-1
include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
...or_operation/gpu/element/unary_element_wise_operation.hpp
+1
-1
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
...nsor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
+20
-23
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
...operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
+0
-4
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
...tion/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
+0
-3
include/ck/utility/amd_xdlops.hpp
include/ck/utility/amd_xdlops.hpp
+0
-10
No files found.
example/01_gemm/gemm_xdl_fp16_fp8_v3.cpp
View file @
40d038e9
...
...
@@ -27,7 +27,7 @@ using DeviceGemmV2Instance =
ALayout
,
BLayout
,
CLayout
,
ADataType
,
BDataType
,
CDataType
,
AccDataType
,
CShuffleDataType
,
AElementOp
,
BElementOp
,
CElementOp
,
GemmDefault
,
#if
0
#if
1
64
,
16
,
16
,
256
,
8
,
16
,
...
...
example/01_gemm/run_gemm_example_v2.inc
View file @
40d038e9
...
...
@@ -256,7 +256,14 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
// get_rtol<CDataType>(),
// get_atol<CDataType>());
LogRangeAsType
<
float
>
(
std
::
cout
<<
"c_m_n_device_buf : "
,
c_m_n_device_result
.
mData
,
","
)
<<
std
::
endl
;
//for(int i = 0; i < M; i++)
//{
// for(int j = 0; j < N; j++)
// {
// std::cout << ck::type_convert<float>(c_m_n_device_result(i, j)) << ",";
// }
// std::cout << std::endl;
//}
#endif
}
...
...
include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
View file @
40d038e9
...
...
@@ -25,7 +25,7 @@ struct PassThroughPack2
__host__
__device__
constexpr
void
operator
()(
ck
::
half2_t
&
y
,
const
ck
::
pk_i4_t
&
x
)
const
{
#if
1
#if
0
uint8_t x_u8 = ck::bit_cast<uint8_t>(x);
uint8_t x_l = (x_u8 & 0x0f) >> 0;
uint8_t x_h = (x_u8 & 0xf0) >> 4;
...
...
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
View file @
40d038e9
...
...
@@ -151,6 +151,20 @@ struct GridwiseGemm_xdl_cshuffle_v3
using
ThisThreadBlock
=
ThisThreadBlock
<
BlockSize
>
;
static
constexpr
index_t
APackedSize
=
[]()
{
if
constexpr
(
is_same_v
<
remove_cvref_t
<
ADataType
>
,
pk_i4_t
>
)
return
2
;
else
return
1
;
}();
static
constexpr
index_t
BPackedSize
=
[]()
{
if
constexpr
(
is_same_v
<
remove_cvref_t
<
BDataType
>
,
pk_i4_t
>
)
return
2
;
else
return
1
;
}();
__host__
static
auto
CalculateGridSize
(
index_t
M
,
index_t
N
,
index_t
KBatch
)
{
return
std
::
make_tuple
(
Block2CTileMap
::
CalculateGridSize
(
M
,
N
),
1
,
KBatch
);
...
...
@@ -625,9 +639,8 @@ struct GridwiseGemm_xdl_cshuffle_v3
// in some cases.
else
if
constexpr
(
is_same
<
tensor_layout
::
gemm
::
RowMajor
,
ALayout
>::
value
)
{
constexpr
auto
MLdsLayer
=
32
*
4
/
KPerBlock
/
sizeof
(
ADataType
)
<
1
?
1
:
32
*
4
/
KPerBlock
/
sizeof
(
ADataType
);
constexpr
index_t
LdsSize
=
32
*
4
/
KPerBlock
/
sizeof
(
ADataType
);
constexpr
auto
MLdsLayer
=
LdsSize
<
1
?
1
:
LdsSize
;
constexpr
auto
a_lds_block_desc
=
make_naive_tensor_descriptor
(
make_tuple
(
AK0Number
*
Number
<
MLdsLayer
>
{},
Number
<
MPerBlock
/
MLdsLayer
>
{},
AK1Number
),
...
...
@@ -761,10 +774,8 @@ struct GridwiseGemm_xdl_cshuffle_v3
else
if
constexpr
(
is_same
<
tensor_layout
::
gemm
::
ColumnMajor
,
BLayout
>::
value
)
{
// NLdsLayer * K0 as logical Bank
constexpr
auto
NLdsLayer
=
32
*
4
/
KPerBlock
/
sizeof
(
BDataType
)
<
1
?
1
:
32
*
4
/
KPerBlock
/
sizeof
(
BDataType
);
;
constexpr
index_t
LdsSize
=
32
*
4
/
KPerBlock
/
sizeof
(
BDataType
);
constexpr
auto
NLdsLayer
=
LdsSize
<
1
?
1
:
LdsSize
;
constexpr
auto
b_lds_block_desc
=
make_naive_tensor_descriptor
(
make_tuple
(
BK0Number
*
Number
<
NLdsLayer
>
{},
Number
<
NPerBlock
/
NLdsLayer
>
{},
BK1Number
),
...
...
@@ -924,20 +935,6 @@ struct GridwiseGemm_xdl_cshuffle_v3
NXdlPerWave
,
KPack
>
())
>
;
static
constexpr
index_t
APackedSize
=
[]()
{
if
constexpr
(
is_same_v
<
remove_cvref_t
<
ADataType
>
,
pk_i4_t
>
)
return
2
;
else
return
1
;
}();
static
constexpr
index_t
BPackedSize
=
[]()
{
if
constexpr
(
is_same_v
<
remove_cvref_t
<
BDataType
>
,
pk_i4_t
>
)
return
2
;
else
return
1
;
}();
__device__
static
constexpr
index_t
GetSharedMemoryNumberOfByte
()
{
// LDS allocation for A and B: be careful of alignment
...
...
@@ -1326,8 +1323,8 @@ struct GridwiseGemm_xdl_cshuffle_v3
static_cast
<
ADataType
*>
(
p_shared
),
a_block_desc_ak0_m_ak1
.
GetElementSpaceSize
()
/
APackedSize
);
auto
b_block_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
bi
t_cast
<
BDataType
*>
(
bit_cast
<
unsigned
char
*>
(
p_shared
)
+
a_block_space_size_aligned
*
sizeof
(
ADataType
)
),
reinterpre
t_cast
<
BDataType
*>
(
static_cast
<
ADataType
*>
(
p_shared
)
+
a_block_space_size_aligned
),
b_block_desc_bk0_n_bk1
.
GetElementSpaceSize
()
/
BPackedSize
);
constexpr
auto
a_block_slice_copy_step
=
make_multi_index
(
KPerBlock
/
AK1Number
,
0
,
0
);
...
...
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
View file @
40d038e9
...
...
@@ -1211,10 +1211,6 @@ struct ThreadwiseTensorSliceTransfer_v4
dst_origin_idx
+
data_to_origin_disp_idx
+
i
*
src_scalar_step_in_vector
);
dst_buf
(
Number
<
dst_offset
>
{})
=
dst_tmp_vector
.
template
AsType
<
DstData
>()[
i
];
if
constexpr
(
is_same_v
<
remove_cvref_t
<
SrcData
>
,
half_t
>
)
printf
(
"v4: %f %d
\n
"
,
type_convert
<
float
>
(
dst_buf
[
Number
<
dst_offset
>
{}]),
threadIdx
.
x
);
});
}
});
...
...
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
View file @
40d038e9
...
...
@@ -554,9 +554,6 @@ struct ThreadwiseTensorSliceTransfer_v3r1
dst_element_op_
(
dst_v
,
dst_vector_container
.
template
AsType
<
DstData
>()[
i
]);
dst_vector_container
.
template
AsType
<
DstData
>()(
i
)
=
dst_v
;
//if constexpr(is_same_v<remove_cvref_t<SrcData>, half_t>)
//printf("v3r1: %f %d\n", type_convert<float>(dst_v), threadIdx.x);
});
// copy data from dst_vector_container to dst_buf
...
...
include/ck/utility/amd_xdlops.hpp
View file @
40d038e9
...
...
@@ -157,16 +157,6 @@ struct intrin_mfma_f32_16x16x16f16<16, 16>
template
<
class
FloatC
>
__device__
static
void
Run
(
const
half4_t
&
reg_a
,
const
half4_t
&
reg_b
,
FloatC
&
reg_c
)
{
auto
tmp_a
=
vector_type
<
half_t
,
4
>
{
reg_a
};
auto
tmp_b
=
vector_type
<
half_t
,
4
>
{
reg_b
};
printf
(
"{%f %f}, {%f %f}, {%f %f}, {%f %f} %d %d
\n
"
,
static_cast
<
float
>
(
tmp_a
.
template
AsType
<
half_t
>()(
Number
<
0
>
{})),
static_cast
<
float
>
(
tmp_b
.
template
AsType
<
half_t
>()(
Number
<
0
>
{})),
static_cast
<
float
>
(
tmp_a
.
template
AsType
<
half_t
>()(
Number
<
1
>
{})),
static_cast
<
float
>
(
tmp_b
.
template
AsType
<
half_t
>()(
Number
<
1
>
{})),
static_cast
<
float
>
(
tmp_a
.
template
AsType
<
half_t
>()(
Number
<
2
>
{})),
static_cast
<
float
>
(
tmp_b
.
template
AsType
<
half_t
>()(
Number
<
2
>
{})),
static_cast
<
float
>
(
tmp_a
.
template
AsType
<
half_t
>()(
Number
<
3
>
{})),
static_cast
<
float
>
(
tmp_b
.
template
AsType
<
half_t
>()(
Number
<
3
>
{})),
threadIdx
.
x
,
blockIdx
.
x
);
reg_c
.
template
AsType
<
float4_t
>()(
Number
<
0
>
{})
=
__builtin_amdgcn_mfma_f32_16x16x16f16
(
reg_a
,
reg_b
,
reg_c
.
template
AsType
<
float4_t
>()[
Number
<
0
>
{}],
0
,
0
,
0
);
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment