Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
3ef4d2c2
"vscode:/vscode.git/clone" did not exist on "b60f7d842f5175fa02ce51e7d46b5427acd30a98"
Commit
3ef4d2c2
authored
Oct 13, 2024
by
Jing Zhang
Browse files
clean
parent
0f3b88bf
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
64 additions
and
41 deletions
+64
-41
CMakeLists.txt
CMakeLists.txt
+1
-1
cmake/EnableCompilerWarnings.cmake
cmake/EnableCompilerWarnings.cmake
+1
-1
example/01_gemm/gemm_xdl_fp16_fp8_v3.cpp
example/01_gemm/gemm_xdl_fp16_fp8_v3.cpp
+1
-1
example/01_gemm/run_gemm_example_v2.inc
example/01_gemm/run_gemm_example_v2.inc
+2
-2
include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
...or_operation/gpu/element/unary_element_wise_operation.hpp
+5
-0
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
...nsor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
+30
-23
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
...tion/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
+18
-10
include/ck/utility/amd_xdlops.hpp
include/ck/utility/amd_xdlops.hpp
+5
-2
script/cmake-ck-dev.sh
script/cmake-ck-dev.sh
+1
-1
No files found.
CMakeLists.txt
View file @
3ef4d2c2
...
...
@@ -489,7 +489,7 @@ include_directories(BEFORE
SET
(
BUILD_DEV ON CACHE BOOL
"BUILD_DEV"
)
if
(
BUILD_DEV
)
add_compile_options
(
-Werror
)
#
add_compile_options(-Werror)
add_compile_options
(
-Weverything
)
endif
()
message
(
"CMAKE_CXX_FLAGS:
${
CMAKE_CXX_FLAGS
}
"
)
...
...
cmake/EnableCompilerWarnings.cmake
View file @
3ef4d2c2
...
...
@@ -66,7 +66,7 @@ else()
-Wunreachable-code
-Wunused
-Wno-reserved-identifier
-Werror
#
-Werror
-Wno-option-ignored
-Wsign-compare
-Wno-extra-semi-stmt
...
...
example/01_gemm/gemm_xdl_fp16_fp8_v3.cpp
View file @
3ef4d2c2
...
...
@@ -8,7 +8,7 @@
using
ADataType
=
ck
::
half_t
;
using
BDataType
=
ck
::
f8_t
;
using
AccDataType
=
float
;
using
CShuffleDataType
=
ck
::
half_
t
;
using
CShuffleDataType
=
floa
t
;
using
CDataType
=
ck
::
half_t
;
using
ALayout
=
Row
;
...
...
example/01_gemm/run_gemm_example_v2.inc
View file @
3ef4d2c2
...
...
@@ -133,7 +133,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
};
StrideA
=
f_get_default_stride
(
M
,
K
,
StrideA
,
ALayout
{});
StrideB
=
f_get_default_stride
(
K
,
N
,
StrideB
,
BLayout
{});
StrideB
=
f_get_default_stride
(
K
,
N
,
StrideB
/
2
,
BLayout
{});
StrideC
=
f_get_default_stride
(
M
,
N
,
StrideC
,
CLayout
{});
Tensor
<
ADataType
>
a_m_k
(
f_host_tensor_descriptor
(
M
,
K
,
StrideA
,
ALayout
{}));
...
...
@@ -267,7 +267,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
std
::
size_t
flop
=
2_
uz
*
M
*
N
*
K
;
std
::
size_t
num_btype
=
sizeof
(
ADataType
)
*
M
*
K
+
sizeof
(
BDataType
)
*
K
*
N
+
sizeof
(
CDataType
)
*
M
*
N
;
sizeof
(
ADataType
)
*
M
*
K
+
sizeof
(
BDataType
)
*
K
*
N
/
2
+
sizeof
(
CDataType
)
*
M
*
N
;
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
...
...
include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
View file @
3ef4d2c2
...
...
@@ -25,6 +25,7 @@ struct PassThroughPack2
__host__
__device__
constexpr
void
operator
()(
ck
::
half2_t
&
y
,
const
ck
::
pk_i4_t
&
x
)
const
{
#if 0
uint8_t x_u8 = ck::bit_cast<uint8_t>(x);
uint8_t x_l = (x_u8 & 0x0f) >> 0;
uint8_t x_h = (x_u8 & 0xf0) >> 4;
...
...
@@ -33,6 +34,10 @@ struct PassThroughPack2
auto h_f16 = ck::type_convert<ck::half_t>(x_h);
y = {l_f16, h_f16};
#else
uint32_t
t
=
ck
::
bit_cast
<
uint8_t
>
(
x
);
y
=
ck
::
bit_cast
<
half2_t
>
(
t
);
#endif
}
constexpr
const
static
bool
is_pack2_invocable
=
true
;
...
...
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
View file @
3ef4d2c2
...
...
@@ -924,6 +924,13 @@ struct GridwiseGemm_xdl_cshuffle_v3
NXdlPerWave
,
KPack
>
())
>
;
static
constexpr
index_t
BPackedSize
=
[]()
{
if
constexpr
(
is_same_v
<
remove_cvref_t
<
BDataType
>
,
pk_i4_t
>
)
return
2
;
else
return
1
;
}();
__device__
static
constexpr
index_t
GetSharedMemoryNumberOfByte
()
{
// LDS allocation for A and B: be careful of alignment
...
...
@@ -937,7 +944,7 @@ struct GridwiseGemm_xdl_cshuffle_v3
a_block_desc_ak0_m_ak1
.
GetElementSpaceSize
(),
max_lds_align
);
constexpr
auto
b_block_space_size_aligned
=
math
::
integer_least_multiple
(
b_block_desc_bk0_n_bk1
.
GetElementSpaceSize
(),
max_lds_align
);
b_block_desc_bk0_n_bk1
.
GetElementSpaceSize
(),
max_lds_align
)
/
BPackedSize
;
// LDS allocation for C shuffle in LDS
constexpr
auto
c_shuffle_block_desc_mblock_mperblock_nblock_nperblock
=
...
...
@@ -1312,9 +1319,9 @@ struct GridwiseGemm_xdl_cshuffle_v3
static_cast
<
ADataType
*>
(
p_shared
),
a_block_desc_ak0_m_ak1
.
GetElementSpaceSize
());
auto
b_block_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
static_cast
<
BDataType
*>
(
p_shared
)
+
a_block_space_size_aligned
*
sizeof
(
ADataType
)
/
sizeof
(
BDataType
),
b_block_desc_bk0_n_bk1
.
GetElementSpaceSize
());
static_cast
<
BDataType
*>
(
static_cast
<
unsigned
char
*>
(
p_shared
)
+
a_block_space_size_aligned
*
sizeof
(
ADataType
)),
b_block_desc_bk0_n_bk1
.
GetElementSpaceSize
()
/
BPackedSize
);
constexpr
auto
a_block_slice_copy_step
=
make_multi_index
(
KPerBlock
/
AK1Number
,
0
,
0
);
constexpr
auto
b_block_slice_copy_step
=
make_multi_index
(
KPerBlock
/
BK1Number
,
0
,
0
);
...
...
@@ -1329,19 +1336,19 @@ struct GridwiseGemm_xdl_cshuffle_v3
KPerBlock
);
blockwise_gemm_pipeline
.
template
Run
<
HasMainKBlockLoop
,
TailNum
>(
a_grid_desc_ak0_m_ak1
,
a_block_desc_ak0_m_ak1
,
a_blockwise_copy
,
a_grid_buf
,
a_block_buf
,
a_block_slice_copy_step
,
b_grid_desc_bk0_n_bk1
,
b_block_desc_bk0_n_bk1
,
b_blockwise_copy
,
b_grid_buf
,
b_block_buf
,
b_block_slice_copy_step
,
c_thread_buf
,
num_k_block_main_loop
);
a_block_desc_ak0_m_ak1
,
a_blockwise_copy
,
a_grid_buf
,
a_block_buf
,
a_block_slice_copy_step
,
b_grid_desc_bk0_n_bk1
,
b_block_desc_bk0_n_bk1
,
b_blockwise_copy
,
b_grid_buf
,
b_block_buf
,
b_block_slice_copy_step
,
c_thread_buf
,
num_k_block_main_loop
);
// shuffle C and write out
{
...
...
@@ -1706,17 +1713,17 @@ struct GridwiseGemm_xdl_cshuffle_v3
static_cast
<
ADataType
*>
(
p_shared_0
),
a_block_desc_ak0_m_ak1
.
GetElementSpaceSize
());
auto
b_block_buf_ping
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
static_cast
<
BDataType
*>
(
p_shared_0
)
+
a_block_space_size_aligned
*
sizeof
(
ADataType
)
/
sizeof
(
BDataType
),
b_block_desc_bk0_n_bk1
.
GetElementSpaceSize
());
static_cast
<
BDataType
*>
(
static_cast
<
char
*>
(
p_shared_0
)
+
a_block_space_size_aligned
*
sizeof
(
ADataType
)),
b_block_desc_bk0_n_bk1
.
GetElementSpaceSize
()
/
BPackedSize
);
auto
a_block_buf_pong
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
static_cast
<
ADataType
*>
(
p_shared_1
),
a_block_desc_ak0_m_ak1
.
GetElementSpaceSize
());
auto
b_block_buf_pong
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
static_cast
<
BDataType
*>
(
p_shared_1
)
+
a_block_space_size_aligned
*
sizeof
(
ADataType
)
/
sizeof
(
BDataType
),
b_block_desc_bk0_n_bk1
.
GetElementSpaceSize
());
static_cast
<
BDataType
*>
(
static_cast
<
char
*>
(
p_shared_1
)
+
a_block_space_size_aligned
*
sizeof
(
ADataType
)),
b_block_desc_bk0_n_bk1
.
GetElementSpaceSize
()
/
BPackedSize
);
auto
a_block_bufs
=
make_tuple
(
a_block_buf_ping
,
a_block_buf_pong
);
auto
b_block_bufs
=
make_tuple
(
b_block_buf_ping
,
b_block_buf_pong
);
...
...
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
View file @
3ef4d2c2
...
...
@@ -79,7 +79,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
dst_element_op_
(
dst_element_op
)
{
static_assert
(
is_same_v
<
remove_cvref_t
<
SrcData
>
,
remove_cvref_t
<
DstData
>>
,
"SrcData != DstData"
);
static_assert
(
!
(
is_same_v
<
remove_cvref_t
<
SrcData
>
,
pk_i4_t
>
&&
(
SrcScalarPerVector
==
1
||
DstScalarPerVector
==
1
)),
"pk data N cannot be 1"
);
static_assert
(
!
(
is_same_v
<
remove_cvref_t
<
SrcData
>
,
pk_i4_t
>
&&
(
SrcScalarPerVector
_
==
1
||
DstScalarPerVector
_
==
1
)),
"pk data N cannot be 1"
);
}
__device__
void
SetSrcSliceOrigin
(
const
SrcDesc
&
src_desc
,
const
Index
&
src_slice_origin_idx
)
...
...
@@ -108,11 +108,11 @@ struct ThreadwiseTensorSliceTransfer_v3r1
// scalar per access on each dim
// TODO: don't use lambda_scalar_per_access
constexpr
auto
src_scalar_per_access
=
generate_sequence
(
detail
::
lambda_scalar_per_access
<
SrcVectorDim
,
SrcScalarPerVector
*
PackedSize
>
{},
Number
<
nDim
>
{});
detail
::
lambda_scalar_per_access
<
SrcVectorDim
,
SrcScalarPerVector
_
>
{},
Number
<
nDim
>
{});
constexpr
auto
src_access_lengths
=
SliceLengths
{}
/
src_scalar_per_access
;
static_assert
(
SliceLengths
::
At
(
SrcVectorDim
)
%
(
SrcScalarPerVector
*
PackedSize
)
==
0
,
static_assert
(
SliceLengths
::
At
(
SrcVectorDim
)
%
(
SrcScalarPerVector
_
)
==
0
,
"SliceLengths[SrcVectorDim] must be divisible by SrcScalarPerVector"
);
constexpr
auto
src_dim_access_order
=
SrcDimAccessOrder
{};
...
...
@@ -206,17 +206,20 @@ struct ThreadwiseTensorSliceTransfer_v3r1
if
constexpr
(
decltype
(
src_element_op_
)
::
is_pack8_invocable
)
return
math
::
min
(
8
,
SrcScalarPerVector
);
}
if
constexpr
(
is_detected
<
is_pack4_invocable_t
,
decltype
(
src_element_op_
)
>::
value
)
else
if
constexpr
(
is_detected
<
is_pack4_invocable_t
,
decltype
(
src_element_op_
)
>::
value
)
{
if
constexpr
(
decltype
(
src_element_op_
)
::
is_pack4_invocable
)
return
math
::
min
(
4
,
SrcScalarPerVector
);
}
if
constexpr
(
is_detected
<
is_pack2_invocable_t
,
decltype
(
src_element_op_
)
>::
value
)
else
if
constexpr
(
is_detected
<
is_pack2_invocable_t
,
decltype
(
src_element_op_
)
>::
value
)
{
if
constexpr
(
decltype
(
src_element_op_
)
::
is_pack2_invocable
)
return
math
::
min
(
2
,
SrcScalarPerVector
);
}
return
1
;
else
{
return
1
;
}
};
constexpr
index_t
elem_op_vec_len
=
get_elem_op_vec_len
();
...
...
@@ -224,6 +227,8 @@ struct ThreadwiseTensorSliceTransfer_v3r1
using
src_elem_op_vec_t
=
typename
vector_type
<
SrcData
,
elem_op_vec_len
>::
type
;
using
dst_elem_op_vec_t
=
typename
vector_type
<
DstData
,
elem_op_vec_len
>::
type
;
static_assert
(
elem_op_vec_len
==
1
,
"elem_op_vec_len != 1"
);
static_for
<
0
,
SrcScalarPerVector
/
elem_op_vec_len
,
1
>
{}([
&
](
auto
idx
)
{
// apply the src elementwise op and convert to DstData under the hood if needed
src_element_op_
(
op_r_v
.
template
AsType
<
dst_elem_op_vec_t
>()(
idx
),
...
...
@@ -285,6 +290,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
TransferDataFromSrcThreadScratchToDstThreadScratch
(
Number
<
ThreadScratchId
>
thread_scratch_id
)
{
#if !CK_EXPERIMENTAL_USE_IN_REGISTER_SUB_DWORD_TRANSPOSE
static_assert
(
false
,
""
);
static_ford
<
SliceLengths
>
{}([
&
](
auto
idx
)
{
dst_thread_scratch_
(
idx
)
=
src_thread_scratch_tuple_
[
thread_scratch_id
][
idx
];
});
...
...
@@ -292,7 +298,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
// OOB Check
constexpr
auto
src_scalar_per_access
=
generate_sequence
(
detail
::
lambda_scalar_per_access
<
SrcVectorDim
,
SrcScalarPerVector
*
PackedSize
>
{},
Number
<
nDim
>
{});
detail
::
lambda_scalar_per_access
<
SrcVectorDim
,
SrcScalarPerVector
_
>
{},
Number
<
nDim
>
{});
constexpr
auto
src_access_lengths
=
SliceLengths
{}
/
src_scalar_per_access
;
...
...
@@ -353,6 +359,8 @@ struct ThreadwiseTensorSliceTransfer_v3r1
.
template
SetAsType
<
vector_t
>(
src_data_idx_seq
,
op_r_v
);
});
static_assert
(
!
(
is_same_v
<
pk_i4_t
,
remove_cvref_t
<
SrcData
>>
&&
SrcVectorDim
!=
DstVectorDim
),
"pk_i4_t does not support transpose"
);
// sub-dword transpose between src_thread_scratch_ and dst_thread_scratch_
// TODO make this logic more generic for more sub-dword datatype
if
constexpr
(
SrcVectorDim
!=
DstVectorDim
&&
...
...
@@ -381,9 +389,9 @@ struct ThreadwiseTensorSliceTransfer_v3r1
constexpr
auto
scalar_per_access
=
generate_sequence
(
detail
::
lambda_scalar_per_access_for_src_and_dst
<
SrcVectorDim
,
SrcScalarPerVector
*
PackedSize
,
SrcScalarPerVector
_
,
DstVectorDim
,
DstScalarPerVector
*
PackedSize
>
{},
DstScalarPerVector
_
>
{},
Number
<
nDim
>
{});
constexpr
auto
access_lengths
=
SliceLengths
{}
/
scalar_per_access
;
...
...
@@ -456,7 +464,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
// src scalar per access on each dim
// TODO: don't use this
constexpr
auto
dst_scalar_per_access
=
generate_sequence
(
detail
::
lambda_scalar_per_access
<
DstVectorDim
,
DstScalarPerVector
*
PackedSize
>
{},
Number
<
nDim
>
{});
detail
::
lambda_scalar_per_access
<
DstVectorDim
,
DstScalarPerVector
_
>
{},
Number
<
nDim
>
{});
constexpr
auto
dst_access_lengths
=
SliceLengths
{}
/
dst_scalar_per_access
;
...
...
include/ck/utility/amd_xdlops.hpp
View file @
3ef4d2c2
...
...
@@ -157,8 +157,11 @@ struct intrin_mfma_f32_16x16x16f16<16, 16>
template
<
class
FloatC
>
__device__
static
void
Run
(
const
half4_t
&
reg_a
,
const
half4_t
&
reg_b
,
FloatC
&
reg_c
)
{
reg_c
.
template
AsType
<
float4_t
>()(
Number
<
0
>
{})
=
__builtin_amdgcn_mfma_f32_16x16x16f16
(
reg_a
,
reg_b
,
reg_c
.
template
AsType
<
float4_t
>()[
Number
<
0
>
{}],
0
,
0
,
0
);
ignore
=
reg_a
;
ignore
=
reg_b
;
ignore
=
reg_c
;
//reg_c.template AsType<float4_t>()(Number<0>{}) = __builtin_amdgcn_mfma_f32_16x16x16f16(
//reg_a, reg_b, reg_c.template AsType<float4_t>()[Number<0>{}], 0, 0, 0);
}
};
...
...
script/cmake-ck-dev.sh
View file @
3ef4d2c2
...
...
@@ -17,7 +17,7 @@ fi
cmake
\
-D
CMAKE_PREFIX_PATH
=
/opt/rocm
\
-D
CMAKE_CXX_COMPILER
=
/opt/rocm/bin/hipcc
\
-D
CMAKE_
CXX
_FLAGS
=
"-Xclang -mllvm -Xclang -enable-post-misched=0 -std=c++17 -O3 -ftemplate-backtrace-limit=0 -fPIE -Wno-gnu-line-marker"
\
-D
CMAKE_
HIP
_FLAGS
=
"
-save-temps -gline-tables-only
-Xclang -mllvm -Xclang -enable-post-misched=0 -std=c++17 -O3 -ftemplate-backtrace-limit=0 -fPIE -Wno-gnu-line-marker"
\
-D
CMAKE_BUILD_TYPE
=
Release
\
-D
BUILD_DEV
=
ON
\
-D
GPU_TARGETS
=
$GPU_TARGETS
\
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment