Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
171b9030
"vscode:/vscode.git/clone" did not exist on "5a61ffe14d59fb17e336eaf31e0268fc5e904895"
Unverified
Commit
171b9030
authored
Nov 20, 2024
by
Mirza Halilčević
Committed by
GitHub
Nov 20, 2024
Browse files
Merge branch 'develop' into gemm_elementwise_gemm
parents
417f805f
da0c21f6
Changes
486
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
652 additions
and
28 deletions
+652
-28
profiler/include/profiler/profile_pool3d_fwd_impl.hpp
profiler/include/profiler/profile_pool3d_fwd_impl.hpp
+45
-7
profiler/src/CMakeLists.txt
profiler/src/CMakeLists.txt
+2
-0
profiler/src/profile_gemm_multiply_multiply.cpp
profiler/src/profile_gemm_multiply_multiply.cpp
+9
-1
profiler/src/profile_gemm_universal.cpp
profiler/src/profile_gemm_universal.cpp
+26
-5
profiler/src/profile_gemm_universal_batched.cpp
profiler/src/profile_gemm_universal_batched.cpp
+187
-0
profiler/src/profile_grouped_conv_bwd_weight.cpp
profiler/src/profile_grouped_conv_bwd_weight.cpp
+23
-2
profiler/src/profile_layernorm_fwd.cpp
profiler/src/profile_layernorm_fwd.cpp
+1
-1
python/ck4inductor/grouped_conv_fwd/gen_instances.py
python/ck4inductor/grouped_conv_fwd/gen_instances.py
+167
-0
python/ck4inductor/grouped_conv_fwd/op.py
python/ck4inductor/grouped_conv_fwd/op.py
+93
-0
python/ck4inductor/universal_gemm/gen_instances.py
python/ck4inductor/universal_gemm/gen_instances.py
+4
-1
python/ck4inductor/universal_gemm/op.py
python/ck4inductor/universal_gemm/op.py
+3
-0
python/ck4inductor/util.py
python/ck4inductor/util.py
+4
-1
script/convert_miopen_driver_to_profiler.py
script/convert_miopen_driver_to_profiler.py
+3
-2
script/process_perf_data.py
script/process_perf_data.py
+2
-2
script/process_qa_data.sh
script/process_qa_data.sh
+1
-0
test/CMakeLists.txt
test/CMakeLists.txt
+7
-6
test/ck_tile/CMakeLists.txt
test/ck_tile/CMakeLists.txt
+1
-0
test/ck_tile/gemm/CMakeLists.txt
test/ck_tile/gemm/CMakeLists.txt
+4
-0
test/ck_tile/gemm/test_gemm_mem_pipeline.cpp
test/ck_tile/gemm/test_gemm_mem_pipeline.cpp
+29
-0
test/ck_tile/gemm/test_gemm_mem_pipeline_ut_cases.inc
test/ck_tile/gemm/test_gemm_mem_pipeline_ut_cases.inc
+41
-0
No files found.
profiler/include/profiler/profile_pool3d_fwd_impl.hpp
View file @
171b9030
...
@@ -102,11 +102,22 @@ bool profile_pool3d_fwd_impl(PoolFwdInputParams& in_params, PoolFwdKernelParams&
...
@@ -102,11 +102,22 @@ bool profile_pool3d_fwd_impl(PoolFwdInputParams& in_params, PoolFwdKernelParams&
Tensor
<
IndexDataType
>
out_indices_n_c_do_ho_wo_device
(
Tensor
<
IndexDataType
>
out_indices_n_c_do_ho_wo_device
(
f_host_tensor_descriptor
(
N
,
C
,
Do
,
Ho
,
Wo
));
f_host_tensor_descriptor
(
N
,
C
,
Do
,
Ho
,
Wo
));
constexpr
int
inDataRangeTensor1
{
1
};
constexpr
int
inDataRangeTensor2
{
5
};
constexpr
double
inDataRangeTensor3
{
0.5
};
switch
(
in_params
.
init_method
)
switch
(
in_params
.
init_method
)
{
{
case
0
:
in_n_c_di_hi_wi
.
GenerateTensorValue
(
GeneratorTensor_1
<
InDataType
>
{});
break
;
case
0
:
case
1
:
in_n_c_di_hi_wi
.
GenerateTensorValue
(
GeneratorTensor_2
<
InDataType
>
{
-
5
,
5
});
break
;
in_n_c_di_hi_wi
.
GenerateTensorValue
(
GeneratorTensor_1
<
InDataType
>
{
inDataRangeTensor1
});
default:
in_n_c_di_hi_wi
.
GenerateTensorValue
(
GeneratorTensor_3
<
InDataType
>
{
-
0.5
,
0.5
});
break
;
case
1
:
in_n_c_di_hi_wi
.
GenerateTensorValue
(
GeneratorTensor_2
<
InDataType
>
{
-
inDataRangeTensor2
,
inDataRangeTensor2
});
break
;
default:
in_n_c_di_hi_wi
.
GenerateTensorValue
(
GeneratorTensor_3
<
InDataType
>
{
-
inDataRangeTensor3
,
inDataRangeTensor3
});
}
}
DeviceMem
in_device_buf
(
sizeof
(
InDataType
)
*
in_n_c_di_hi_wi
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
in_device_buf
(
sizeof
(
InDataType
)
*
in_n_c_di_hi_wi
.
mDesc
.
GetElementSpaceSize
());
...
@@ -229,12 +240,39 @@ bool profile_pool3d_fwd_impl(PoolFwdInputParams& in_params, PoolFwdKernelParams&
...
@@ -229,12 +240,39 @@ bool profile_pool3d_fwd_impl(PoolFwdInputParams& in_params, PoolFwdKernelParams&
{
{
out_device_buf
.
FromDevice
(
out_n_c_do_ho_wo_device
.
mData
.
data
());
out_device_buf
.
FromDevice
(
out_n_c_do_ho_wo_device
.
mData
.
data
());
auto
tolerance
=
1e-3
;
auto
number_of_accumulations
=
1
;
bool
pass
=
ck
::
utils
::
check_err
(
out_n_c_do_ho_wo_device
.
mData
,
static_assert
(
ReduceOpId
==
ck
::
ReduceTensorOp
::
AVG
||
ReduceOpId
==
ck
::
ReduceTensorOp
::
MAX
,
"Warning: Unhandled ReduceOpId for setting up the number of accumulations!"
);
if
constexpr
(
ReduceOpId
==
ck
::
ReduceTensorOp
::
AVG
)
{
for
(
size_t
i
=
0
;
i
<
kernel_params
.
window_spatial_lengths
.
size
();
++
i
)
{
number_of_accumulations
*=
kernel_params
.
window_spatial_lengths
.
at
(
i
);
}
}
auto
absolute_error_threshold
=
1.0
;
switch
(
in_params
.
init_method
)
{
case
0
:
absolute_error_threshold
=
static_cast
<
double
>
(
inDataRangeTensor1
);
break
;
case
1
:
absolute_error_threshold
=
static_cast
<
double
>
(
inDataRangeTensor2
);
break
;
default:
absolute_error_threshold
=
inDataRangeTensor3
;
}
absolute_error_threshold
=
ck
::
utils
::
get_absolute_threshold
<
ComputeDataType
,
OutDataType
>
(
absolute_error_threshold
,
number_of_accumulations
);
auto
relative_error_threshold
=
ck
::
utils
::
get_relative_threshold
<
ComputeDataType
,
OutDataType
>
(
number_of_accumulations
);
bool
pass
=
ck
::
utils
::
check_err
(
out_n_c_do_ho_wo_device
.
mData
,
out_n_c_do_ho_wo_host
.
mData
,
out_n_c_do_ho_wo_host
.
mData
,
"Error: Incorrect results"
,
"Error: Incorrect results"
,
tolerance
,
relative_error_threshold
,
tolerance
);
absolute_error_threshold
);
if
constexpr
(
OutputIndex
)
if
constexpr
(
OutputIndex
)
{
{
...
...
profiler/src/CMakeLists.txt
View file @
171b9030
...
@@ -59,6 +59,7 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
...
@@ -59,6 +59,7 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
list
(
APPEND PROFILER_SOURCES profile_gemm_bias_add_reduce.cpp
)
list
(
APPEND PROFILER_SOURCES profile_gemm_bias_add_reduce.cpp
)
list
(
APPEND PROFILER_SOURCES profile_gemm_splitk.cpp
)
list
(
APPEND PROFILER_SOURCES profile_gemm_splitk.cpp
)
list
(
APPEND PROFILER_SOURCES profile_gemm_universal.cpp
)
list
(
APPEND PROFILER_SOURCES profile_gemm_universal.cpp
)
list
(
APPEND PROFILER_SOURCES profile_gemm_universal_batched.cpp
)
list
(
APPEND PROFILER_SOURCES profile_gemm_universal_reduce.cpp
)
list
(
APPEND PROFILER_SOURCES profile_gemm_universal_reduce.cpp
)
list
(
APPEND PROFILER_SOURCES profile_gemm_universal_streamk.cpp
)
list
(
APPEND PROFILER_SOURCES profile_gemm_universal_streamk.cpp
)
list
(
APPEND PROFILER_SOURCES profile_conv_fwd_bias_relu.cpp
)
list
(
APPEND PROFILER_SOURCES profile_conv_fwd_bias_relu.cpp
)
...
@@ -141,6 +142,7 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
...
@@ -141,6 +142,7 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
endif
()
endif
()
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_splitk_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_splitk_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_universal_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_universal_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_universal_batched_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_universal_reduce_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_universal_reduce_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_universal_streamk_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_universal_streamk_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_add_multiply_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_add_multiply_instance
)
...
...
profiler/src/profile_gemm_multiply_multiply.cpp
View file @
171b9030
...
@@ -27,6 +27,7 @@ enum struct GemmDataType
...
@@ -27,6 +27,7 @@ enum struct GemmDataType
F16_F8_F16
,
// 5
F16_F8_F16
,
// 5
F16_F16_F16_F8
,
// 6
F16_F16_F16_F8
,
// 6
F8_F8_BF16
,
// 7
F8_F8_BF16
,
// 7
INT8_INT8_BF16
,
// 8
};
};
#define OP_NAME "gemm_multiply_multiply"
#define OP_NAME "gemm_multiply_multiply"
...
@@ -39,7 +40,7 @@ int profile_gemm_multiply_multiply(int argc, char* argv[])
...
@@ -39,7 +40,7 @@ int profile_gemm_multiply_multiply(int argc, char* argv[])
printf
(
"arg1: tensor operation ("
OP_NAME
": "
OP_DESC
")
\n
"
);
printf
(
"arg1: tensor operation ("
OP_NAME
": "
OP_DESC
")
\n
"
);
printf
(
"arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8; 4: f8@f16; 5: f16@f8; 6: "
printf
(
"arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8; 4: f8@f16; 5: f16@f8; 6: "
"f16->f8; 7: f8->bf16, "
"f16->f8; 7: f8->bf16, "
"comp f8)
\n
"
);
"comp f8
; 8: int8->bf16
)
\n
"
);
printf
(
"arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];
\n
"
);
printf
(
"arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];
\n
"
);
printf
(
" 1: A[m, k] * B[n, k] = C[m, n];
\n
"
);
printf
(
" 1: A[m, k] * B[n, k] = C[m, n];
\n
"
);
printf
(
" 2: A[k, m] * B[k, n] = C[m, n];
\n
"
);
printf
(
" 2: A[k, m] * B[k, n] = C[m, n];
\n
"
);
...
@@ -89,6 +90,8 @@ int profile_gemm_multiply_multiply(int argc, char* argv[])
...
@@ -89,6 +90,8 @@ int profile_gemm_multiply_multiply(int argc, char* argv[])
using
F32
=
float
;
using
F32
=
float
;
using
BF16
=
ck
::
bhalf_t
;
using
BF16
=
ck
::
bhalf_t
;
using
F8
=
ck
::
f8_t
;
using
F8
=
ck
::
f8_t
;
using
I8
=
int8_t
;
using
I32
=
int
;
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
...
@@ -162,6 +165,11 @@ int profile_gemm_multiply_multiply(int argc, char* argv[])
...
@@ -162,6 +165,11 @@ int profile_gemm_multiply_multiply(int argc, char* argv[])
return
profile
(
return
profile
(
F8
{},
F8
{},
F8
{},
F32
{},
F32
{},
F32
{},
BF16
{},
Row
{},
Col
{},
Row
{},
Col
{},
Row
{});
F8
{},
F8
{},
F8
{},
F32
{},
F32
{},
F32
{},
BF16
{},
Row
{},
Col
{},
Row
{},
Col
{},
Row
{});
}
}
else
if
(
data_type
==
GemmDataType
::
INT8_INT8_BF16
&&
layout
==
GemmMatrixLayout
::
MK_NK_MN
)
{
return
profile
(
I8
{},
I8
{},
I8
{},
I32
{},
F32
{},
F32
{},
BF16
{},
Row
{},
Col
{},
Row
{},
Col
{},
Row
{});
}
else
else
{
{
std
::
cout
<<
"this data_type & layout is not implemented"
<<
std
::
endl
;
std
::
cout
<<
"this data_type & layout is not implemented"
<<
std
::
endl
;
...
...
profiler/src/profile_gemm_universal.cpp
View file @
171b9030
...
@@ -57,6 +57,25 @@ int profile_gemm_universal(int argc, char* argv[])
...
@@ -57,6 +57,25 @@ int profile_gemm_universal(int argc, char* argv[])
exit
(
1
);
exit
(
1
);
}
}
int
M
;
int
N
;
int
StrideA
;
int
StrideB
;
// Analyze the unsupported matrix shapes, switch the M and N number
if
(
std
::
stoi
(
argv
[
9
])
%
8
!=
0
&&
std
::
stoi
(
argv
[
8
])
%
8
==
0
)
{
M
=
std
::
stoi
(
argv
[
9
]);
StrideA
=
std
::
stoi
(
argv
[
12
]);
N
=
std
::
stoi
(
argv
[
8
]);
StrideB
=
std
::
stoi
(
argv
[
11
]);
}
else
{
M
=
std
::
stoi
(
argv
[
8
]);
StrideA
=
std
::
stoi
(
argv
[
11
]);
N
=
std
::
stoi
(
argv
[
9
]);
StrideB
=
std
::
stoi
(
argv
[
12
]);
}
const
auto
data_type
=
static_cast
<
GemmDataType
>
(
std
::
stoi
(
argv
[
2
]));
const
auto
data_type
=
static_cast
<
GemmDataType
>
(
std
::
stoi
(
argv
[
2
]));
const
auto
layout
=
static_cast
<
GemmMatrixLayout
>
(
std
::
stoi
(
argv
[
3
]));
const
auto
layout
=
static_cast
<
GemmMatrixLayout
>
(
std
::
stoi
(
argv
[
3
]));
const
bool
do_verification
=
std
::
stoi
(
argv
[
4
]);
const
bool
do_verification
=
std
::
stoi
(
argv
[
4
]);
...
@@ -64,12 +83,8 @@ int profile_gemm_universal(int argc, char* argv[])
...
@@ -64,12 +83,8 @@ int profile_gemm_universal(int argc, char* argv[])
const
bool
do_log
=
std
::
stoi
(
argv
[
6
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
6
]);
const
bool
time_kernel
=
std
::
stoi
(
argv
[
7
]);
const
bool
time_kernel
=
std
::
stoi
(
argv
[
7
]);
const
int
M
=
std
::
stoi
(
argv
[
8
]);
const
int
N
=
std
::
stoi
(
argv
[
9
]);
const
int
K
=
std
::
stoi
(
argv
[
10
]);
const
int
K
=
std
::
stoi
(
argv
[
10
]);
const
int
StrideA
=
std
::
stoi
(
argv
[
11
]);
const
int
StrideB
=
std
::
stoi
(
argv
[
12
]);
const
int
StrideC
=
std
::
stoi
(
argv
[
13
]);
const
int
StrideC
=
std
::
stoi
(
argv
[
13
]);
const
int
KBatch
=
std
::
stoi
(
argv
[
14
]);
const
int
KBatch
=
std
::
stoi
(
argv
[
14
]);
...
@@ -86,7 +101,9 @@ int profile_gemm_universal(int argc, char* argv[])
...
@@ -86,7 +101,9 @@ int profile_gemm_universal(int argc, char* argv[])
using
F32
=
float
;
using
F32
=
float
;
using
F16
=
ck
::
half_t
;
using
F16
=
ck
::
half_t
;
using
BF16
=
ck
::
bhalf_t
;
using
BF16
=
ck
::
bhalf_t
;
using
F8
=
ck
::
f8_t
;
#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94)
using
F8
=
ck
::
f8_t
;
#endif
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
...
@@ -147,6 +164,7 @@ int profile_gemm_universal(int argc, char* argv[])
...
@@ -147,6 +164,7 @@ int profile_gemm_universal(int argc, char* argv[])
{
{
return
profile
(
F16
{},
F16
{},
F16
{},
F32
{},
F16
{},
Row
{},
Col
{},
Row
{});
return
profile
(
F16
{},
F16
{},
F16
{},
F32
{},
F16
{},
Row
{},
Col
{},
Row
{});
}
}
#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94)
else
if
(
data_type
==
GemmDataType
::
F16_F8_F16
&&
layout
==
GemmMatrixLayout
::
MK_KN_MN
)
else
if
(
data_type
==
GemmDataType
::
F16_F8_F16
&&
layout
==
GemmMatrixLayout
::
MK_KN_MN
)
{
{
return
profile
(
F16
{},
F8
{},
F16
{},
F32
{},
F16
{},
Row
{},
Row
{},
Row
{});
return
profile
(
F16
{},
F8
{},
F16
{},
F32
{},
F16
{},
Row
{},
Row
{},
Row
{});
...
@@ -163,6 +181,7 @@ int profile_gemm_universal(int argc, char* argv[])
...
@@ -163,6 +181,7 @@ int profile_gemm_universal(int argc, char* argv[])
{
{
return
profile
(
F8
{},
F16
{},
F16
{},
F32
{},
F16
{},
Row
{},
Col
{},
Row
{});
return
profile
(
F8
{},
F16
{},
F16
{},
F32
{},
F16
{},
Row
{},
Col
{},
Row
{});
}
}
#endif
else
if
(
data_type
==
GemmDataType
::
BF16_BF16_BF16
&&
layout
==
GemmMatrixLayout
::
MK_KN_MN
)
else
if
(
data_type
==
GemmDataType
::
BF16_BF16_BF16
&&
layout
==
GemmMatrixLayout
::
MK_KN_MN
)
{
{
return
profile
(
BF16
{},
BF16
{},
BF16
{},
F32
{},
BF16
{},
Row
{},
Row
{},
Row
{});
return
profile
(
BF16
{},
BF16
{},
BF16
{},
F32
{},
BF16
{},
Row
{},
Row
{},
Row
{});
...
@@ -179,6 +198,7 @@ int profile_gemm_universal(int argc, char* argv[])
...
@@ -179,6 +198,7 @@ int profile_gemm_universal(int argc, char* argv[])
{
{
return
profile
(
BF16
{},
BF16
{},
BF16
{},
F32
{},
BF16
{},
Col
{},
Row
{},
Row
{});
return
profile
(
BF16
{},
BF16
{},
BF16
{},
F32
{},
BF16
{},
Col
{},
Row
{},
Row
{});
}
}
#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94)
else
if
(
data_type
==
GemmDataType
::
F8_F8_BF16
&&
layout
==
GemmMatrixLayout
::
MK_KN_MN
)
else
if
(
data_type
==
GemmDataType
::
F8_F8_BF16
&&
layout
==
GemmMatrixLayout
::
MK_KN_MN
)
{
{
return
profile
(
F8
{},
F8
{},
F8
{},
F32
{},
BF16
{},
Row
{},
Row
{},
Row
{});
return
profile
(
F8
{},
F8
{},
F8
{},
F32
{},
BF16
{},
Row
{},
Row
{},
Row
{});
...
@@ -187,6 +207,7 @@ int profile_gemm_universal(int argc, char* argv[])
...
@@ -187,6 +207,7 @@ int profile_gemm_universal(int argc, char* argv[])
{
{
return
profile
(
F8
{},
F8
{},
F8
{},
F32
{},
BF16
{},
Row
{},
Col
{},
Row
{});
return
profile
(
F8
{},
F8
{},
F8
{},
F32
{},
BF16
{},
Row
{},
Col
{},
Row
{});
}
}
#endif
else
else
{
{
std
::
cout
<<
"this data_type & layout is not implemented"
<<
std
::
endl
;
std
::
cout
<<
"this data_type & layout is not implemented"
<<
std
::
endl
;
...
...
profiler/src/profile_gemm_universal_batched.cpp
0 → 100644
View file @
171b9030
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdint>
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "profiler/profile_gemm_universal_batched_impl.hpp"
#include "profiler_operation_registry.hpp"
#include "ck/library/tensor_operation_instance/gpu/gemm_universal_batched.hpp"
enum
struct
GemmMatrixLayout
{
MK_KN_MN
,
// 0
MK_NK_MN
,
// 1
KM_KN_MN
,
// 2
KM_NK_MN
,
// 3
};
enum
struct
GemmDataType
{
BF16_BF16_BF16
,
// 0
F8_F8_BF16
,
// 1
};
#define OP_NAME "gemm_universal_batched"
#define OP_DESC "Batched GEMM Universal"
int
profile_batched_gemm_universal
(
int
argc
,
char
*
argv
[])
{
if
(
argc
!=
18
&&
argc
!=
21
)
{
// clang-format off
printf
(
"arg1: tensor operation ("
OP_NAME
": "
OP_DESC
")
\n
"
);
printf
(
"arg2: data type (0: bf16, 1: fp8->bf16)
\n
"
);
printf
(
"arg3: matrix layout (0: A[g, m, k] * B[g, k, n] = C[g, m, n];
\n
"
);
printf
(
" 1: A[g, m, k] * B[g, n, k] = C[g, m, n];
\n
"
);
printf
(
" 2: A[g, k, m] * B[g, k, n] = C[g, m, n];
\n
"
);
printf
(
" 3: A[g, k, m] * B[g, n, k] = C[g, m, n])
\n
"
);
printf
(
"arg4: verification (0: no; 1: yes)
\n
"
);
printf
(
"arg5: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
);
printf
(
"arg6: print tensor value (0: no; 1: yes)
\n
"
);
printf
(
"arg7: time kernel (0=n0, 1=yes)
\n
"
);
printf
(
"arg8 to 17: M, N, K, StrideA, StrideB, StrideC, BatchStrideA, BatchStrideB, BatchStrideC, BatchCount
\n
"
);
printf
(
"optional:
\n
"
);
printf
(
"arg18: number of warm-up cycles (default 1)
\n
"
);
printf
(
"arg19: number of iterations (default 10)
\n
"
);
printf
(
"arg20: memory for rotating buffer (default 0, size in MB)
\n
"
);
// clang-format on
exit
(
1
);
}
int
n_warmup
=
1
;
int
n_iter
=
10
;
uint64_t
rotating
=
0
;
if
(
argc
==
21
)
{
n_warmup
=
std
::
stoi
(
argv
[
18
]);
n_iter
=
std
::
stoi
(
argv
[
19
]);
rotating
=
std
::
stoull
(
argv
[
20
])
*
1024
*
1024
;
}
const
auto
data_type
=
static_cast
<
GemmDataType
>
(
std
::
stoi
(
argv
[
2
]));
const
auto
layout
=
static_cast
<
GemmMatrixLayout
>
(
std
::
stoi
(
argv
[
3
]));
const
bool
do_verification
=
std
::
stoi
(
argv
[
4
]);
const
int
init_method
=
std
::
stoi
(
argv
[
5
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
6
]);
const
bool
time_kernel
=
std
::
stoi
(
argv
[
7
]);
const
int
M
=
std
::
stoi
(
argv
[
8
]);
const
int
N
=
std
::
stoi
(
argv
[
9
]);
const
int
K
=
std
::
stoi
(
argv
[
10
]);
const
int
StrideA
=
std
::
stoi
(
argv
[
11
]);
const
int
StrideB
=
std
::
stoi
(
argv
[
12
]);
const
int
StrideC
=
std
::
stoi
(
argv
[
13
]);
const
int
BatchStrideA
=
std
::
stoi
(
argv
[
14
]);
const
int
BatchStrideB
=
std
::
stoi
(
argv
[
15
]);
const
int
BatchStrideC
=
std
::
stoi
(
argv
[
16
]);
const
int
BatchCount
=
std
::
stoi
(
argv
[
17
]);
#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94)
using
F8
=
ck
::
f8_t
;
#endif
using
BF16
=
ck
::
bhalf_t
;
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
auto
profile
=
[
&
](
auto
a_type
,
auto
b_type
,
auto
c_type
,
auto
a_layout
,
auto
b_layout
,
auto
c_layout
)
{
using
ADataType
=
decltype
(
a_type
);
using
BDataType
=
decltype
(
b_type
);
using
DsDataType
=
ck
::
Tuple
<>
;
using
CDataType
=
decltype
(
c_type
);
using
ALayout
=
decltype
(
a_layout
);
using
BLayout
=
decltype
(
b_layout
);
using
DsLayout
=
ck
::
Tuple
<>
;
using
CLayout
=
decltype
(
c_layout
);
const
int
DefaultStrideA
=
ck
::
is_same_v
<
ALayout
,
Row
>
?
K
:
M
;
const
int
DefaultStrideB
=
ck
::
is_same_v
<
BLayout
,
Row
>
?
N
:
K
;
const
int
DefaultStrideC
=
ck
::
is_same_v
<
CLayout
,
Row
>
?
N
:
M
;
const
int
StrideA_
=
(
StrideA
<
0
)
?
DefaultStrideA
:
StrideA
;
const
int
StrideB_
=
(
StrideB
<
0
)
?
DefaultStrideB
:
StrideB
;
const
int
StrideC_
=
(
StrideC
<
0
)
?
DefaultStrideC
:
StrideC
;
const
int
DefaultBatchStrideA
=
(
ck
::
is_same_v
<
ALayout
,
Row
>
?
M
:
K
)
*
StrideA_
;
const
int
DefaultBatchStrideB
=
(
ck
::
is_same_v
<
BLayout
,
Row
>
?
K
:
N
)
*
StrideB_
;
const
int
DefaultBatchStrideC
=
(
ck
::
is_same_v
<
CLayout
,
Row
>
?
M
:
N
)
*
StrideC_
;
const
int
BatchStrideA_
=
(
BatchStrideA
<
0
)
?
DefaultBatchStrideA
:
BatchStrideA
;
const
int
BatchStrideB_
=
(
BatchStrideB
<
0
)
?
DefaultBatchStrideB
:
BatchStrideB
;
const
int
BatchStrideC_
=
(
BatchStrideC
<
0
)
?
DefaultBatchStrideC
:
BatchStrideC
;
using
AElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
BElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
CElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceBatchedGemmV2MultiD
<
ALayout
,
BLayout
,
DsLayout
,
CLayout
,
ADataType
,
BDataType
,
DsDataType
,
CDataType
,
AElementOp
,
BElementOp
,
CElementOp
>
;
bool
pass
=
ck
::
profiler
::
profile_gemm_universal_batched_impl
<
ADataType
,
BDataType
,
CDataType
,
ALayout
,
BLayout
,
CLayout
,
AElementOp
,
BElementOp
,
CElementOp
,
DeviceOp
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
M
,
N
,
K
,
BatchStrideA_
,
BatchStrideB_
,
BatchStrideC_
,
StrideA_
,
StrideB_
,
StrideC_
,
BatchCount
,
n_warmup
,
n_iter
,
rotating
);
return
pass
?
0
:
1
;
};
if
(
data_type
==
GemmDataType
::
BF16_BF16_BF16
&&
layout
==
GemmMatrixLayout
::
MK_NK_MN
)
{
return
profile
(
BF16
{},
BF16
{},
BF16
{},
Row
{},
Col
{},
Row
{});
}
#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94)
else
if
(
data_type
==
GemmDataType
::
F8_F8_BF16
&&
layout
==
GemmMatrixLayout
::
MK_NK_MN
)
{
return
profile
(
F8
{},
F8
{},
BF16
{},
Row
{},
Col
{},
Row
{});
}
#endif
else
{
std
::
cout
<<
"this data_type & layout is not implemented"
<<
std
::
endl
;
return
1
;
}
}
REGISTER_PROFILER_OPERATION
(
OP_NAME
,
OP_DESC
,
profile_batched_gemm_universal
);
profiler/src/profile_grouped_conv_bwd_weight.cpp
View file @
171b9030
...
@@ -25,7 +25,8 @@ enum struct ConvDataType
...
@@ -25,7 +25,8 @@ enum struct ConvDataType
F16_F16_F16
,
// 1
F16_F16_F16
,
// 1
BF16_F32_BF16
,
// 2
BF16_F32_BF16
,
// 2
F16_F16_F16_BF8_F8
,
// 3
F16_F16_F16_BF8_F8
,
// 3
I8_I8_I8
// 4
I8_I8_I8
,
// 4
BF16_BF16_BF16
,
// 5
};
};
#define OP_NAME "grouped_conv_bwd_weight"
#define OP_NAME "grouped_conv_bwd_weight"
...
@@ -38,7 +39,8 @@ static void print_helper_msg()
...
@@ -38,7 +39,8 @@ static void print_helper_msg()
<<
" 1: Input fp16, Weight fp16, Output fp16
\n
"
<<
" 1: Input fp16, Weight fp16, Output fp16
\n
"
<<
" 2: Input bf16, Weight fp32, Output bf16
\n
"
<<
" 2: Input bf16, Weight fp32, Output bf16
\n
"
<<
" 3: Input fp16, Weight fp16, Output fp16, Gemm bf8@fp8
\n
"
<<
" 3: Input fp16, Weight fp16, Output fp16, Gemm bf8@fp8
\n
"
<<
" 4: Input int8, Weight int8, Output int8)
\n
"
<<
" 4: Input int8, Weight int8, Output int8
\n
"
<<
" 5: Input bf16, Weight bf16, Output bf16)
\n
"
<<
"arg3: tensor layout (0: Input[G, N, C, Hi, Wi], Weight[G, K, C, Y, X], Output[G, "
<<
"arg3: tensor layout (0: Input[G, N, C, Hi, Wi], Weight[G, K, C, Y, X], Output[G, "
"N, K, Ho, Wo]
\n
"
"N, K, Ho, Wo]
\n
"
<<
" 1: Input[G, N, Hi, Wi, C], Weight[G, K, Y, X, C], Output[G, "
<<
" 1: Input[G, N, Hi, Wi, C], Weight[G, K, Y, X, C], Output[G, "
...
@@ -180,6 +182,10 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[])
...
@@ -180,6 +182,10 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[])
// fp32 atomic add is used for weight tensor in bf16 kernel
// fp32 atomic add is used for weight tensor in bf16 kernel
return
profile
(
I2
,
NHWGC
{},
GKYXC
{},
NHWGK
{},
BF16
{},
F32
{},
BF16
{},
BF16
{},
BF16
{});
return
profile
(
I2
,
NHWGC
{},
GKYXC
{},
NHWGK
{},
BF16
{},
F32
{},
BF16
{},
BF16
{},
BF16
{});
}
}
if
(
data_type
==
ConvDataType
::
BF16_BF16_BF16
)
{
return
profile
(
I2
,
NHWGC
{},
GKYXC
{},
NHWGK
{},
BF16
{},
BF16
{},
BF16
{},
BF16
{},
BF16
{});
}
}
}
else
if
(
num_dim_spatial
==
2
&&
layout
==
ConvLayout
::
NGCHW_GKYXC_NGKHW
)
else
if
(
num_dim_spatial
==
2
&&
layout
==
ConvLayout
::
NGCHW_GKYXC_NGKHW
)
{
{
...
@@ -187,6 +193,11 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[])
...
@@ -187,6 +193,11 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[])
{
{
return
profile
(
I2
,
NGCHW
{},
GKYXC
{},
NGKHW
{},
F16
{},
F16
{},
F16
{},
F16
{},
F16
{});
return
profile
(
I2
,
NGCHW
{},
GKYXC
{},
NGKHW
{},
F16
{},
F16
{},
F16
{},
F16
{},
F16
{});
}
}
if
(
data_type
==
ConvDataType
::
BF16_BF16_BF16
)
{
// fp32 atomic add is used for weight tensor in bf16 kernel
return
profile
(
I2
,
NGCHW
{},
GKYXC
{},
NGKHW
{},
BF16
{},
BF16
{},
BF16
{},
BF16
{},
BF16
{});
}
}
}
if
(
num_dim_spatial
==
3
&&
layout
==
ConvLayout
::
GNHWC_GKYXC_GNHWK
)
if
(
num_dim_spatial
==
3
&&
layout
==
ConvLayout
::
GNHWC_GKYXC_GNHWK
)
{
{
...
@@ -224,6 +235,11 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[])
...
@@ -224,6 +235,11 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[])
// fp32 atomic add is used for weight tensor in bf16 kernel
// fp32 atomic add is used for weight tensor in bf16 kernel
return
profile
(
I3
,
NDHWGC
{},
GKZYXC
{},
NDHWGK
{},
BF16
{},
F32
{},
BF16
{},
BF16
{},
BF16
{});
return
profile
(
I3
,
NDHWGC
{},
GKZYXC
{},
NDHWGK
{},
BF16
{},
F32
{},
BF16
{},
BF16
{},
BF16
{});
}
}
if
(
data_type
==
ConvDataType
::
BF16_BF16_BF16
)
{
return
profile
(
I3
,
NDHWGC
{},
GKZYXC
{},
NDHWGK
{},
BF16
{},
BF16
{},
BF16
{},
BF16
{},
BF16
{});
}
if
(
data_type
==
ConvDataType
::
F16_F16_F16_BF8_F8
)
if
(
data_type
==
ConvDataType
::
F16_F16_F16_BF8_F8
)
{
{
return
profile
(
I3
,
NDHWGC
{},
GKZYXC
{},
NDHWGK
{},
F16
{},
F16
{},
F16
{},
BF8
{},
F8
{});
return
profile
(
I3
,
NDHWGC
{},
GKZYXC
{},
NDHWGK
{},
F16
{},
F16
{},
F16
{},
BF8
{},
F8
{});
...
@@ -240,6 +256,11 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[])
...
@@ -240,6 +256,11 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[])
{
{
return
profile
(
I3
,
NGCDHW
{},
GKZYXC
{},
NGKDHW
{},
F16
{},
F16
{},
F16
{},
F16
{},
F16
{});
return
profile
(
I3
,
NGCDHW
{},
GKZYXC
{},
NGKDHW
{},
F16
{},
F16
{},
F16
{},
F16
{},
F16
{});
}
}
if
(
data_type
==
ConvDataType
::
BF16_BF16_BF16
)
{
return
profile
(
I3
,
NGCDHW
{},
GKZYXC
{},
NGKDHW
{},
BF16
{},
BF16
{},
BF16
{},
BF16
{},
BF16
{});
}
}
}
std
::
cout
<<
"this data_type & layout is not implemented"
<<
std
::
endl
;
std
::
cout
<<
"this data_type & layout is not implemented"
<<
std
::
endl
;
...
...
profiler/src/profile_layernorm_fwd.cpp
View file @
171b9030
...
@@ -85,7 +85,7 @@ int profile_layernorm(int argc, char* argv[])
...
@@ -85,7 +85,7 @@ int profile_layernorm(int argc, char* argv[])
if
(
data_type
==
ck
::
DataTypeEnum
::
Half
)
if
(
data_type
==
ck
::
DataTypeEnum
::
Half
)
{
{
ck
::
profiler
::
profile_layernorm_impl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F
32
,
false
,
rank
>
(
ck
::
profiler
::
profile_layernorm_impl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F
16
,
false
,
rank
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
length
);
do_verification
,
init_method
,
do_log
,
time_kernel
,
length
);
}
}
else
if
(
data_type
==
ck
::
DataTypeEnum
::
Float
)
else
if
(
data_type
==
ck
::
DataTypeEnum
::
Float
)
...
...
python/ck4inductor/grouped_conv_fwd/gen_instances.py
0 → 100644
View file @
171b9030
# SPDX-License-Identifier: MIT
# Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
import
logging
import
os
import
subprocess
from
dataclasses
import
replace
from
functools
import
lru_cache
from
typing
import
List
from
..util
import
library_path
from
.op
import
CKGroupedConvFwdOp
log
=
logging
.
getLogger
(
__name__
)
def
_ck_conv_instances_path
():
conv_instances_path
=
os
.
path
.
join
(
# noqa: F821
library_path
(),
"include"
,
"ck"
,
"library"
,
"tensor_operation_instance"
,
"gpu"
,
"grouped_conv_fwd"
,
)
if
not
os
.
path
.
exists
(
conv_instances_path
):
log
.
error
(
"CK library conv instances path %s does not exist"
,
conv_instances_path
)
return
None
return
conv_instances_path
def
parse_instances
(
str_instances
:
List
[
str
])
->
List
[
CKGroupedConvFwdOp
]:
"""
Parse the lines containing Grouped Convolution Forward template instances
into `CKGroupedConvFwdOp` instances
"""
def
maybe_int
(
s
):
try
:
return
int
(
s
)
except
ValueError
:
return
s
op_instances
=
[]
# TODO: maybe use libclang for parsing C++ code in the future
# to avoid this hacky parsing logic below ? :) - copilot
for
line
in
str_instances
:
s_template_args
=
line
.
split
(
"DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3"
)[
-
1
].
strip
(
"<>, "
)
template_args
=
[]
i_current
=
0
while
i_current
<
len
(
s_template_args
):
if
s_template_args
[
i_current
]
==
" "
:
# skip whitespace
i_current
+=
1
continue
elif
s_template_args
[
i_current
:
i_current
+
2
]
==
"S<"
:
# parse template S<Index...>
i_next
=
s_template_args
.
find
(
">"
,
i_current
)
template_args
.
append
(
tuple
(
map
(
int
,
s_template_args
[
i_current
+
2
:
i_next
].
split
(
","
)))
)
i_current
=
i_next
+
2
else
:
# all string attributes must be either type aliases or global constants in C++
i_next
=
s_template_args
.
find
(
","
,
i_current
)
template_args
.
append
(
maybe_int
(
s_template_args
[
i_current
:
i_next
if
i_next
!=
-
1
else
None
]
)
)
if
i_next
!=
-
1
:
i_current
=
i_next
+
1
if
i_next
==
-
1
:
break
template_args
[
0
]
=
-
1
# n_dim_spatial
template_args
[
3
]
=
tuple
()
# ds_layout
template_args
[
9
]
=
tuple
()
# ds_element_dtype
new_instance
=
CKGroupedConvFwdOp
(
*
template_args
,
# type: ignore[arg-type]
)
op_instances
.
append
(
new_instance
)
return
op_instances
@
lru_cache
(
None
)
def
gen_conv_ops_library
()
->
List
[
CKGroupedConvFwdOp
]:
"""
Parse the Grouped Convolution Forward instances
defined in the Composable Kernel library folder.
"""
ck_library_dir
=
_ck_conv_instances_path
()
if
not
ck_library_dir
:
return
[]
grep_result
=
subprocess
.
run
(
[
"grep"
,
"-inR"
,
"DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3"
,
ck_library_dir
,
],
capture_output
=
True
,
text
=
True
,
)
op_instances
=
parse_instances
(
grep_result
.
stdout
.
strip
().
split
(
"
\n
"
))
log
.
debug
(
"ck instances from library: %d"
,
len
(
op_instances
))
schedulers
=
[
"BlockGemmPipelineScheduler::Intrawave"
,
"BlockGemmPipelineScheduler::Interwave"
,
]
conv_specs
=
[
"ConvolutionForwardSpecialization::Default"
,
"ConvolutionForwardSpecialization::Filter1x1Pad0"
,
"ConvolutionForwardSpecialization::Filter1x1Stride1Pad0"
,
"ConvolutionForwardSpecialization::OddC"
,
]
# substitute templated args by looping through their domains
substitute_instances
=
[]
for
instance
in
op_instances
:
sub_scheduler
=
(
instance
.
block_gemm_pipeline_scheduler
==
"BlkGemmPipeSched"
)
sub_spec
=
instance
.
conv_forward_specialization
==
"ConvSpec"
schedulers_range
=
(
schedulers
if
sub_scheduler
else
[
instance
.
block_gemm_pipeline_scheduler
]
)
spec_range
=
conv_specs
if
sub_spec
else
[
instance
.
conv_forward_specialization
]
for
scheduler
in
schedulers_range
:
for
spec
in
spec_range
:
for
channels_last
in
[
True
,
False
]:
if
channels_last
:
a_layout
=
"NHWGC"
e_layout
=
"NHWGK"
else
:
a_layout
=
"NGCHW"
e_layout
=
"NGKHW"
substitute_instances
.
append
(
replace
(
instance
,
block_gemm_pipeline_scheduler
=
scheduler
,
conv_forward_specialization
=
spec
,
gemm_specialization
=
"GemmSpecialization::MNKPadding"
,
n_dim_spatial
=
2
,
a_layout
=
a_layout
,
b_layout
=
"GKYXC"
,
e_layout
=
e_layout
,
)
)
return
substitute_instances
if
__name__
==
"__main__"
:
print
(
gen_conv_ops_library
())
python/ck4inductor/grouped_conv_fwd/op.py
0 → 100644
View file @
171b9030
# SPDX-License-Identifier: MIT
# Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
from
dataclasses
import
asdict
,
dataclass
from
typing
import
Optional
,
Tuple
@
dataclass
class
CKGroupedConvFwdOp
:
n_dim_spatial
:
int
a_layout
:
str
b_layout
:
str
ds_layout
:
Tuple
[
str
]
e_layout
:
str
a_element_dtype
:
str
b_element_dtype
:
str
acc_dtype
:
str
c_shuffle_dtype
:
str
ds_element_dtype
:
Tuple
[
str
]
e_element_dtype
:
str
a_elementwise_op
:
str
b_elementwise_op
:
str
cde_elementwise_op
:
str
conv_forward_specialization
:
str
gemm_specialization
:
str
block_size
:
int
m_per_block
:
int
n_per_block
:
int
k_per_block
:
int
ak1
:
int
bk1
:
int
m_per_xdl
:
int
n_per_xdl
:
int
m_xdl_per_wave
:
int
n_xdl_per_wave
:
int
a_block_transfer_thread_cluster_lengths_ak0_m_ak1
:
Tuple
[
int
,
int
,
int
]
a_block_transfer_thread_cluster_arrange_order
:
Tuple
[
int
,
int
,
int
]
a_block_transfer_src_access_order
:
Tuple
[
int
,
int
,
int
]
a_block_transfer_src_vector_dim
:
int
a_block_transfer_src_scalar_per_vector
:
int
a_block_transfer_dst_scalar_per_vector_ak1
:
int
a_block_lds_extra_m
:
bool
b_block_transfer_thread_cluster_lengths_bk0_n_bk1
:
Tuple
[
int
,
int
,
int
]
b_block_transfer_thread_cluster_arrange_order
:
Tuple
[
int
,
int
,
int
]
b_block_transfer_src_access_order
:
Tuple
[
int
,
int
,
int
]
b_block_transfer_src_vector_dim
:
int
b_block_transfer_src_scalar_per_vector
:
int
b_block_transfer_dst_scalar_per_vector_bk1
:
int
b_block_lds_extra_n
:
bool
c_shuffle_m_xdl_per_wave_per_shuffle
:
int
c_shuffle_n_xdl_per_wave_per_shuffle
:
int
cde_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block
:
Tuple
[
# noqa
int
,
int
,
int
,
int
,
]
cde_block_transfer_scalar_per_vector_n_per_block
:
int
block_gemm_pipeline_scheduler
:
str
block_gemm_pipeline_version
:
str
a_compute_dtype
:
Optional
[
str
]
=
None
b_compute_dtype
:
Optional
[
str
]
=
None
def
name
(
self
):
# cpp alias for template instance
return
(
f
"ck_device_grouped_convolution_fwd_multiple_abd_xdl_c_shuffle_v3_"
f
"
{
self
.
key_name
()
}
"
)
def
key_name
(
self
):
# TBD; must be unique per instance. Intended to use as dict key
return
"_"
.
join
(
[
"K"
+
field_name
.
replace
(
"_"
,
""
).
lower
()
+
"V"
+
(
"x"
.
join
(
map
(
str
,
iter
(
field_value
)))
if
isinstance
(
field_value
,
tuple
)
else
str
(
field_value
).
replace
(
":"
,
""
)
)
for
field_name
,
field_value
in
self
.
dict_items
()
]
)
def
dict_items
(
self
):
return
asdict
(
self
).
items
()
python/ck4inductor/universal_gemm/gen_instances.py
View file @
171b9030
# SPDX-License-Identifier: MIT
# Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
import
logging
import
logging
import
os
import
os
import
subprocess
import
subprocess
from
dataclasses
import
fields
,
replace
from
dataclasses
import
replace
from
functools
import
lru_cache
,
partial
from
functools
import
lru_cache
,
partial
from
typing
import
List
from
typing
import
List
...
...
python/ck4inductor/universal_gemm/op.py
View file @
171b9030
# SPDX-License-Identifier: MIT
# Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
from
dataclasses
import
asdict
,
dataclass
from
dataclasses
import
asdict
,
dataclass
from
typing
import
Optional
,
Tuple
from
typing
import
Optional
,
Tuple
...
...
python/ck4inductor/util.py
View file @
171b9030
# SPDX-License-Identifier: MIT
# Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
import
functools
import
functools
import
os
import
os
@
functools
.
lru_cache
(
None
)
@
functools
.
lru_cache
(
None
)
def
library_path
():
def
library_path
():
return
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
'
library
'
)
return
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
"
library
"
)
script/convert_miopen_driver_to_profiler.py
View file @
171b9030
...
@@ -65,8 +65,9 @@ def parse_data_type(args):
...
@@ -65,8 +65,9 @@ def parse_data_type(args):
if
args
.
ck_profier_op
==
"grouped_conv_fwd"
:
if
args
.
ck_profier_op
==
"grouped_conv_fwd"
:
args
.
data_type
=
3
args
.
data_type
=
3
if
args
.
data_type
==
"bfp16"
:
if
args
.
data_type
==
"bfp16"
:
if
args
.
ck_profier_op
==
"grouped_conv_bwd_weight"
or
\
if
args
.
ck_profier_op
==
"grouped_conv_bwd_weight"
:
args
.
ck_profier_op
==
"grouped_conv_bwd_data"
or
\
args
.
data_type
=
5
if
args
.
ck_profier_op
==
"grouped_conv_bwd_data"
or
\
args
.
ck_profier_op
==
"grouped_conv_fwd"
:
args
.
ck_profier_op
==
"grouped_conv_fwd"
:
args
.
data_type
=
2
args
.
data_type
=
2
...
...
script/process_perf_data.py
View file @
171b9030
...
@@ -133,12 +133,12 @@ def parse_logfile(logfile):
...
@@ -133,12 +133,12 @@ def parse_logfile(logfile):
if
'Best Perf'
in
line
:
if
'Best Perf'
in
line
:
lst
=
line
.
split
()
lst
=
line
.
split
()
res
.
append
(
lst
[
4
])
res
.
append
(
lst
[
4
])
elif
'onnx_gemm'
in
logfile
or
'mixed_gemm'
in
logfile
:
elif
'onnx_gemm'
in
logfile
:
for
line
in
open
(
logfile
):
for
line
in
open
(
logfile
):
if
'Best Perf'
in
line
:
if
'Best Perf'
in
line
:
lst
=
line
.
split
()
lst
=
line
.
split
()
res
.
append
(
lst
[
33
])
res
.
append
(
lst
[
33
])
elif
'splitK_gemm'
in
logfile
:
elif
'splitK_gemm'
in
logfile
or
'mixed_gemm'
in
logfile
:
for
line
in
open
(
logfile
):
for
line
in
open
(
logfile
):
if
'Best Perf'
in
line
:
if
'Best Perf'
in
line
:
lst
=
line
.
split
()
lst
=
line
.
split
()
...
...
script/process_qa_data.sh
View file @
171b9030
...
@@ -22,6 +22,7 @@ python3 process_perf_data.py perf_gemm_bilinear.log
...
@@ -22,6 +22,7 @@ python3 process_perf_data.py perf_gemm_bilinear.log
python3 process_perf_data.py perf_reduction.log
python3 process_perf_data.py perf_reduction.log
python3 process_perf_data.py perf_splitK_gemm.log
python3 process_perf_data.py perf_splitK_gemm.log
python3 process_perf_data.py perf_onnx_gemm.log
python3 process_perf_data.py perf_onnx_gemm.log
python3 process_perf_data.py perf_mixed_gemm.log
file
=
./perf_fmha_fwd_gfx942.log
file
=
./perf_fmha_fwd_gfx942.log
if
[
-e
"
$file
"
]
;
then
if
[
-e
"
$file
"
]
;
then
...
...
test/CMakeLists.txt
View file @
171b9030
...
@@ -64,11 +64,11 @@ function(add_test_executable TEST_NAME)
...
@@ -64,11 +64,11 @@ function(add_test_executable TEST_NAME)
#only continue if there are some source files left on the list
#only continue if there are some source files left on the list
if
(
ARGN
)
if
(
ARGN
)
if
(
ARGN MATCHES
"_xdl"
)
if
(
ARGN MATCHES
"_xdl"
)
list
(
REMOVE_ITEM TEST_TARGETS gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201
)
list
(
REMOVE_ITEM TEST_TARGETS
gfx900 gfx906 gfx906:xnack-
gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201
gfx10.3-generic gfx11-generic gfx12-generic
)
elseif
(
ARGN MATCHES
"_wmma"
)
elseif
(
ARGN MATCHES
"_wmma"
)
list
(
REMOVE_ITEM TEST_TARGETS gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030
)
list
(
REMOVE_ITEM TEST_TARGETS
gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack-
gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030
)
elseif
(
ARGN MATCHES
"_smfmac"
)
elseif
(
ARGN MATCHES
"_smfmac"
)
list
(
REMOVE_ITEM TEST_TARGETS gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx908 gfx90a gfx1200 gfx1201
)
list
(
REMOVE_ITEM TEST_TARGETS
gfx900 gfx906 gfx906:xnack-
gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx908 gfx90a gfx1200 gfx1201
gfx10.3-generic gfx11-generic gfx12-generic
)
endif
()
endif
()
set_source_files_properties
(
${
ARGN
}
PROPERTIES LANGUAGE HIP
)
set_source_files_properties
(
${
ARGN
}
PROPERTIES LANGUAGE HIP
)
add_executable
(
${
TEST_NAME
}
${
ARGN
}
)
add_executable
(
${
TEST_NAME
}
${
ARGN
}
)
...
@@ -141,11 +141,11 @@ function(add_gtest_executable TEST_NAME)
...
@@ -141,11 +141,11 @@ function(add_gtest_executable TEST_NAME)
#only continue if there are some source files left on the list
#only continue if there are some source files left on the list
if
(
ARGN
)
if
(
ARGN
)
if
(
ARGN MATCHES
"_xdl"
)
if
(
ARGN MATCHES
"_xdl"
)
list
(
REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201
)
list
(
REMOVE_ITEM TEST_TARGETS gfx900 gfx906
gfx906:xnack-
gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201
gfx10.3-generic gfx11-generic gfx12-generic
)
elseif
(
ARGN MATCHES
"_wmma"
)
elseif
(
ARGN MATCHES
"_wmma"
)
list
(
REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030
)
list
(
REMOVE_ITEM TEST_TARGETS gfx900 gfx906
gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack-
gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030
)
elseif
(
ARGN MATCHES
"_smfmac"
)
elseif
(
ARGN MATCHES
"_smfmac"
)
list
(
REMOVE_ITEM TEST_TARGETS gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx908 gfx90a gfx1200 gfx1201
)
list
(
REMOVE_ITEM TEST_TARGETS
gfx900 gfx906 gfx906:xnack-
gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx908 gfx90a gfx1200 gfx1201
gfx10.3-generic gfx11-generic gfx12-generic
)
endif
()
endif
()
set_source_files_properties
(
${
ARGN
}
PROPERTIES LANGUAGE HIP
)
set_source_files_properties
(
${
ARGN
}
PROPERTIES LANGUAGE HIP
)
add_executable
(
${
TEST_NAME
}
${
ARGN
}
)
add_executable
(
${
TEST_NAME
}
${
ARGN
}
)
...
@@ -210,3 +210,4 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx942" AND CK_HIP_VERSION_MAJOR GREATER_EQUAL
...
@@ -210,3 +210,4 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx942" AND CK_HIP_VERSION_MAJOR GREATER_EQUAL
add_subdirectory
(
smfmac_op
)
add_subdirectory
(
smfmac_op
)
endif
()
endif
()
add_subdirectory
(
position_embedding
)
add_subdirectory
(
position_embedding
)
add_subdirectory
(
scatter_gather
)
test/ck_tile/CMakeLists.txt
View file @
171b9030
add_subdirectory
(
image_to_column
)
add_subdirectory
(
image_to_column
)
add_subdirectory
(
gemm
)
test/ck_tile/gemm/CMakeLists.txt
0 → 100644
View file @
171b9030
# Currently ck_tile is only built on gfx9
if
(
GPU_TARGETS MATCHES
"gfx9"
)
add_gtest_executable
(
test_ck_tile_gemm_mem_pipeline test_gemm_mem_pipeline.cpp
)
endif
()
test/ck_tile/gemm/test_gemm_mem_pipeline.cpp
0 → 100644
View file @
171b9030
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include <tuple>
#include "gtest/gtest.h"
#include "ck_tile/host.hpp"
#include "test_gemm_mem_pipeline_util.hpp"
using
F16
=
ck_tile
::
half_t
;
using
F32
=
float
;
using
Row
=
ck_tile
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck_tile
::
tensor_layout
::
gemm
::
ColumnMajor
;
// clang-format off
using
KernelTypes
=
::
testing
::
Types
<
// ALayout, BLayout, CLayout, ADataType, BDataType, AccDataType, CDataType
std
::
tuple
<
Row
,
Col
,
Row
,
F16
,
F16
,
F32
,
F16
>
,
std
::
tuple
<
Col
,
Row
,
Row
,
F16
,
F16
,
F32
,
F16
>
,
std
::
tuple
<
Row
,
Row
,
Row
,
F16
,
F16
,
F32
,
F16
>
,
std
::
tuple
<
Col
,
Col
,
Row
,
F16
,
F16
,
F32
,
F16
>
>
;
// clang-format on
TYPED_TEST_SUITE
(
TestCkTileGemmMemPipeline
,
KernelTypes
);
#include "test_gemm_mem_pipeline_ut_cases.inc"
test/ck_tile/gemm/test_gemm_mem_pipeline_ut_cases.inc
0 → 100644
View file @
171b9030
#pragma once
TYPED_TEST
(
TestCkTileGemmMemPipeline
,
SmallM
)
{
std
::
vector
<
int
>
Ms
{
1
,
2
,
3
,
4
,
5
,
6
};
constexpr
int
N
=
1024
;
constexpr
int
K
=
320
;
for
(
int
M
:
Ms
)
this
->
Run
(
M
,
N
,
K
);
}
TYPED_TEST
(
TestCkTileGemmMemPipeline
,
MidLargeM
)
{
std
::
vector
<
int
>
Ms
{
127
,
255
,
312
,
799
,
1573
};
constexpr
int
N
=
1024
;
constexpr
int
K
=
320
;
for
(
int
M
:
Ms
)
this
->
Run
(
M
,
N
,
K
);
}
TYPED_TEST
(
TestCkTileGemmMemPipeline
,
PaddK
)
{
std
::
vector
<
int
>
Ms
{
127
};
constexpr
int
N
=
1024
;
constexpr
int
K
=
432
;
for
(
int
M
:
Ms
)
this
->
Run
(
M
,
N
,
K
);
}
TYPED_TEST
(
TestCkTileGemmMemPipeline
,
Regular
)
{
std
::
vector
<
int
>
Ms
{
512
};
constexpr
int
N
=
1024
;
constexpr
int
K
=
512
;
for
(
int
M
:
Ms
)
this
->
Run
(
M
,
N
,
K
);
}
Prev
1
…
20
21
22
23
24
25
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment