Unverified Commit 3d61f89a authored by Illia Silin's avatar Illia Silin Committed by GitHub
Browse files

Merge pull request #134 from ROCm/merge_from_public

Merge from public
parents c160c6cf 4558a3f8
...@@ -33,7 +33,8 @@ template <ck::index_t NDimSpatial, ...@@ -33,7 +33,8 @@ template <ck::index_t NDimSpatial,
typename WeiDataType, typename WeiDataType,
typename OutDataType, typename OutDataType,
typename AComputeType = InDataType, typename AComputeType = InDataType,
typename BComputeType = AComputeType> typename BComputeType = AComputeType,
typename IndexType = ck::index_t>
bool profile_grouped_conv_fwd_impl(int do_verification, bool profile_grouped_conv_fwd_impl(int do_verification,
int init_method, int init_method,
bool do_log, bool do_log,
...@@ -57,16 +58,16 @@ bool profile_grouped_conv_fwd_impl(int do_verification, ...@@ -57,16 +58,16 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
const auto out_g_n_k_wos_desc = const auto out_g_n_k_wos_desc =
ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param); ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{}; std::array<IndexType, NDimSpatial + 3> a_g_n_c_wis_lengths{};
std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{}; std::array<IndexType, NDimSpatial + 3> a_g_n_c_wis_strides{};
std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{}; std::array<IndexType, NDimSpatial + 3> b_g_k_c_xs_lengths{};
std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{}; std::array<IndexType, NDimSpatial + 3> b_g_k_c_xs_strides{};
std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_lengths{}; std::array<IndexType, NDimSpatial + 3> e_g_n_k_wos_lengths{};
std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_strides{}; std::array<IndexType, NDimSpatial + 3> e_g_n_k_wos_strides{};
std::array<ck::index_t, NDimSpatial> conv_filter_strides{}; std::array<IndexType, NDimSpatial> conv_filter_strides{};
std::array<ck::index_t, NDimSpatial> conv_filter_dilations{}; std::array<IndexType, NDimSpatial> conv_filter_dilations{};
std::array<ck::index_t, NDimSpatial> input_left_pads{}; std::array<IndexType, NDimSpatial> input_left_pads{};
std::array<ck::index_t, NDimSpatial> input_right_pads{}; std::array<IndexType, NDimSpatial> input_right_pads{};
auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); }; auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
......
...@@ -46,12 +46,17 @@ if(GPU_TARGETS MATCHES "gfx9") ...@@ -46,12 +46,17 @@ if(GPU_TARGETS MATCHES "gfx9")
list(APPEND PROFILER_SOURCES profile_grouped_gemm_multiply_tile_loop.cpp) list(APPEND PROFILER_SOURCES profile_grouped_gemm_multiply_tile_loop.cpp)
endif() endif()
list(APPEND PROFILER_SOURCES profile_gemm_multiply_add.cpp) list(APPEND PROFILER_SOURCES profile_gemm_multiply_add.cpp)
if(GPU_TARGETS MATCHES "gfx94")
list(APPEND PROFILER_SOURCES profile_gemm_multiply_multiply.cpp)
list(APPEND PROFILER_SOURCES profile_gemm_ab_scale.cpp)
endif()
list(APPEND PROFILER_SOURCES profile_batched_gemm.cpp) list(APPEND PROFILER_SOURCES profile_batched_gemm.cpp)
list(APPEND PROFILER_SOURCES profile_batched_gemm_reduce.cpp) list(APPEND PROFILER_SOURCES profile_batched_gemm_reduce.cpp)
list(APPEND PROFILER_SOURCES profile_gemm_add_multiply.cpp) list(APPEND PROFILER_SOURCES profile_gemm_add_multiply.cpp)
list(APPEND PROFILER_SOURCES profile_gemm_bias_add_reduce.cpp) list(APPEND PROFILER_SOURCES profile_gemm_bias_add_reduce.cpp)
list(APPEND PROFILER_SOURCES profile_gemm_splitk.cpp) list(APPEND PROFILER_SOURCES profile_gemm_splitk.cpp)
list(APPEND PROFILER_SOURCES profile_gemm_universal.cpp) list(APPEND PROFILER_SOURCES profile_gemm_universal.cpp)
list(APPEND PROFILER_SOURCES profile_gemm_universal_reduce.cpp)
list(APPEND PROFILER_SOURCES profile_gemm_universal_streamk.cpp) list(APPEND PROFILER_SOURCES profile_gemm_universal_streamk.cpp)
list(APPEND PROFILER_SOURCES profile_conv_fwd_bias_relu.cpp) list(APPEND PROFILER_SOURCES profile_conv_fwd_bias_relu.cpp)
list(APPEND PROFILER_SOURCES profile_conv_fwd_bias_relu_add.cpp) list(APPEND PROFILER_SOURCES profile_conv_fwd_bias_relu_add.cpp)
...@@ -79,6 +84,11 @@ set(PROFILER_EXECUTABLE ckProfiler) ...@@ -79,6 +84,11 @@ set(PROFILER_EXECUTABLE ckProfiler)
add_executable(${PROFILER_EXECUTABLE} ${PROFILER_SOURCES}) add_executable(${PROFILER_EXECUTABLE} ${PROFILER_SOURCES})
target_compile_options(${PROFILER_EXECUTABLE} PRIVATE -Wno-global-constructors) target_compile_options(${PROFILER_EXECUTABLE} PRIVATE -Wno-global-constructors)
# flags to compress the library
if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600241132)
message("Adding --offload-compress flag for ${PROFILER_EXECUTABLE}")
target_compile_options(${PROFILER_EXECUTABLE} PRIVATE --offload-compress)
endif()
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE utility getopt::getopt) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE utility getopt::getopt)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_instance)
...@@ -120,8 +130,13 @@ if(GPU_TARGETS MATCHES "gfx9") ...@@ -120,8 +130,13 @@ if(GPU_TARGETS MATCHES "gfx9")
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_reduce_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_reduce_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_multiply_add_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_multiply_add_instance)
if(GPU_TARGETS MATCHES "gfx94")
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_multiply_multiply_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_ab_scale_instance)
endif()
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_splitk_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_splitk_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_reduce_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_streamk_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_streamk_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_multiply_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_multiply_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_reduce_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_reduce_instance)
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "profiler/profile_gemm_ab_scale_impl.hpp"
#include "profiler_operation_registry.hpp"
enum struct GemmMatrixLayout
{
MK_KN_MN, // 0
MK_NK_MN, // 1
KM_KN_MN, // 2
KM_NK_MN, // 3
};
enum struct GemmDataType
{
F32_F32_F32, // 0
F16_F16_F16, // 1
BF16_BF16_BF16, // 2
INT8_INT8_INT8, // 3
F8_F16_F16, // 4
F16_F8_F16, // 5
F16_F16_F16_F8, // 6
F8_F8_BF16, // 7
};
enum struct ScaleBlockTile
{
Tile_128_128_128, // 0
};
#define OP_NAME "gemm_ab_scale"
#define OP_DESC "GEMM_AB_Scale"
int profile_gemm_ab_scale(int argc, char* argv[])
{
if(argc != 15 && argc != 18)
{
printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8; 4: f8@f16; 5: f16@f8; 6: "
"f16->f8; 7: f8->bf16, "
"comp f8)\n");
printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
printf(" 1: A[m, k] * B[n, k] = C[m, n];\n");
printf(" 2: A[k, m] * B[k, n] = C[m, n];\n");
printf(" 3: A[k, m] * B[n, k] = C[m, n])\n");
printf("arg4: scale block tile (0: ScaleBlockM/N/K = [128, 128, 128];\n");
printf("arg5: verification (0: no; 1: yes)\n");
printf("arg6: initialization (0: no init; 1: integer value; 2: decimal value)\n");
printf("arg7: print tensor value (0: no; 1: yes)\n");
printf("arg8: time kernel (0=no, 1=yes)\n");
printf("arg9 to 14: M, N, K, StrideA, StrideB, StrideE\n");
printf("optional:\n");
printf("arg15: number of warm-up cycles (default 1)\n");
printf("arg16: number of iterations (default 10)\n");
printf("arg17: memory for rotating buffer (default 0, size in MB)\n");
exit(1);
}
const auto data_type = static_cast<GemmDataType>(std::stoi(argv[2]));
const auto layout = static_cast<GemmMatrixLayout>(std::stoi(argv[3]));
const auto scale_block_tile = static_cast<ScaleBlockTile>(std::stoi(argv[4]));
const bool do_verification = std::stoi(argv[5]);
const int init_method = std::stoi(argv[6]);
const bool do_log = std::stoi(argv[7]);
const bool time_kernel = std::stoi(argv[8]);
const int M = std::stoi(argv[9]);
const int N = std::stoi(argv[10]);
const int K = std::stoi(argv[11]);
const int StrideA = std::stoi(argv[12]);
const int StrideB = std::stoi(argv[13]);
const int StrideE = std::stoi(argv[14]);
int n_warmup = 1;
int n_iter = 10;
uint64_t rotating = 0;
if(argc == 18)
{
n_warmup = std::stoi(argv[15]);
n_iter = std::stoi(argv[16]);
rotating = std::stoull(argv[17]) * 1024 * 1024;
}
using F32 = float;
using BF16 = ck::bhalf_t;
using F8 = ck::f8_t;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
auto profile = [&](auto a0_type,
auto a1_type,
auto b0_type,
auto b1_type,
auto comp_type,
auto acc_type,
auto c_type,
auto scale_block_m,
auto scale_block_n,
auto scale_block_k,
auto a_layout,
auto b_layout,
auto e_layout) {
using A0DataType = decltype(a0_type);
using A1DataType = decltype(a1_type);
using B0DataType = decltype(b0_type);
using B1DataType = decltype(b1_type);
using ComputeDataType = decltype(comp_type);
using AccDataType = decltype(acc_type);
using EDataType = decltype(c_type);
using ALayout = decltype(a_layout);
using BLayout = decltype(b_layout);
using ELayout = decltype(e_layout);
const int DefaultStrideA = ck::is_same_v<ALayout, Row> ? K : M;
const int DefaultStrideB = ck::is_same_v<BLayout, Row> ? N : K;
const int DefaultStrideE = ck::is_same_v<ELayout, Row> ? N : M;
bool pass = ck::profiler::profile_gemm_ab_scale_impl<A0DataType,
A1DataType,
B0DataType,
B1DataType,
ComputeDataType,
AccDataType,
EDataType,
scale_block_m,
scale_block_n,
scale_block_k,
ALayout,
BLayout,
ELayout>(
do_verification,
init_method,
do_log,
time_kernel,
M,
N,
K,
(StrideA < 0) ? DefaultStrideA : StrideA,
(StrideB < 0) ? DefaultStrideB : StrideB,
(StrideE < 0) ? DefaultStrideE : StrideE,
n_warmup,
n_iter,
rotating);
return pass ? 0 : 1;
};
if(data_type == GemmDataType::F8_F8_BF16 && layout == GemmMatrixLayout::MK_NK_MN &&
scale_block_tile == ScaleBlockTile::Tile_128_128_128)
{
return profile(F8{},
F32{},
F8{},
F32{},
F8{},
F32{},
BF16{},
ck::Number<128>{},
ck::Number<128>{},
ck::Number<128>{},
Row{},
Col{},
Row{});
}
else
{
std::cout << "this data_type & layout is not implemented" << std::endl;
return 1;
}
}
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm_ab_scale);
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "profiler/profile_gemm_multiply_multiply_impl.hpp"
#include "profiler_operation_registry.hpp"
enum struct GemmMatrixLayout
{
MK_KN_MN, // 0
MK_NK_MN, // 1
KM_KN_MN, // 2
KM_NK_MN, // 3
};
enum struct GemmDataType
{
F32_F32_F32, // 0
F16_F16_F16, // 1
BF16_BF16_BF16, // 2
INT8_INT8_INT8, // 3
F8_F16_F16, // 4
F16_F8_F16, // 5
F16_F16_F16_F8, // 6
F8_F8_BF16, // 7
};
#define OP_NAME "gemm_multiply_multiply"
#define OP_DESC "GEMM_Multiply_Multiply"
int profile_gemm_multiply_multiply(int argc, char* argv[])
{
if(argc != 16 && argc != 20)
{
printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8; 4: f8@f16; 5: f16@f8; 6: "
"f16->f8; 7: f8->bf16, "
"comp f8)\n");
printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
printf(" 1: A[m, k] * B[n, k] = C[m, n];\n");
printf(" 2: A[k, m] * B[k, n] = C[m, n];\n");
printf(" 3: A[k, m] * B[n, k] = C[m, n])\n");
printf("arg4: verification (0: no; 1: yes)\n");
printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
printf("arg6: print tensor value (0: no; 1: yes)\n");
printf("arg7: time kernel (0=no, 1=yes)\n");
printf("arg8 to 15: M, N, K, StrideA, StrideB, StrideD0, StrideD1, StrideE\n");
printf("optional:\n");
printf("arg16: number of kbatch (default 1)\n");
printf("arg17: number of warm-up cycles (default 1)\n");
printf("arg18: number of iterations (default 10)\n");
printf("arg19: memory for rotating buffer (default 0, size in MB)\n");
exit(1);
}
const auto data_type = static_cast<GemmDataType>(std::stoi(argv[2]));
const auto layout = static_cast<GemmMatrixLayout>(std::stoi(argv[3]));
const bool do_verification = std::stoi(argv[4]);
const int init_method = std::stoi(argv[5]);
const bool do_log = std::stoi(argv[6]);
const bool time_kernel = std::stoi(argv[7]);
const int M = std::stoi(argv[8]);
const int N = std::stoi(argv[9]);
const int K = std::stoi(argv[10]);
const int StrideA = std::stoi(argv[11]);
const int StrideB = std::stoi(argv[12]);
const int StrideD0 = std::stoi(argv[13]);
const int StrideD1 = std::stoi(argv[14]);
const int StrideE = std::stoi(argv[15]);
int n_warmup = 1;
int n_iter = 10;
uint64_t rotating = 0;
int KBatch = 1;
if(argc == 20)
{
KBatch = std::stoi(argv[16]);
n_warmup = std::stoi(argv[17]);
n_iter = std::stoi(argv[18]);
rotating = std::stoull(argv[19]) * 1024 * 1024;
}
using F32 = float;
using BF16 = ck::bhalf_t;
using F8 = ck::f8_t;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
auto profile = [&](auto a_type,
auto b_type,
auto comp_type,
auto acc_type,
auto d0_type,
auto d1_type,
auto c_type,
auto a_layout,
auto b_layout,
auto d0_layout,
auto d1_layout,
auto e_layout) {
using ADataType = decltype(a_type);
using BDataType = decltype(b_type);
using ComputeDataType = decltype(comp_type);
using D0DataType = decltype(d0_type);
using D1DataType = decltype(d1_type);
using AccDataType = decltype(acc_type);
using EDataType = decltype(c_type);
using ALayout = decltype(a_layout);
using BLayout = decltype(b_layout);
using D0Layout = decltype(d0_layout);
using D1Layout = decltype(d1_layout);
using ELayout = decltype(e_layout);
const int DefaultStrideA = ck::is_same_v<ALayout, Row> ? K : M;
const int DefaultStrideB = ck::is_same_v<BLayout, Row> ? N : K;
const int DefaultStrideD0 = ck::is_same_v<D0Layout, Row> ? N : M;
const int DefaultStrideD1 = ck::is_same_v<D1Layout, Row> ? N : M;
const int DefaultStrideE = ck::is_same_v<ELayout, Row> ? N : M;
bool pass = ck::profiler::profile_gemm_multiply_multiply_impl<ADataType,
BDataType,
ComputeDataType,
AccDataType,
D0DataType,
D1DataType,
EDataType,
ALayout,
BLayout,
D0Layout,
D1Layout,
ELayout>(
do_verification,
init_method,
do_log,
time_kernel,
M,
N,
K,
(StrideA < 0) ? DefaultStrideA : StrideA,
(StrideB < 0) ? DefaultStrideB : StrideB,
(StrideD0 < 0) ? DefaultStrideD0 : StrideD0,
(StrideD1 < 0) ? DefaultStrideD1 : StrideD1,
(StrideE < 0) ? DefaultStrideE : StrideE,
KBatch,
n_warmup,
n_iter,
rotating);
return pass ? 0 : 1;
};
if(data_type == GemmDataType::F8_F8_BF16 && layout == GemmMatrixLayout::MK_NK_MN)
{
return profile(
F8{}, F8{}, F8{}, F32{}, F32{}, F32{}, BF16{}, Row{}, Col{}, Row{}, Col{}, Row{});
}
else
{
std::cout << "this data_type & layout is not implemented" << std::endl;
return 1;
}
}
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm_multiply_multiply);
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream> #include <iostream>
#include <numeric> #include <numeric>
...@@ -26,6 +26,7 @@ enum struct GemmDataType ...@@ -26,6 +26,7 @@ enum struct GemmDataType
F8_F16_F16, // 4 F8_F16_F16, // 4
F16_F8_F16, // 5 F16_F8_F16, // 5
F16_F16_F16_F8, // 6 F16_F16_F16_F8, // 6
F8_F8_BF16, // 7
}; };
#define OP_NAME "gemm_universal" #define OP_NAME "gemm_universal"
...@@ -36,7 +37,8 @@ int profile_gemm_universal(int argc, char* argv[]) ...@@ -36,7 +37,8 @@ int profile_gemm_universal(int argc, char* argv[])
if(argc != 15 && argc != 18) if(argc != 15 && argc != 18)
{ {
printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"); printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8; 4: f8@f16; 5: f16@f8; 6: f16, " printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8; 4: f8@f16; 5: f16@f8; 6: "
"f16->f8; 7: f8->bf16, "
"comp f8)\n"); "comp f8)\n");
printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n"); printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
printf(" 1: A[m, k] * B[n, k] = C[m, n];\n"); printf(" 1: A[m, k] * B[n, k] = C[m, n];\n");
...@@ -91,15 +93,17 @@ int profile_gemm_universal(int argc, char* argv[]) ...@@ -91,15 +93,17 @@ int profile_gemm_universal(int argc, char* argv[])
auto profile = [&](auto a_type, auto profile = [&](auto a_type,
auto b_type, auto b_type,
auto comp_type,
auto acc_type, auto acc_type,
auto c_type, auto c_type,
auto a_layout, auto a_layout,
auto b_layout, auto b_layout,
auto c_layout) { auto c_layout) {
using ADataType = decltype(a_type); using ADataType = decltype(a_type);
using BDataType = decltype(b_type); using BDataType = decltype(b_type);
using AccDataType = decltype(acc_type); using ComputeDataType = decltype(comp_type);
using CDataType = decltype(c_type); using AccDataType = decltype(acc_type);
using CDataType = decltype(c_type);
using ALayout = decltype(a_layout); using ALayout = decltype(a_layout);
using BLayout = decltype(b_layout); using BLayout = decltype(b_layout);
...@@ -111,6 +115,7 @@ int profile_gemm_universal(int argc, char* argv[]) ...@@ -111,6 +115,7 @@ int profile_gemm_universal(int argc, char* argv[])
bool pass = ck::profiler::profile_gemm_universal_impl<ADataType, bool pass = ck::profiler::profile_gemm_universal_impl<ADataType,
BDataType, BDataType,
ComputeDataType,
AccDataType, AccDataType,
CDataType, CDataType,
ALayout, ALayout,
...@@ -136,35 +141,43 @@ int profile_gemm_universal(int argc, char* argv[]) ...@@ -136,35 +141,43 @@ int profile_gemm_universal(int argc, char* argv[])
if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN) if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
{ {
return profile(F16{}, F16{}, F32{}, F16{}, Row{}, Row{}, Row{}); return profile(F16{}, F16{}, F16{}, F32{}, F16{}, Row{}, Row{}, Row{});
} }
else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN) else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN)
{ {
return profile(F16{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{}); return profile(F16{}, F16{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{});
} }
else if(data_type == GemmDataType::F16_F8_F16 && layout == GemmMatrixLayout::MK_KN_MN) else if(data_type == GemmDataType::F16_F8_F16 && layout == GemmMatrixLayout::MK_KN_MN)
{ {
return profile(F16{}, F8{}, F32{}, F16{}, Row{}, Row{}, Row{}); return profile(F16{}, F8{}, F16{}, F32{}, F16{}, Row{}, Row{}, Row{});
} }
else if(data_type == GemmDataType::F16_F8_F16 && layout == GemmMatrixLayout::MK_NK_MN) else if(data_type == GemmDataType::F16_F8_F16 && layout == GemmMatrixLayout::MK_NK_MN)
{ {
return profile(F16{}, F8{}, F32{}, F16{}, Row{}, Col{}, Row{}); return profile(F16{}, F8{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{});
} }
else if(data_type == GemmDataType::F8_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN) else if(data_type == GemmDataType::F8_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
{ {
return profile(F8{}, F16{}, F32{}, F16{}, Row{}, Row{}, Row{}); return profile(F8{}, F16{}, F16{}, F32{}, F16{}, Row{}, Row{}, Row{});
} }
else if(data_type == GemmDataType::F8_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN) else if(data_type == GemmDataType::F8_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN)
{ {
return profile(F8{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{}); return profile(F8{}, F16{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{});
} }
else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_KN_MN) else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_KN_MN)
{ {
return profile(BF16{}, BF16{}, F32{}, BF16{}, Row{}, Row{}, Row{}); return profile(BF16{}, BF16{}, BF16{}, F32{}, BF16{}, Row{}, Row{}, Row{});
} }
else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_NK_MN) else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_NK_MN)
{ {
return profile(BF16{}, BF16{}, F32{}, BF16{}, Row{}, Col{}, Row{}); return profile(BF16{}, BF16{}, BF16{}, F32{}, BF16{}, Row{}, Col{}, Row{});
}
else if(data_type == GemmDataType::F8_F8_BF16 && layout == GemmMatrixLayout::MK_KN_MN)
{
return profile(F8{}, F8{}, F8{}, F32{}, BF16{}, Row{}, Row{}, Row{});
}
else if(data_type == GemmDataType::F8_F8_BF16 && layout == GemmMatrixLayout::MK_NK_MN)
{
return profile(F8{}, F8{}, F8{}, F32{}, BF16{}, Row{}, Col{}, Row{});
} }
else else
{ {
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "profiler/profile_gemm_universal_reduce_impl.hpp"
#include "profiler_operation_registry.hpp"
enum struct GemmMatrixLayout
{
MK_KN_MN, // 0
MK_NK_MN, // 1
KM_KN_MN, // 2
KM_NK_MN, // 3
};
enum struct GemmDataType
{
F32_F32_F32, // 0
F16_F16_F16, // 1
BF16_BF16_BF16, // 2
INT8_INT8_INT8, // 3
F8_F16_F16, // 4
BF16_I8_BF16, // 5
F16_F16_F16_F8, // 6
};
#define OP_NAME "gemm_universal_reduce"
#define OP_DESC "Universal GEMM"
int profile_gemm_universal_reduce(int argc, char* argv[])
{
if(argc != 15 && argc != 18)
{
printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8; 4: f8@f16; 5: f16@i8; 6: f16, "
"comp f8)\n");
printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
printf("arg4: verification (0: no; 1: yes)\n");
printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
printf("arg6: print tensor value (0: no; 1: yes)\n");
printf("arg7: time kernel (0=no, 1=yes)\n");
printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideC\n");
printf("arg14: split k into mulitiple batch\n");
printf("optional:\n");
printf("arg15: number of warm-up cycles (default 1)\n");
printf("arg16: number of iterations (default 10)\n");
printf("arg17: memory for rotating buffer (default 0, size in MB)\n");
exit(1);
}
const auto data_type = static_cast<GemmDataType>(std::stoi(argv[2]));
const auto layout = static_cast<GemmMatrixLayout>(std::stoi(argv[3]));
const bool do_verification = std::stoi(argv[4]);
const int init_method = std::stoi(argv[5]);
const bool do_log = std::stoi(argv[6]);
const bool time_kernel = std::stoi(argv[7]);
const int M = std::stoi(argv[8]);
const int N = std::stoi(argv[9]);
const int K = std::stoi(argv[10]);
const int StrideA = std::stoi(argv[11]);
const int StrideB = std::stoi(argv[12]);
const int StrideC = std::stoi(argv[13]);
const int KBatch = std::stoi(argv[14]);
int n_warmup = 1;
int n_iter = 10;
uint64_t rotating = 0;
if(argc == 18)
{
n_warmup = std::stoi(argv[15]);
n_iter = std::stoi(argv[16]);
rotating = std::stoull(argv[17]) * 1024 * 1024;
}
using F32 = float;
using F16 = ck::half_t;
using BF16 = ck::bhalf_t;
using I8 = int8_t;
using Row = ck::tensor_layout::gemm::RowMajor;
using DsDataType = ck::Tuple<>;
using DsLayout = ck::Tuple<>;
auto profile = [&](auto a_type,
auto b_type,
auto acc_type,
auto c_type,
auto a_layout,
auto b_layout,
auto c_layout) {
using ADataType = decltype(a_type);
using BDataType = decltype(b_type);
using AccDataType = decltype(acc_type);
using CDataType = decltype(c_type);
using ALayout = decltype(a_layout);
using BLayout = decltype(b_layout);
using CLayout = decltype(c_layout);
const int DefaultStrideA = ck::is_same_v<ALayout, Row> ? K : M;
const int DefaultStrideB = ck::is_same_v<BLayout, Row> ? N : K;
const int DefaultStrideC = ck::is_same_v<CLayout, Row> ? N : M;
bool pass = ck::profiler::profile_gemm_universal_reduce_impl<ADataType,
BDataType,
DsDataType,
AccDataType,
CDataType,
ALayout,
BLayout,
DsLayout,
CLayout>(
do_verification,
init_method,
do_log,
time_kernel,
M,
N,
K,
(StrideA < 0) ? DefaultStrideA : StrideA,
(StrideB < 0) ? DefaultStrideB : StrideB,
(StrideC < 0) ? DefaultStrideC : StrideC,
KBatch,
n_warmup,
n_iter,
rotating);
return pass ? 0 : 1;
};
if(data_type == GemmDataType::BF16_I8_BF16 && layout == GemmMatrixLayout::MK_KN_MN)
{
return profile(BF16{}, I8{}, F32{}, BF16{}, Row{}, Row{}, Row{});
}
else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_KN_MN)
{
return profile(BF16{}, BF16{}, F32{}, BF16{}, Row{}, Row{}, Row{});
}
else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
{
return profile(F16{}, F16{}, F32{}, F16{}, Row{}, Row{}, Row{});
}
else
{
std::cout << "this data_type & layout is not implemented" << std::endl;
return 1;
}
}
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm_universal_reduce);
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib> #include <cstdlib>
#include <initializer_list> #include <initializer_list>
...@@ -81,7 +81,6 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[]) ...@@ -81,7 +81,6 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[])
const auto params = ck::utils::conv::parse_conv_param(num_dim_spatial, 9, argv); const auto params = ck::utils::conv::parse_conv_param(num_dim_spatial, 9, argv);
ck::index_t split_k = std::stoi(argv[8 + 1 + 4 + 6 * num_dim_spatial]); ck::index_t split_k = std::stoi(argv[8 + 1 + 4 + 6 * num_dim_spatial]);
split_k = std::max(1, split_k);
using F32 = float; using F32 = float;
using F16 = ck::half_t; using F16 = ck::half_t;
......
...@@ -29,6 +29,12 @@ enum struct ConvDataType ...@@ -29,6 +29,12 @@ enum struct ConvDataType
BF8_F8_F8, // 7 BF8_F8_F8, // 7
}; };
enum struct IndexType
{
INDEX_T, // 0
LONG_INDEX_T, // 1
};
#define OP_NAME "grouped_conv_fwd" #define OP_NAME "grouped_conv_fwd"
#define OP_DESC "Grouped Convolution Forward" #define OP_DESC "Grouped Convolution Forward"
...@@ -45,12 +51,13 @@ static void print_helper_msg() ...@@ -45,12 +51,13 @@ static void print_helper_msg()
<< " 5: Input bf8, Weight bf8, Output fp8\n" << " 5: Input bf8, Weight bf8, Output fp8\n"
<< " 6: Input fp8, Weight bf8, Output fp8\n" << " 6: Input fp8, Weight bf8, Output fp8\n"
<< " 7: Input bf8, Weight fp8, Output fp8)\n" << " 7: Input bf8, Weight fp8, Output fp8)\n"
<< "arg3: tensor layout (0: Input[G, N, Hi, Wi, C], Weight[G, K, Y, X, C], Output[G, N, Ho, Wo, K]\n" << "arg3: indexing data type (0: 32-bit, 1: 64-bit)\n"
<< "arg4: tensor layout (0: Input[G, N, Hi, Wi, C], Weight[G, K, Y, X, C], Output[G, N, Ho, Wo, K]\n"
<< " 1: Input[N, Hi, Wi, G, C], Weight[G, K, Y, X, C], Output[N, Ho, Wo, G, K])\n" << " 1: Input[N, Hi, Wi, G, C], Weight[G, K, Y, X, C], Output[N, Ho, Wo, G, K])\n"
<< "arg4: verification (0: no, 1: yes)\n" << "arg5: verification (0: no, 1: yes)\n"
<< "arg5: initialization (0: no init, 1: integer value, 2: decimal value)\n" << "arg6: initialization (0: no init, 1: integer value, 2: decimal value)\n"
<< "arg6: print tensor value (0: no; 1: yes)\n" << "arg7: print tensor value (0: no; 1: yes)\n"
<< "arg7: time kernel (0: no, 1: yes)\n" << "arg8: time kernel (0: no, 1: yes)\n"
<< ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl; << ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
// clang-format on // clang-format on
} }
...@@ -60,7 +67,7 @@ static void print_helper_msg() ...@@ -60,7 +67,7 @@ static void print_helper_msg()
int profile_grouped_conv_fwd(int argc, char* argv[]) int profile_grouped_conv_fwd(int argc, char* argv[])
{ {
// 8 for control, 1 for num_dim_spatial // 8 for control, 1 for num_dim_spatial
if(argc < 9) if(argc < 10)
{ {
print_helper_msg(); print_helper_msg();
return 1; return 1;
...@@ -68,20 +75,21 @@ int profile_grouped_conv_fwd(int argc, char* argv[]) ...@@ -68,20 +75,21 @@ int profile_grouped_conv_fwd(int argc, char* argv[])
const auto data_type = static_cast<ConvDataType>(std::stoi(argv[2])); const auto data_type = static_cast<ConvDataType>(std::stoi(argv[2]));
const auto layout = static_cast<ConvLayout>(std::stoi(argv[3])); const auto layout = static_cast<ConvLayout>(std::stoi(argv[3]));
const bool do_verification = std::stoi(argv[4]); const auto index_type = static_cast<IndexType>(std::stoi(argv[4]));
const int init_method = std::stoi(argv[5]); const bool do_verification = std::stoi(argv[5]);
const bool do_log = std::stoi(argv[6]); const int init_method = std::stoi(argv[6]);
const bool time_kernel = std::stoi(argv[7]); const bool do_log = std::stoi(argv[7]);
const int num_dim_spatial = std::stoi(argv[8]); const bool time_kernel = std::stoi(argv[8]);
const int num_dim_spatial = std::stoi(argv[9]);
// 8 for control, 1 for num_dim_spatial, 4 for G/N/K/C, and 6 * num_dim_spatial // 9 for control, 1 for num_dim_spatial, 4 for G/N/K/C, and 6 * num_dim_spatial
if(argc != 8 + 1 + 4 + 6 * num_dim_spatial) if(argc != 9 + 1 + 4 + 6 * num_dim_spatial)
{ {
print_helper_msg(); print_helper_msg();
return 1; return 1;
} }
const auto params = ck::utils::conv::parse_conv_param(num_dim_spatial, 9, argv); const auto params = ck::utils::conv::parse_conv_param(num_dim_spatial, 10, argv);
using F32 = float; using F32 = float;
using F16 = ck::half_t; using F16 = ck::half_t;
...@@ -138,18 +146,43 @@ int profile_grouped_conv_fwd(int argc, char* argv[]) ...@@ -138,18 +146,43 @@ int profile_grouped_conv_fwd(int argc, char* argv[])
using AComputeType = decltype(a_compute_type); using AComputeType = decltype(a_compute_type);
using BComputeType = decltype(b_compute_type); using BComputeType = decltype(b_compute_type);
bool pass = ck::profiler::profile_grouped_conv_fwd_impl<NDimSpatial, if(index_type == IndexType::INDEX_T)
InLayout, {
WeiLayout, bool pass = ck::profiler::profile_grouped_conv_fwd_impl<NDimSpatial,
OutLayout, InLayout,
InDataType, WeiLayout,
WeiDataType, OutLayout,
OutDataType, InDataType,
AComputeType, WeiDataType,
BComputeType>( OutDataType,
do_verification, init_method, do_log, time_kernel, params); AComputeType,
BComputeType,
ck::index_t>(
do_verification, init_method, do_log, time_kernel, params);
return pass ? 0 : 1;
}
else if(index_type == IndexType::LONG_INDEX_T)
{
bool pass = ck::profiler::profile_grouped_conv_fwd_impl<NDimSpatial,
InLayout,
WeiLayout,
OutLayout,
InDataType,
WeiDataType,
OutDataType,
AComputeType,
BComputeType,
ck::long_index_t>(
do_verification, init_method, do_log, time_kernel, params);
return pass ? 0 : 1; return pass ? 0 : 1;
}
else
{
std::cout << "this indexing data type is not implemented" << std::endl;
return 1;
}
}; };
// GNHWC_GKYXC_GNHWK // GNHWC_GKYXC_GNHWK
......
...@@ -85,9 +85,11 @@ int profile_grouped_gemm_fixed_nk(int argc, char* argv[]) ...@@ -85,9 +85,11 @@ int profile_grouped_gemm_fixed_nk(int argc, char* argv[])
const auto StrideCs = argToIntArray(argv[13]); const auto StrideCs = argToIntArray(argv[13]);
const int kbatch = argc == 15 ? std::stoi(argv[14]) : 1; const int kbatch = argc == 15 ? std::stoi(argv[14]) : 1;
using F32 = float; using F32 = float;
using F16 = ck::half_t; using F16 = ck::half_t;
using F8 = ck::f8_t; #if defined(CK_ENABLE_FP8)
using F8 = ck::f8_t;
#endif
using BF16 = ck::bhalf_t; using BF16 = ck::bhalf_t;
using I8 = int8_t; using I8 = int8_t;
......
...@@ -62,17 +62,13 @@ def parse_instances(str_instances: List[str]) -> List[CKGemmOperation]: ...@@ -62,17 +62,13 @@ def parse_instances(str_instances: List[str]) -> List[CKGemmOperation]:
i_current = i_next + 1 i_current = i_next + 1
if i_next == -1: if i_next == -1:
break break
# pad with `None`s for the fields which are not defined in the instance
template_args.insert(2, tuple()) # ds layout
template_args.insert(6, tuple()) # ds dtype
new_instance = CKGemmOperation( new_instance = CKGemmOperation(
*template_args, # type: ignore[arg-type] *template_args, # type: ignore[arg-type]
*((None,) * (len(fields(CKGemmOperation)) - len(template_args))),
) )
# the last 2 template parameters are optional
# if they are absent, substitute them with default values from Universal Gemm C++ template declaration
if new_instance.a_compute_dtype is None:
new_instance.a_compute_dtype = new_instance.c_element_dtype
if new_instance.b_compute_dtype is None:
new_instance.b_compute_dtype = new_instance.c_element_dtype
op_instances.append(new_instance) op_instances.append(new_instance)
return op_instances return op_instances
...@@ -208,6 +204,8 @@ def gen_ops_preselected() -> List[CKGemmOperation]: ...@@ -208,6 +204,8 @@ def gen_ops_preselected() -> List[CKGemmOperation]:
a_layout="Row", a_layout="Row",
b_layout="Col", b_layout="Col",
c_layout="Row", c_layout="Row",
ds_element_dtypes=tuple(),
ds_layouts=tuple(),
a_element_dtype="F16", a_element_dtype="F16",
b_element_dtype="F16", b_element_dtype="F16",
c_element_dtype="F16", c_element_dtype="F16",
......
...@@ -10,10 +10,12 @@ class CKGemmOperation: ...@@ -10,10 +10,12 @@ class CKGemmOperation:
a_layout: str a_layout: str
b_layout: str b_layout: str
ds_layouts: Tuple[str] # addmm specific
c_layout: str c_layout: str
a_element_dtype: str a_element_dtype: str
b_element_dtype: str b_element_dtype: str
ds_element_dtypes: Tuple[str] # addmm specific
c_element_dtype: str c_element_dtype: str
acc_dtype: str acc_dtype: str
...@@ -64,16 +66,15 @@ class CKGemmOperation: ...@@ -64,16 +66,15 @@ class CKGemmOperation:
Tuple[int, int, int, int] Tuple[int, int, int, int]
) )
c_shuffle_block_transfer_scalar_per_vector_n_per_block: int c_shuffle_block_transfer_scalar_per_vector_n_per_block: int
block_gemm_pipeline_scheduler: str block_gemm_pipeline_scheduler: str
block_gemm_pipeline_version: Optional[str] block_gemm_pipeline_version: str
a_compute_dtype: Optional[str] a_compute_dtype: Optional[str] = None
b_compute_dtype: Optional[str] b_compute_dtype: Optional[str] = None
def name(self): def name(self):
# cpp alias for template instance # cpp alias for template instance
return f"ck_devicegemm_xdl_shuffle_v3_{self.key_name()}" return f"ck_devicegemm_multid_xdl_shuffle_v3_{self.key_name()}"
def key_name(self): def key_name(self):
# TBD; must be unique per instance. Intended to use as dict key # TBD; must be unique per instance. Intended to use as dict key
......
File mode changed from 100644 to 100755
# SPDX-License-Identifier: MIT
# Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
# Convert miopen driver command to ck Profiler
# Example: python3 ../script/convert_miopen_driver_to_profiler.py
# /opt/rocm/bin/MIOpenDriver conv -n 32 -c 64 -H 28 -W 28 -k 64 -y 3 -x 3
# -p 1 -q 1 -u 2 -v 2 -l 1 -j 1 -m conv -g 32 -F 1 -t 1
import argparse
import subprocess
def init_const_args(args):
args.ck_profiler_cmd = '../build/bin/ckProfiler'
# use decimal values
args.init_method = 2
# don't print tensor values
args.log_value = 0
def run_ck_profiler_cmd(cmd):
print("ckProfiler command:")
print(cmd)
subprocess.run(cmd)
def parse_data_type(args):
if args.data_type == "fp32":
if args.ck_profier_op == "grouped_conv_bwd_weight" or \
args.ck_profier_op == "grouped_conv_bwd_weight" or \
args.ck_profier_op == "grouped_conv_fwd":
args.data_type = 0
if args.data_type == "fp16":
if args.ck_profier_op == "grouped_conv_bwd_weight" or \
args.ck_profier_op == "grouped_conv_bwd_data" or \
args.ck_profier_op == "grouped_conv_fwd":
args.data_type = 1
if args.data_type == "int8":
if args.ck_profier_op == "grouped_conv_bwd_weight":
args.data_type = 4
if args.ck_profier_op == "grouped_conv_bwd_data":
print('Not supported data type for grouped_conv_bwd_data')
exit(1)
if args.ck_profier_op == "grouped_conv_fwd":
args.data_type = 3
if args.data_type == "bfp16":
if args.ck_profier_op == "grouped_conv_bwd_weight" or \
args.ck_profier_op == "grouped_conv_bwd_data" or \
args.ck_profier_op == "grouped_conv_fwd":
args.data_type = 2
def add_conv_params_to_cmd(args, cmd):
if args.spatial_dim == 1:
cmd += [str(args.fil_w), str(args.in_w)]
cmd += [str(args.conv_stride_w), str(args.dilation_w)]
cmd += [str(args.pad_w), str(args.pad_w)]
elif args.spatial_dim == 2:
cmd += [str(args.fil_h), str(args.fil_w)]
cmd += [str(args.in_h), str(args.in_w)]
cmd += [str(args.conv_stride_h), str(args.conv_stride_w)]
cmd += [str(args.dilation_h), str(args.dilation_w)]
cmd += [str(args.pad_h), str(args.pad_w)]
cmd += [str(args.pad_h), str(args.pad_w)]
elif args.spatial_dim == 3:
cmd += [str(args.fil_d), str(args.fil_h), str(args.fil_w)]
cmd += [str(args.in_d), str(args.in_h), str(args.in_w)]
cmd += [str(args.conv_stride_d), str(args.conv_stride_h)]
cmd += [str(args.conv_stride_w)]
cmd += [str(args.dilation_d),
str(args.dilation_h),
str(args.dilation_w)]
cmd += [str(args.pad_d), str(args.pad_h), str(args.pad_w)]
cmd += [str(args.pad_d), str(args.pad_h), str(args.pad_w)]
else:
print('Not supported spatial dim (supported: 1, 2, 3)')
exit(1)
def run_ck_grouped_conv_fwd(args):
args.ck_profier_op = "grouped_conv_fwd"
parse_data_type(args)
# default for MIOpen NHWGC
args.layout = 1
# use int32 by default
args.index_type = 0
cmd = [str(args.ck_profiler_cmd), str(args.ck_profier_op)]
cmd += [str(args.data_type), str(args.layout), str(args.index_type)]
cmd += [str(args.verify), str(args.init_method)]
cmd += [str(args.log_value), str(args.time)]
cmd += [str(args.spatial_dim), str(args.group_count)]
cmd += [str(args.batchsize), str(args.out_channels)]
cmd += [str(args.in_channels)]
add_conv_params_to_cmd(args, cmd)
run_ck_profiler_cmd(cmd)
def run_ck_grouped_conv_bwd_data(args):
args.ck_profier_op = "grouped_conv_bwd_data"
parse_data_type(args)
# default for MIOpen NHWGC
args.layout = 1
cmd = [str(args.ck_profiler_cmd), str(args.ck_profier_op)]
cmd += [str(args.data_type), str(args.layout)]
cmd += [str(args.verify), str(args.init_method)]
cmd += [str(args.log_value), str(args.time)]
cmd += [str(args.spatial_dim), str(args.group_count)]
cmd += [str(args.batchsize), str(args.out_channels)]
cmd += [str(args.in_channels)]
add_conv_params_to_cmd(args, cmd)
run_ck_profiler_cmd(cmd)
def run_ck_grouped_conv_bwd_weight(args):
args.ck_profier_op = "grouped_conv_bwd_weight"
parse_data_type(args)
# default for MIOpen NHWGC
args.layout = 2
# Test all split K value from the list {1, 2, 4, 8, 32, 64, 128}
args.split_k_value = -1
cmd = [str(args.ck_profiler_cmd), str(args.ck_profier_op)]
cmd += [str(args.data_type), str(args.layout)]
cmd += [str(args.verify), str(args.init_method)]
cmd += [str(args.log_value), str(args.time)]
cmd += [str(args.spatial_dim), str(args.group_count)]
cmd += [str(args.batchsize), str(args.out_channels)]
cmd += [str(args.in_channels)]
add_conv_params_to_cmd(args, cmd)
cmd += [str(args.split_k_value)]
run_ck_profiler_cmd(cmd)
# Get name of miopen driver, remove it from unknown
def process_miopen_driver_name(args, unknown):
if "convint8" in unknown:
args.data_type = 'int8'
unknown.remove("convint8")
elif "convbfp16" in unknown:
args.data_type = 'bfp16'
unknown.remove("convbfp16")
elif "convfp16" in unknown:
args.data_type = 'fp16'
unknown.remove("convfp16")
elif "conv" in unknown:
args.data_type = 'fp32'
unknown.remove("conv")
else:
print('Not supported driver (supported: conv, convfp16, convint8,'
' convbfp16).')
exit(1)
def run_ck_profiler(args):
# MIOpen get number of channel per all groups, CK profiler get number of
# channel per group
args.in_channels = int(args.in_channels / args.group_count)
args.out_channels = int(args.out_channels / args.group_count)
if args.forw == 0 or args.forw == 1 or args.forw == 3 or args.forw == 5:
run_ck_grouped_conv_fwd(args)
if args.forw == 0 or args.forw == 2 or args.forw == 3 or args.forw == 6:
run_ck_grouped_conv_bwd_data(args)
if args.forw == 0 or args.forw == 4 or args.forw == 5 or args.forw == 6:
run_ck_grouped_conv_bwd_weight(args)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
prog="converter",
description="Convert miopen driver command to ck Profiler"
"\nExample: python3 "
"../script/convert_miopen_driver_to_profiler.py "
"/opt/rocm/bin/MIOpenDriver conv -n 32 -c 64 -H 28 -W 28 "
"-k 64 -y 3 -x 3 -p 1 -q 1 -u 1 -v 1 -l 1 -j 1 -m conv -g "
"32 -F 1 -t 1",
)
parser.add_argument(
"-in_layout",
"-I",
default=-1,
type=int,
required=False,
help="Input Layout (Default=NCHW for 2d conv, NCDHW for 3d conv)"
)
parser.add_argument(
"-forw",
"-F",
default=0,
type=int,
required=False,
help="Flag enables fwd, bwd, wrw convolutions"
"\n0 fwd+bwd+wrw (default)"
"\n1 fwd only"
"\n2 bwd only"
"\n4 wrw only"
"\n3 fwd+bwd"
"\n5 fwd+wrw"
"\n6 bwd+wrw"
)
parser.add_argument(
"-spatial_dim",
"-_",
default=2,
type=int,
required=False,
help="convolution spatial dimension (Default-2)"
)
parser.add_argument(
"-batchsize",
"-n",
default=100,
type=int,
required=False,
help="Mini-batch size (Default=100)"
)
parser.add_argument(
"-in_channels",
"-c",
default=3,
type=int,
required=False,
help="Number of Input Channels (Default=3)"
)
parser.add_argument(
"-in_d",
"-!",
default=32,
type=int,
required=False,
help="Input Depth (Default=32)"
)
parser.add_argument(
"-in_h",
"-H",
default=32,
type=int,
required=False,
help="Input Height (Default=32)"
)
parser.add_argument(
"-in_w",
"-W",
default=32,
type=int,
required=False,
help="Input Width (Default=32)"
)
parser.add_argument(
"-out_channels",
"-k",
default=32,
type=int,
required=False,
help="Number of Output Channels (Default=32)"
)
parser.add_argument(
"-fil_d",
"-@",
default=3,
type=int,
required=False,
help="Filter Depth (Default=3)"
)
parser.add_argument(
"-fil_h",
"-y",
default=3,
type=int,
required=False,
help="Filter Height (Default=3)"
)
parser.add_argument(
"-fil_w",
"-x",
default=3,
type=int,
required=False,
help="Filter Width (Default=3)"
)
parser.add_argument(
"-conv_stride_d",
"-#",
default=1,
type=int,
required=False,
help="Convolution Stride for Depth (Default=1)"
)
parser.add_argument(
"-conv_stride_h",
"-u",
default=1,
type=int,
required=False,
help="Convolution Stride for Height (Default=1)"
)
parser.add_argument(
"-conv_stride_w",
"-v",
default=1,
type=int,
required=False,
help="Convolution Stride for Width (Default=1)"
)
parser.add_argument(
"-pad_d",
"-$",
default=1,
type=int,
required=False,
help="Zero Padding for Depth (Default=0)"
)
parser.add_argument(
"-pad_h",
"-p",
default=1,
type=int,
required=False,
help="Zero Padding for Height (Default=0)"
)
parser.add_argument(
"-pad_w",
"-q",
default=1,
type=int,
required=False,
help="Zero Padding for Width (Default=0)"
)
parser.add_argument(
"-verify",
"-V",
default=1,
type=int,
required=False,
help="Verify Each Layer (Default=1)"
)
parser.add_argument(
"-time",
"-t",
default=0,
type=int,
required=False,
help="Time Each Layer (Default=0)"
)
parser.add_argument(
"-dilation_d",
"-^",
default=1,
type=int,
required=False,
help="Dilation of Filter Depth (Default=1)"
)
parser.add_argument(
"-dilation_h",
"-l",
default=1,
type=int,
required=False,
help="Dilation of Filter Height (Default=1)"
)
parser.add_argument(
"-dilation_w",
"-j",
default=1,
type=int,
required=False,
help="Dilation of Filter Width (Default=1)"
)
parser.add_argument(
"-group_count",
"-g",
type=int,
default=1,
required=False,
help="Number of Groups (Default=1)"
)
args, unknown = parser.parse_known_args()
init_const_args(args)
process_miopen_driver_name(args, unknown)
print("Ignored args:")
print(unknown)
run_ck_profiler(args)
...@@ -122,7 +122,7 @@ def parse_logfile(logfile): ...@@ -122,7 +122,7 @@ def parse_logfile(logfile):
#sorted_kernels = [x for _,x in sorted(zip(tests,kernels))] #sorted_kernels = [x for _,x in sorted(zip(tests,kernels))]
test_list=list(range(1,len(tests)+1)) test_list=list(range(1,len(tests)+1))
#parse conv_fwd and conv_bwd performance tests: #parse conv_fwd and conv_bwd performance tests:
elif 'conv_fwd' in logfile or 'conv_bwd_data' in logfile: elif 'conv_fwd' in logfile or 'conv_bwd' in logfile:
for line in open(logfile): for line in open(logfile):
if 'tflops:' in line: if 'tflops:' in line:
lst=line.split() lst=line.split()
...@@ -143,6 +143,12 @@ def parse_logfile(logfile): ...@@ -143,6 +143,12 @@ def parse_logfile(logfile):
if 'Best Perf' in line: if 'Best Perf' in line:
lst=line.split() lst=line.split()
res.append(lst[36]) res.append(lst[36])
elif 'perf_fmha' in logfile:
for line in open(logfile):
if 'TFlops' in line:
lst=line.split()
line_dict=dict(zip(lst[1:],lst))
res.append(line_dict['TFlops,'])
return res return res
...@@ -268,14 +274,26 @@ def main(): ...@@ -268,14 +274,26 @@ def main():
for i in range(1,len(results)+1): for i in range(1,len(results)+1):
testlist.append("Test%i"%i) testlist.append("Test%i"%i)
table_name="ck_grouped_gemm_tflops" table_name="ck_grouped_gemm_tflops"
if 'conv_fwd' in filename: if 'perf_conv_fwd' in filename:
for i in range(1,len(results)+1): for i in range(1,len(results)+1):
testlist.append("Test%i"%i) testlist.append("Test%i"%i)
table_name="ck_conv_fwd_tflops" table_name="ck_conv_fwd_tflops"
if 'conv_bwd_data' in filename: if 'perf_conv_bwd_data' in filename:
for i in range(1,len(results)+1): for i in range(1,len(results)+1):
testlist.append("Test%i"%i) testlist.append("Test%i"%i)
table_name="ck_conv_bwd_data_tflops" table_name="ck_conv_bwd_data_tflops"
if 'grouped_conv_fwd' in filename:
for i in range(1,len(results)+1):
testlist.append("Test%i"%i)
table_name="ck_grouped_conv_fwd_tflops"
if 'grouped_conv_bwd_data' in filename:
for i in range(1,len(results)+1):
testlist.append("Test%i"%i)
table_name="ck_grouped_conv_bwd_data_tflops"
if 'grouped_conv_bwd_weight' in filename:
for i in range(1,len(results)+1):
testlist.append("Test%i"%i)
table_name="ck_grouped_conv_bwd_weight_tflops"
if 'gemm_bilinear' in filename: if 'gemm_bilinear' in filename:
for i in range(1,len(results)+1): for i in range(1,len(results)+1):
testlist.append("Test%i"%i) testlist.append("Test%i"%i)
...@@ -304,6 +322,14 @@ def main(): ...@@ -304,6 +322,14 @@ def main():
for i in range(1,len(results)+1): for i in range(1,len(results)+1):
testlist.append("Test%i"%i) testlist.append("Test%i"%i)
table_name="ck_mixed_gemm_tflops" table_name="ck_mixed_gemm_tflops"
if 'fmha_fwd' in filename:
for i in range(1,len(results)+1):
testlist.append("Test%i"%i)
table_name="ck_fmha_fwd_tflops"
if 'fmha_bwd' in filename:
for i in range(1,len(results)+1):
testlist.append("Test%i"%i)
table_name="ck_fmha_bwd_tflops"
tflops_base = get_baseline(table_name,conn) tflops_base = get_baseline(table_name,conn)
store_new_test_result(table_name, results, testlist, branch_name, node_id, gpu_arch, compute_units, rocm_vers, hip_vers, environment, conn) store_new_test_result(table_name, results, testlist, branch_name, node_id, gpu_arch, compute_units, rocm_vers, hip_vers, environment, conn)
......
...@@ -13,3 +13,20 @@ ...@@ -13,3 +13,20 @@
python3 process_perf_data.py perf_gemm.log python3 process_perf_data.py perf_gemm.log
python3 process_perf_data.py perf_resnet50_N256.log python3 process_perf_data.py perf_resnet50_N256.log
python3 process_perf_data.py perf_resnet50_N4.log python3 process_perf_data.py perf_resnet50_N4.log
file=./perf_fmha_fwd_gfx942.log
if [ -e "$file" ]; then
python3 process_perf_data.py perf_fmha_fwd_gfx942.log
fi
file=./perf_fmha_bwd_gfx942.log
if [ -e "$file" ]; then
python3 process_perf_data.py perf_fmha_bwd_gfx942.log
fi
file=./perf_fmha_fwd_gfx90a.log
if [ -e "$file" ]; then
python3 process_perf_data.py perf_fmha_fwd_gfx90a.log
fi
file=./perf_fmha_bwd_gfx90a.log
if [ -e "$file" ]; then
python3 process_perf_data.py perf_fmha_bwd_gfx90a.log
fi
...@@ -15,9 +15,27 @@ python3 process_perf_data.py perf_resnet50_N256.log ...@@ -15,9 +15,27 @@ python3 process_perf_data.py perf_resnet50_N256.log
python3 process_perf_data.py perf_resnet50_N4.log python3 process_perf_data.py perf_resnet50_N4.log
python3 process_perf_data.py perf_batched_gemm.log python3 process_perf_data.py perf_batched_gemm.log
python3 process_perf_data.py perf_grouped_gemm.log python3 process_perf_data.py perf_grouped_gemm.log
python3 process_perf_data.py perf_conv_fwd.log python3 process_perf_data.py perf_grouped_conv_fwd.log
python3 process_perf_data.py perf_conv_bwd_data.log python3 process_perf_data.py perf_grouped_conv_bwd_data.log
python3 process_perf_data.py perf_grouped_conv_bwd_weight.log
python3 process_perf_data.py perf_gemm_bilinear.log python3 process_perf_data.py perf_gemm_bilinear.log
python3 process_perf_data.py perf_reduction.log python3 process_perf_data.py perf_reduction.log
python3 process_perf_data.py perf_splitK_gemm.log python3 process_perf_data.py perf_splitK_gemm.log
python3 process_perf_data.py perf_onnx_gemm.log python3 process_perf_data.py perf_onnx_gemm.log
file=./perf_fmha_fwd_gfx942.log
if [ -e "$file" ]; then
python3 process_perf_data.py perf_fmha_fwd_gfx942.log
fi
file=./perf_fmha_bwd_gfx942.log
if [ -e "$file" ]; then
python3 process_perf_data.py perf_fmha_bwd_gfx942.log
fi
file=./perf_fmha_fwd_gfx90a.log
if [ -e "$file" ]; then
python3 process_perf_data.py perf_fmha_fwd_gfx90a.log
fi
file=./perf_fmha_bwd_gfx90a.log
if [ -e "$file" ]; then
python3 process_perf_data.py perf_fmha_bwd_gfx90a.log
fi
...@@ -12,27 +12,28 @@ INIT=$5 ...@@ -12,27 +12,28 @@ INIT=$5
LOG=$6 LOG=$6
TIME=$7 TIME=$7
N=$8 N=$8
SplitK=$9
# Resnet50 # Resnet50
######## op datatype layout verify init log time conv_dim G__ N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads ######## op datatype layout verify init log time conv_dim G__ N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 256 1024 1 1 14 14 1 1 1 1 0 0 0 0 $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 256 1024 1 1 14 14 1 1 1 1 0 0 0 0 $SplitK
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 512 1024 1 1 14 14 1 1 1 1 0 0 0 0 $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 512 1024 1 1 14 14 1 1 1 1 0 0 0 0 $SplitK
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 128 128 3 3 28 28 1 1 1 1 1 1 1 1 $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 128 128 3 3 28 28 1 1 1 1 1 1 1 1 $SplitK
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 512 128 1 1 28 28 1 1 1 1 0 0 0 0 $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 512 128 1 1 28 28 1 1 1 1 0 0 0 0 $SplitK
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 128 128 3 3 56 56 2 2 1 1 1 1 1 1 $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 128 128 3 3 56 56 2 2 1 1 1 1 1 1 $SplitK
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 512 2048 1 1 7 7 1 1 1 1 0 0 0 0 $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 512 2048 1 1 7 7 1 1 1 1 0 0 0 0 $SplitK
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 1024 256 1 1 14 14 1 1 1 1 0 0 0 0 $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 1024 256 1 1 14 14 1 1 1 1 0 0 0 0 $SplitK
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 256 256 3 3 14 14 1 1 1 1 1 1 1 1 $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 256 256 3 3 14 14 1 1 1 1 1 1 1 1 $SplitK
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 256 256 3 3 28 28 2 2 1 1 1 1 1 1 $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 256 256 3 3 28 28 2 2 1 1 1 1 1 1 $SplitK
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 128 256 1 1 56 56 1 1 1 1 0 0 0 0 $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 128 256 1 1 56 56 1 1 1 1 0 0 0 0 $SplitK
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 64 256 1 1 56 56 1 1 1 1 0 0 0 0 $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 64 256 1 1 56 56 1 1 1 1 0 0 0 0 $SplitK
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 512 512 3 3 14 14 2 2 1 1 1 1 1 1 $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 512 512 3 3 14 14 2 2 1 1 1 1 1 1 $SplitK
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 128 512 1 1 28 28 1 1 1 1 0 0 0 0 $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 128 512 1 1 28 28 1 1 1 1 0 0 0 0 $SplitK
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 256 512 1 1 28 28 1 1 1 1 0 0 0 0 $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 256 512 1 1 28 28 1 1 1 1 0 0 0 0 $SplitK
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 2048 512 1 1 7 7 1 1 1 1 0 0 0 0 $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 2048 512 1 1 7 7 1 1 1 1 0 0 0 0 $SplitK
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 512 512 3 3 7 7 1 1 1 1 1 1 1 1 $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 512 512 3 3 7 7 1 1 1 1 1 1 1 1 $SplitK
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 256 64 1 1 56 56 1 1 1 1 0 0 0 0 $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 256 64 1 1 56 56 1 1 1 1 0 0 0 0 $SplitK
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 64 64 1 1 56 56 1 1 1 1 0 0 0 0 $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 64 64 1 1 56 56 1 1 1 1 0 0 0 0 $SplitK
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 64 64 3 3 56 56 1 1 1 1 1 1 1 1 $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 64 64 3 3 56 56 1 1 1 1 1 1 1 1 $SplitK
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 64 3 7 7 224 224 2 2 1 1 3 3 3 3 $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 64 3 7 7 224 224 2 2 1 1 3 3 3 3 $SplitK
#!/bin/bash
## GPU visibility
export HIP_VISIBLE_DEVICES=0
DRIVER="../build/bin/ckProfiler"
OP=$1
DATATYPE=$2
LAYOUT=$3
INDEXTYPE=$4
VERIFY=$5
INIT=$6
LOG=$7
TIME=$8
N=$9
# Resnet50
######## op datatype indextype layout verify init log time conv_dim G__ N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads
$DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 256 1024 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 512 1024 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 128 128 3 3 28 28 1 1 1 1 1 1 1 1
$DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 512 128 1 1 28 28 1 1 1 1 0 0 0 0
$DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 128 128 3 3 56 56 2 2 1 1 1 1 1 1
$DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 512 2048 1 1 7 7 1 1 1 1 0 0 0 0
$DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 1024 256 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 256 256 3 3 14 14 1 1 1 1 1 1 1 1
$DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 256 256 3 3 28 28 2 2 1 1 1 1 1 1
$DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 128 256 1 1 56 56 1 1 1 1 0 0 0 0
$DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 64 256 1 1 56 56 1 1 1 1 0 0 0 0
$DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 512 512 3 3 14 14 2 2 1 1 1 1 1 1
$DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 128 512 1 1 28 28 1 1 1 1 0 0 0 0
$DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 256 512 1 1 28 28 1 1 1 1 0 0 0 0
$DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 2048 512 1 1 7 7 1 1 1 1 0 0 0 0
$DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 512 512 3 3 7 7 1 1 1 1 1 1 1 1
$DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 256 64 1 1 56 56 1 1 1 1 0 0 0 0
$DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 64 64 1 1 56 56 1 1 1 1 0 0 0 0
$DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 64 64 3 3 56 56 1 1 1 1 1 1 1 1
$DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 64 3 7 7 224 224 2 2 1 1 3 3 3 3
...@@ -90,21 +90,27 @@ print_log_header $gemm_bilinear_log $env_type $branch $host_name ...@@ -90,21 +90,27 @@ print_log_header $gemm_bilinear_log $env_type $branch $host_name
./profile_gemm_bilinear.sh gemm_bilinear 1 2 $verify 1 0 1 2>&1 | tee -a $gemm_bilinear_log ./profile_gemm_bilinear.sh gemm_bilinear 1 2 $verify 1 0 1 2>&1 | tee -a $gemm_bilinear_log
./profile_gemm_bilinear.sh gemm_bilinear 1 3 $verify 1 0 1 2>&1 | tee -a $gemm_bilinear_log ./profile_gemm_bilinear.sh gemm_bilinear 1 3 $verify 1 0 1 2>&1 | tee -a $gemm_bilinear_log
#run conv_fwd tests #run grouped_fwd tests
export conv_fwd_log="perf_conv_fwd.log" export grouped_conv_fwd_log="perf_grouped_conv_fwd.log"
print_log_header $conv_fwd_log $env_type $branch $host_name print_log_header $grouped_conv_fwd_log $env_type $branch $host_name
./profile_conv_fwd.sh conv_fwd 0 1 $verify 1 0 1 256 2>&1 | tee -a $conv_fwd_log ./profile_grouped_conv_fwd.sh grouped_conv_fwd 0 1 0 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_fwd_log
./profile_conv_fwd.sh conv_fwd 1 1 $verify 1 0 1 256 2>&1 | tee -a $conv_fwd_log ./profile_grouped_conv_fwd.sh grouped_conv_fwd 1 1 0 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_fwd_log
./profile_conv_fwd.sh conv_fwd 2 1 $verify 1 0 1 256 2>&1 | tee -a $conv_fwd_log ./profile_grouped_conv_fwd.sh grouped_conv_fwd 2 1 0 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_fwd_log
./profile_conv_fwd.sh conv_fwd 3 1 $verify 1 0 1 256 2>&1 | tee -a $conv_fwd_log
#run conv_bwd_data tests #run grouped_bwd_data tests
export conv_bwd_data_log="perf_conv_bwd_data.log" export grouped_conv_bwd_data_log="perf_grouped_conv_bwd_data.log"
print_log_header $conv_bwd_data_log $env_type $branch $host_name print_log_header $grouped_conv_bwd_data_log $env_type $branch $host_name
./profile_conv_bwd_data.sh conv_bwd_data 0 1 $verify 1 0 1 256 2>&1 | tee -a $conv_bwd_data_log ./profile_grouped_conv_bwd_data.sh grouped_conv_bwd_data 0 1 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_bwd_data_log
./profile_conv_bwd_data.sh conv_bwd_data 1 1 $verify 1 0 1 256 2>&1 | tee -a $conv_bwd_data_log ./profile_grouped_conv_bwd_data.sh grouped_conv_bwd_data 1 1 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_bwd_data_log
./profile_conv_bwd_data.sh conv_bwd_data 2 1 $verify 1 0 1 256 2>&1 | tee -a $conv_bwd_data_log ./profile_grouped_conv_bwd_data.sh grouped_conv_bwd_data 2 1 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_bwd_data_log
./profile_conv_bwd_data.sh conv_bwd_data 3 1 $verify 1 0 1 256 2>&1 | tee -a $conv_bwd_data_log
#run grouped_bwd_weight tests
export grouped_conv_bwd_weight_log="perf_grouped_conv_bwd_weight.log"
print_log_header $grouped_conv_bwd_weight_log $env_type $branch $host_name
./profile_grouped_conv_bwd_weight.sh grouped_conv_bwd_weight 0 2 $verify 1 0 1 256 1 2>&1 | tee -a $grouped_conv_bwd_weight_log
./profile_grouped_conv_bwd_weight.sh grouped_conv_bwd_weight 1 2 $verify 1 0 1 256 1 2>&1 | tee -a $grouped_conv_bwd_weight_log
./profile_grouped_conv_bwd_weight.sh grouped_conv_bwd_weight 2 2 $verify 1 0 1 256 1 2>&1 | tee -a $grouped_conv_bwd_weight_log
./profile_grouped_conv_bwd_weight.sh grouped_conv_bwd_weight 1 2 $verify 1 0 1 256 4 2>&1 | tee -a $grouped_conv_bwd_weight_log
#run resnet50 tests #run resnet50 tests
export resnet256_log="perf_resnet50_N256.log" export resnet256_log="perf_resnet50_N256.log"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment