"fmoe/vscode:/vscode.git/clone" did not exist on "91a5e794ff1b6754960cfe825962a6c0979775d8"
Commit e57bd240 authored by aska-0096's avatar aska-0096
Browse files

Merge branch 'develop' of...

Merge branch 'develop' of https://github.com/ROCmSoftwarePlatform/composable_kernel into add_fp16_wmma_conv_instance
parents 4b70d68e 37a8c1f7
......@@ -71,7 +71,7 @@ int profile_batched_gemm_multi_d(int argc, char* argv[])
const int BatchCount = std::stoi(argv[17]);
using F16 = ck::half_t;
#ifdef __int8__
#ifdef CK_ENABLE_INT8
using INT8 = int8_t;
#endif
......@@ -165,7 +165,7 @@ int profile_batched_gemm_multi_d(int argc, char* argv[])
{
return profile(F16{}, F16{}, F16{}, Col{}, Col{}, Row{});
}
#ifdef __int8__
#ifdef CK_ENABLE_INT8
else if(data_type == GemmDataType::INT8_INT8_INT8 && layout == GemmMatrixLayout::MK_KN_MN)
{
return profile(INT8{}, INT8{}, INT8{}, Row{}, Row{}, Row{});
......
......@@ -77,7 +77,7 @@ int profile_conv_bwd_data(int argc, char* argv[])
using F32 = float;
using F16 = ck::half_t;
using BF16 = ck::bhalf_t;
#ifdef __int8__
#ifdef CK_ENABLE_INT8
using INT8 = int8_t;
#endif
......@@ -140,7 +140,7 @@ int profile_conv_bwd_data(int argc, char* argv[])
{
return profile(I1, NWC{}, KXC{}, NWK{}, BF16{}, BF16{}, BF16{});
}
#ifdef __int8__
#ifdef CK_ENABLE_INT8
else if(data_type == ConvDataType::INT8_INT8_INT8)
{
return profile(I1, NWC{}, KXC{}, NWK{}, INT8{}, INT8{}, INT8{});
......@@ -161,7 +161,7 @@ int profile_conv_bwd_data(int argc, char* argv[])
{
return profile(I2, NHWC{}, KYXC{}, NHWK{}, BF16{}, BF16{}, BF16{});
}
#ifdef __int8__
#ifdef CK_ENABLE_INT8
else if(data_type == ConvDataType::INT8_INT8_INT8)
{
return profile(I2, NHWC{}, KYXC{}, NHWK{}, INT8{}, INT8{}, INT8{});
......@@ -182,7 +182,7 @@ int profile_conv_bwd_data(int argc, char* argv[])
{
return profile(I3, NDHWC{}, KZYXC{}, NDHWK{}, BF16{}, BF16{}, BF16{});
}
#ifdef __int8__
#ifdef CK_ENABLE_INT8
else if(data_type == ConvDataType::INT8_INT8_INT8)
{
return profile(I3, NDHWC{}, KZYXC{}, NDHWK{}, INT8{}, INT8{}, INT8{});
......
......@@ -69,10 +69,10 @@ int profile_gemm(int argc, char* argv[])
using F32 = float;
using F16 = ck::half_t;
#ifdef __bf16__
#ifdef CK_ENABLE_BF16
using BF16 = ck::bhalf_t;
#endif
#ifdef __int8__
#ifdef CK_ENABLE_INT8
using INT8 = int8_t;
using INT32 = int32_t;
#endif
......@@ -121,7 +121,10 @@ int profile_gemm(int argc, char* argv[])
return pass ? 0 : 1;
};
if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::MK_KN_MN)
if(false)
;
#ifdef CK_ENABLE_FP32
else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::MK_KN_MN)
{
return profile(Row{}, Row{}, Row{}, F32{}, F32{}, F32{}, F32{});
}
......@@ -137,6 +140,8 @@ int profile_gemm(int argc, char* argv[])
{
return profile(Col{}, Col{}, Row{}, F32{}, F32{}, F32{}, F32{});
}
#endif
#ifdef CK_ENABLE_FP16
else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
{
return profile(Row{}, Row{}, Row{}, F16{}, F16{}, F32{}, F16{});
......@@ -153,7 +158,8 @@ int profile_gemm(int argc, char* argv[])
{
return profile(Col{}, Col{}, Row{}, F16{}, F16{}, F32{}, F16{});
}
#ifdef __bf16__
#endif
#ifdef CK_ENABLE_BF16
else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_KN_MN)
{
return profile(Row{}, Row{}, Row{}, BF16{}, BF16{}, F32{}, BF16{});
......@@ -171,7 +177,7 @@ int profile_gemm(int argc, char* argv[])
return profile(Col{}, Col{}, Row{}, BF16{}, BF16{}, F32{}, BF16{});
}
#endif
#ifdef __int8__
#ifdef CK_ENABLE_INT8
else if(data_type == GemmDataType::INT8_INT8_INT8 && layout == GemmMatrixLayout::MK_KN_MN)
{
return profile(Row{}, Row{}, Row{}, INT8{}, INT8{}, INT32{}, INT8{});
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "profiler/profile_gemm_multiply_add_impl.hpp"
#include "profiler_operation_registry.hpp"
#define OP_NAME "gemm_multiply_add"
#define OP_DESC "GEMM+MULTIPLY+ADD"
int profile_gemm_multiply_add(int argc, char* argv[])
{
enum struct MatrixLayout
{
MK_KN_MN_MN_MN, // 0
MK_NK_MN_MN_MN, // 1
};
enum struct MatrixDataType
{
F16_F16_F16_F16_F16, // 0
F16_F8_F32_F32_F16, // 1
};
if(argc != 16)
{
// clang-format off
printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
printf("arg2: data type (0: fp16; 1: fp16Afp8B)\n");
printf("arg3: matrix layout (0: E[m, n] = Multiply_Add((A[m, k] * B[k, n]) x D1[m, n] + D0[m, n]);\n");
printf(" 1: E[m, n] = Multiply_Add((A[m, k] * B[n, k]) x D1[m, n] + D0[m, n]);\n");
printf("arg4: verification (0: no; 1: yes)\n");
printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
printf("arg6: print tensor value (0: no; 1: yes)\n");
printf("arg7: time kernel (0=no, 1=yes)\n");
printf("arg8 to 15: M, N, K, StrideA, StrideB, StrideD0, StrideD1, StrideE\n");
// clang-format on
exit(1);
}
const auto data_type = static_cast<MatrixDataType>(std::stoi(argv[2]));
const auto layout = static_cast<MatrixLayout>(std::stoi(argv[3]));
const bool do_verification = std::stoi(argv[4]);
const int init_method = std::stoi(argv[5]);
const bool do_log = std::stoi(argv[6]);
const bool time_kernel = std::stoi(argv[7]);
const int M = std::stoi(argv[8]);
const int N = std::stoi(argv[9]);
const int K = std::stoi(argv[10]);
const int StrideA = std::stoi(argv[11]);
const int StrideB = std::stoi(argv[12]);
const int StrideD0 = std::stoi(argv[13]);
const int StrideD1 = std::stoi(argv[14]);
const int StrideE = std::stoi(argv[15]);
using F8 = ck::f8_t;
using F16 = ck::half_t;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
auto profile = [&](auto a_type,
auto b_type,
auto acc_type,
auto d0_type,
auto d1_type,
auto e_type,
auto a_layout,
auto b_layout,
auto d0_layout,
auto d1_layout,
auto e_layout) {
using ADataType = decltype(a_type);
using BDataType = decltype(b_type);
using AccDataType = decltype(acc_type);
using D0DataType = decltype(d0_type);
using D1DataType = decltype(d1_type);
using EDataType = decltype(e_type);
using ALayout = decltype(a_layout);
using BLayout = decltype(b_layout);
using D0Layout = decltype(d0_layout);
using D1Layout = decltype(d1_layout);
using ELayout = decltype(e_layout);
const int DefaultStrideA = ck::is_same_v<ALayout, Row> ? K : M;
const int DefaultStrideB = ck::is_same_v<BLayout, Row> ? N : K;
const int DefaultStrideD0 = ck::is_same_v<D0Layout, Row> ? N : M;
const int DefaultStrideD1 = ck::is_same_v<D1Layout, Row> ? N : M;
const int DefaultStrideE = ck::is_same_v<ELayout, Row> ? N : M;
bool pass = ck::profiler::profile_gemm_multiply_add_impl<ADataType,
BDataType,
AccDataType,
D0DataType,
D1DataType,
EDataType,
ALayout,
BLayout,
D0Layout,
D1Layout,
ELayout>(
do_verification,
init_method,
do_log,
time_kernel,
M,
N,
K,
(StrideA < 0) ? DefaultStrideA : StrideA,
(StrideB < 0) ? DefaultStrideB : StrideB,
(StrideD0 < 0) ? DefaultStrideD0 : StrideD0,
(StrideD1 < 0) ? DefaultStrideD1 : StrideD1,
(StrideE < 0) ? DefaultStrideE : StrideE);
return pass ? 0 : 1;
};
if(data_type == MatrixDataType::F16_F16_F16_F16_F16 && layout == MatrixLayout::MK_KN_MN_MN_MN)
{
return profile(F16{}, F16{}, F32{}, F16{}, F16{}, F16{}, Row{}, Row{}, Row{}, Row{}, Row{});
}
else if(data_type == MatrixDataType::F16_F16_F16_F16_F16 &&
layout == MatrixLayout::MK_NK_MN_MN_MN)
{
return profile(F16{}, F16{}, F32{}, F16{}, F16{}, F16{}, Row{}, Col{}, Row{}, Row{}, Row{});
}
else if(data_type == MatrixDataType::F16_F8_F32_F32_F16 &&
layout == MatrixLayout::MK_KN_MN_MN_MN)
{
return profile(F16{}, F8{}, F32{}, F32{}, F32{}, F16{}, Row{}, Row{}, Row{}, Row{}, Row{});
}
else if(data_type == MatrixDataType::F16_F8_F32_F32_F16 &&
layout == MatrixLayout::MK_NK_MN_MN_MN)
{
return profile(F16{}, F8{}, F32{}, F32{}, F32{}, F16{}, Row{}, Col{}, Row{}, Row{}, Row{});
}
else
{
std::cout << "this data_type & layout is not implemented" << std::endl;
return 1;
}
}
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm_multiply_add);
......@@ -23,6 +23,8 @@ enum struct GemmDataType
F16_F16_F16, // 1
BF16_BF16_BF16, // 2
INT8_INT8_INT8, // 3
F8_F16_F16, // 4
F16_F8_F16, // 5
};
#define OP_NAME "gemm_splitk"
......@@ -33,7 +35,7 @@ int profile_gemm_splitk(int argc, char* argv[])
if(argc != 15)
{
printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n");
printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8; 4: f8@f16; 5: f16@f8)\n");
printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
printf(" 1: A[m, k] * B[n, k] = C[m, n];\n");
printf(" 2: A[k, m] * B[k, n] = C[m, n];\n");
......@@ -65,6 +67,7 @@ int profile_gemm_splitk(int argc, char* argv[])
using F32 = float;
using F16 = ck::half_t;
using F8 = ck::f8_t;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
......@@ -143,6 +146,38 @@ int profile_gemm_splitk(int argc, char* argv[])
{
return profile(F16{}, F16{}, F32{}, F16{}, Col{}, Col{}, Row{});
}
else if(data_type == GemmDataType::F8_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
{
return profile(F8{}, F16{}, F32{}, F16{}, Row{}, Row{}, Row{});
}
else if(data_type == GemmDataType::F8_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN)
{
return profile(F8{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{});
}
else if(data_type == GemmDataType::F8_F16_F16 && layout == GemmMatrixLayout::KM_KN_MN)
{
return profile(F8{}, F16{}, F32{}, F16{}, Col{}, Row{}, Row{});
}
else if(data_type == GemmDataType::F8_F16_F16 && layout == GemmMatrixLayout::KM_NK_MN)
{
return profile(F8{}, F16{}, F32{}, F16{}, Col{}, Col{}, Row{});
}
else if(data_type == GemmDataType::F16_F8_F16 && layout == GemmMatrixLayout::MK_KN_MN)
{
return profile(F16{}, F8{}, F32{}, F16{}, Row{}, Row{}, Row{});
}
else if(data_type == GemmDataType::F16_F8_F16 && layout == GemmMatrixLayout::MK_NK_MN)
{
return profile(F16{}, F8{}, F32{}, F16{}, Row{}, Col{}, Row{});
}
else if(data_type == GemmDataType::F16_F8_F16 && layout == GemmMatrixLayout::KM_KN_MN)
{
return profile(F16{}, F8{}, F32{}, F16{}, Col{}, Row{}, Row{});
}
else if(data_type == GemmDataType::F16_F8_F16 && layout == GemmMatrixLayout::KM_NK_MN)
{
return profile(F16{}, F8{}, F32{}, F16{}, Col{}, Col{}, Row{});
}
else
{
std::cout << "this data_type & layout is not implemented" << std::endl;
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "profiler/profile_gemm_streamk_impl.hpp"
#include "profiler_operation_registry.hpp"
enum struct GemmMatrixLayout
{
MK_KN_MN, // 0
MK_NK_MN, // 1
KM_KN_MN, // 2
KM_NK_MN, // 3
};
enum struct GemmDataType
{
F32_F32_F32, // 0
F16_F16_F16, // 1
BF16_BF16_BF16, // 2
INT8_INT8_INT8, // 3
};
#define OP_NAME "gemm_streamk"
#define OP_DESC "StreamK GEMM"
int profile_gemm_streamk(int argc, char* argv[])
{
if(argc < 14)
{
printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n");
printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
printf(" 1: A[m, k] * B[n, k] = C[m, n];\n");
printf(" 2: A[k, m] * B[k, n] = C[m, n];\n");
printf(" 3: A[k, m] * B[n, k] = C[m, n])\n");
printf("arg4: verification (0: no; 1: yes)\n");
printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
printf("arg6: print tensor value (0: no; 1: yes)\n");
printf("arg7: time kernel (0=no, 1=yes)\n");
printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideC\n");
printf("arg14: num_sk_blocks (optional)\n");
exit(1);
}
const auto data_type = static_cast<GemmDataType>(std::stoi(argv[2]));
const auto layout = static_cast<GemmMatrixLayout>(std::stoi(argv[3]));
const bool do_verification = std::stoi(argv[4]);
const int init_method = std::stoi(argv[5]);
const bool do_log = std::stoi(argv[6]);
const bool time_kernel = std::stoi(argv[7]);
const int M = std::stoi(argv[8]);
const int N = std::stoi(argv[9]);
const int K = std::stoi(argv[10]);
const int StrideA = std::stoi(argv[11]);
const int StrideB = std::stoi(argv[12]);
const int StrideC = std::stoi(argv[13]);
const uint32_t NumSKBlocks =
argc >= 15 ? static_cast<uint32_t>(std::stoul(std::string(argv[14]))) : 0xffffffff;
using F32 = float;
using F16 = ck::half_t;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
auto profile = [&](auto a_type,
auto b_type,
auto acc_type,
auto c_type,
auto a_layout,
auto b_layout,
auto c_layout) {
using ADataType = decltype(a_type);
using BDataType = decltype(b_type);
using AccDataType = decltype(acc_type);
using CDataType = decltype(c_type);
using ALayout = decltype(a_layout);
using BLayout = decltype(b_layout);
using CLayout = decltype(c_layout);
const int DefaultStrideA = ck::is_same_v<ALayout, Row> ? K : M;
const int DefaultStrideB = ck::is_same_v<BLayout, Row> ? N : K;
const int DefaultStrideC = ck::is_same_v<CLayout, Row> ? N : M;
bool pass = ck::profiler::profile_gemm_streamk_impl<ADataType,
BDataType,
AccDataType,
CDataType,
ALayout,
BLayout,
CLayout>(
do_verification,
init_method,
do_log,
time_kernel,
M,
N,
K,
(StrideA <= 0) ? DefaultStrideA : StrideA,
(StrideB <= 0) ? DefaultStrideB : StrideB,
(StrideC <= 0) ? DefaultStrideC : StrideC,
NumSKBlocks);
return pass ? 0 : 1;
};
if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::MK_KN_MN)
{
return profile(F32{}, F32{}, F32{}, F32{}, Row{}, Row{}, Row{});
}
else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::MK_NK_MN)
{
return profile(F32{}, F32{}, F32{}, F32{}, Row{}, Col{}, Row{});
}
else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::KM_KN_MN)
{
return profile(F32{}, F32{}, F32{}, F32{}, Col{}, Row{}, Row{});
}
else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::KM_NK_MN)
{
return profile(F32{}, F32{}, F32{}, F32{}, Col{}, Col{}, Row{});
}
else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
{
return profile(F16{}, F16{}, F32{}, F16{}, Row{}, Row{}, Row{});
}
else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN)
{
return profile(F16{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{});
}
else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_KN_MN)
{
return profile(F16{}, F16{}, F32{}, F16{}, Col{}, Row{}, Row{});
}
else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_NK_MN)
{
return profile(F16{}, F16{}, F32{}, F16{}, Col{}, Col{}, Row{});
}
else
{
std::cout << "this data_type & layout is not implemented" << std::endl;
return 1;
}
}
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm_streamk);
......@@ -77,15 +77,10 @@ int profile_grouped_conv_bwd_data(int argc, char* argv[])
using F16 = ck::half_t;
using BF16 = ck::bhalf_t;
using GNHWC = ck::tensor_layout::convolution::GNHWC;
using NHWGC = ck::tensor_layout::convolution::NHWGC;
using GKYXC = ck::tensor_layout::convolution::GKYXC;
using GNHWK = ck::tensor_layout::convolution::GNHWK;
using NHWGK = ck::tensor_layout::convolution::NHWGK;
using namespace ck::tensor_layout::convolution;
constexpr auto I2 = ck::Number<2>{};
constexpr auto I3 = ck::Number<3>{};
auto profile = [&](auto num_dim_spatial_tmp,
auto out_layout,
......@@ -116,36 +111,70 @@ int profile_grouped_conv_bwd_data(int argc, char* argv[])
return pass ? 0 : 1;
};
// GNHWC_GKYXC_GNHWK
if(num_dim_spatial == 2 && layout == ConvLayout::GNHWC_GKYXC_GNHWK)
if(num_dim_spatial == 2)
{
if(data_type == ConvDataType::F32_F32_F32)
{
return profile(I2, GNHWK{}, GKYXC{}, GNHWC{}, F32{}, F32{}, F32{});
}
else if(data_type == ConvDataType::F16_F16_F16)
if(layout == ConvLayout::GNHWC_GKYXC_GNHWK)
{
return profile(I2, GNHWK{}, GKYXC{}, GNHWC{}, F16{}, F16{}, F16{});
if(data_type == ConvDataType::F32_F32_F32)
{
return profile(I2, GNHWK{}, GKYXC{}, GNHWC{}, F32{}, F32{}, F32{});
}
else if(data_type == ConvDataType::F16_F16_F16)
{
return profile(I2, GNHWK{}, GKYXC{}, GNHWC{}, F16{}, F16{}, F16{});
}
else if(data_type == ConvDataType::BF16_BF16_BF16)
{
return profile(I2, GNHWK{}, GKYXC{}, GNHWC{}, BF16{}, BF16{}, BF16{});
}
}
else if(data_type == ConvDataType::BF16_BF16_BF16)
else if(layout == ConvLayout::NHWGC_GKYXC_NHWGK)
{
return profile(I2, GNHWK{}, GKYXC{}, GNHWC{}, BF16{}, BF16{}, BF16{});
if(data_type == ConvDataType::F32_F32_F32)
{
return profile(I2, NHWGK{}, GKYXC{}, NHWGC{}, F32{}, F32{}, F32{});
}
else if(data_type == ConvDataType::F16_F16_F16)
{
return profile(I2, NHWGK{}, GKYXC{}, NHWGC{}, F16{}, F16{}, F16{});
}
else if(data_type == ConvDataType::BF16_BF16_BF16)
{
return profile(I2, NHWGK{}, GKYXC{}, NHWGC{}, BF16{}, BF16{}, BF16{});
}
}
}
// NHWGC_GKYXC_NHWGK
else if(num_dim_spatial == 2 && layout == ConvLayout::NHWGC_GKYXC_NHWGK)
else if(num_dim_spatial == 3)
{
if(data_type == ConvDataType::F32_F32_F32)
{
return profile(I2, NHWGK{}, GKYXC{}, NHWGC{}, F32{}, F32{}, F32{});
}
else if(data_type == ConvDataType::F16_F16_F16)
if(layout == ConvLayout::GNHWC_GKYXC_GNHWK)
{
return profile(I2, NHWGK{}, GKYXC{}, NHWGC{}, F16{}, F16{}, F16{});
if(data_type == ConvDataType::F32_F32_F32)
{
return profile(I3, GNDHWK{}, GKZYXC{}, GNDHWC{}, F32{}, F32{}, F32{});
}
else if(data_type == ConvDataType::F16_F16_F16)
{
return profile(I3, GNDHWK{}, GKZYXC{}, GNDHWC{}, F16{}, F16{}, F16{});
}
else if(data_type == ConvDataType::BF16_BF16_BF16)
{
return profile(I3, GNDHWK{}, GKZYXC{}, GNDHWC{}, BF16{}, BF16{}, BF16{});
}
}
else if(data_type == ConvDataType::BF16_BF16_BF16)
else if(layout == ConvLayout::NHWGC_GKYXC_NHWGK)
{
return profile(I2, NHWGK{}, GKYXC{}, NHWGC{}, BF16{}, BF16{}, BF16{});
if(data_type == ConvDataType::F32_F32_F32)
{
return profile(I3, NDHWGK{}, GKZYXC{}, NDHWGC{}, F32{}, F32{}, F32{});
}
else if(data_type == ConvDataType::F16_F16_F16)
{
return profile(I3, NDHWGK{}, GKZYXC{}, NDHWGC{}, F16{}, F16{}, F16{});
}
else if(data_type == ConvDataType::BF16_BF16_BF16)
{
return profile(I3, NDHWGK{}, GKZYXC{}, NDHWGC{}, BF16{}, BF16{}, BF16{});
}
}
}
......
......@@ -83,19 +83,7 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[])
using F16 = ck::half_t;
using BF16 = ck::bhalf_t;
using GNWC = ck::tensor_layout::convolution::GNWC;
using GNHWC = ck::tensor_layout::convolution::GNHWC;
using NHWGC = ck::tensor_layout::convolution::NHWGC;
using GNDHWC = ck::tensor_layout::convolution::GNDHWC;
using GKXC = ck::tensor_layout::convolution::GKXC;
using GKYXC = ck::tensor_layout::convolution::GKYXC;
using GKZYXC = ck::tensor_layout::convolution::GKZYXC;
using GNWK = ck::tensor_layout::convolution::GNWK;
using GNHWK = ck::tensor_layout::convolution::GNHWK;
using NHWGK = ck::tensor_layout::convolution::NHWGK;
using GNDHWK = ck::tensor_layout::convolution::GNDHWK;
using namespace ck::tensor_layout::convolution;
constexpr auto I1 = ck::Number<1>{};
constexpr auto I2 = ck::Number<2>{};
......@@ -194,6 +182,22 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[])
return profile(I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, BF16{}, F32{}, BF16{});
}
}
else if(num_dim_spatial == 3 && layout == ConvLayout::NHWGC_GKYXC_NHWGK)
{
if(data_type == ConvDataType::F32_F32_F32)
{
return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F32{}, F32{}, F32{});
}
else if(data_type == ConvDataType::F16_F16_F16)
{
return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F16{}, F16{}, F16{});
}
else if(data_type == ConvDataType::BF16_F32_BF16)
{
// fp32 atomic add is used for weight tensor in bf16 kernel
return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, BF16{}, F32{}, BF16{});
}
}
std::cout << "this data_type & layout is not implemented" << std::endl;
......
......@@ -88,7 +88,7 @@ int profile_grouped_gemm(int argc, char* argv[])
const auto StrideBs = argToIntArray(argv[12]);
const auto StrideCs = argToIntArray(argv[13]);
const int kbatch = argc == 15 ? std::stoi(argv[14]) : 1;
#ifdef CK_ENABLE_FP16
if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
{
ck::profiler::profile_grouped_gemm_impl<ck::half_t,
......@@ -173,7 +173,7 @@ int profile_grouped_gemm(int argc, char* argv[])
{
throw std::runtime_error("wrong! this GEMM data_type & layout is not implemented");
}
#endif
return 0;
}
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "profiler/profile_image_to_column_impl.hpp"
#include "profiler_operation_registry.hpp"
namespace {
enum struct ConvLayout
{
NHWC, // 0
};
enum struct DataType
{
F32_F32, // 0
F16_F16, // 1
BF16_BF16, // 2
INT8_INT8, // 3
};
#define OP_NAME "image_to_column"
#define OP_DESC "Image To Column"
static void print_helper_msg()
{
std::cout
// clang-format off
<< "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
<< "arg2: data type (0: Input fp32, Weight fp32, Output fp32\n"
<< " 1: Input fp16, Weight fp16, Output fp16\n"
<< " 2: Input bf16, Weight bf16, Output bf16\n"
<< " 3: Input int8, Weight int8, Output int8)\n"
<< "arg3: tensor layout (0: Input[N, Hi, Wi, C], Output[N * Ho * Wo, Y * X * C])\n"
<< "arg4: verification (0: no, 1: yes)\n"
<< "arg5: initialization (0: no init, 1: integer value, 2: decimal value)\n"
<< "arg6: print tensor value (0: no; 1: yes)\n"
<< "arg7: time kernel (0: no, 1: yes)\n"
<< ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
// clang-format on
}
} // namespace
int profile_image_to_column(int argc, char* argv[])
{
// 8 for control, 1 for num_dim_spatial
if(argc < 9)
{
print_helper_msg();
return 1;
}
const auto data_type = static_cast<DataType>(std::stoi(argv[2]));
const auto layout = static_cast<ConvLayout>(std::stoi(argv[3]));
const bool do_verification = std::stoi(argv[4]);
const int init_method = std::stoi(argv[5]);
const bool do_log = std::stoi(argv[6]);
const bool time_kernel = std::stoi(argv[7]);
const int num_dim_spatial = std::stoi(argv[8]);
// 8 for control, 1 for num_dim_spatial, 4 for G/N/K/C, and 6 * num_dim_spatial
if(argc != 8 + 1 + 4 + 6 * num_dim_spatial)
{
print_helper_msg();
return 1;
}
const auto params = ck::utils::conv::parse_conv_param(num_dim_spatial, 9, argv);
using F32 = float;
using F16 = ck::half_t;
using BF16 = ck::bhalf_t;
using INT8 = int8_t;
using namespace ck::tensor_layout::convolution;
constexpr auto I1 = ck::Number<1>{};
constexpr auto I2 = ck::Number<2>{};
constexpr auto I3 = ck::Number<3>{};
auto profile = [&](auto num_dim_spatial_tmp, auto in_layout, auto in_type, auto out_type) {
constexpr ck::index_t NDimSpatial = num_dim_spatial_tmp.value;
using InLayout = decltype(in_layout);
using InDataType = decltype(in_type);
using OutDataType = decltype(out_type);
bool pass = ck::profiler::
profile_image_to_column_impl<NDimSpatial, InLayout, InDataType, OutDataType>(
do_verification, init_method, do_log, time_kernel, params);
return pass ? 0 : 1;
};
// NHWC
if(layout == ConvLayout::NHWC)
{
if(num_dim_spatial == 1)
{
if(data_type == DataType::F32_F32)
{
return profile(I1, GNWC{}, F32{}, F32{});
}
else if(data_type == DataType::F16_F16)
{
return profile(I1, GNWC{}, F16{}, F16{});
}
else if(data_type == DataType::BF16_BF16)
{
return profile(I1, GNWC{}, BF16{}, BF16{});
}
else if(data_type == DataType::INT8_INT8)
{
return profile(I1, GNWC{}, INT8{}, INT8{});
}
}
else if(num_dim_spatial == 2)
{
if(data_type == DataType::F32_F32)
{
return profile(I2, GNHWC{}, F32{}, F32{});
}
else if(data_type == DataType::F16_F16)
{
return profile(I2, GNHWC{}, F16{}, F16{});
}
else if(data_type == DataType::BF16_BF16)
{
return profile(I2, GNHWC{}, BF16{}, BF16{});
}
else if(data_type == DataType::INT8_INT8)
{
return profile(I2, GNHWC{}, INT8{}, INT8{});
}
}
else if(num_dim_spatial == 3)
{
if(data_type == DataType::F32_F32)
{
return profile(I3, GNDHWC{}, F32{}, F32{});
}
else if(data_type == DataType::F16_F16)
{
return profile(I3, GNDHWC{}, F16{}, F16{});
}
else if(data_type == DataType::BF16_BF16)
{
return profile(I3, GNDHWC{}, BF16{}, BF16{});
}
else if(data_type == DataType::INT8_INT8)
{
return profile(I3, GNDHWC{}, INT8{}, INT8{});
}
}
}
std::cout << "this data_type & layout is not implemented" << std::endl;
return 1;
}
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_image_to_column);
......@@ -6,15 +6,20 @@
#include <unordered_map>
#include "profiler/data_type_enum.hpp"
#include "profiler/profile_pool2d_fwd_impl.hpp"
#include "profiler/profile_max_pool3d_bwd_impl.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "profiler_operation_registry.hpp"
using ck::index_t;
struct avgPoolFwdArgParser
struct maxPoolbwdArgParser
{
std::unordered_map<std::string, std::vector<int>> long_opts = {
{"length", {}}, {"wsize", {}}, {"wstride", {}}, {"pad1", {}}, {"pad2", {}}};
std::unordered_map<std::string, std::vector<int>> long_opts = {{"length", {}},
{"wsize", {}},
{"wstride", {}},
{"wdilation", {}},
{"pad1", {}},
{"pad2", {}}};
bool parse_opt(int argc, char* argv[], const std::string& key, int i)
{
......@@ -45,24 +50,25 @@ struct avgPoolFwdArgParser
}
};
void print_help_avg_pool2d_fwd()
void print_help_max_pool3d_bwd()
{
std::cout << "arg1: data type (0: fp16; 1: fp32)\n"
std::cout << "arg1: data type (0: fp16; 1: fp32; 5: bf16)\n"
<< "arg2: verification (0: no; 1: yes)\n"
<< "arg3: initialization (0: no init; 1: integer value; 2: decimal value)\n"
<< "arg4: print tensor value (0: no; 1: yes)\n"
<< "arg5: time kernel (0=no, 1=yes)\n"
<< "--length: input tensor length for NDHW(e.g, --length 2 32 30 30) \n"
<< "--wsize: window size for YX (e.g, --wsize 2 2) \n"
<< "--wstride: window stride for HW (e.g, --wstride 2 2) \n"
<< "--pad1: left side of padding in HW (e.g, --pad1 1 1) \n"
<< "--pad2: right side of padding in HW (e.g, --pad2 1 1) \n"
<< "eg: ckProfiler avg_pool2d_fwd 0 1 2 0 1 0 --length 2 32 30 30 --wsize 2 2 "
"--wstride 2 2 --pad1 1 1 --pad2 1 1"
<< "--length: input tensor length for NCDHW(e.g, --length 2 32 30 30 30) \n"
<< "--wsize: window size for ZYX (e.g, --wsize 2 2 2) \n"
<< "--wstride: window stride for DHW (e.g, --wstride 2 2 2) \n"
<< "--wdilation: window dilation for DHW (e.g, --wdilation 1 1 1) \n"
<< "--pad1: left side of padding in DHW (e.g, --pad1 1 1 1) \n"
<< "--pad2: right side of padding in DHW (e.g, --pad2 1 1 1) \n"
<< "eg: ckProfiler max_pool3d_bwd 0 1 2 0 1 --length 2 32 30 30 30 --wsize 2 2 2 "
"--wstride 2 2 2 --wdilation 1 1 1 --pad1 1 1 1 --pad2 1 1 1"
<< std::endl;
}
int profile_avg_pool2d_fwd(int argc, char* argv[])
int profile_max_pool3d_bwd(int argc, char* argv[])
{
ck::DataTypeEnum data_type = ck::DataTypeEnum::Half;
bool do_verification = true;
......@@ -70,18 +76,19 @@ int profile_avg_pool2d_fwd(int argc, char* argv[])
bool do_log = false;
bool time_kernel = true;
std::vector<index_t> in_length = {2, 32, 30, 30};
std::vector<index_t> wsize = {2, 2};
std::vector<index_t> wstride = {2, 2};
std::vector<index_t> pad1 = {1, 1};
std::vector<index_t> pad2 = {1, 1};
std::vector<index_t> in_length = {2, 32, 30, 30, 30};
std::vector<index_t> wsize = {2, 2, 2};
std::vector<index_t> wstride = {2, 2, 2};
std::vector<index_t> wdilation = {1, 1, 1};
std::vector<index_t> pad1 = {1, 1, 1};
std::vector<index_t> pad2 = {1, 1, 1};
if(argc != 2 && argc != 25)
if(argc != 2 && argc != 33)
{
print_help_avg_pool2d_fwd();
print_help_max_pool3d_bwd();
return 0;
}
else if(argc == 25)
else if(argc == 33)
{
data_type = static_cast<ck::DataTypeEnum>(std::stoi(argv[2]));
do_verification = std::stoi(argv[3]);
......@@ -90,23 +97,48 @@ int profile_avg_pool2d_fwd(int argc, char* argv[])
time_kernel = std::stoi(argv[6]);
// parse the long options
avgPoolFwdArgParser arg_parser;
maxPoolbwdArgParser arg_parser;
arg_parser(argc, argv);
in_length = arg_parser.long_opts["length"];
wsize = arg_parser.long_opts["wsize"];
wstride = arg_parser.long_opts["wstride"];
wdilation = arg_parser.long_opts["wdilation"];
pad1 = arg_parser.long_opts["pad1"];
pad2 = arg_parser.long_opts["pad2"];
}
using F16 = ck::half_t;
using F32 = float;
using I32 = int32_t;
constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
#ifdef CK_ENABLE_FP16
using F16 = ck::half_t;
#endif
#ifdef CK_ENABLE_BF16
using BF16 = ck::bhalf_t;
#endif
#ifdef CK_ENABLE_FP32
using F32 = float;
#endif
using I32 = int32_t;
if(data_type == ck::DataTypeEnum::Half)
if(false)
;
#ifdef CK_ENABLE_FP16
else if(data_type == ck::DataTypeEnum::Half)
{
ck::profiler::profile_pool2d_fwd_impl<F16, F16, F32, I32, ReduceOpId, false, false>(
ck::profiler::profile_max_pool3d_bwd_impl<F16, F16, I32, F16, F16, false>(do_verification,
init_method,
do_log,
time_kernel,
in_length,
wsize,
wstride,
wdilation,
pad1,
pad2);
}
#endif
#ifdef CK_ENABLE_BF16
else if(data_type == ck::DataTypeEnum::BFloat16)
{
ck::profiler::profile_max_pool3d_bwd_impl<BF16, BF16, I32, BF16, BF16, false>(
do_verification,
init_method,
do_log,
......@@ -114,22 +146,26 @@ int profile_avg_pool2d_fwd(int argc, char* argv[])
in_length,
wsize,
wstride,
wdilation,
pad1,
pad2);
}
#endif
#ifdef CK_ENABLE_FP32
else if(data_type == ck::DataTypeEnum::Float)
{
ck::profiler::profile_pool2d_fwd_impl<F32, F32, F32, I32, ReduceOpId, false, false>(
do_verification,
init_method,
do_log,
time_kernel,
in_length,
wsize,
wstride,
pad1,
pad2);
ck::profiler::profile_max_pool3d_bwd_impl<F32, F32, I32, F32, F32, false>(do_verification,
init_method,
do_log,
time_kernel,
in_length,
wsize,
wstride,
wdilation,
pad1,
pad2);
}
#endif
else
{
throw std::runtime_error("not implemented yet");
......@@ -138,4 +174,4 @@ int profile_avg_pool2d_fwd(int argc, char* argv[])
return 0;
}
REGISTER_PROFILER_OPERATION("avg_pool2d_fwd", "avg_pool2d fwd", profile_avg_pool2d_fwd);
REGISTER_PROFILER_OPERATION("max_pool3d_bwd", "max_pool3d bwd", profile_max_pool3d_bwd);
......@@ -13,8 +13,12 @@ using ck::index_t;
struct maxPoolFwdArgParser
{
std::unordered_map<std::string, std::vector<int>> long_opts = {
{"length", {}}, {"wsize", {}}, {"wstride", {}}, {"pad1", {}}, {"pad2", {}}};
std::unordered_map<std::string, std::vector<int>> long_opts = {{"length", {}},
{"wsize", {}},
{"wstride", {}},
{"wdilation", {}},
{"pad1", {}},
{"pad2", {}}};
bool parse_opt(int argc, char* argv[], const std::string& key, int i)
{
......@@ -47,7 +51,7 @@ struct maxPoolFwdArgParser
void print_help_max_pool3d_fwd()
{
std::cout << "arg1: data type (0: fp16; 1: fp32)\n"
std::cout << "arg1: data type (0: fp16; 1: fp32; 5: bf16)\n"
<< "arg2: verification (0: no; 1: yes)\n"
<< "arg3: initialization (0: no init; 1: integer value; 2: decimal value)\n"
<< "arg4: print tensor value (0: no; 1: yes)\n"
......@@ -56,10 +60,11 @@ void print_help_max_pool3d_fwd()
<< "--length: input tensor length for NCDHW(e.g, --length 2 32 30 30 30) \n"
<< "--wsize: window size for ZYX (e.g, --wsize 2 2 2) \n"
<< "--wstride: window stride for DHW (e.g, --wstride 2 2 2) \n"
<< "--wdilation: window dilation for DHW (e.g, --wdilation 1 1 1) \n"
<< "--pad1: left side of padding in DHW (e.g, --pad1 1 1 1) \n"
<< "--pad2: right side of padding in DHW (e.g, --pad2 1 1 1) \n"
<< "eg: ckProfiler max_pool3d_fwd 0 1 2 0 1 0 --length 2 32 30 30 30 --wsize 2 2 2 "
"--wstride 2 2 2 --pad1 1 1 1 --pad2 1 1 1"
"--wstride 2 2 2 --wdilation 1 1 1 --pad1 1 1 1 --pad2 1 1 1"
<< std::endl;
}
......@@ -75,15 +80,16 @@ int profile_max_pool3d_fwd(int argc, char* argv[])
std::vector<index_t> in_length = {2, 32, 30, 30, 30};
std::vector<index_t> wsize = {2, 2, 2};
std::vector<index_t> wstride = {2, 2, 2};
std::vector<index_t> wdilation = {1, 1, 1};
std::vector<index_t> pad1 = {1, 1, 1};
std::vector<index_t> pad2 = {1, 1, 1};
if(argc != 2 && argc != 30)
if(argc != 2 && argc != 34)
{
print_help_max_pool3d_fwd();
return 0;
}
else if(argc == 30)
else if(argc == 34)
{
data_type = static_cast<ck::DataTypeEnum>(std::stoi(argv[2]));
do_verification = std::stoi(argv[3]);
......@@ -98,65 +104,136 @@ int profile_max_pool3d_fwd(int argc, char* argv[])
in_length = arg_parser.long_opts["length"];
wsize = arg_parser.long_opts["wsize"];
wstride = arg_parser.long_opts["wstride"];
wdilation = arg_parser.long_opts["wdilation"];
pad1 = arg_parser.long_opts["pad1"];
pad2 = arg_parser.long_opts["pad2"];
}
using F16 = ck::half_t;
using F32 = float;
using I32 = int32_t;
#ifdef CK_ENABLE_FP16
using F16 = ck::half_t;
#endif
#ifdef CK_ENABLE_BF16
using BF16 = ck::bhalf_t;
#endif
#ifdef CK_ENABLE_FP32
using F32 = float;
#endif
using I32 = int32_t;
using NDHWC = ck::tensor_layout::convolution::NDHWC;
#if 1
constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
#else
constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
#endif
if(data_type == ck::DataTypeEnum::Half)
if(false)
;
#ifdef CK_ENABLE_FP16
else if(data_type == ck::DataTypeEnum::Half)
{
if(return_index)
ck::profiler::
profile_pool3d_fwd_impl<F16, F16, F16, I32, NDHWC, NDHWC, ReduceOpId, false, true>(
do_verification,
init_method,
do_log,
time_kernel,
in_length,
wsize,
wstride,
wdilation,
pad1,
pad2);
else
ck::profiler::
profile_pool3d_fwd_impl<F16, F16, F16, I32, NDHWC, NDHWC, ReduceOpId, false, false>(
do_verification,
init_method,
do_log,
time_kernel,
in_length,
wsize,
wstride,
wdilation,
pad1,
pad2);
}
#endif
#ifdef CK_ENABLE_BF16
else if(data_type == ck::DataTypeEnum::BFloat16)
{
if(return_index)
ck::profiler::profile_pool3d_fwd_impl<F16, F16, F16, I32, ReduceOpId, false, true>(
do_verification,
init_method,
do_log,
time_kernel,
in_length,
wsize,
wstride,
pad1,
pad2);
ck::profiler::profile_pool3d_fwd_impl<BF16,
BF16,
BF16,
I32,
NDHWC,
NDHWC,
ReduceOpId,
false,
true>(do_verification,
init_method,
do_log,
time_kernel,
in_length,
wsize,
wstride,
wdilation,
pad1,
pad2);
else
ck::profiler::profile_pool3d_fwd_impl<F16, F16, F16, I32, ReduceOpId, false, false>(
do_verification,
init_method,
do_log,
time_kernel,
in_length,
wsize,
wstride,
pad1,
pad2);
ck::profiler::profile_pool3d_fwd_impl<BF16,
BF16,
BF16,
I32,
NDHWC,
NDHWC,
ReduceOpId,
false,
false>(do_verification,
init_method,
do_log,
time_kernel,
in_length,
wsize,
wstride,
wdilation,
pad1,
pad2);
}
#endif
#ifdef CK_ENABLE_FP32
else if(data_type == ck::DataTypeEnum::Float)
{
if(return_index)
ck::profiler::profile_pool3d_fwd_impl<F32, F32, F32, I32, ReduceOpId, false, true>(
do_verification,
init_method,
do_log,
time_kernel,
in_length,
wsize,
wstride,
pad1,
pad2);
ck::profiler::
profile_pool3d_fwd_impl<F32, F32, F32, I32, NDHWC, NDHWC, ReduceOpId, false, true>(
do_verification,
init_method,
do_log,
time_kernel,
in_length,
wsize,
wstride,
wdilation,
pad1,
pad2);
else
ck::profiler::profile_pool3d_fwd_impl<F32, F32, F32, I32, ReduceOpId, false, false>(
do_verification,
init_method,
do_log,
time_kernel,
in_length,
wsize,
wstride,
pad1,
pad2);
ck::profiler::
profile_pool3d_fwd_impl<F32, F32, F32, I32, NDHWC, NDHWC, ReduceOpId, false, false>(
do_verification,
init_method,
do_log,
time_kernel,
in_length,
wsize,
wstride,
wdilation,
pad1,
pad2);
}
#endif
else
{
throw std::runtime_error("not implemented yet");
......
#find . -name deps -prune -o -name build -prune -o -iname '*.h' -o -iname '*.hpp' -o -iname '*.cpp' -o -iname '*.h.in' -o -iname '*.hpp.in' -o -iname '*.cpp.in' -o -iname '*.cl' -o -iname '*.cuh' -o -iname '*.cu' -o -iname '*.inc' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-10 -i -style=file {}'
git status --porcelain | awk '$1 != "D" && (match($2, "\\.cpp|hpp|inc")) {print $2}' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-10 -i -style=file {}'
#find . -name deps -prune -o -name build -prune -o -iname '*.h' -o -iname '*.hpp' -o -iname '*.cpp' -o -iname '*.h.in' -o -iname '*.hpp.in' -o -iname '*.cpp.in' -o -iname '*.cl' -o -iname '*.cuh' -o -iname '*.cu' -o -iname '*.inc' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-12 -i -style=file {}'
git status --porcelain | awk '$1 != "D" && (match($2, "\\.cpp|hpp|inc")) {print $2}' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-12 -i -style=file {}'
......@@ -16,4 +16,3 @@ cmake
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \
-D USE_BITINT_EXTENSION_INT4=OFF \
${MY_PROJECT_SOURCE}
......@@ -11,7 +11,7 @@ run_and_check() {
}
echo "I: Installing tools required for pre-commit checks..."
run_and_check apt install clang-format-10
run_and_check apt install clang-format-12
echo "I: Installing pre-commit itself..."
run_and_check pip3 install pre-commit
......
......@@ -3,13 +3,6 @@
## GPU visibility
export HIP_VISIBLE_DEVICES=0
DRIVER="../build/bin/ckProfiler"
OP=$1
DATATYPE=$2
LAYOUT=$3
VERIFY=$4
INIT=$5
LOG=$6
TIME=$7
OP=$1
DATATYPE=$2
......
......@@ -57,9 +57,10 @@ add_subdirectory(data_type)
add_subdirectory(elementwise_normalization)
add_subdirectory(batchnorm)
add_subdirectory(contraction)
add_subdirectory(pool_fwd)
add_subdirectory(pool)
add_subdirectory(batched_gemm_multi_d)
add_subdirectory(grouped_convnd_bwd_data)
if(GPU_TARGETS MATCHES "gfx1100")
add_subdirectory(image_to_column)
if(GPU_TARGETS MATCHES "gfx11")
add_subdirectory(wmma_op)
endif()
......@@ -2,21 +2,26 @@ list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
set(target 0)
foreach(gpu IN LISTS GPU_TARGETS)
if(gpu IN_LIST gpu_list AND target EQUAL 0)
add_test_executable(test_batched_gemm_fp16 batched_gemm_fp16.cpp)
target_link_libraries(test_batched_gemm_fp16 PRIVATE utility)
target_link_libraries(test_batched_gemm_fp16 PRIVATE device_batched_gemm_instance)
add_test_executable(test_batched_gemm_fp32 batched_gemm_fp32.cpp)
target_link_libraries(test_batched_gemm_fp32 PRIVATE utility)
target_link_libraries(test_batched_gemm_fp32 PRIVATE device_batched_gemm_instance)
add_test_executable(test_batched_gemm_bf16 batched_gemm_bf16.cpp)
target_link_libraries(test_batched_gemm_bf16 PRIVATE utility)
target_link_libraries(test_batched_gemm_bf16 PRIVATE device_batched_gemm_instance)
add_test_executable(test_batched_gemm_int8 batched_gemm_int8.cpp)
target_link_libraries(test_batched_gemm_int8 PRIVATE utility)
target_link_libraries(test_batched_gemm_int8 PRIVATE device_batched_gemm_instance)
if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
add_test_executable(test_batched_gemm_fp16 batched_gemm_fp16.cpp)
target_link_libraries(test_batched_gemm_fp16 PRIVATE utility)
target_link_libraries(test_batched_gemm_fp16 PRIVATE device_batched_gemm_instance)
endif()
if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES)
add_test_executable(test_batched_gemm_fp32 batched_gemm_fp32.cpp)
target_link_libraries(test_batched_gemm_fp32 PRIVATE utility)
target_link_libraries(test_batched_gemm_fp32 PRIVATE device_batched_gemm_instance)
endif()
if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES)
add_test_executable(test_batched_gemm_bf16 batched_gemm_bf16.cpp)
target_link_libraries(test_batched_gemm_bf16 PRIVATE utility)
target_link_libraries(test_batched_gemm_bf16 PRIVATE device_batched_gemm_instance)
endif()
if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES)
add_test_executable(test_batched_gemm_int8 batched_gemm_int8.cpp)
target_link_libraries(test_batched_gemm_int8 PRIVATE utility)
target_link_libraries(test_batched_gemm_int8 PRIVATE device_batched_gemm_instance)
endif()
set(target 1)
endif()
endforeach()
\ No newline at end of file
......@@ -2,10 +2,12 @@ list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
set(target 0)
foreach(gpu IN LISTS GPU_TARGETS)
if(gpu IN_LIST gpu_list AND target EQUAL 0)
add_custom_target(test_batched_gemm_gemm)
add_gtest_executable(test_batched_gemm_gemm_fp16 test_batched_gemm_gemm_fp16.cpp)
target_link_libraries(test_batched_gemm_gemm_fp16 PRIVATE utility device_batched_gemm_gemm_instance)
add_dependencies(test_batched_gemm_gemm test_batched_gemm_gemm_fp16)
set(target 1)
if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
add_custom_target(test_batched_gemm_gemm)
add_gtest_executable(test_batched_gemm_gemm_fp16 test_batched_gemm_gemm_fp16.cpp)
target_link_libraries(test_batched_gemm_gemm_fp16 PRIVATE utility device_batched_gemm_gemm_instance)
add_dependencies(test_batched_gemm_gemm test_batched_gemm_gemm_fp16)
set(target 1)
endif()
endif()
endforeach()
\ No newline at end of file
# TODO: Enable for gfx90a after complier fix
if(NOT GPU_TARGETS MATCHES "gfx90a")
if(DL_KERNELS)
add_gtest_executable(test_batched_gemm_multi_d test_batched_gemm_multi_d.cpp)
target_link_libraries(test_batched_gemm_multi_d PRIVATE utility device_batched_gemm_multi_d_instance)
endif()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment