Commit e57bd240 authored by aska-0096's avatar aska-0096
Browse files

Merge branch 'develop' of...

Merge branch 'develop' of https://github.com/ROCmSoftwarePlatform/composable_kernel into add_fp16_wmma_conv_instance
parents 4b70d68e 37a8c1f7
...@@ -71,7 +71,7 @@ int profile_batched_gemm_multi_d(int argc, char* argv[]) ...@@ -71,7 +71,7 @@ int profile_batched_gemm_multi_d(int argc, char* argv[])
const int BatchCount = std::stoi(argv[17]); const int BatchCount = std::stoi(argv[17]);
using F16 = ck::half_t; using F16 = ck::half_t;
#ifdef __int8__ #ifdef CK_ENABLE_INT8
using INT8 = int8_t; using INT8 = int8_t;
#endif #endif
...@@ -165,7 +165,7 @@ int profile_batched_gemm_multi_d(int argc, char* argv[]) ...@@ -165,7 +165,7 @@ int profile_batched_gemm_multi_d(int argc, char* argv[])
{ {
return profile(F16{}, F16{}, F16{}, Col{}, Col{}, Row{}); return profile(F16{}, F16{}, F16{}, Col{}, Col{}, Row{});
} }
#ifdef __int8__ #ifdef CK_ENABLE_INT8
else if(data_type == GemmDataType::INT8_INT8_INT8 && layout == GemmMatrixLayout::MK_KN_MN) else if(data_type == GemmDataType::INT8_INT8_INT8 && layout == GemmMatrixLayout::MK_KN_MN)
{ {
return profile(INT8{}, INT8{}, INT8{}, Row{}, Row{}, Row{}); return profile(INT8{}, INT8{}, INT8{}, Row{}, Row{}, Row{});
......
...@@ -77,7 +77,7 @@ int profile_conv_bwd_data(int argc, char* argv[]) ...@@ -77,7 +77,7 @@ int profile_conv_bwd_data(int argc, char* argv[])
using F32 = float; using F32 = float;
using F16 = ck::half_t; using F16 = ck::half_t;
using BF16 = ck::bhalf_t; using BF16 = ck::bhalf_t;
#ifdef __int8__ #ifdef CK_ENABLE_INT8
using INT8 = int8_t; using INT8 = int8_t;
#endif #endif
...@@ -140,7 +140,7 @@ int profile_conv_bwd_data(int argc, char* argv[]) ...@@ -140,7 +140,7 @@ int profile_conv_bwd_data(int argc, char* argv[])
{ {
return profile(I1, NWC{}, KXC{}, NWK{}, BF16{}, BF16{}, BF16{}); return profile(I1, NWC{}, KXC{}, NWK{}, BF16{}, BF16{}, BF16{});
} }
#ifdef __int8__ #ifdef CK_ENABLE_INT8
else if(data_type == ConvDataType::INT8_INT8_INT8) else if(data_type == ConvDataType::INT8_INT8_INT8)
{ {
return profile(I1, NWC{}, KXC{}, NWK{}, INT8{}, INT8{}, INT8{}); return profile(I1, NWC{}, KXC{}, NWK{}, INT8{}, INT8{}, INT8{});
...@@ -161,7 +161,7 @@ int profile_conv_bwd_data(int argc, char* argv[]) ...@@ -161,7 +161,7 @@ int profile_conv_bwd_data(int argc, char* argv[])
{ {
return profile(I2, NHWC{}, KYXC{}, NHWK{}, BF16{}, BF16{}, BF16{}); return profile(I2, NHWC{}, KYXC{}, NHWK{}, BF16{}, BF16{}, BF16{});
} }
#ifdef __int8__ #ifdef CK_ENABLE_INT8
else if(data_type == ConvDataType::INT8_INT8_INT8) else if(data_type == ConvDataType::INT8_INT8_INT8)
{ {
return profile(I2, NHWC{}, KYXC{}, NHWK{}, INT8{}, INT8{}, INT8{}); return profile(I2, NHWC{}, KYXC{}, NHWK{}, INT8{}, INT8{}, INT8{});
...@@ -182,7 +182,7 @@ int profile_conv_bwd_data(int argc, char* argv[]) ...@@ -182,7 +182,7 @@ int profile_conv_bwd_data(int argc, char* argv[])
{ {
return profile(I3, NDHWC{}, KZYXC{}, NDHWK{}, BF16{}, BF16{}, BF16{}); return profile(I3, NDHWC{}, KZYXC{}, NDHWK{}, BF16{}, BF16{}, BF16{});
} }
#ifdef __int8__ #ifdef CK_ENABLE_INT8
else if(data_type == ConvDataType::INT8_INT8_INT8) else if(data_type == ConvDataType::INT8_INT8_INT8)
{ {
return profile(I3, NDHWC{}, KZYXC{}, NDHWK{}, INT8{}, INT8{}, INT8{}); return profile(I3, NDHWC{}, KZYXC{}, NDHWK{}, INT8{}, INT8{}, INT8{});
......
...@@ -69,10 +69,10 @@ int profile_gemm(int argc, char* argv[]) ...@@ -69,10 +69,10 @@ int profile_gemm(int argc, char* argv[])
using F32 = float; using F32 = float;
using F16 = ck::half_t; using F16 = ck::half_t;
#ifdef __bf16__ #ifdef CK_ENABLE_BF16
using BF16 = ck::bhalf_t; using BF16 = ck::bhalf_t;
#endif #endif
#ifdef __int8__ #ifdef CK_ENABLE_INT8
using INT8 = int8_t; using INT8 = int8_t;
using INT32 = int32_t; using INT32 = int32_t;
#endif #endif
...@@ -121,7 +121,10 @@ int profile_gemm(int argc, char* argv[]) ...@@ -121,7 +121,10 @@ int profile_gemm(int argc, char* argv[])
return pass ? 0 : 1; return pass ? 0 : 1;
}; };
if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::MK_KN_MN) if(false)
;
#ifdef CK_ENABLE_FP32
else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::MK_KN_MN)
{ {
return profile(Row{}, Row{}, Row{}, F32{}, F32{}, F32{}, F32{}); return profile(Row{}, Row{}, Row{}, F32{}, F32{}, F32{}, F32{});
} }
...@@ -137,6 +140,8 @@ int profile_gemm(int argc, char* argv[]) ...@@ -137,6 +140,8 @@ int profile_gemm(int argc, char* argv[])
{ {
return profile(Col{}, Col{}, Row{}, F32{}, F32{}, F32{}, F32{}); return profile(Col{}, Col{}, Row{}, F32{}, F32{}, F32{}, F32{});
} }
#endif
#ifdef CK_ENABLE_FP16
else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN) else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
{ {
return profile(Row{}, Row{}, Row{}, F16{}, F16{}, F32{}, F16{}); return profile(Row{}, Row{}, Row{}, F16{}, F16{}, F32{}, F16{});
...@@ -153,7 +158,8 @@ int profile_gemm(int argc, char* argv[]) ...@@ -153,7 +158,8 @@ int profile_gemm(int argc, char* argv[])
{ {
return profile(Col{}, Col{}, Row{}, F16{}, F16{}, F32{}, F16{}); return profile(Col{}, Col{}, Row{}, F16{}, F16{}, F32{}, F16{});
} }
#ifdef __bf16__ #endif
#ifdef CK_ENABLE_BF16
else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_KN_MN) else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_KN_MN)
{ {
return profile(Row{}, Row{}, Row{}, BF16{}, BF16{}, F32{}, BF16{}); return profile(Row{}, Row{}, Row{}, BF16{}, BF16{}, F32{}, BF16{});
...@@ -171,7 +177,7 @@ int profile_gemm(int argc, char* argv[]) ...@@ -171,7 +177,7 @@ int profile_gemm(int argc, char* argv[])
return profile(Col{}, Col{}, Row{}, BF16{}, BF16{}, F32{}, BF16{}); return profile(Col{}, Col{}, Row{}, BF16{}, BF16{}, F32{}, BF16{});
} }
#endif #endif
#ifdef __int8__ #ifdef CK_ENABLE_INT8
else if(data_type == GemmDataType::INT8_INT8_INT8 && layout == GemmMatrixLayout::MK_KN_MN) else if(data_type == GemmDataType::INT8_INT8_INT8 && layout == GemmMatrixLayout::MK_KN_MN)
{ {
return profile(Row{}, Row{}, Row{}, INT8{}, INT8{}, INT32{}, INT8{}); return profile(Row{}, Row{}, Row{}, INT8{}, INT8{}, INT32{}, INT8{});
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "profiler/profile_gemm_multiply_add_impl.hpp"
#include "profiler_operation_registry.hpp"
#define OP_NAME "gemm_multiply_add"
#define OP_DESC "GEMM+MULTIPLY+ADD"
int profile_gemm_multiply_add(int argc, char* argv[])
{
enum struct MatrixLayout
{
MK_KN_MN_MN_MN, // 0
MK_NK_MN_MN_MN, // 1
};
enum struct MatrixDataType
{
F16_F16_F16_F16_F16, // 0
F16_F8_F32_F32_F16, // 1
};
if(argc != 16)
{
// clang-format off
printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
printf("arg2: data type (0: fp16; 1: fp16Afp8B)\n");
printf("arg3: matrix layout (0: E[m, n] = Multiply_Add((A[m, k] * B[k, n]) x D1[m, n] + D0[m, n]);\n");
printf(" 1: E[m, n] = Multiply_Add((A[m, k] * B[n, k]) x D1[m, n] + D0[m, n]);\n");
printf("arg4: verification (0: no; 1: yes)\n");
printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
printf("arg6: print tensor value (0: no; 1: yes)\n");
printf("arg7: time kernel (0=no, 1=yes)\n");
printf("arg8 to 15: M, N, K, StrideA, StrideB, StrideD0, StrideD1, StrideE\n");
// clang-format on
exit(1);
}
const auto data_type = static_cast<MatrixDataType>(std::stoi(argv[2]));
const auto layout = static_cast<MatrixLayout>(std::stoi(argv[3]));
const bool do_verification = std::stoi(argv[4]);
const int init_method = std::stoi(argv[5]);
const bool do_log = std::stoi(argv[6]);
const bool time_kernel = std::stoi(argv[7]);
const int M = std::stoi(argv[8]);
const int N = std::stoi(argv[9]);
const int K = std::stoi(argv[10]);
const int StrideA = std::stoi(argv[11]);
const int StrideB = std::stoi(argv[12]);
const int StrideD0 = std::stoi(argv[13]);
const int StrideD1 = std::stoi(argv[14]);
const int StrideE = std::stoi(argv[15]);
using F8 = ck::f8_t;
using F16 = ck::half_t;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
auto profile = [&](auto a_type,
auto b_type,
auto acc_type,
auto d0_type,
auto d1_type,
auto e_type,
auto a_layout,
auto b_layout,
auto d0_layout,
auto d1_layout,
auto e_layout) {
using ADataType = decltype(a_type);
using BDataType = decltype(b_type);
using AccDataType = decltype(acc_type);
using D0DataType = decltype(d0_type);
using D1DataType = decltype(d1_type);
using EDataType = decltype(e_type);
using ALayout = decltype(a_layout);
using BLayout = decltype(b_layout);
using D0Layout = decltype(d0_layout);
using D1Layout = decltype(d1_layout);
using ELayout = decltype(e_layout);
const int DefaultStrideA = ck::is_same_v<ALayout, Row> ? K : M;
const int DefaultStrideB = ck::is_same_v<BLayout, Row> ? N : K;
const int DefaultStrideD0 = ck::is_same_v<D0Layout, Row> ? N : M;
const int DefaultStrideD1 = ck::is_same_v<D1Layout, Row> ? N : M;
const int DefaultStrideE = ck::is_same_v<ELayout, Row> ? N : M;
bool pass = ck::profiler::profile_gemm_multiply_add_impl<ADataType,
BDataType,
AccDataType,
D0DataType,
D1DataType,
EDataType,
ALayout,
BLayout,
D0Layout,
D1Layout,
ELayout>(
do_verification,
init_method,
do_log,
time_kernel,
M,
N,
K,
(StrideA < 0) ? DefaultStrideA : StrideA,
(StrideB < 0) ? DefaultStrideB : StrideB,
(StrideD0 < 0) ? DefaultStrideD0 : StrideD0,
(StrideD1 < 0) ? DefaultStrideD1 : StrideD1,
(StrideE < 0) ? DefaultStrideE : StrideE);
return pass ? 0 : 1;
};
if(data_type == MatrixDataType::F16_F16_F16_F16_F16 && layout == MatrixLayout::MK_KN_MN_MN_MN)
{
return profile(F16{}, F16{}, F32{}, F16{}, F16{}, F16{}, Row{}, Row{}, Row{}, Row{}, Row{});
}
else if(data_type == MatrixDataType::F16_F16_F16_F16_F16 &&
layout == MatrixLayout::MK_NK_MN_MN_MN)
{
return profile(F16{}, F16{}, F32{}, F16{}, F16{}, F16{}, Row{}, Col{}, Row{}, Row{}, Row{});
}
else if(data_type == MatrixDataType::F16_F8_F32_F32_F16 &&
layout == MatrixLayout::MK_KN_MN_MN_MN)
{
return profile(F16{}, F8{}, F32{}, F32{}, F32{}, F16{}, Row{}, Row{}, Row{}, Row{}, Row{});
}
else if(data_type == MatrixDataType::F16_F8_F32_F32_F16 &&
layout == MatrixLayout::MK_NK_MN_MN_MN)
{
return profile(F16{}, F8{}, F32{}, F32{}, F32{}, F16{}, Row{}, Col{}, Row{}, Row{}, Row{});
}
else
{
std::cout << "this data_type & layout is not implemented" << std::endl;
return 1;
}
}
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm_multiply_add);
...@@ -23,6 +23,8 @@ enum struct GemmDataType ...@@ -23,6 +23,8 @@ enum struct GemmDataType
F16_F16_F16, // 1 F16_F16_F16, // 1
BF16_BF16_BF16, // 2 BF16_BF16_BF16, // 2
INT8_INT8_INT8, // 3 INT8_INT8_INT8, // 3
F8_F16_F16, // 4
F16_F8_F16, // 5
}; };
#define OP_NAME "gemm_splitk" #define OP_NAME "gemm_splitk"
...@@ -33,7 +35,7 @@ int profile_gemm_splitk(int argc, char* argv[]) ...@@ -33,7 +35,7 @@ int profile_gemm_splitk(int argc, char* argv[])
if(argc != 15) if(argc != 15)
{ {
printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"); printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n"); printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8; 4: f8@f16; 5: f16@f8)\n");
printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n"); printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
printf(" 1: A[m, k] * B[n, k] = C[m, n];\n"); printf(" 1: A[m, k] * B[n, k] = C[m, n];\n");
printf(" 2: A[k, m] * B[k, n] = C[m, n];\n"); printf(" 2: A[k, m] * B[k, n] = C[m, n];\n");
...@@ -65,6 +67,7 @@ int profile_gemm_splitk(int argc, char* argv[]) ...@@ -65,6 +67,7 @@ int profile_gemm_splitk(int argc, char* argv[])
using F32 = float; using F32 = float;
using F16 = ck::half_t; using F16 = ck::half_t;
using F8 = ck::f8_t;
using Row = ck::tensor_layout::gemm::RowMajor; using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor; using Col = ck::tensor_layout::gemm::ColumnMajor;
...@@ -143,6 +146,38 @@ int profile_gemm_splitk(int argc, char* argv[]) ...@@ -143,6 +146,38 @@ int profile_gemm_splitk(int argc, char* argv[])
{ {
return profile(F16{}, F16{}, F32{}, F16{}, Col{}, Col{}, Row{}); return profile(F16{}, F16{}, F32{}, F16{}, Col{}, Col{}, Row{});
} }
else if(data_type == GemmDataType::F8_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
{
return profile(F8{}, F16{}, F32{}, F16{}, Row{}, Row{}, Row{});
}
else if(data_type == GemmDataType::F8_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN)
{
return profile(F8{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{});
}
else if(data_type == GemmDataType::F8_F16_F16 && layout == GemmMatrixLayout::KM_KN_MN)
{
return profile(F8{}, F16{}, F32{}, F16{}, Col{}, Row{}, Row{});
}
else if(data_type == GemmDataType::F8_F16_F16 && layout == GemmMatrixLayout::KM_NK_MN)
{
return profile(F8{}, F16{}, F32{}, F16{}, Col{}, Col{}, Row{});
}
else if(data_type == GemmDataType::F16_F8_F16 && layout == GemmMatrixLayout::MK_KN_MN)
{
return profile(F16{}, F8{}, F32{}, F16{}, Row{}, Row{}, Row{});
}
else if(data_type == GemmDataType::F16_F8_F16 && layout == GemmMatrixLayout::MK_NK_MN)
{
return profile(F16{}, F8{}, F32{}, F16{}, Row{}, Col{}, Row{});
}
else if(data_type == GemmDataType::F16_F8_F16 && layout == GemmMatrixLayout::KM_KN_MN)
{
return profile(F16{}, F8{}, F32{}, F16{}, Col{}, Row{}, Row{});
}
else if(data_type == GemmDataType::F16_F8_F16 && layout == GemmMatrixLayout::KM_NK_MN)
{
return profile(F16{}, F8{}, F32{}, F16{}, Col{}, Col{}, Row{});
}
else else
{ {
std::cout << "this data_type & layout is not implemented" << std::endl; std::cout << "this data_type & layout is not implemented" << std::endl;
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "profiler/profile_gemm_streamk_impl.hpp"
#include "profiler_operation_registry.hpp"
enum struct GemmMatrixLayout
{
MK_KN_MN, // 0
MK_NK_MN, // 1
KM_KN_MN, // 2
KM_NK_MN, // 3
};
enum struct GemmDataType
{
F32_F32_F32, // 0
F16_F16_F16, // 1
BF16_BF16_BF16, // 2
INT8_INT8_INT8, // 3
};
#define OP_NAME "gemm_streamk"
#define OP_DESC "StreamK GEMM"
int profile_gemm_streamk(int argc, char* argv[])
{
if(argc < 14)
{
printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n");
printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
printf(" 1: A[m, k] * B[n, k] = C[m, n];\n");
printf(" 2: A[k, m] * B[k, n] = C[m, n];\n");
printf(" 3: A[k, m] * B[n, k] = C[m, n])\n");
printf("arg4: verification (0: no; 1: yes)\n");
printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
printf("arg6: print tensor value (0: no; 1: yes)\n");
printf("arg7: time kernel (0=no, 1=yes)\n");
printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideC\n");
printf("arg14: num_sk_blocks (optional)\n");
exit(1);
}
const auto data_type = static_cast<GemmDataType>(std::stoi(argv[2]));
const auto layout = static_cast<GemmMatrixLayout>(std::stoi(argv[3]));
const bool do_verification = std::stoi(argv[4]);
const int init_method = std::stoi(argv[5]);
const bool do_log = std::stoi(argv[6]);
const bool time_kernel = std::stoi(argv[7]);
const int M = std::stoi(argv[8]);
const int N = std::stoi(argv[9]);
const int K = std::stoi(argv[10]);
const int StrideA = std::stoi(argv[11]);
const int StrideB = std::stoi(argv[12]);
const int StrideC = std::stoi(argv[13]);
const uint32_t NumSKBlocks =
argc >= 15 ? static_cast<uint32_t>(std::stoul(std::string(argv[14]))) : 0xffffffff;
using F32 = float;
using F16 = ck::half_t;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
auto profile = [&](auto a_type,
auto b_type,
auto acc_type,
auto c_type,
auto a_layout,
auto b_layout,
auto c_layout) {
using ADataType = decltype(a_type);
using BDataType = decltype(b_type);
using AccDataType = decltype(acc_type);
using CDataType = decltype(c_type);
using ALayout = decltype(a_layout);
using BLayout = decltype(b_layout);
using CLayout = decltype(c_layout);
const int DefaultStrideA = ck::is_same_v<ALayout, Row> ? K : M;
const int DefaultStrideB = ck::is_same_v<BLayout, Row> ? N : K;
const int DefaultStrideC = ck::is_same_v<CLayout, Row> ? N : M;
bool pass = ck::profiler::profile_gemm_streamk_impl<ADataType,
BDataType,
AccDataType,
CDataType,
ALayout,
BLayout,
CLayout>(
do_verification,
init_method,
do_log,
time_kernel,
M,
N,
K,
(StrideA <= 0) ? DefaultStrideA : StrideA,
(StrideB <= 0) ? DefaultStrideB : StrideB,
(StrideC <= 0) ? DefaultStrideC : StrideC,
NumSKBlocks);
return pass ? 0 : 1;
};
if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::MK_KN_MN)
{
return profile(F32{}, F32{}, F32{}, F32{}, Row{}, Row{}, Row{});
}
else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::MK_NK_MN)
{
return profile(F32{}, F32{}, F32{}, F32{}, Row{}, Col{}, Row{});
}
else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::KM_KN_MN)
{
return profile(F32{}, F32{}, F32{}, F32{}, Col{}, Row{}, Row{});
}
else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::KM_NK_MN)
{
return profile(F32{}, F32{}, F32{}, F32{}, Col{}, Col{}, Row{});
}
else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
{
return profile(F16{}, F16{}, F32{}, F16{}, Row{}, Row{}, Row{});
}
else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN)
{
return profile(F16{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{});
}
else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_KN_MN)
{
return profile(F16{}, F16{}, F32{}, F16{}, Col{}, Row{}, Row{});
}
else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_NK_MN)
{
return profile(F16{}, F16{}, F32{}, F16{}, Col{}, Col{}, Row{});
}
else
{
std::cout << "this data_type & layout is not implemented" << std::endl;
return 1;
}
}
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm_streamk);
...@@ -77,15 +77,10 @@ int profile_grouped_conv_bwd_data(int argc, char* argv[]) ...@@ -77,15 +77,10 @@ int profile_grouped_conv_bwd_data(int argc, char* argv[])
using F16 = ck::half_t; using F16 = ck::half_t;
using BF16 = ck::bhalf_t; using BF16 = ck::bhalf_t;
using GNHWC = ck::tensor_layout::convolution::GNHWC; using namespace ck::tensor_layout::convolution;
using NHWGC = ck::tensor_layout::convolution::NHWGC;
using GKYXC = ck::tensor_layout::convolution::GKYXC;
using GNHWK = ck::tensor_layout::convolution::GNHWK;
using NHWGK = ck::tensor_layout::convolution::NHWGK;
constexpr auto I2 = ck::Number<2>{}; constexpr auto I2 = ck::Number<2>{};
constexpr auto I3 = ck::Number<3>{};
auto profile = [&](auto num_dim_spatial_tmp, auto profile = [&](auto num_dim_spatial_tmp,
auto out_layout, auto out_layout,
...@@ -116,36 +111,70 @@ int profile_grouped_conv_bwd_data(int argc, char* argv[]) ...@@ -116,36 +111,70 @@ int profile_grouped_conv_bwd_data(int argc, char* argv[])
return pass ? 0 : 1; return pass ? 0 : 1;
}; };
// GNHWC_GKYXC_GNHWK if(num_dim_spatial == 2)
if(num_dim_spatial == 2 && layout == ConvLayout::GNHWC_GKYXC_GNHWK)
{ {
if(data_type == ConvDataType::F32_F32_F32) if(layout == ConvLayout::GNHWC_GKYXC_GNHWK)
{
return profile(I2, GNHWK{}, GKYXC{}, GNHWC{}, F32{}, F32{}, F32{});
}
else if(data_type == ConvDataType::F16_F16_F16)
{ {
return profile(I2, GNHWK{}, GKYXC{}, GNHWC{}, F16{}, F16{}, F16{}); if(data_type == ConvDataType::F32_F32_F32)
{
return profile(I2, GNHWK{}, GKYXC{}, GNHWC{}, F32{}, F32{}, F32{});
}
else if(data_type == ConvDataType::F16_F16_F16)
{
return profile(I2, GNHWK{}, GKYXC{}, GNHWC{}, F16{}, F16{}, F16{});
}
else if(data_type == ConvDataType::BF16_BF16_BF16)
{
return profile(I2, GNHWK{}, GKYXC{}, GNHWC{}, BF16{}, BF16{}, BF16{});
}
} }
else if(data_type == ConvDataType::BF16_BF16_BF16) else if(layout == ConvLayout::NHWGC_GKYXC_NHWGK)
{ {
return profile(I2, GNHWK{}, GKYXC{}, GNHWC{}, BF16{}, BF16{}, BF16{}); if(data_type == ConvDataType::F32_F32_F32)
{
return profile(I2, NHWGK{}, GKYXC{}, NHWGC{}, F32{}, F32{}, F32{});
}
else if(data_type == ConvDataType::F16_F16_F16)
{
return profile(I2, NHWGK{}, GKYXC{}, NHWGC{}, F16{}, F16{}, F16{});
}
else if(data_type == ConvDataType::BF16_BF16_BF16)
{
return profile(I2, NHWGK{}, GKYXC{}, NHWGC{}, BF16{}, BF16{}, BF16{});
}
} }
} }
// NHWGC_GKYXC_NHWGK else if(num_dim_spatial == 3)
else if(num_dim_spatial == 2 && layout == ConvLayout::NHWGC_GKYXC_NHWGK)
{ {
if(data_type == ConvDataType::F32_F32_F32) if(layout == ConvLayout::GNHWC_GKYXC_GNHWK)
{
return profile(I2, NHWGK{}, GKYXC{}, NHWGC{}, F32{}, F32{}, F32{});
}
else if(data_type == ConvDataType::F16_F16_F16)
{ {
return profile(I2, NHWGK{}, GKYXC{}, NHWGC{}, F16{}, F16{}, F16{}); if(data_type == ConvDataType::F32_F32_F32)
{
return profile(I3, GNDHWK{}, GKZYXC{}, GNDHWC{}, F32{}, F32{}, F32{});
}
else if(data_type == ConvDataType::F16_F16_F16)
{
return profile(I3, GNDHWK{}, GKZYXC{}, GNDHWC{}, F16{}, F16{}, F16{});
}
else if(data_type == ConvDataType::BF16_BF16_BF16)
{
return profile(I3, GNDHWK{}, GKZYXC{}, GNDHWC{}, BF16{}, BF16{}, BF16{});
}
} }
else if(data_type == ConvDataType::BF16_BF16_BF16) else if(layout == ConvLayout::NHWGC_GKYXC_NHWGK)
{ {
return profile(I2, NHWGK{}, GKYXC{}, NHWGC{}, BF16{}, BF16{}, BF16{}); if(data_type == ConvDataType::F32_F32_F32)
{
return profile(I3, NDHWGK{}, GKZYXC{}, NDHWGC{}, F32{}, F32{}, F32{});
}
else if(data_type == ConvDataType::F16_F16_F16)
{
return profile(I3, NDHWGK{}, GKZYXC{}, NDHWGC{}, F16{}, F16{}, F16{});
}
else if(data_type == ConvDataType::BF16_BF16_BF16)
{
return profile(I3, NDHWGK{}, GKZYXC{}, NDHWGC{}, BF16{}, BF16{}, BF16{});
}
} }
} }
......
...@@ -83,19 +83,7 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[]) ...@@ -83,19 +83,7 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[])
using F16 = ck::half_t; using F16 = ck::half_t;
using BF16 = ck::bhalf_t; using BF16 = ck::bhalf_t;
using GNWC = ck::tensor_layout::convolution::GNWC; using namespace ck::tensor_layout::convolution;
using GNHWC = ck::tensor_layout::convolution::GNHWC;
using NHWGC = ck::tensor_layout::convolution::NHWGC;
using GNDHWC = ck::tensor_layout::convolution::GNDHWC;
using GKXC = ck::tensor_layout::convolution::GKXC;
using GKYXC = ck::tensor_layout::convolution::GKYXC;
using GKZYXC = ck::tensor_layout::convolution::GKZYXC;
using GNWK = ck::tensor_layout::convolution::GNWK;
using GNHWK = ck::tensor_layout::convolution::GNHWK;
using NHWGK = ck::tensor_layout::convolution::NHWGK;
using GNDHWK = ck::tensor_layout::convolution::GNDHWK;
constexpr auto I1 = ck::Number<1>{}; constexpr auto I1 = ck::Number<1>{};
constexpr auto I2 = ck::Number<2>{}; constexpr auto I2 = ck::Number<2>{};
...@@ -194,6 +182,22 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[]) ...@@ -194,6 +182,22 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[])
return profile(I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, BF16{}, F32{}, BF16{}); return profile(I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, BF16{}, F32{}, BF16{});
} }
} }
else if(num_dim_spatial == 3 && layout == ConvLayout::NHWGC_GKYXC_NHWGK)
{
if(data_type == ConvDataType::F32_F32_F32)
{
return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F32{}, F32{}, F32{});
}
else if(data_type == ConvDataType::F16_F16_F16)
{
return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F16{}, F16{}, F16{});
}
else if(data_type == ConvDataType::BF16_F32_BF16)
{
// fp32 atomic add is used for weight tensor in bf16 kernel
return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, BF16{}, F32{}, BF16{});
}
}
std::cout << "this data_type & layout is not implemented" << std::endl; std::cout << "this data_type & layout is not implemented" << std::endl;
......
...@@ -88,7 +88,7 @@ int profile_grouped_gemm(int argc, char* argv[]) ...@@ -88,7 +88,7 @@ int profile_grouped_gemm(int argc, char* argv[])
const auto StrideBs = argToIntArray(argv[12]); const auto StrideBs = argToIntArray(argv[12]);
const auto StrideCs = argToIntArray(argv[13]); const auto StrideCs = argToIntArray(argv[13]);
const int kbatch = argc == 15 ? std::stoi(argv[14]) : 1; const int kbatch = argc == 15 ? std::stoi(argv[14]) : 1;
#ifdef CK_ENABLE_FP16
if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN) if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
{ {
ck::profiler::profile_grouped_gemm_impl<ck::half_t, ck::profiler::profile_grouped_gemm_impl<ck::half_t,
...@@ -173,7 +173,7 @@ int profile_grouped_gemm(int argc, char* argv[]) ...@@ -173,7 +173,7 @@ int profile_grouped_gemm(int argc, char* argv[])
{ {
throw std::runtime_error("wrong! this GEMM data_type & layout is not implemented"); throw std::runtime_error("wrong! this GEMM data_type & layout is not implemented");
} }
#endif
return 0; return 0;
} }
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "profiler/profile_image_to_column_impl.hpp"
#include "profiler_operation_registry.hpp"
namespace {
enum struct ConvLayout
{
NHWC, // 0
};
enum struct DataType
{
F32_F32, // 0
F16_F16, // 1
BF16_BF16, // 2
INT8_INT8, // 3
};
#define OP_NAME "image_to_column"
#define OP_DESC "Image To Column"
static void print_helper_msg()
{
std::cout
// clang-format off
<< "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
<< "arg2: data type (0: Input fp32, Weight fp32, Output fp32\n"
<< " 1: Input fp16, Weight fp16, Output fp16\n"
<< " 2: Input bf16, Weight bf16, Output bf16\n"
<< " 3: Input int8, Weight int8, Output int8)\n"
<< "arg3: tensor layout (0: Input[N, Hi, Wi, C], Output[N * Ho * Wo, Y * X * C])\n"
<< "arg4: verification (0: no, 1: yes)\n"
<< "arg5: initialization (0: no init, 1: integer value, 2: decimal value)\n"
<< "arg6: print tensor value (0: no; 1: yes)\n"
<< "arg7: time kernel (0: no, 1: yes)\n"
<< ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
// clang-format on
}
} // namespace
int profile_image_to_column(int argc, char* argv[])
{
// 8 for control, 1 for num_dim_spatial
if(argc < 9)
{
print_helper_msg();
return 1;
}
const auto data_type = static_cast<DataType>(std::stoi(argv[2]));
const auto layout = static_cast<ConvLayout>(std::stoi(argv[3]));
const bool do_verification = std::stoi(argv[4]);
const int init_method = std::stoi(argv[5]);
const bool do_log = std::stoi(argv[6]);
const bool time_kernel = std::stoi(argv[7]);
const int num_dim_spatial = std::stoi(argv[8]);
// 8 for control, 1 for num_dim_spatial, 4 for G/N/K/C, and 6 * num_dim_spatial
if(argc != 8 + 1 + 4 + 6 * num_dim_spatial)
{
print_helper_msg();
return 1;
}
const auto params = ck::utils::conv::parse_conv_param(num_dim_spatial, 9, argv);
using F32 = float;
using F16 = ck::half_t;
using BF16 = ck::bhalf_t;
using INT8 = int8_t;
using namespace ck::tensor_layout::convolution;
constexpr auto I1 = ck::Number<1>{};
constexpr auto I2 = ck::Number<2>{};
constexpr auto I3 = ck::Number<3>{};
auto profile = [&](auto num_dim_spatial_tmp, auto in_layout, auto in_type, auto out_type) {
constexpr ck::index_t NDimSpatial = num_dim_spatial_tmp.value;
using InLayout = decltype(in_layout);
using InDataType = decltype(in_type);
using OutDataType = decltype(out_type);
bool pass = ck::profiler::
profile_image_to_column_impl<NDimSpatial, InLayout, InDataType, OutDataType>(
do_verification, init_method, do_log, time_kernel, params);
return pass ? 0 : 1;
};
// NHWC
if(layout == ConvLayout::NHWC)
{
if(num_dim_spatial == 1)
{
if(data_type == DataType::F32_F32)
{
return profile(I1, GNWC{}, F32{}, F32{});
}
else if(data_type == DataType::F16_F16)
{
return profile(I1, GNWC{}, F16{}, F16{});
}
else if(data_type == DataType::BF16_BF16)
{
return profile(I1, GNWC{}, BF16{}, BF16{});
}
else if(data_type == DataType::INT8_INT8)
{
return profile(I1, GNWC{}, INT8{}, INT8{});
}
}
else if(num_dim_spatial == 2)
{
if(data_type == DataType::F32_F32)
{
return profile(I2, GNHWC{}, F32{}, F32{});
}
else if(data_type == DataType::F16_F16)
{
return profile(I2, GNHWC{}, F16{}, F16{});
}
else if(data_type == DataType::BF16_BF16)
{
return profile(I2, GNHWC{}, BF16{}, BF16{});
}
else if(data_type == DataType::INT8_INT8)
{
return profile(I2, GNHWC{}, INT8{}, INT8{});
}
}
else if(num_dim_spatial == 3)
{
if(data_type == DataType::F32_F32)
{
return profile(I3, GNDHWC{}, F32{}, F32{});
}
else if(data_type == DataType::F16_F16)
{
return profile(I3, GNDHWC{}, F16{}, F16{});
}
else if(data_type == DataType::BF16_BF16)
{
return profile(I3, GNDHWC{}, BF16{}, BF16{});
}
else if(data_type == DataType::INT8_INT8)
{
return profile(I3, GNDHWC{}, INT8{}, INT8{});
}
}
}
std::cout << "this data_type & layout is not implemented" << std::endl;
return 1;
}
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_image_to_column);
...@@ -6,15 +6,20 @@ ...@@ -6,15 +6,20 @@
#include <unordered_map> #include <unordered_map>
#include "profiler/data_type_enum.hpp" #include "profiler/data_type_enum.hpp"
#include "profiler/profile_pool2d_fwd_impl.hpp" #include "profiler/profile_max_pool3d_bwd_impl.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "profiler_operation_registry.hpp" #include "profiler_operation_registry.hpp"
using ck::index_t; using ck::index_t;
struct avgPoolFwdArgParser struct maxPoolbwdArgParser
{ {
std::unordered_map<std::string, std::vector<int>> long_opts = { std::unordered_map<std::string, std::vector<int>> long_opts = {{"length", {}},
{"length", {}}, {"wsize", {}}, {"wstride", {}}, {"pad1", {}}, {"pad2", {}}}; {"wsize", {}},
{"wstride", {}},
{"wdilation", {}},
{"pad1", {}},
{"pad2", {}}};
bool parse_opt(int argc, char* argv[], const std::string& key, int i) bool parse_opt(int argc, char* argv[], const std::string& key, int i)
{ {
...@@ -45,24 +50,25 @@ struct avgPoolFwdArgParser ...@@ -45,24 +50,25 @@ struct avgPoolFwdArgParser
} }
}; };
void print_help_avg_pool2d_fwd() void print_help_max_pool3d_bwd()
{ {
std::cout << "arg1: data type (0: fp16; 1: fp32)\n" std::cout << "arg1: data type (0: fp16; 1: fp32; 5: bf16)\n"
<< "arg2: verification (0: no; 1: yes)\n" << "arg2: verification (0: no; 1: yes)\n"
<< "arg3: initialization (0: no init; 1: integer value; 2: decimal value)\n" << "arg3: initialization (0: no init; 1: integer value; 2: decimal value)\n"
<< "arg4: print tensor value (0: no; 1: yes)\n" << "arg4: print tensor value (0: no; 1: yes)\n"
<< "arg5: time kernel (0=no, 1=yes)\n" << "arg5: time kernel (0=no, 1=yes)\n"
<< "--length: input tensor length for NDHW(e.g, --length 2 32 30 30) \n" << "--length: input tensor length for NCDHW(e.g, --length 2 32 30 30 30) \n"
<< "--wsize: window size for YX (e.g, --wsize 2 2) \n" << "--wsize: window size for ZYX (e.g, --wsize 2 2 2) \n"
<< "--wstride: window stride for HW (e.g, --wstride 2 2) \n" << "--wstride: window stride for DHW (e.g, --wstride 2 2 2) \n"
<< "--pad1: left side of padding in HW (e.g, --pad1 1 1) \n" << "--wdilation: window dilation for DHW (e.g, --wdilation 1 1 1) \n"
<< "--pad2: right side of padding in HW (e.g, --pad2 1 1) \n" << "--pad1: left side of padding in DHW (e.g, --pad1 1 1 1) \n"
<< "eg: ckProfiler avg_pool2d_fwd 0 1 2 0 1 0 --length 2 32 30 30 --wsize 2 2 " << "--pad2: right side of padding in DHW (e.g, --pad2 1 1 1) \n"
"--wstride 2 2 --pad1 1 1 --pad2 1 1" << "eg: ckProfiler max_pool3d_bwd 0 1 2 0 1 --length 2 32 30 30 30 --wsize 2 2 2 "
"--wstride 2 2 2 --wdilation 1 1 1 --pad1 1 1 1 --pad2 1 1 1"
<< std::endl; << std::endl;
} }
int profile_avg_pool2d_fwd(int argc, char* argv[]) int profile_max_pool3d_bwd(int argc, char* argv[])
{ {
ck::DataTypeEnum data_type = ck::DataTypeEnum::Half; ck::DataTypeEnum data_type = ck::DataTypeEnum::Half;
bool do_verification = true; bool do_verification = true;
...@@ -70,18 +76,19 @@ int profile_avg_pool2d_fwd(int argc, char* argv[]) ...@@ -70,18 +76,19 @@ int profile_avg_pool2d_fwd(int argc, char* argv[])
bool do_log = false; bool do_log = false;
bool time_kernel = true; bool time_kernel = true;
std::vector<index_t> in_length = {2, 32, 30, 30}; std::vector<index_t> in_length = {2, 32, 30, 30, 30};
std::vector<index_t> wsize = {2, 2}; std::vector<index_t> wsize = {2, 2, 2};
std::vector<index_t> wstride = {2, 2}; std::vector<index_t> wstride = {2, 2, 2};
std::vector<index_t> pad1 = {1, 1}; std::vector<index_t> wdilation = {1, 1, 1};
std::vector<index_t> pad2 = {1, 1}; std::vector<index_t> pad1 = {1, 1, 1};
std::vector<index_t> pad2 = {1, 1, 1};
if(argc != 2 && argc != 25) if(argc != 2 && argc != 33)
{ {
print_help_avg_pool2d_fwd(); print_help_max_pool3d_bwd();
return 0; return 0;
} }
else if(argc == 25) else if(argc == 33)
{ {
data_type = static_cast<ck::DataTypeEnum>(std::stoi(argv[2])); data_type = static_cast<ck::DataTypeEnum>(std::stoi(argv[2]));
do_verification = std::stoi(argv[3]); do_verification = std::stoi(argv[3]);
...@@ -90,23 +97,48 @@ int profile_avg_pool2d_fwd(int argc, char* argv[]) ...@@ -90,23 +97,48 @@ int profile_avg_pool2d_fwd(int argc, char* argv[])
time_kernel = std::stoi(argv[6]); time_kernel = std::stoi(argv[6]);
// parse the long options // parse the long options
avgPoolFwdArgParser arg_parser; maxPoolbwdArgParser arg_parser;
arg_parser(argc, argv); arg_parser(argc, argv);
in_length = arg_parser.long_opts["length"]; in_length = arg_parser.long_opts["length"];
wsize = arg_parser.long_opts["wsize"]; wsize = arg_parser.long_opts["wsize"];
wstride = arg_parser.long_opts["wstride"]; wstride = arg_parser.long_opts["wstride"];
wdilation = arg_parser.long_opts["wdilation"];
pad1 = arg_parser.long_opts["pad1"]; pad1 = arg_parser.long_opts["pad1"];
pad2 = arg_parser.long_opts["pad2"]; pad2 = arg_parser.long_opts["pad2"];
} }
using F16 = ck::half_t; #ifdef CK_ENABLE_FP16
using F32 = float; using F16 = ck::half_t;
using I32 = int32_t; #endif
constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG; #ifdef CK_ENABLE_BF16
using BF16 = ck::bhalf_t;
#endif
#ifdef CK_ENABLE_FP32
using F32 = float;
#endif
using I32 = int32_t;
if(data_type == ck::DataTypeEnum::Half) if(false)
;
#ifdef CK_ENABLE_FP16
else if(data_type == ck::DataTypeEnum::Half)
{ {
ck::profiler::profile_pool2d_fwd_impl<F16, F16, F32, I32, ReduceOpId, false, false>( ck::profiler::profile_max_pool3d_bwd_impl<F16, F16, I32, F16, F16, false>(do_verification,
init_method,
do_log,
time_kernel,
in_length,
wsize,
wstride,
wdilation,
pad1,
pad2);
}
#endif
#ifdef CK_ENABLE_BF16
else if(data_type == ck::DataTypeEnum::BFloat16)
{
ck::profiler::profile_max_pool3d_bwd_impl<BF16, BF16, I32, BF16, BF16, false>(
do_verification, do_verification,
init_method, init_method,
do_log, do_log,
...@@ -114,22 +146,26 @@ int profile_avg_pool2d_fwd(int argc, char* argv[]) ...@@ -114,22 +146,26 @@ int profile_avg_pool2d_fwd(int argc, char* argv[])
in_length, in_length,
wsize, wsize,
wstride, wstride,
wdilation,
pad1, pad1,
pad2); pad2);
} }
#endif
#ifdef CK_ENABLE_FP32
else if(data_type == ck::DataTypeEnum::Float) else if(data_type == ck::DataTypeEnum::Float)
{ {
ck::profiler::profile_pool2d_fwd_impl<F32, F32, F32, I32, ReduceOpId, false, false>( ck::profiler::profile_max_pool3d_bwd_impl<F32, F32, I32, F32, F32, false>(do_verification,
do_verification, init_method,
init_method, do_log,
do_log, time_kernel,
time_kernel, in_length,
in_length, wsize,
wsize, wstride,
wstride, wdilation,
pad1, pad1,
pad2); pad2);
} }
#endif
else else
{ {
throw std::runtime_error("not implemented yet"); throw std::runtime_error("not implemented yet");
...@@ -138,4 +174,4 @@ int profile_avg_pool2d_fwd(int argc, char* argv[]) ...@@ -138,4 +174,4 @@ int profile_avg_pool2d_fwd(int argc, char* argv[])
return 0; return 0;
} }
REGISTER_PROFILER_OPERATION("avg_pool2d_fwd", "avg_pool2d fwd", profile_avg_pool2d_fwd); REGISTER_PROFILER_OPERATION("max_pool3d_bwd", "max_pool3d bwd", profile_max_pool3d_bwd);
...@@ -13,8 +13,12 @@ using ck::index_t; ...@@ -13,8 +13,12 @@ using ck::index_t;
struct maxPoolFwdArgParser struct maxPoolFwdArgParser
{ {
std::unordered_map<std::string, std::vector<int>> long_opts = { std::unordered_map<std::string, std::vector<int>> long_opts = {{"length", {}},
{"length", {}}, {"wsize", {}}, {"wstride", {}}, {"pad1", {}}, {"pad2", {}}}; {"wsize", {}},
{"wstride", {}},
{"wdilation", {}},
{"pad1", {}},
{"pad2", {}}};
bool parse_opt(int argc, char* argv[], const std::string& key, int i) bool parse_opt(int argc, char* argv[], const std::string& key, int i)
{ {
...@@ -47,7 +51,7 @@ struct maxPoolFwdArgParser ...@@ -47,7 +51,7 @@ struct maxPoolFwdArgParser
void print_help_max_pool3d_fwd() void print_help_max_pool3d_fwd()
{ {
std::cout << "arg1: data type (0: fp16; 1: fp32)\n" std::cout << "arg1: data type (0: fp16; 1: fp32; 5: bf16)\n"
<< "arg2: verification (0: no; 1: yes)\n" << "arg2: verification (0: no; 1: yes)\n"
<< "arg3: initialization (0: no init; 1: integer value; 2: decimal value)\n" << "arg3: initialization (0: no init; 1: integer value; 2: decimal value)\n"
<< "arg4: print tensor value (0: no; 1: yes)\n" << "arg4: print tensor value (0: no; 1: yes)\n"
...@@ -56,10 +60,11 @@ void print_help_max_pool3d_fwd() ...@@ -56,10 +60,11 @@ void print_help_max_pool3d_fwd()
<< "--length: input tensor length for NCDHW(e.g, --length 2 32 30 30 30) \n" << "--length: input tensor length for NCDHW(e.g, --length 2 32 30 30 30) \n"
<< "--wsize: window size for ZYX (e.g, --wsize 2 2 2) \n" << "--wsize: window size for ZYX (e.g, --wsize 2 2 2) \n"
<< "--wstride: window stride for DHW (e.g, --wstride 2 2 2) \n" << "--wstride: window stride for DHW (e.g, --wstride 2 2 2) \n"
<< "--wdilation: window dilation for DHW (e.g, --wdilation 1 1 1) \n"
<< "--pad1: left side of padding in DHW (e.g, --pad1 1 1 1) \n" << "--pad1: left side of padding in DHW (e.g, --pad1 1 1 1) \n"
<< "--pad2: right side of padding in DHW (e.g, --pad2 1 1 1) \n" << "--pad2: right side of padding in DHW (e.g, --pad2 1 1 1) \n"
<< "eg: ckProfiler max_pool3d_fwd 0 1 2 0 1 0 --length 2 32 30 30 30 --wsize 2 2 2 " << "eg: ckProfiler max_pool3d_fwd 0 1 2 0 1 0 --length 2 32 30 30 30 --wsize 2 2 2 "
"--wstride 2 2 2 --pad1 1 1 1 --pad2 1 1 1" "--wstride 2 2 2 --wdilation 1 1 1 --pad1 1 1 1 --pad2 1 1 1"
<< std::endl; << std::endl;
} }
...@@ -75,15 +80,16 @@ int profile_max_pool3d_fwd(int argc, char* argv[]) ...@@ -75,15 +80,16 @@ int profile_max_pool3d_fwd(int argc, char* argv[])
std::vector<index_t> in_length = {2, 32, 30, 30, 30}; std::vector<index_t> in_length = {2, 32, 30, 30, 30};
std::vector<index_t> wsize = {2, 2, 2}; std::vector<index_t> wsize = {2, 2, 2};
std::vector<index_t> wstride = {2, 2, 2}; std::vector<index_t> wstride = {2, 2, 2};
std::vector<index_t> wdilation = {1, 1, 1};
std::vector<index_t> pad1 = {1, 1, 1}; std::vector<index_t> pad1 = {1, 1, 1};
std::vector<index_t> pad2 = {1, 1, 1}; std::vector<index_t> pad2 = {1, 1, 1};
if(argc != 2 && argc != 30) if(argc != 2 && argc != 34)
{ {
print_help_max_pool3d_fwd(); print_help_max_pool3d_fwd();
return 0; return 0;
} }
else if(argc == 30) else if(argc == 34)
{ {
data_type = static_cast<ck::DataTypeEnum>(std::stoi(argv[2])); data_type = static_cast<ck::DataTypeEnum>(std::stoi(argv[2]));
do_verification = std::stoi(argv[3]); do_verification = std::stoi(argv[3]);
...@@ -98,65 +104,136 @@ int profile_max_pool3d_fwd(int argc, char* argv[]) ...@@ -98,65 +104,136 @@ int profile_max_pool3d_fwd(int argc, char* argv[])
in_length = arg_parser.long_opts["length"]; in_length = arg_parser.long_opts["length"];
wsize = arg_parser.long_opts["wsize"]; wsize = arg_parser.long_opts["wsize"];
wstride = arg_parser.long_opts["wstride"]; wstride = arg_parser.long_opts["wstride"];
wdilation = arg_parser.long_opts["wdilation"];
pad1 = arg_parser.long_opts["pad1"]; pad1 = arg_parser.long_opts["pad1"];
pad2 = arg_parser.long_opts["pad2"]; pad2 = arg_parser.long_opts["pad2"];
} }
using F16 = ck::half_t; #ifdef CK_ENABLE_FP16
using F32 = float; using F16 = ck::half_t;
using I32 = int32_t; #endif
#ifdef CK_ENABLE_BF16
using BF16 = ck::bhalf_t;
#endif
#ifdef CK_ENABLE_FP32
using F32 = float;
#endif
using I32 = int32_t;
using NDHWC = ck::tensor_layout::convolution::NDHWC;
#if 1
constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX; constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
#else
constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
#endif
if(data_type == ck::DataTypeEnum::Half) if(false)
;
#ifdef CK_ENABLE_FP16
else if(data_type == ck::DataTypeEnum::Half)
{
if(return_index)
ck::profiler::
profile_pool3d_fwd_impl<F16, F16, F16, I32, NDHWC, NDHWC, ReduceOpId, false, true>(
do_verification,
init_method,
do_log,
time_kernel,
in_length,
wsize,
wstride,
wdilation,
pad1,
pad2);
else
ck::profiler::
profile_pool3d_fwd_impl<F16, F16, F16, I32, NDHWC, NDHWC, ReduceOpId, false, false>(
do_verification,
init_method,
do_log,
time_kernel,
in_length,
wsize,
wstride,
wdilation,
pad1,
pad2);
}
#endif
#ifdef CK_ENABLE_BF16
else if(data_type == ck::DataTypeEnum::BFloat16)
{ {
if(return_index) if(return_index)
ck::profiler::profile_pool3d_fwd_impl<F16, F16, F16, I32, ReduceOpId, false, true>( ck::profiler::profile_pool3d_fwd_impl<BF16,
do_verification, BF16,
init_method, BF16,
do_log, I32,
time_kernel, NDHWC,
in_length, NDHWC,
wsize, ReduceOpId,
wstride, false,
pad1, true>(do_verification,
pad2); init_method,
do_log,
time_kernel,
in_length,
wsize,
wstride,
wdilation,
pad1,
pad2);
else else
ck::profiler::profile_pool3d_fwd_impl<F16, F16, F16, I32, ReduceOpId, false, false>( ck::profiler::profile_pool3d_fwd_impl<BF16,
do_verification, BF16,
init_method, BF16,
do_log, I32,
time_kernel, NDHWC,
in_length, NDHWC,
wsize, ReduceOpId,
wstride, false,
pad1, false>(do_verification,
pad2); init_method,
do_log,
time_kernel,
in_length,
wsize,
wstride,
wdilation,
pad1,
pad2);
} }
#endif
#ifdef CK_ENABLE_FP32
else if(data_type == ck::DataTypeEnum::Float) else if(data_type == ck::DataTypeEnum::Float)
{ {
if(return_index) if(return_index)
ck::profiler::profile_pool3d_fwd_impl<F32, F32, F32, I32, ReduceOpId, false, true>( ck::profiler::
do_verification, profile_pool3d_fwd_impl<F32, F32, F32, I32, NDHWC, NDHWC, ReduceOpId, false, true>(
init_method, do_verification,
do_log, init_method,
time_kernel, do_log,
in_length, time_kernel,
wsize, in_length,
wstride, wsize,
pad1, wstride,
pad2); wdilation,
pad1,
pad2);
else else
ck::profiler::profile_pool3d_fwd_impl<F32, F32, F32, I32, ReduceOpId, false, false>( ck::profiler::
do_verification, profile_pool3d_fwd_impl<F32, F32, F32, I32, NDHWC, NDHWC, ReduceOpId, false, false>(
init_method, do_verification,
do_log, init_method,
time_kernel, do_log,
in_length, time_kernel,
wsize, in_length,
wstride, wsize,
pad1, wstride,
pad2); wdilation,
pad1,
pad2);
} }
#endif
else else
{ {
throw std::runtime_error("not implemented yet"); throw std::runtime_error("not implemented yet");
......
#find . -name deps -prune -o -name build -prune -o -iname '*.h' -o -iname '*.hpp' -o -iname '*.cpp' -o -iname '*.h.in' -o -iname '*.hpp.in' -o -iname '*.cpp.in' -o -iname '*.cl' -o -iname '*.cuh' -o -iname '*.cu' -o -iname '*.inc' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-10 -i -style=file {}' #find . -name deps -prune -o -name build -prune -o -iname '*.h' -o -iname '*.hpp' -o -iname '*.cpp' -o -iname '*.h.in' -o -iname '*.hpp.in' -o -iname '*.cpp.in' -o -iname '*.cl' -o -iname '*.cuh' -o -iname '*.cu' -o -iname '*.inc' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-12 -i -style=file {}'
git status --porcelain | awk '$1 != "D" && (match($2, "\\.cpp|hpp|inc")) {print $2}' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-10 -i -style=file {}' git status --porcelain | awk '$1 != "D" && (match($2, "\\.cpp|hpp|inc")) {print $2}' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-12 -i -style=file {}'
...@@ -16,4 +16,3 @@ cmake ...@@ -16,4 +16,3 @@ cmake
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \ -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \
-D USE_BITINT_EXTENSION_INT4=OFF \ -D USE_BITINT_EXTENSION_INT4=OFF \
${MY_PROJECT_SOURCE} ${MY_PROJECT_SOURCE}
...@@ -11,7 +11,7 @@ run_and_check() { ...@@ -11,7 +11,7 @@ run_and_check() {
} }
echo "I: Installing tools required for pre-commit checks..." echo "I: Installing tools required for pre-commit checks..."
run_and_check apt install clang-format-10 run_and_check apt install clang-format-12
echo "I: Installing pre-commit itself..." echo "I: Installing pre-commit itself..."
run_and_check pip3 install pre-commit run_and_check pip3 install pre-commit
......
...@@ -3,13 +3,6 @@ ...@@ -3,13 +3,6 @@
## GPU visibility ## GPU visibility
export HIP_VISIBLE_DEVICES=0 export HIP_VISIBLE_DEVICES=0
DRIVER="../build/bin/ckProfiler" DRIVER="../build/bin/ckProfiler"
OP=$1
DATATYPE=$2
LAYOUT=$3
VERIFY=$4
INIT=$5
LOG=$6
TIME=$7
OP=$1 OP=$1
DATATYPE=$2 DATATYPE=$2
......
...@@ -57,9 +57,10 @@ add_subdirectory(data_type) ...@@ -57,9 +57,10 @@ add_subdirectory(data_type)
add_subdirectory(elementwise_normalization) add_subdirectory(elementwise_normalization)
add_subdirectory(batchnorm) add_subdirectory(batchnorm)
add_subdirectory(contraction) add_subdirectory(contraction)
add_subdirectory(pool_fwd) add_subdirectory(pool)
add_subdirectory(batched_gemm_multi_d) add_subdirectory(batched_gemm_multi_d)
add_subdirectory(grouped_convnd_bwd_data) add_subdirectory(grouped_convnd_bwd_data)
if(GPU_TARGETS MATCHES "gfx1100") add_subdirectory(image_to_column)
if(GPU_TARGETS MATCHES "gfx11")
add_subdirectory(wmma_op) add_subdirectory(wmma_op)
endif() endif()
...@@ -2,21 +2,26 @@ list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942) ...@@ -2,21 +2,26 @@ list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
set(target 0) set(target 0)
foreach(gpu IN LISTS GPU_TARGETS) foreach(gpu IN LISTS GPU_TARGETS)
if(gpu IN_LIST gpu_list AND target EQUAL 0) if(gpu IN_LIST gpu_list AND target EQUAL 0)
add_test_executable(test_batched_gemm_fp16 batched_gemm_fp16.cpp) if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
target_link_libraries(test_batched_gemm_fp16 PRIVATE utility) add_test_executable(test_batched_gemm_fp16 batched_gemm_fp16.cpp)
target_link_libraries(test_batched_gemm_fp16 PRIVATE device_batched_gemm_instance) target_link_libraries(test_batched_gemm_fp16 PRIVATE utility)
target_link_libraries(test_batched_gemm_fp16 PRIVATE device_batched_gemm_instance)
add_test_executable(test_batched_gemm_fp32 batched_gemm_fp32.cpp) endif()
target_link_libraries(test_batched_gemm_fp32 PRIVATE utility) if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES)
target_link_libraries(test_batched_gemm_fp32 PRIVATE device_batched_gemm_instance) add_test_executable(test_batched_gemm_fp32 batched_gemm_fp32.cpp)
target_link_libraries(test_batched_gemm_fp32 PRIVATE utility)
add_test_executable(test_batched_gemm_bf16 batched_gemm_bf16.cpp) target_link_libraries(test_batched_gemm_fp32 PRIVATE device_batched_gemm_instance)
target_link_libraries(test_batched_gemm_bf16 PRIVATE utility) endif()
target_link_libraries(test_batched_gemm_bf16 PRIVATE device_batched_gemm_instance) if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES)
add_test_executable(test_batched_gemm_bf16 batched_gemm_bf16.cpp)
add_test_executable(test_batched_gemm_int8 batched_gemm_int8.cpp) target_link_libraries(test_batched_gemm_bf16 PRIVATE utility)
target_link_libraries(test_batched_gemm_int8 PRIVATE utility) target_link_libraries(test_batched_gemm_bf16 PRIVATE device_batched_gemm_instance)
target_link_libraries(test_batched_gemm_int8 PRIVATE device_batched_gemm_instance) endif()
if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES)
add_test_executable(test_batched_gemm_int8 batched_gemm_int8.cpp)
target_link_libraries(test_batched_gemm_int8 PRIVATE utility)
target_link_libraries(test_batched_gemm_int8 PRIVATE device_batched_gemm_instance)
endif()
set(target 1) set(target 1)
endif() endif()
endforeach() endforeach()
\ No newline at end of file
...@@ -2,10 +2,12 @@ list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942) ...@@ -2,10 +2,12 @@ list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
set(target 0) set(target 0)
foreach(gpu IN LISTS GPU_TARGETS) foreach(gpu IN LISTS GPU_TARGETS)
if(gpu IN_LIST gpu_list AND target EQUAL 0) if(gpu IN_LIST gpu_list AND target EQUAL 0)
add_custom_target(test_batched_gemm_gemm) if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
add_gtest_executable(test_batched_gemm_gemm_fp16 test_batched_gemm_gemm_fp16.cpp) add_custom_target(test_batched_gemm_gemm)
target_link_libraries(test_batched_gemm_gemm_fp16 PRIVATE utility device_batched_gemm_gemm_instance) add_gtest_executable(test_batched_gemm_gemm_fp16 test_batched_gemm_gemm_fp16.cpp)
add_dependencies(test_batched_gemm_gemm test_batched_gemm_gemm_fp16) target_link_libraries(test_batched_gemm_gemm_fp16 PRIVATE utility device_batched_gemm_gemm_instance)
set(target 1) add_dependencies(test_batched_gemm_gemm test_batched_gemm_gemm_fp16)
set(target 1)
endif()
endif() endif()
endforeach() endforeach()
\ No newline at end of file
# TODO: Enable for gfx90a after complier fix if(DL_KERNELS)
if(NOT GPU_TARGETS MATCHES "gfx90a")
add_gtest_executable(test_batched_gemm_multi_d test_batched_gemm_multi_d.cpp) add_gtest_executable(test_batched_gemm_multi_d test_batched_gemm_multi_d.cpp)
target_link_libraries(test_batched_gemm_multi_d PRIVATE utility device_batched_gemm_multi_d_instance) target_link_libraries(test_batched_gemm_multi_d PRIVATE utility device_batched_gemm_multi_d_instance)
endif() endif()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment