Commit 271269a5 authored by Adam Osewski's avatar Adam Osewski
Browse files

Merge remote-tracking branch 'origin/develop' into aosewski/gemm_tile_loop

parents 648f1f13 04f93aad
...@@ -17,8 +17,9 @@ ...@@ -17,8 +17,9 @@
static void print_helper_msg() static void print_helper_msg()
{ {
std::cout << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n" std::cout << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
<< "arg2: data type (0: fp32; 1: f64)\n" << "arg2: data type (0: fp32; 1: f64; 2: f16; 3: bf16)\n"
<< "arg3: matrix layout (0: A[m0, m1, k0, k1] * B[k0, k1, n0, n1] + " << "arg3: compute data type (0: fp32; 1: f64; 2: f16; 3: bf16)\n"
<< "arg4: matrix layout (0: A[m0, m1, k0, k1] * B[k0, k1, n0, n1] + "
"D[m0, m1, n0, n1] = E[m0, m1, n0, n1];\n" "D[m0, m1, n0, n1] = E[m0, m1, n0, n1];\n"
<< " 1: A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + " << " 1: A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + "
"D[m0, m1, n0, n1] = E[m0, m1, n0, n1];\n" "D[m0, m1, n0, n1] = E[m0, m1, n0, n1];\n"
...@@ -26,40 +27,42 @@ static void print_helper_msg() ...@@ -26,40 +27,42 @@ static void print_helper_msg()
"D[m0, m1, n0, n1] = E[m0, m1, n0, n1];\n" "D[m0, m1, n0, n1] = E[m0, m1, n0, n1];\n"
<< " 3: A[k0, k1, m0, m1] * B[n0, n1, k0, k1] + " << " 3: A[k0, k1, m0, m1] * B[n0, n1, k0, k1] + "
"D[m0, m1, n0, n1] = E[m0, m1, n0, n1])\n" "D[m0, m1, n0, n1] = E[m0, m1, n0, n1])\n"
<< "arg4: verification (0: no; 1: yes)\n" << "arg5: verification (0: no; 1: yes)\n"
<< "arg5: initialization (0: no init; 1: integer value; 2: decimal " << "arg6: initialization (0: no init; 1: integer value; 2: decimal "
<< "value)\n" << "value)\n"
<< "arg6: print tensor value (0: no; 1: yes)\n" << "arg7: print tensor value (0: no; 1: yes)\n"
<< "arg7: time kernel (0: no, 1: yes)\n" << "arg8: time kernel (0: no, 1: yes)\n"
<< "arg8 and arg9: alpha and beta\n" << "arg9: alpha\n"
<< "arg10 to 15: M0, M1, N0, N1, K0, K1\n" << "arg10: beta\n"
<< "arg16 to 31: Strides for A, B, D and E (skip for default)\n" << "arg11 to 16: M0, M1, N0, N1, K0, K1\n"
<< "arg17 to 32: Strides for A, B, D and E (skip for default)\n"
<< std::endl; << std::endl;
} }
int profile_contraction_bilinear(int argc, char* argv[]) int profile_contraction_bilinear(int argc, char* argv[])
{ {
const bool default_strides = argc == 16; const bool default_strides = argc == 17;
if(argc != 32 && argc != 16) if(argc != 33 && argc != 17)
{ {
print_helper_msg(); print_helper_msg();
exit(1); exit(1);
} }
const auto data_type = static_cast<ContractionDataType>(std::stoi(argv[2])); const auto data_type = static_cast<ContractionDataType>(std::stoi(argv[2]));
const auto layout = static_cast<ContractionMatrixLayout>(std::stoi(argv[3])); const auto compute_data_type = static_cast<ContractionComputeDataType>(std::stoi(argv[3]));
const bool do_verification = std::stoi(argv[4]); const auto layout = static_cast<ContractionMatrixLayout>(std::stoi(argv[4]));
const ck::index_t init_method = std::stoi(argv[5]); const bool do_verification = std::stoi(argv[5]);
const bool do_log = std::stoi(argv[6]); const ck::index_t init_method = std::stoi(argv[6]);
const bool time_kernel = std::stoi(argv[7]); const bool do_log = std::stoi(argv[7]);
const float alpha = std::stof(argv[8]); const bool time_kernel = std::stoi(argv[8]);
const float beta = std::stof(argv[9]); const float alpha = std::stof(argv[9]);
const float beta = std::stof(argv[10]);
std::vector<ck::index_t> M; std::vector<ck::index_t> M;
std::vector<ck::index_t> N; std::vector<ck::index_t> N;
std::vector<ck::index_t> K; std::vector<ck::index_t> K;
const ck::index_t dims_arg_num = 10; const ck::index_t dims_arg_num = 11;
collect_index_params(argv, M, dims_arg_num, 2); collect_index_params(argv, M, dims_arg_num, 2);
collect_index_params(argv, N, dims_arg_num + 2, 2); collect_index_params(argv, N, dims_arg_num + 2, 2);
collect_index_params(argv, K, dims_arg_num + 4, 2); collect_index_params(argv, K, dims_arg_num + 4, 2);
...@@ -76,90 +79,130 @@ int profile_contraction_bilinear(int argc, char* argv[]) ...@@ -76,90 +79,130 @@ int profile_contraction_bilinear(int argc, char* argv[])
collect_index_params(argv, StridesD, dims_arg_num + 18, 4); collect_index_params(argv, StridesD, dims_arg_num + 18, 4);
} }
using F32 = float; using F16 = ck::half_t;
using F64 = double; using BF16 = ck::bhalf_t;
using F32 = float;
auto profile = [&](auto a_layout, auto b_layout, auto cde_layout, auto type) { using F64 = double;
using ALayout = decltype(a_layout);
using BLayout = decltype(b_layout); auto profile =
using CDELayout = decltype(cde_layout); [&](auto a_layout, auto b_layout, auto cde_layout, auto type, auto compute_type) {
using ALayout = decltype(a_layout);
using DataType = decltype(type); using BLayout = decltype(b_layout);
using CDELayout = decltype(cde_layout);
if(default_strides)
using DataType = decltype(type);
using ComputeDataType = decltype(compute_type);
if(default_strides)
{
assign_default_strides(a_layout, StridesA, {M[0], M[1], K[0], K[1]});
assign_default_strides(b_layout, StridesB, {N[0], N[1], K[0], K[1]});
assign_default_strides(cde_layout, StridesE, {M[0], M[1], N[0], N[1]});
assign_default_strides(cde_layout, StridesD, {M[0], M[1], N[0], N[1]});
}
bool pass = ck::profiler::profile_contraction_impl<ALayout,
BLayout,
CDELayout,
DataType,
ComputeDataType,
ck::Tuple<DataType>,
Bilinear>(do_verification,
init_method,
do_log,
time_kernel,
Bilinear{alpha, beta},
M,
N,
K,
StridesA,
StridesB,
StridesE,
StridesD);
return pass;
};
auto run_profile_for_datatype = [&](auto type, auto compute_type) {
if(layout == ContractionMatrixLayout::MK_KN_MN_MN)
{ {
assign_default_strides(a_layout, StridesA, {M[0], M[1], K[0], K[1]}); return profile(Row{}, Row{}, Row{}, type, compute_type);
assign_default_strides(b_layout, StridesB, {K[0], K[1], N[0], N[1]});
assign_default_strides(cde_layout, StridesE, {M[0], M[1], N[0], N[1]});
assign_default_strides(cde_layout, StridesD, {M[0], M[1], N[0], N[1]});
} }
bool pass = ck::profiler::profile_contraction_impl<ALayout, else if(layout == ContractionMatrixLayout::MK_NK_MN_MN)
BLayout, {
CDELayout, return profile(Row{}, Col{}, Row{}, type, compute_type);
DataType, }
ck::Tuple<DataType>, else if(layout == ContractionMatrixLayout::KM_KN_MN_MN)
Bilinear>(do_verification, {
init_method, return profile(Col{}, Row{}, Row{}, type, compute_type);
do_log, }
time_kernel, else if(layout == ContractionMatrixLayout::KM_NK_MN_MN)
Bilinear{alpha, beta}, {
M, return profile(Col{}, Col{}, Row{}, type, compute_type);
N, }
K, return false;
StridesA,
StridesB,
StridesE,
StridesD);
return pass;
}; };
if(data_type == ContractionDataType::F32_F32_F32_F32 && if(data_type == ContractionDataType::F32_F32_F32_F32)
layout == ContractionMatrixLayout::MK_KN_MN_MN)
{
return profile(Row{}, Row{}, Row{}, F32{});
}
else if(data_type == ContractionDataType::F32_F32_F32_F32 &&
layout == ContractionMatrixLayout::MK_NK_MN_MN)
{ {
return profile(Row{}, Col{}, Row{}, F32{}); if(compute_data_type == ContractionComputeDataType::F32)
} {
else if(data_type == ContractionDataType::F32_F32_F32_F32 && return run_profile_for_datatype(F32{}, F32{});
layout == ContractionMatrixLayout::KM_KN_MN_MN) }
{ else if(compute_data_type == ContractionComputeDataType::F16)
return profile(Col{}, Row{}, Row{}, F32{}); {
} return run_profile_for_datatype(F32{}, F16{});
else if(data_type == ContractionDataType::F32_F32_F32_F32 && }
layout == ContractionMatrixLayout::KM_NK_MN_MN) else if(compute_data_type == ContractionComputeDataType::BF16)
{ {
return profile(Col{}, Col{}, Row{}, F32{}); return run_profile_for_datatype(F32{}, BF16{});
} }
else if(data_type == ContractionDataType::F64_F64_F64_F64 && else
layout == ContractionMatrixLayout::MK_KN_MN_MN) {
{ std::cout << "Incorrect combination of data type and compute data type." << std::endl;
return profile(Row{}, Row{}, Row{}, F64{}); return 1;
} }
else if(data_type == ContractionDataType::F64_F64_F64_F64 &&
layout == ContractionMatrixLayout::MK_NK_MN_MN)
{
return profile(Row{}, Col{}, Row{}, F64{});
} }
else if(data_type == ContractionDataType::F64_F64_F64_F64 && else if(data_type == ContractionDataType::F64_F64_F64_F64)
layout == ContractionMatrixLayout::KM_KN_MN_MN)
{ {
return profile(Col{}, Row{}, Row{}, F64{}); if(compute_data_type == ContractionComputeDataType::F64)
{
return run_profile_for_datatype(F64{}, F64{});
}
else if(compute_data_type == ContractionComputeDataType::F32)
{
return run_profile_for_datatype(F64{}, F32{});
}
else
{
std::cout << "Incorrect combination of data type and compute data type." << std::endl;
return 1;
}
} }
else if(data_type == ContractionDataType::F64_F64_F64_F64 && else if(data_type == ContractionDataType::F16_F16_F16_F16)
layout == ContractionMatrixLayout::KM_NK_MN_MN)
{ {
return profile(Col{}, Col{}, Row{}, F64{}); if(compute_data_type == ContractionComputeDataType::F32)
{
return run_profile_for_datatype(F16{}, F32{});
}
else
{
std::cout << "Incorrect combination of data type and compute data type." << std::endl;
return 1;
}
} }
else else if(data_type == ContractionDataType::BF16_BF16_BF16_BF16)
{ {
std::cout << "this data_type & layout is not implemented" << std::endl; if(compute_data_type == ContractionComputeDataType::F32)
{
return 1; return run_profile_for_datatype(BF16{}, F32{});
}
else
{
std::cout << "Incorrect combination of data type and compute data type." << std::endl;
return 1;
}
} }
return 1;
} }
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_contraction_bilinear); REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_contraction_bilinear);
...@@ -17,8 +17,9 @@ ...@@ -17,8 +17,9 @@
static void print_helper_msg() static void print_helper_msg()
{ {
std::cout << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n" std::cout << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
<< "arg2: data type (0: fp32; 1: f64)\n" << "arg2: data type (0: fp32; 1: f64; 2: f16; 3: bf16)\n"
<< "arg3: matrix layout (0: A[m0, m1, k0, k1] * B[k0, k1, n0, n1] + " << "arg3: compute data type (0: fp32; 1: f64; 2: f16; 3: bf16)\n"
<< "arg4: matrix layout (0: A[m0, m1, k0, k1] * B[k0, k1, n0, n1] + "
"D[m0, m1, n0, n1] = E[m0, m1, n0, n1];\n" "D[m0, m1, n0, n1] = E[m0, m1, n0, n1];\n"
<< " 1: A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + " << " 1: A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + "
"D[m0, m1, n0, n1] = E[m0, m1, n0, n1];\n" "D[m0, m1, n0, n1] = E[m0, m1, n0, n1];\n"
...@@ -26,39 +27,40 @@ static void print_helper_msg() ...@@ -26,39 +27,40 @@ static void print_helper_msg()
"D[m0, m1, n0, n1] = E[m0, m1, n0, n1];\n" "D[m0, m1, n0, n1] = E[m0, m1, n0, n1];\n"
<< " 3: A[k0, k1, m0, m1] * B[n0, n1, k0, k1] + " << " 3: A[k0, k1, m0, m1] * B[n0, n1, k0, k1] + "
"D[m0, m1, n0, n1] = E[m0, m1, n0, n1])\n" "D[m0, m1, n0, n1] = E[m0, m1, n0, n1])\n"
<< "arg4: verification (0: no; 1: yes)\n" << "arg5: verification (0: no; 1: yes)\n"
<< "arg5: initialization (0: no init; 1: integer value; 2: decimal " << "arg6: initialization (0: no init; 1: integer value; 2: decimal "
<< "value)\n" << "value)\n"
<< "arg6: print tensor value (0: no; 1: yes)\n" << "arg7: print tensor value (0: no; 1: yes)\n"
<< "arg7: time kernel (0: no, 1: yes)\n" << "arg8: time kernel (0: no, 1: yes)\n"
<< "arg8: alpha\n" << "arg9: alpha\n"
<< "arg9 to 14: M0, M1, N0, N1, K0, K1\n" << "arg10 to 15: M0, M1, N0, N1, K0, K1\n"
<< "arg15 to 30: Strides for A, B, D and E (skip for default)\n" << "arg16 to 31: Strides for A, B, D and E (skip for default)\n"
<< std::endl; << std::endl;
} }
int profile_contraction_scale(int argc, char* argv[]) int profile_contraction_scale(int argc, char* argv[])
{ {
const bool default_strides = argc == 15; const bool default_strides = argc == 16;
if(argc != 31 && argc != 15) if(argc != 32 && argc != 16)
{ {
print_helper_msg(); print_helper_msg();
exit(1); exit(1);
} }
const auto data_type = static_cast<ContractionDataType>(std::stoi(argv[2])); const auto data_type = static_cast<ContractionDataType>(std::stoi(argv[2]));
const auto layout = static_cast<ContractionMatrixLayout>(std::stoi(argv[3])); const auto compute_data_type = static_cast<ContractionComputeDataType>(std::stoi(argv[3]));
const bool do_verification = std::stoi(argv[4]); const auto layout = static_cast<ContractionMatrixLayout>(std::stoi(argv[4]));
const ck::index_t init_method = std::stoi(argv[5]); const bool do_verification = std::stoi(argv[5]);
const bool do_log = std::stoi(argv[6]); const ck::index_t init_method = std::stoi(argv[6]);
const bool time_kernel = std::stoi(argv[7]); const bool do_log = std::stoi(argv[7]);
const float alpha = std::stof(argv[8]); const bool time_kernel = std::stoi(argv[8]);
const float alpha = std::stof(argv[9]);
std::vector<ck::index_t> M; std::vector<ck::index_t> M;
std::vector<ck::index_t> N; std::vector<ck::index_t> N;
std::vector<ck::index_t> K; std::vector<ck::index_t> K;
const ck::index_t dims_arg_num = 9; const ck::index_t dims_arg_num = 10;
collect_index_params(argv, M, dims_arg_num, 2); collect_index_params(argv, M, dims_arg_num, 2);
collect_index_params(argv, N, dims_arg_num + 2, 2); collect_index_params(argv, N, dims_arg_num + 2, 2);
collect_index_params(argv, K, dims_arg_num + 4, 2); collect_index_params(argv, K, dims_arg_num + 4, 2);
...@@ -75,88 +77,131 @@ int profile_contraction_scale(int argc, char* argv[]) ...@@ -75,88 +77,131 @@ int profile_contraction_scale(int argc, char* argv[])
collect_index_params(argv, StridesD, dims_arg_num + 18, 4); collect_index_params(argv, StridesD, dims_arg_num + 18, 4);
} }
using F32 = float; using F16 = ck::half_t;
using F64 = double; using BF16 = ck::bhalf_t;
using F32 = float;
auto profile = [&](auto a_layout, auto b_layout, auto cde_layout, auto type) { using F64 = double;
using ALayout = decltype(a_layout);
using BLayout = decltype(b_layout); auto profile =
using CDELayout = decltype(cde_layout); [&](auto a_layout, auto b_layout, auto cde_layout, auto type, auto compute_type) {
using ALayout = decltype(a_layout);
using DataType = decltype(type); using BLayout = decltype(b_layout);
using CDELayout = decltype(cde_layout);
if(default_strides)
using DataType = decltype(type);
using ComputeDataType = decltype(compute_type);
if(default_strides)
{
assign_default_strides(a_layout, StridesA, {M[0], M[1], K[0], K[1]});
assign_default_strides(b_layout, StridesB, {N[0], N[1], K[0], K[1]});
assign_default_strides(cde_layout, StridesE, {M[0], M[1], N[0], N[1]});
assign_default_strides(cde_layout, StridesD, {M[0], M[1], N[0], N[1]});
}
bool pass = ck::profiler::profile_contraction_impl<ALayout,
BLayout,
CDELayout,
DataType,
ComputeDataType,
ck::Tuple<>,
Scale>(do_verification,
init_method,
do_log,
time_kernel,
Scale{alpha},
M,
N,
K,
StridesA,
StridesB,
StridesE,
StridesD);
return pass;
};
auto run_profile_for_datatype = [&](auto type, auto compute_type) {
if(layout == ContractionMatrixLayout::MK_KN_MN_MN)
{ {
assign_default_strides(a_layout, StridesA, {M[0], M[1], K[0], K[1]}); return profile(Row{}, Row{}, Row{}, type, compute_type);
assign_default_strides(b_layout, StridesB, {K[0], K[1], N[0], N[1]});
assign_default_strides(cde_layout, StridesE, {M[0], M[1], N[0], N[1]});
assign_default_strides(cde_layout, StridesD, {M[0], M[1], N[0], N[1]});
} }
else if(layout == ContractionMatrixLayout::MK_NK_MN_MN)
bool pass = ck::profiler:: {
profile_contraction_impl<ALayout, BLayout, CDELayout, DataType, ck::Tuple<>, Scale>( return profile(Row{}, Col{}, Row{}, type, compute_type);
do_verification, }
init_method, else if(layout == ContractionMatrixLayout::KM_KN_MN_MN)
do_log, {
time_kernel, return profile(Col{}, Row{}, Row{}, type, compute_type);
Scale{alpha}, }
M, else if(layout == ContractionMatrixLayout::KM_NK_MN_MN)
N, {
K, return profile(Col{}, Col{}, Row{}, type, compute_type);
StridesA, }
StridesB, return false;
StridesE,
StridesD);
return pass;
}; };
if(data_type == ContractionDataType::F32_F32_F32_F32 && if(data_type == ContractionDataType::F32_F32_F32_F32)
layout == ContractionMatrixLayout::MK_KN_MN_MN)
{
return profile(Row{}, Row{}, Row{}, F32{});
}
else if(data_type == ContractionDataType::F32_F32_F32_F32 &&
layout == ContractionMatrixLayout::MK_NK_MN_MN)
{
return profile(Row{}, Col{}, Row{}, F32{});
}
else if(data_type == ContractionDataType::F32_F32_F32_F32 &&
layout == ContractionMatrixLayout::KM_KN_MN_MN)
{ {
return profile(Col{}, Row{}, Row{}, F32{}); if(compute_data_type == ContractionComputeDataType::F32)
} {
else if(data_type == ContractionDataType::F32_F32_F32_F32 && return run_profile_for_datatype(F32{}, F32{});
layout == ContractionMatrixLayout::KM_NK_MN_MN) }
{ else if(compute_data_type == ContractionComputeDataType::F16)
return profile(Col{}, Col{}, Row{}, F32{}); {
} return run_profile_for_datatype(F32{}, F16{});
else if(data_type == ContractionDataType::F64_F64_F64_F64 && }
layout == ContractionMatrixLayout::MK_KN_MN_MN) else if(compute_data_type == ContractionComputeDataType::BF16)
{ {
return profile(Row{}, Row{}, Row{}, F64{}); return run_profile_for_datatype(F32{}, BF16{});
} }
else if(data_type == ContractionDataType::F64_F64_F64_F64 && else
layout == ContractionMatrixLayout::MK_NK_MN_MN) {
{ std::cout << "Incorrect combination of data type and compute data type." << std::endl;
return profile(Row{}, Col{}, Row{}, F64{}); return 1;
}
} }
else if(data_type == ContractionDataType::F64_F64_F64_F64 && else if(data_type == ContractionDataType::F64_F64_F64_F64)
layout == ContractionMatrixLayout::KM_KN_MN_MN)
{ {
return profile(Col{}, Row{}, Row{}, F64{}); if(compute_data_type == ContractionComputeDataType::F64)
{
return run_profile_for_datatype(F64{}, F64{});
}
else if(compute_data_type == ContractionComputeDataType::F32)
{
return run_profile_for_datatype(F64{}, F32{});
}
else
{
std::cout << "Incorrect combination of data type and compute data type." << std::endl;
return 1;
}
} }
else if(data_type == ContractionDataType::F64_F64_F64_F64 && else if(data_type == ContractionDataType::F16_F16_F16_F16)
layout == ContractionMatrixLayout::KM_NK_MN_MN)
{ {
return profile(Col{}, Col{}, Row{}, F64{}); if(compute_data_type == ContractionComputeDataType::F32)
{
return run_profile_for_datatype(F16{}, F32{});
}
else
{
std::cout << "Incorrect combination of data type and compute data type." << std::endl;
return 1;
}
} }
else else if(data_type == ContractionDataType::BF16_BF16_BF16_BF16)
{ {
std::cout << "this data_type & layout is not implemented" << std::endl; if(compute_data_type == ContractionComputeDataType::F32)
{
return 1; return run_profile_for_datatype(BF16{}, F32{});
}
else
{
std::cout << "Incorrect combination of data type and compute data type." << std::endl;
return 1;
}
} }
return 1;
} }
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_contraction_scale); REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_contraction_scale);
...@@ -20,9 +20,10 @@ enum struct ConvLayout ...@@ -20,9 +20,10 @@ enum struct ConvLayout
enum struct ConvDataType enum struct ConvDataType
{ {
F32_F32_F32, // 0 F32_F32_F32, // 0
F16_F16_F16, // 1 F16_F16_F16, // 1
BF16_F32_BF16, // 2 BF16_F32_BF16, // 2
F16_F16_F16_BF8_F8 // 3
}; };
#define OP_NAME "grouped_conv_bwd_weight" #define OP_NAME "grouped_conv_bwd_weight"
...@@ -33,7 +34,8 @@ static void print_helper_msg() ...@@ -33,7 +34,8 @@ static void print_helper_msg()
std::cout << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n" std::cout << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
<< "arg2: data type (0: Input fp32, Weight fp32, Output fp32\n" << "arg2: data type (0: Input fp32, Weight fp32, Output fp32\n"
<< " 1: Input fp16, Weight fp16, Output fp16\n" << " 1: Input fp16, Weight fp16, Output fp16\n"
<< " 2: Input bf16, Weight fp32, Output bf16)\n" << " 2: Input bf16, Weight fp32, Output bf16\n"
<< " 3: Input fp16, Weight fp16, Output fp16, Gemm bf8@fp8)\n"
<< "arg3: tensor layout (0: Input[G, N, C, Hi, Wi], Weight[G, K, C, Y, X], Output[G, " << "arg3: tensor layout (0: Input[G, N, C, Hi, Wi], Weight[G, K, C, Y, X], Output[G, "
"N, K, Ho, Wo]\n" "N, K, Ho, Wo]\n"
<< " 1: Input[G, N, Hi, Wi, C], Weight[G, K, Y, X, C], Output[G, " << " 1: Input[G, N, Hi, Wi, C], Weight[G, K, Y, X, C], Output[G, "
...@@ -82,6 +84,12 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[]) ...@@ -82,6 +84,12 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[])
using F32 = float; using F32 = float;
using F16 = ck::half_t; using F16 = ck::half_t;
using BF16 = ck::bhalf_t; using BF16 = ck::bhalf_t;
#ifdef CK_ENABLE_FP8
using F8 = ck::f8_t;
#endif
#ifdef CK_ENABLE_BF8
using BF8 = ck::bf8_t;
#endif
using namespace ck::tensor_layout::convolution; using namespace ck::tensor_layout::convolution;
...@@ -95,7 +103,9 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[]) ...@@ -95,7 +103,9 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[])
auto out_layout, auto out_layout,
auto in_type, auto in_type,
auto wei_type, auto wei_type,
auto out_type) { auto out_type,
auto compute_type_a,
auto compute_type_b) {
constexpr ck::index_t NDimSpatial = num_dim_spatial_tmp.value; constexpr ck::index_t NDimSpatial = num_dim_spatial_tmp.value;
using InLayout = decltype(in_layout); using InLayout = decltype(in_layout);
...@@ -106,13 +116,18 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[]) ...@@ -106,13 +116,18 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[])
using WeiDataType = decltype(wei_type); using WeiDataType = decltype(wei_type);
using OutDataType = decltype(out_type); using OutDataType = decltype(out_type);
using ComputeTypeA = decltype(compute_type_a);
using ComputeTypeB = decltype(compute_type_b);
bool pass = ck::profiler::profile_grouped_conv_bwd_weight_impl<NDimSpatial, bool pass = ck::profiler::profile_grouped_conv_bwd_weight_impl<NDimSpatial,
InLayout, InLayout,
WeiLayout, WeiLayout,
OutLayout, OutLayout,
InDataType, InDataType,
WeiDataType, WeiDataType,
OutDataType>( OutDataType,
ComputeTypeA,
ComputeTypeB>(
do_verification, init_method, do_log, time_kernel, params, split_k); do_verification, init_method, do_log, time_kernel, params, split_k);
return pass ? 0 : 1; return pass ? 0 : 1;
...@@ -122,80 +137,84 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[]) ...@@ -122,80 +137,84 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[])
{ {
if(data_type == ConvDataType::F32_F32_F32) if(data_type == ConvDataType::F32_F32_F32)
{ {
return profile(I1, GNWC{}, GKXC{}, GNWK{}, F32{}, F32{}, F32{}); return profile(I1, GNWC{}, GKXC{}, GNWK{}, F32{}, F32{}, F32{}, F32{}, F32{});
} }
else if(data_type == ConvDataType::F16_F16_F16) else if(data_type == ConvDataType::F16_F16_F16)
{ {
return profile(I1, GNWC{}, GKXC{}, GNWK{}, F16{}, F16{}, F16{}); return profile(I1, GNWC{}, GKXC{}, GNWK{}, F16{}, F16{}, F16{}, F16{}, F16{});
} }
else if(data_type == ConvDataType::BF16_F32_BF16) else if(data_type == ConvDataType::BF16_F32_BF16)
{ {
// fp32 atomic add is used for weight tensor in bf16 kernel // fp32 atomic add is used for weight tensor in bf16 kernel
return profile(I1, GNWC{}, GKXC{}, GNWK{}, BF16{}, F32{}, BF16{}); return profile(I1, GNWC{}, GKXC{}, GNWK{}, BF16{}, F32{}, BF16{}, BF16{}, BF16{});
} }
} }
else if(num_dim_spatial == 2 && layout == ConvLayout::GNHWC_GKYXC_GNHWK) else if(num_dim_spatial == 2 && layout == ConvLayout::GNHWC_GKYXC_GNHWK)
{ {
if(data_type == ConvDataType::F32_F32_F32) if(data_type == ConvDataType::F32_F32_F32)
{ {
return profile(I2, GNHWC{}, GKYXC{}, GNHWK{}, F32{}, F32{}, F32{}); return profile(I2, GNHWC{}, GKYXC{}, GNHWK{}, F32{}, F32{}, F32{}, F32{}, F32{});
} }
else if(data_type == ConvDataType::F16_F16_F16) else if(data_type == ConvDataType::F16_F16_F16)
{ {
return profile(I2, GNHWC{}, GKYXC{}, GNHWK{}, F16{}, F16{}, F16{}); return profile(I2, GNHWC{}, GKYXC{}, GNHWK{}, F16{}, F16{}, F16{}, F16{}, F16{});
} }
else if(data_type == ConvDataType::BF16_F32_BF16) else if(data_type == ConvDataType::BF16_F32_BF16)
{ {
// fp32 atomic add is used for weight tensor in bf16 kernel // fp32 atomic add is used for weight tensor in bf16 kernel
return profile(I2, GNHWC{}, GKYXC{}, GNHWK{}, BF16{}, F32{}, BF16{}); return profile(I2, GNHWC{}, GKYXC{}, GNHWK{}, BF16{}, F32{}, BF16{}, BF16{}, BF16{});
} }
} }
else if(num_dim_spatial == 2 && layout == ConvLayout::NHWGC_GKYXC_NHWGK) else if(num_dim_spatial == 2 && layout == ConvLayout::NHWGC_GKYXC_NHWGK)
{ {
if(data_type == ConvDataType::F32_F32_F32) if(data_type == ConvDataType::F32_F32_F32)
{ {
return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, F32{}, F32{}, F32{}); return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, F32{}, F32{}, F32{}, F32{}, F32{});
} }
else if(data_type == ConvDataType::F16_F16_F16) else if(data_type == ConvDataType::F16_F16_F16)
{ {
return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, F16{}, F16{}, F16{}); return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, F16{}, F16{}, F16{}, F16{}, F16{});
} }
else if(data_type == ConvDataType::BF16_F32_BF16) else if(data_type == ConvDataType::BF16_F32_BF16)
{ {
// fp32 atomic add is used for weight tensor in bf16 kernel // fp32 atomic add is used for weight tensor in bf16 kernel
return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, BF16{}, F32{}, BF16{}); return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, BF16{}, F32{}, BF16{}, BF16{}, BF16{});
} }
} }
else if(num_dim_spatial == 3 && layout == ConvLayout::GNHWC_GKYXC_GNHWK) else if(num_dim_spatial == 3 && layout == ConvLayout::GNHWC_GKYXC_GNHWK)
{ {
if(data_type == ConvDataType::F32_F32_F32) if(data_type == ConvDataType::F32_F32_F32)
{ {
return profile(I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, F32{}, F32{}, F32{}); return profile(I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, F32{}, F32{}, F32{}, F32{}, F32{});
} }
else if(data_type == ConvDataType::F16_F16_F16) else if(data_type == ConvDataType::F16_F16_F16)
{ {
return profile(I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, F16{}, F16{}, F16{}); return profile(I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, F16{}, F16{}, F16{}, F16{}, F16{});
} }
else if(data_type == ConvDataType::BF16_F32_BF16) else if(data_type == ConvDataType::BF16_F32_BF16)
{ {
// fp32 atomic add is used for weight tensor in bf16 kernel // fp32 atomic add is used for weight tensor in bf16 kernel
return profile(I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, BF16{}, F32{}, BF16{}); return profile(I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, BF16{}, F32{}, BF16{}, BF16{}, BF16{});
} }
} }
else if(num_dim_spatial == 3 && layout == ConvLayout::NHWGC_GKYXC_NHWGK) else if(num_dim_spatial == 3 && layout == ConvLayout::NHWGC_GKYXC_NHWGK)
{ {
if(data_type == ConvDataType::F32_F32_F32) if(data_type == ConvDataType::F32_F32_F32)
{ {
return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F32{}, F32{}, F32{}); return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F32{}, F32{}, F32{}, F32{}, F32{});
} }
else if(data_type == ConvDataType::F16_F16_F16) else if(data_type == ConvDataType::F16_F16_F16)
{ {
return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F16{}, F16{}, F16{}); return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F16{}, F16{}, F16{}, F16{}, F16{});
} }
else if(data_type == ConvDataType::BF16_F32_BF16) else if(data_type == ConvDataType::BF16_F32_BF16)
{ {
// fp32 atomic add is used for weight tensor in bf16 kernel // fp32 atomic add is used for weight tensor in bf16 kernel
return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, BF16{}, F32{}, BF16{}); return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, BF16{}, F32{}, BF16{}, BF16{}, BF16{});
}
else if(data_type == ConvDataType::F16_F16_F16_BF8_F8)
{
return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F16{}, F16{}, F16{}, BF8{}, F8{});
} }
} }
......
...@@ -10,9 +10,12 @@ ...@@ -10,9 +10,12 @@
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "profiler/profile_contraction_impl.hpp" #include "profiler/profile_contraction_impl.hpp"
#include "profiler/profile_contraction_utils.hpp"
using F32 = float; using F16 = ck::half_t;
using F64 = double; using BF16 = ck::bhalf_t;
using F32 = float;
using F64 = double;
using Row = ck::tensor_layout::gemm::RowMajor; using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor; using Col = ck::tensor_layout::gemm::ColumnMajor;
...@@ -20,49 +23,49 @@ using Col = ck::tensor_layout::gemm::ColumnMajor; ...@@ -20,49 +23,49 @@ using Col = ck::tensor_layout::gemm::ColumnMajor;
using Bilinear = ck::tensor_operation::element_wise::Bilinear; using Bilinear = ck::tensor_operation::element_wise::Bilinear;
using Scale = ck::tensor_operation::element_wise::Scale; using Scale = ck::tensor_operation::element_wise::Scale;
struct MemoryParams struct Dimensions
{ {
std::vector<ck::index_t> M; std::vector<ck::index_t> M;
std::vector<ck::index_t> N; std::vector<ck::index_t> N;
std::vector<ck::index_t> K; std::vector<ck::index_t> K;
std::vector<ck::index_t> StridesA;
std::vector<ck::index_t> StridesB;
std::vector<ck::index_t> StridesC;
std::vector<ck::index_t> StridesD;
}; };
template <typename Tuple> template <typename Tuple>
class TestContraction : public ::testing::Test class TestContraction : public ::testing::Test
{ {
protected: protected:
using ALayout = std::tuple_element_t<0, Tuple>; using ALayout = std::tuple_element_t<0, Tuple>;
using BLayout = std::tuple_element_t<1, Tuple>; using BLayout = std::tuple_element_t<1, Tuple>;
using CDLayout = std::tuple_element_t<2, Tuple>; using CDLayout = std::tuple_element_t<2, Tuple>;
using DataType = std::tuple_element_t<3, Tuple>; using DataType = std::tuple_element_t<3, Tuple>;
using DTupleDataType = std::tuple_element_t<4, Tuple>; using DTupleDataType = std::tuple_element_t<4, Tuple>;
using CDElementOp = std::tuple_element_t<5, Tuple>; using ComputeDataType = std::tuple_element_t<5, Tuple>;
using CDElementOp = std::tuple_element_t<6, Tuple>;
std::vector<MemoryParams> list_of_memory_params = {{{32, 32},
{32, 32}, std::vector<Dimensions> dimension_list = {{{32, 32}, {32, 32}, {32, 32}},
{32, 32}, {{16, 16}, {32, 32}, {16, 16}}};
{32768, 1024, 32, 1},
{32768, 1024, 32, 1}, std::vector<ck::index_t> init_methods = {1, 2};
{32768, 1024, 32, 1},
{32768, 1024, 32, 1}},
{{16, 16},
{32, 32},
{16, 16},
{4096, 256, 16, 1},
{16, 1, 8192, 256},
{16384, 1024, 32, 1},
{16384, 1024, 32, 1}}};
std::vector<ck::index_t> init_methods = {0, 1, 2};
std::unique_ptr<CDElementOp> p_cd_element_op; std::unique_ptr<CDElementOp> p_cd_element_op;
void Run() void Run()
{ {
for(auto& memory_params : list_of_memory_params) for(auto& dimension_params : dimension_list)
{ {
std::vector<ck::index_t> StridesA;
std::vector<ck::index_t> StridesB;
std::vector<ck::index_t> StridesC;
std::vector<ck::index_t> StridesD;
const auto& M = dimension_params.M;
const auto& N = dimension_params.N;
const auto& K = dimension_params.K;
assign_default_strides(ALayout{}, StridesA, {M[0], M[1], K[0], K[1]});
assign_default_strides(BLayout{}, StridesB, {N[0], N[1], K[0], K[1]});
assign_default_strides(CDLayout{}, StridesC, {M[0], M[1], N[0], N[1]});
assign_default_strides(CDLayout{}, StridesD, {M[0], M[1], N[0], N[1]});
for(const ck::index_t init_method : init_methods) for(const ck::index_t init_method : init_methods)
{ {
bool pass = bool pass =
...@@ -70,19 +73,20 @@ class TestContraction : public ::testing::Test ...@@ -70,19 +73,20 @@ class TestContraction : public ::testing::Test
BLayout, BLayout,
CDLayout, CDLayout,
DataType, DataType,
ComputeDataType,
DTupleDataType, DTupleDataType,
CDElementOp>(true /*do_verification*/, CDElementOp>(true /*do_verification*/,
init_method, init_method,
false /*do_logs*/, false /*do_logs*/,
false /*time_kernel*/, false /*time_kernel*/,
*p_cd_element_op, *p_cd_element_op,
memory_params.M, dimension_params.M,
memory_params.N, dimension_params.N,
memory_params.K, dimension_params.K,
memory_params.StridesA, StridesA,
memory_params.StridesB, StridesB,
memory_params.StridesC, StridesC,
memory_params.StridesD); StridesD);
EXPECT_TRUE(pass); EXPECT_TRUE(pass);
} }
} }
...@@ -99,24 +103,18 @@ class TestContractionBilinear : public TestContraction<Tuple> ...@@ -99,24 +103,18 @@ class TestContractionBilinear : public TestContraction<Tuple>
{ {
}; };
#define ALL_LAYOUT_COMBINATIONS(dt, tuple_dt, compute_dt, op) \
std::tuple<Row, Row, Row, dt, tuple_dt, compute_dt, op>, \
std::tuple<Row, Col, Row, dt, tuple_dt, compute_dt, op>, \
std::tuple<Col, Row, Row, dt, tuple_dt, compute_dt, op>, \
std::tuple<Col, Col, Row, dt, tuple_dt, compute_dt, op>
using BilinearKernelTypes = using BilinearKernelTypes =
::testing::Types<std::tuple<Row, Row, Row, F32, ck::Tuple<F32>, Bilinear>, ::testing::Types<ALL_LAYOUT_COMBINATIONS(F32, ck::Tuple<F32>, F32, Bilinear),
std::tuple<Row, Col, Row, F32, ck::Tuple<F32>, Bilinear>, ALL_LAYOUT_COMBINATIONS(F64, ck::Tuple<F64>, F64, Bilinear)>;
std::tuple<Col, Row, Row, F32, ck::Tuple<F32>, Bilinear>,
std::tuple<Col, Col, Row, F32, ck::Tuple<F32>, Bilinear>, using ScaleKernelTypes = ::testing::Types<ALL_LAYOUT_COMBINATIONS(F32, ck::Tuple<>, F32, Scale),
std::tuple<Row, Row, Row, F64, ck::Tuple<F32>, Bilinear>, ALL_LAYOUT_COMBINATIONS(F64, ck::Tuple<>, F64, Scale)>;
std::tuple<Row, Col, Row, F64, ck::Tuple<F32>, Bilinear>,
std::tuple<Col, Row, Row, F64, ck::Tuple<F32>, Bilinear>,
std::tuple<Col, Col, Row, F64, ck::Tuple<F32>, Bilinear>>;
using ScaleKernelTypes = ::testing::Types<std::tuple<Row, Row, Row, F32, ck::Tuple<>, Scale>,
std::tuple<Row, Col, Row, F32, ck::Tuple<>, Scale>,
std::tuple<Col, Row, Row, F32, ck::Tuple<>, Scale>,
std::tuple<Col, Col, Row, F32, ck::Tuple<>, Scale>,
std::tuple<Row, Row, Row, F64, ck::Tuple<>, Scale>,
std::tuple<Row, Col, Row, F64, ck::Tuple<>, Scale>,
std::tuple<Col, Row, Row, F64, ck::Tuple<>, Scale>,
std::tuple<Col, Col, Row, F64, ck::Tuple<>, Scale>>;
TYPED_TEST_SUITE(TestContractionBilinear, BilinearKernelTypes); TYPED_TEST_SUITE(TestContractionBilinear, BilinearKernelTypes);
TYPED_TEST_SUITE(TestContractionScale, ScaleKernelTypes); TYPED_TEST_SUITE(TestContractionScale, ScaleKernelTypes);
...@@ -136,3 +134,46 @@ TYPED_TEST(TestContractionScale, scale) ...@@ -136,3 +134,46 @@ TYPED_TEST(TestContractionScale, scale)
this->p_cd_element_op = std::make_unique<Scale>(0.5f); this->p_cd_element_op = std::make_unique<Scale>(0.5f);
this->Run(); this->Run();
} }
template <typename Tuple>
class TestContractionScaleMixedPrecision : public TestContraction<Tuple>
{
};
template <typename Tuple>
class TestContractionBilinearMixedPrecision : public TestContraction<Tuple>
{
};
using BilinearKernelTypesMixedPrecision =
::testing::Types<ALL_LAYOUT_COMBINATIONS(F32, ck::Tuple<F32>, F16, Bilinear),
ALL_LAYOUT_COMBINATIONS(F32, ck::Tuple<F32>, BF16, Bilinear),
ALL_LAYOUT_COMBINATIONS(F64, ck::Tuple<F64>, F32, Bilinear),
ALL_LAYOUT_COMBINATIONS(F16, ck::Tuple<F16>, F32, Bilinear),
ALL_LAYOUT_COMBINATIONS(BF16, ck::Tuple<BF16>, F32, Bilinear)>;
using ScaleKernelTypesMixedPrecision =
::testing::Types<ALL_LAYOUT_COMBINATIONS(F32, ck::Tuple<>, F16, Scale),
ALL_LAYOUT_COMBINATIONS(F32, ck::Tuple<>, BF16, Scale),
ALL_LAYOUT_COMBINATIONS(F64, ck::Tuple<>, F32, Scale),
ALL_LAYOUT_COMBINATIONS(F16, ck::Tuple<>, F32, Scale),
ALL_LAYOUT_COMBINATIONS(BF16, ck::Tuple<>, F32, Scale)>;
TYPED_TEST_SUITE(TestContractionBilinearMixedPrecision, BilinearKernelTypesMixedPrecision);
TYPED_TEST_SUITE(TestContractionScaleMixedPrecision, ScaleKernelTypesMixedPrecision);
TYPED_TEST(TestContractionBilinearMixedPrecision, bilinear)
{
this->p_cd_element_op = std::make_unique<Bilinear>(1.f, 1.f);
this->Run();
this->p_cd_element_op = std::make_unique<Bilinear>(-0.5f, 0.5f);
this->Run();
}
TYPED_TEST(TestContractionScaleMixedPrecision, scale)
{
this->p_cd_element_op = std::make_unique<Scale>(1.f);
this->Run();
this->p_cd_element_op = std::make_unique<Scale>(0.5f);
this->Run();
}
...@@ -34,11 +34,11 @@ class ContractionInstanceWrapper ...@@ -34,11 +34,11 @@ class ContractionInstanceWrapper
static constexpr ck::index_t NumDim = 2; static constexpr ck::index_t NumDim = 2;
// clang-format off // clang-format off
using ContractionDeviceInstance = ck::tensor_operation::device:: using ContractionDeviceInstance = ck::tensor_operation::device::
//#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| Compute| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
//#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| //#####################################| | | | Type| Type| Type| DataType| Type| Type| Data| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| //#####################################| | | | | | | | | | Type| Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
DeviceContractionMultipleD_Xdl_CShuffle< NumDim, NumDim, NumDim, F32, F32, F32, F32, ck::Tuple<F32>, F32, Pass, Pass, Bilinear, GemmSpec, 1, 256, 256, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, ABlockTransferSrcVectorDim, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, BBlockTransferSrcVectorDim, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, CDEBlockTransferScalarPerVector>; DeviceContractionMultipleD_Xdl_CShuffle< NumDim, NumDim, NumDim, F32, F32, F32, F32, ck::Tuple<F32>, F32, F32, Pass, Pass, Bilinear, GemmSpec, 1, 256, 256, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, ABlockTransferSrcVectorDim, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, BBlockTransferSrcVectorDim, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, CDEBlockTransferScalarPerVector>;
// clang-format on // clang-format on
bool isSupported(std::vector<ck::index_t>& ADims, bool isSupported(std::vector<ck::index_t>& ADims,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment