Unverified Commit 171b9030 authored by Mirza Halilčević's avatar Mirza Halilčević Committed by GitHub
Browse files

Merge branch 'develop' into gemm_elementwise_gemm

parents 417f805f da0c21f6
......@@ -102,11 +102,22 @@ bool profile_pool3d_fwd_impl(PoolFwdInputParams& in_params, PoolFwdKernelParams&
Tensor<IndexDataType> out_indices_n_c_do_ho_wo_device(
f_host_tensor_descriptor(N, C, Do, Ho, Wo));
constexpr int inDataRangeTensor1{1};
constexpr int inDataRangeTensor2{5};
constexpr double inDataRangeTensor3{0.5};
switch(in_params.init_method)
{
case 0: in_n_c_di_hi_wi.GenerateTensorValue(GeneratorTensor_1<InDataType>{}); break;
case 1: in_n_c_di_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}); break;
default: in_n_c_di_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{-0.5, 0.5});
case 0:
in_n_c_di_hi_wi.GenerateTensorValue(GeneratorTensor_1<InDataType>{inDataRangeTensor1});
break;
case 1:
in_n_c_di_hi_wi.GenerateTensorValue(
GeneratorTensor_2<InDataType>{-inDataRangeTensor2, inDataRangeTensor2});
break;
default:
in_n_c_di_hi_wi.GenerateTensorValue(
GeneratorTensor_3<InDataType>{-inDataRangeTensor3, inDataRangeTensor3});
}
DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_di_hi_wi.mDesc.GetElementSpaceSize());
......@@ -229,12 +240,39 @@ bool profile_pool3d_fwd_impl(PoolFwdInputParams& in_params, PoolFwdKernelParams&
{
out_device_buf.FromDevice(out_n_c_do_ho_wo_device.mData.data());
auto tolerance = 1e-3;
bool pass = ck::utils::check_err(out_n_c_do_ho_wo_device.mData,
auto number_of_accumulations = 1;
static_assert(
ReduceOpId == ck::ReduceTensorOp::AVG || ReduceOpId == ck::ReduceTensorOp::MAX,
"Warning: Unhandled ReduceOpId for setting up the number of accumulations!");
if constexpr(ReduceOpId == ck::ReduceTensorOp::AVG)
{
for(size_t i = 0; i < kernel_params.window_spatial_lengths.size(); ++i)
{
number_of_accumulations *= kernel_params.window_spatial_lengths.at(i);
}
}
auto absolute_error_threshold = 1.0;
switch(in_params.init_method)
{
case 0: absolute_error_threshold = static_cast<double>(inDataRangeTensor1); break;
case 1: absolute_error_threshold = static_cast<double>(inDataRangeTensor2); break;
default: absolute_error_threshold = inDataRangeTensor3;
}
absolute_error_threshold =
ck::utils::get_absolute_threshold<ComputeDataType, OutDataType>(
absolute_error_threshold, number_of_accumulations);
auto relative_error_threshold =
ck::utils::get_relative_threshold<ComputeDataType, OutDataType>(
number_of_accumulations);
bool pass = ck::utils::check_err(out_n_c_do_ho_wo_device.mData,
out_n_c_do_ho_wo_host.mData,
"Error: Incorrect results",
tolerance,
tolerance);
relative_error_threshold,
absolute_error_threshold);
if constexpr(OutputIndex)
{
......
......@@ -59,6 +59,7 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
list(APPEND PROFILER_SOURCES profile_gemm_bias_add_reduce.cpp)
list(APPEND PROFILER_SOURCES profile_gemm_splitk.cpp)
list(APPEND PROFILER_SOURCES profile_gemm_universal.cpp)
list(APPEND PROFILER_SOURCES profile_gemm_universal_batched.cpp)
list(APPEND PROFILER_SOURCES profile_gemm_universal_reduce.cpp)
list(APPEND PROFILER_SOURCES profile_gemm_universal_streamk.cpp)
list(APPEND PROFILER_SOURCES profile_conv_fwd_bias_relu.cpp)
......@@ -141,6 +142,7 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
endif()
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_splitk_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_batched_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_reduce_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_streamk_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_multiply_instance)
......
......@@ -27,6 +27,7 @@ enum struct GemmDataType
F16_F8_F16, // 5
F16_F16_F16_F8, // 6
F8_F8_BF16, // 7
INT8_INT8_BF16, // 8
};
#define OP_NAME "gemm_multiply_multiply"
......@@ -39,7 +40,7 @@ int profile_gemm_multiply_multiply(int argc, char* argv[])
printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8; 4: f8@f16; 5: f16@f8; 6: "
"f16->f8; 7: f8->bf16, "
"comp f8)\n");
"comp f8; 8: int8->bf16)\n");
printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
printf(" 1: A[m, k] * B[n, k] = C[m, n];\n");
printf(" 2: A[k, m] * B[k, n] = C[m, n];\n");
......@@ -89,6 +90,8 @@ int profile_gemm_multiply_multiply(int argc, char* argv[])
using F32 = float;
using BF16 = ck::bhalf_t;
using F8 = ck::f8_t;
using I8 = int8_t;
using I32 = int;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
......@@ -162,6 +165,11 @@ int profile_gemm_multiply_multiply(int argc, char* argv[])
return profile(
F8{}, F8{}, F8{}, F32{}, F32{}, F32{}, BF16{}, Row{}, Col{}, Row{}, Col{}, Row{});
}
else if(data_type == GemmDataType::INT8_INT8_BF16 && layout == GemmMatrixLayout::MK_NK_MN)
{
return profile(
I8{}, I8{}, I8{}, I32{}, F32{}, F32{}, BF16{}, Row{}, Col{}, Row{}, Col{}, Row{});
}
else
{
std::cout << "this data_type & layout is not implemented" << std::endl;
......
......@@ -57,6 +57,25 @@ int profile_gemm_universal(int argc, char* argv[])
exit(1);
}
int M;
int N;
int StrideA;
int StrideB;
// Analyze the unsupported matrix shapes, switch the M and N number
if(std::stoi(argv[9]) % 8 != 0 && std::stoi(argv[8]) % 8 == 0)
{
M = std::stoi(argv[9]);
StrideA = std::stoi(argv[12]);
N = std::stoi(argv[8]);
StrideB = std::stoi(argv[11]);
}
else
{
M = std::stoi(argv[8]);
StrideA = std::stoi(argv[11]);
N = std::stoi(argv[9]);
StrideB = std::stoi(argv[12]);
}
const auto data_type = static_cast<GemmDataType>(std::stoi(argv[2]));
const auto layout = static_cast<GemmMatrixLayout>(std::stoi(argv[3]));
const bool do_verification = std::stoi(argv[4]);
......@@ -64,12 +83,8 @@ int profile_gemm_universal(int argc, char* argv[])
const bool do_log = std::stoi(argv[6]);
const bool time_kernel = std::stoi(argv[7]);
const int M = std::stoi(argv[8]);
const int N = std::stoi(argv[9]);
const int K = std::stoi(argv[10]);
const int StrideA = std::stoi(argv[11]);
const int StrideB = std::stoi(argv[12]);
const int StrideC = std::stoi(argv[13]);
const int KBatch = std::stoi(argv[14]);
......@@ -86,7 +101,9 @@ int profile_gemm_universal(int argc, char* argv[])
using F32 = float;
using F16 = ck::half_t;
using BF16 = ck::bhalf_t;
using F8 = ck::f8_t;
#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94)
using F8 = ck::f8_t;
#endif
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
......@@ -147,6 +164,7 @@ int profile_gemm_universal(int argc, char* argv[])
{
return profile(F16{}, F16{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{});
}
#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94)
else if(data_type == GemmDataType::F16_F8_F16 && layout == GemmMatrixLayout::MK_KN_MN)
{
return profile(F16{}, F8{}, F16{}, F32{}, F16{}, Row{}, Row{}, Row{});
......@@ -163,6 +181,7 @@ int profile_gemm_universal(int argc, char* argv[])
{
return profile(F8{}, F16{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{});
}
#endif
else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_KN_MN)
{
return profile(BF16{}, BF16{}, BF16{}, F32{}, BF16{}, Row{}, Row{}, Row{});
......@@ -179,6 +198,7 @@ int profile_gemm_universal(int argc, char* argv[])
{
return profile(BF16{}, BF16{}, BF16{}, F32{}, BF16{}, Col{}, Row{}, Row{});
}
#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94)
else if(data_type == GemmDataType::F8_F8_BF16 && layout == GemmMatrixLayout::MK_KN_MN)
{
return profile(F8{}, F8{}, F8{}, F32{}, BF16{}, Row{}, Row{}, Row{});
......@@ -187,6 +207,7 @@ int profile_gemm_universal(int argc, char* argv[])
{
return profile(F8{}, F8{}, F8{}, F32{}, BF16{}, Row{}, Col{}, Row{});
}
#endif
else
{
std::cout << "this data_type & layout is not implemented" << std::endl;
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdint>
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "profiler/profile_gemm_universal_batched_impl.hpp"
#include "profiler_operation_registry.hpp"
#include "ck/library/tensor_operation_instance/gpu/gemm_universal_batched.hpp"
enum struct GemmMatrixLayout
{
MK_KN_MN, // 0
MK_NK_MN, // 1
KM_KN_MN, // 2
KM_NK_MN, // 3
};
enum struct GemmDataType
{
BF16_BF16_BF16, // 0
F8_F8_BF16, // 1
};
#define OP_NAME "gemm_universal_batched"
#define OP_DESC "Batched GEMM Universal"
int profile_batched_gemm_universal(int argc, char* argv[])
{
if(argc != 18 && argc != 21)
{
// clang-format off
printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
printf("arg2: data type (0: bf16, 1: fp8->bf16)\n");
printf("arg3: matrix layout (0: A[g, m, k] * B[g, k, n] = C[g, m, n];\n");
printf(" 1: A[g, m, k] * B[g, n, k] = C[g, m, n];\n");
printf(" 2: A[g, k, m] * B[g, k, n] = C[g, m, n];\n");
printf(" 3: A[g, k, m] * B[g, n, k] = C[g, m, n])\n");
printf("arg4: verification (0: no; 1: yes)\n");
printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
printf("arg6: print tensor value (0: no; 1: yes)\n");
printf("arg7: time kernel (0=n0, 1=yes)\n");
printf("arg8 to 17: M, N, K, StrideA, StrideB, StrideC, BatchStrideA, BatchStrideB, BatchStrideC, BatchCount\n");
printf("optional:\n");
printf("arg18: number of warm-up cycles (default 1)\n");
printf("arg19: number of iterations (default 10)\n");
printf("arg20: memory for rotating buffer (default 0, size in MB)\n");
// clang-format on
exit(1);
}
int n_warmup = 1;
int n_iter = 10;
uint64_t rotating = 0;
if(argc == 21)
{
n_warmup = std::stoi(argv[18]);
n_iter = std::stoi(argv[19]);
rotating = std::stoull(argv[20]) * 1024 * 1024;
}
const auto data_type = static_cast<GemmDataType>(std::stoi(argv[2]));
const auto layout = static_cast<GemmMatrixLayout>(std::stoi(argv[3]));
const bool do_verification = std::stoi(argv[4]);
const int init_method = std::stoi(argv[5]);
const bool do_log = std::stoi(argv[6]);
const bool time_kernel = std::stoi(argv[7]);
const int M = std::stoi(argv[8]);
const int N = std::stoi(argv[9]);
const int K = std::stoi(argv[10]);
const int StrideA = std::stoi(argv[11]);
const int StrideB = std::stoi(argv[12]);
const int StrideC = std::stoi(argv[13]);
const int BatchStrideA = std::stoi(argv[14]);
const int BatchStrideB = std::stoi(argv[15]);
const int BatchStrideC = std::stoi(argv[16]);
const int BatchCount = std::stoi(argv[17]);
#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94)
using F8 = ck::f8_t;
#endif
using BF16 = ck::bhalf_t;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
auto profile =
[&](auto a_type, auto b_type, auto c_type, auto a_layout, auto b_layout, auto c_layout) {
using ADataType = decltype(a_type);
using BDataType = decltype(b_type);
using DsDataType = ck::Tuple<>;
using CDataType = decltype(c_type);
using ALayout = decltype(a_layout);
using BLayout = decltype(b_layout);
using DsLayout = ck::Tuple<>;
using CLayout = decltype(c_layout);
const int DefaultStrideA = ck::is_same_v<ALayout, Row> ? K : M;
const int DefaultStrideB = ck::is_same_v<BLayout, Row> ? N : K;
const int DefaultStrideC = ck::is_same_v<CLayout, Row> ? N : M;
const int StrideA_ = (StrideA < 0) ? DefaultStrideA : StrideA;
const int StrideB_ = (StrideB < 0) ? DefaultStrideB : StrideB;
const int StrideC_ = (StrideC < 0) ? DefaultStrideC : StrideC;
const int DefaultBatchStrideA = (ck::is_same_v<ALayout, Row> ? M : K) * StrideA_;
const int DefaultBatchStrideB = (ck::is_same_v<BLayout, Row> ? K : N) * StrideB_;
const int DefaultBatchStrideC = (ck::is_same_v<CLayout, Row> ? M : N) * StrideC_;
const int BatchStrideA_ = (BatchStrideA < 0) ? DefaultBatchStrideA : BatchStrideA;
const int BatchStrideB_ = (BatchStrideB < 0) ? DefaultBatchStrideB : BatchStrideB;
const int BatchStrideC_ = (BatchStrideC < 0) ? DefaultBatchStrideC : BatchStrideC;
using AElementOp = ck::tensor_operation::element_wise::PassThrough;
using BElementOp = ck::tensor_operation::element_wise::PassThrough;
using CElementOp = ck::tensor_operation::element_wise::PassThrough;
using DeviceOp = ck::tensor_operation::device::DeviceBatchedGemmV2MultiD<ALayout,
BLayout,
DsLayout,
CLayout,
ADataType,
BDataType,
DsDataType,
CDataType,
AElementOp,
BElementOp,
CElementOp>;
bool pass = ck::profiler::profile_gemm_universal_batched_impl<ADataType,
BDataType,
CDataType,
ALayout,
BLayout,
CLayout,
AElementOp,
BElementOp,
CElementOp,
DeviceOp>(do_verification,
init_method,
do_log,
time_kernel,
M,
N,
K,
BatchStrideA_,
BatchStrideB_,
BatchStrideC_,
StrideA_,
StrideB_,
StrideC_,
BatchCount,
n_warmup,
n_iter,
rotating);
return pass ? 0 : 1;
};
if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_NK_MN)
{
return profile(BF16{}, BF16{}, BF16{}, Row{}, Col{}, Row{});
}
#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94)
else if(data_type == GemmDataType::F8_F8_BF16 && layout == GemmMatrixLayout::MK_NK_MN)
{
return profile(F8{}, F8{}, BF16{}, Row{}, Col{}, Row{});
}
#endif
else
{
std::cout << "this data_type & layout is not implemented" << std::endl;
return 1;
}
}
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_batched_gemm_universal);
......@@ -25,7 +25,8 @@ enum struct ConvDataType
F16_F16_F16, // 1
BF16_F32_BF16, // 2
F16_F16_F16_BF8_F8, // 3
I8_I8_I8 // 4
I8_I8_I8, // 4
BF16_BF16_BF16, // 5
};
#define OP_NAME "grouped_conv_bwd_weight"
......@@ -38,7 +39,8 @@ static void print_helper_msg()
<< " 1: Input fp16, Weight fp16, Output fp16\n"
<< " 2: Input bf16, Weight fp32, Output bf16\n"
<< " 3: Input fp16, Weight fp16, Output fp16, Gemm bf8@fp8\n"
<< " 4: Input int8, Weight int8, Output int8)\n"
<< " 4: Input int8, Weight int8, Output int8\n"
<< " 5: Input bf16, Weight bf16, Output bf16)\n"
<< "arg3: tensor layout (0: Input[G, N, C, Hi, Wi], Weight[G, K, C, Y, X], Output[G, "
"N, K, Ho, Wo]\n"
<< " 1: Input[G, N, Hi, Wi, C], Weight[G, K, Y, X, C], Output[G, "
......@@ -180,6 +182,10 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[])
// fp32 atomic add is used for weight tensor in bf16 kernel
return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, BF16{}, F32{}, BF16{}, BF16{}, BF16{});
}
if(data_type == ConvDataType::BF16_BF16_BF16)
{
return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{});
}
}
else if(num_dim_spatial == 2 && layout == ConvLayout::NGCHW_GKYXC_NGKHW)
{
......@@ -187,6 +193,11 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[])
{
return profile(I2, NGCHW{}, GKYXC{}, NGKHW{}, F16{}, F16{}, F16{}, F16{}, F16{});
}
if(data_type == ConvDataType::BF16_BF16_BF16)
{
// fp32 atomic add is used for weight tensor in bf16 kernel
return profile(I2, NGCHW{}, GKYXC{}, NGKHW{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{});
}
}
if(num_dim_spatial == 3 && layout == ConvLayout::GNHWC_GKYXC_GNHWK)
{
......@@ -224,6 +235,11 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[])
// fp32 atomic add is used for weight tensor in bf16 kernel
return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, BF16{}, F32{}, BF16{}, BF16{}, BF16{});
}
if(data_type == ConvDataType::BF16_BF16_BF16)
{
return profile(
I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{});
}
if(data_type == ConvDataType::F16_F16_F16_BF8_F8)
{
return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F16{}, F16{}, F16{}, BF8{}, F8{});
......@@ -240,6 +256,11 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[])
{
return profile(I3, NGCDHW{}, GKZYXC{}, NGKDHW{}, F16{}, F16{}, F16{}, F16{}, F16{});
}
if(data_type == ConvDataType::BF16_BF16_BF16)
{
return profile(
I3, NGCDHW{}, GKZYXC{}, NGKDHW{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{});
}
}
std::cout << "this data_type & layout is not implemented" << std::endl;
......
......@@ -85,7 +85,7 @@ int profile_layernorm(int argc, char* argv[])
if(data_type == ck::DataTypeEnum::Half)
{
ck::profiler::profile_layernorm_impl<F16, F16, F16, F32, F16, F32, false, rank>(
ck::profiler::profile_layernorm_impl<F16, F16, F16, F32, F16, F16, false, rank>(
do_verification, init_method, do_log, time_kernel, length);
}
else if(data_type == ck::DataTypeEnum::Float)
......
# SPDX-License-Identifier: MIT
# Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
import logging
import os
import subprocess
from dataclasses import replace
from functools import lru_cache
from typing import List
from ..util import library_path
from .op import CKGroupedConvFwdOp
log = logging.getLogger(__name__)
def _ck_conv_instances_path():
conv_instances_path = os.path.join( # noqa: F821
library_path(),
"include",
"ck",
"library",
"tensor_operation_instance",
"gpu",
"grouped_conv_fwd",
)
if not os.path.exists(conv_instances_path):
log.error(
"CK library conv instances path %s does not exist", conv_instances_path
)
return None
return conv_instances_path
def parse_instances(str_instances: List[str]) -> List[CKGroupedConvFwdOp]:
"""
Parse the lines containing Grouped Convolution Forward template instances
into `CKGroupedConvFwdOp` instances
"""
def maybe_int(s):
try:
return int(s)
except ValueError:
return s
op_instances = []
# TODO: maybe use libclang for parsing C++ code in the future
# to avoid this hacky parsing logic below ? :) - copilot
for line in str_instances:
s_template_args = line.split("DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3")[
-1
].strip("<>, ")
template_args = []
i_current = 0
while i_current < len(s_template_args):
if s_template_args[i_current] == " ":
# skip whitespace
i_current += 1
continue
elif s_template_args[i_current : i_current + 2] == "S<":
# parse template S<Index...>
i_next = s_template_args.find(">", i_current)
template_args.append(
tuple(map(int, s_template_args[i_current + 2 : i_next].split(",")))
)
i_current = i_next + 2
else:
# all string attributes must be either type aliases or global constants in C++
i_next = s_template_args.find(",", i_current)
template_args.append(
maybe_int(
s_template_args[i_current : i_next if i_next != -1 else None]
)
)
if i_next != -1:
i_current = i_next + 1
if i_next == -1:
break
template_args[0] = -1 # n_dim_spatial
template_args[3] = tuple() # ds_layout
template_args[9] = tuple() # ds_element_dtype
new_instance = CKGroupedConvFwdOp(
*template_args, # type: ignore[arg-type]
)
op_instances.append(new_instance)
return op_instances
@lru_cache(None)
def gen_conv_ops_library() -> List[CKGroupedConvFwdOp]:
"""
Parse the Grouped Convolution Forward instances
defined in the Composable Kernel library folder.
"""
ck_library_dir = _ck_conv_instances_path()
if not ck_library_dir:
return []
grep_result = subprocess.run(
[
"grep",
"-inR",
"DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3",
ck_library_dir,
],
capture_output=True,
text=True,
)
op_instances = parse_instances(grep_result.stdout.strip().split("\n"))
log.debug("ck instances from library: %d", len(op_instances))
schedulers = [
"BlockGemmPipelineScheduler::Intrawave",
"BlockGemmPipelineScheduler::Interwave",
]
conv_specs = [
"ConvolutionForwardSpecialization::Default",
"ConvolutionForwardSpecialization::Filter1x1Pad0",
"ConvolutionForwardSpecialization::Filter1x1Stride1Pad0",
"ConvolutionForwardSpecialization::OddC",
]
# substitute templated args by looping through their domains
substitute_instances = []
for instance in op_instances:
sub_scheduler = (
instance.block_gemm_pipeline_scheduler == "BlkGemmPipeSched"
)
sub_spec = instance.conv_forward_specialization == "ConvSpec"
schedulers_range = (
schedulers if sub_scheduler else [instance.block_gemm_pipeline_scheduler]
)
spec_range = conv_specs if sub_spec else [instance.conv_forward_specialization]
for scheduler in schedulers_range:
for spec in spec_range:
for channels_last in [True, False]:
if channels_last:
a_layout = "NHWGC"
e_layout = "NHWGK"
else:
a_layout = "NGCHW"
e_layout = "NGKHW"
substitute_instances.append(
replace(
instance,
block_gemm_pipeline_scheduler=scheduler,
conv_forward_specialization=spec,
gemm_specialization="GemmSpecialization::MNKPadding",
n_dim_spatial=2,
a_layout=a_layout,
b_layout="GKYXC",
e_layout=e_layout,
)
)
return substitute_instances
if __name__ == "__main__":
print(gen_conv_ops_library())
# SPDX-License-Identifier: MIT
# Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
from dataclasses import asdict, dataclass
from typing import Optional, Tuple
@dataclass
class CKGroupedConvFwdOp:
n_dim_spatial: int
a_layout: str
b_layout: str
ds_layout: Tuple[str]
e_layout: str
a_element_dtype: str
b_element_dtype: str
acc_dtype: str
c_shuffle_dtype: str
ds_element_dtype: Tuple[str]
e_element_dtype: str
a_elementwise_op: str
b_elementwise_op: str
cde_elementwise_op: str
conv_forward_specialization: str
gemm_specialization: str
block_size: int
m_per_block: int
n_per_block: int
k_per_block: int
ak1: int
bk1: int
m_per_xdl: int
n_per_xdl: int
m_xdl_per_wave: int
n_xdl_per_wave: int
a_block_transfer_thread_cluster_lengths_ak0_m_ak1: Tuple[int, int, int]
a_block_transfer_thread_cluster_arrange_order: Tuple[int, int, int]
a_block_transfer_src_access_order: Tuple[int, int, int]
a_block_transfer_src_vector_dim: int
a_block_transfer_src_scalar_per_vector: int
a_block_transfer_dst_scalar_per_vector_ak1: int
a_block_lds_extra_m: bool
b_block_transfer_thread_cluster_lengths_bk0_n_bk1: Tuple[int, int, int]
b_block_transfer_thread_cluster_arrange_order: Tuple[int, int, int]
b_block_transfer_src_access_order: Tuple[int, int, int]
b_block_transfer_src_vector_dim: int
b_block_transfer_src_scalar_per_vector: int
b_block_transfer_dst_scalar_per_vector_bk1: int
b_block_lds_extra_n: bool
c_shuffle_m_xdl_per_wave_per_shuffle: int
c_shuffle_n_xdl_per_wave_per_shuffle: int
cde_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block: Tuple[ # noqa
int,
int,
int,
int,
]
cde_block_transfer_scalar_per_vector_n_per_block: int
block_gemm_pipeline_scheduler: str
block_gemm_pipeline_version: str
a_compute_dtype: Optional[str] = None
b_compute_dtype: Optional[str] = None
def name(self):
# cpp alias for template instance
return (
f"ck_device_grouped_convolution_fwd_multiple_abd_xdl_c_shuffle_v3_"
f"{self.key_name()}"
)
def key_name(self):
# TBD; must be unique per instance. Intended to use as dict key
return "_".join(
[
"K"
+ field_name.replace("_", "").lower()
+ "V"
+ (
"x".join(map(str, iter(field_value)))
if isinstance(field_value, tuple)
else str(field_value).replace(":", "")
)
for field_name, field_value in self.dict_items()
]
)
def dict_items(self):
return asdict(self).items()
# SPDX-License-Identifier: MIT
# Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
import logging
import os
import subprocess
from dataclasses import fields, replace
from dataclasses import replace
from functools import lru_cache, partial
from typing import List
......
# SPDX-License-Identifier: MIT
# Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
from dataclasses import asdict, dataclass
from typing import Optional, Tuple
......
# SPDX-License-Identifier: MIT
# Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
import functools
import os
@functools.lru_cache(None)
def library_path():
return os.path.join(os.path.dirname(__file__), 'library')
return os.path.join(os.path.dirname(__file__), "library")
......@@ -65,8 +65,9 @@ def parse_data_type(args):
if args.ck_profier_op == "grouped_conv_fwd":
args.data_type = 3
if args.data_type == "bfp16":
if args.ck_profier_op == "grouped_conv_bwd_weight" or \
args.ck_profier_op == "grouped_conv_bwd_data" or \
if args.ck_profier_op == "grouped_conv_bwd_weight":
args.data_type = 5
if args.ck_profier_op == "grouped_conv_bwd_data" or \
args.ck_profier_op == "grouped_conv_fwd":
args.data_type = 2
......
......@@ -133,12 +133,12 @@ def parse_logfile(logfile):
if 'Best Perf' in line:
lst=line.split()
res.append(lst[4])
elif 'onnx_gemm' in logfile or 'mixed_gemm' in logfile:
elif 'onnx_gemm' in logfile:
for line in open(logfile):
if 'Best Perf' in line:
lst=line.split()
res.append(lst[33])
elif 'splitK_gemm' in logfile:
elif 'splitK_gemm' in logfile or 'mixed_gemm' in logfile:
for line in open(logfile):
if 'Best Perf' in line:
lst=line.split()
......
......@@ -22,6 +22,7 @@ python3 process_perf_data.py perf_gemm_bilinear.log
python3 process_perf_data.py perf_reduction.log
python3 process_perf_data.py perf_splitK_gemm.log
python3 process_perf_data.py perf_onnx_gemm.log
python3 process_perf_data.py perf_mixed_gemm.log
file=./perf_fmha_fwd_gfx942.log
if [ -e "$file" ]; then
......
......@@ -64,11 +64,11 @@ function(add_test_executable TEST_NAME)
#only continue if there are some source files left on the list
if(ARGN)
if(ARGN MATCHES "_xdl")
list(REMOVE_ITEM TEST_TARGETS gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
elseif(ARGN MATCHES "_wmma")
list(REMOVE_ITEM TEST_TARGETS gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030)
list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030)
elseif(ARGN MATCHES "_smfmac")
list(REMOVE_ITEM TEST_TARGETS gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx908 gfx90a gfx1200 gfx1201)
list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx908 gfx90a gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
endif()
set_source_files_properties(${ARGN} PROPERTIES LANGUAGE HIP)
add_executable(${TEST_NAME} ${ARGN})
......@@ -141,11 +141,11 @@ function(add_gtest_executable TEST_NAME)
#only continue if there are some source files left on the list
if(ARGN)
if(ARGN MATCHES "_xdl")
list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
elseif(ARGN MATCHES "_wmma")
list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030)
list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030)
elseif(ARGN MATCHES "_smfmac")
list(REMOVE_ITEM TEST_TARGETS gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx908 gfx90a gfx1200 gfx1201)
list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx908 gfx90a gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
endif()
set_source_files_properties(${ARGN} PROPERTIES LANGUAGE HIP)
add_executable(${TEST_NAME} ${ARGN})
......@@ -210,3 +210,4 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx942" AND CK_HIP_VERSION_MAJOR GREATER_EQUAL
add_subdirectory(smfmac_op)
endif()
add_subdirectory(position_embedding)
add_subdirectory(scatter_gather)
add_subdirectory(image_to_column)
add_subdirectory(gemm)
# Currently ck_tile is only built on gfx9
if(GPU_TARGETS MATCHES "gfx9")
add_gtest_executable(test_ck_tile_gemm_mem_pipeline test_gemm_mem_pipeline.cpp)
endif()
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include <tuple>
#include "gtest/gtest.h"
#include "ck_tile/host.hpp"
#include "test_gemm_mem_pipeline_util.hpp"
using F16 = ck_tile::half_t;
using F32 = float;
using Row = ck_tile::tensor_layout::gemm::RowMajor;
using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
// clang-format off
using KernelTypes = ::testing::Types<
// ALayout, BLayout, CLayout, ADataType, BDataType, AccDataType, CDataType
std::tuple< Row, Col, Row, F16, F16, F32, F16>,
std::tuple< Col, Row, Row, F16, F16, F32, F16>,
std::tuple< Row, Row, Row, F16, F16, F32, F16>,
std::tuple< Col, Col, Row, F16, F16, F32, F16>
>;
// clang-format on
TYPED_TEST_SUITE(TestCkTileGemmMemPipeline, KernelTypes);
#include "test_gemm_mem_pipeline_ut_cases.inc"
#pragma once
TYPED_TEST(TestCkTileGemmMemPipeline, SmallM)
{
std::vector<int> Ms{1, 2, 3, 4, 5, 6};
constexpr int N = 1024;
constexpr int K = 320;
for(int M : Ms)
this->Run(M, N, K);
}
TYPED_TEST(TestCkTileGemmMemPipeline, MidLargeM)
{
std::vector<int> Ms{127, 255, 312, 799, 1573};
constexpr int N = 1024;
constexpr int K = 320;
for(int M : Ms)
this->Run(M, N, K);
}
TYPED_TEST(TestCkTileGemmMemPipeline, PaddK)
{
std::vector<int> Ms{127};
constexpr int N = 1024;
constexpr int K = 432;
for(int M : Ms)
this->Run(M, N, K);
}
TYPED_TEST(TestCkTileGemmMemPipeline, Regular)
{
std::vector<int> Ms{512};
constexpr int N = 1024;
constexpr int K = 512;
for(int M : Ms)
this->Run(M, N, K);
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment