"src/include/threadwise_direct_convolution.cuh" did not exist on "0404f777b453034d81670c54e136334ec067eacd"
Unverified Commit fc9f9756 authored by Bartłomiej Kocot's avatar Bartłomiej Kocot Committed by GitHub
Browse files

Add DeviceBatchedGemmMultipleD_Dl (#732)

* Add DeviceBatchedGemmMultipleD_Dl

* Fix batched_gemm tests

* Fix comments

* test_batched_gemm_multi_d fixes

* Fix args for isSupported batchedGemmMultipleDDl

* Disable tests for gfx90a
parent 7c24654c
...@@ -8,9 +8,11 @@ ...@@ -8,9 +8,11 @@
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm.hpp" #include "ck/tensor_operation/gpu/device/device_batched_gemm.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/batched_gemm.hpp" #include "ck/library/tensor_operation_instance/gpu/batched_gemm.hpp"
#include "ck/library/tensor_operation_instance/gpu/batched_gemm_multi_d.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
...@@ -27,7 +29,11 @@ template <typename ADataType, ...@@ -27,7 +29,11 @@ template <typename ADataType,
typename CDataType, typename CDataType,
typename ALayout, typename ALayout,
typename BLayout, typename BLayout,
typename CLayout> typename CLayout,
typename AElementOp,
typename BElementOp,
typename CElementOp,
typename DeviceOp>
bool profile_batched_gemm_impl(int do_verification, bool profile_batched_gemm_impl(int do_verification,
int init_method, int init_method,
bool do_log, bool do_log,
...@@ -88,10 +94,6 @@ bool profile_batched_gemm_impl(int do_verification, ...@@ -88,10 +94,6 @@ bool profile_batched_gemm_impl(int do_verification,
b_g_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5}); b_g_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
} }
using AElementOp = ck::tensor_operation::element_wise::PassThrough;
using BElementOp = ck::tensor_operation::element_wise::PassThrough;
using CElementOp = ck::tensor_operation::element_wise::PassThrough;
const auto a_element_op = AElementOp{}; const auto a_element_op = AElementOp{};
const auto b_element_op = BElementOp{}; const auto b_element_op = BElementOp{};
const auto c_element_op = CElementOp{}; const auto c_element_op = CElementOp{};
...@@ -124,16 +126,6 @@ bool profile_batched_gemm_impl(int do_verification, ...@@ -124,16 +126,6 @@ bool profile_batched_gemm_impl(int do_verification,
b_device_buf.ToDevice(b_g_k_n.mData.data()); b_device_buf.ToDevice(b_g_k_n.mData.data());
c_device_buf.ToDevice(c_g_m_n_device_result.mData.data()); c_device_buf.ToDevice(c_g_m_n_device_result.mData.data());
using DeviceOp = ck::tensor_operation::device::DeviceBatchedGemm<ALayout,
BLayout,
CLayout,
ADataType,
BDataType,
CDataType,
AElementOp,
BElementOp,
CElementOp>;
// get device op instances // get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances(); DeviceOp>::GetInstances();
...@@ -148,23 +140,62 @@ bool profile_batched_gemm_impl(int do_verification, ...@@ -148,23 +140,62 @@ bool profile_batched_gemm_impl(int do_verification,
// profile device op instances // profile device op instances
for(auto& op_ptr : op_ptrs) for(auto& op_ptr : op_ptrs)
{ {
auto argument_ptr = std::unique_ptr<tensor_operation::device::BaseArgument> argument_ptr;
op_ptr->MakeArgumentPointer(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()), // false branch for multi d dl kernel
static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()), if constexpr(std::is_same<
static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()), DeviceOp,
M, ck::tensor_operation::device::DeviceBatchedGemm<ALayout,
N, BLayout,
K, CLayout,
StrideA, ADataType,
StrideB, BDataType,
StrideC, CDataType,
BatchStrideA, AElementOp,
BatchStrideB, BElementOp,
BatchStrideC, CElementOp>>::value)
BatchCount, {
ck::tensor_operation::element_wise::PassThrough{},
ck::tensor_operation::element_wise::PassThrough{}, argument_ptr =
ck::tensor_operation::element_wise::PassThrough{}); op_ptr->MakeArgumentPointer(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
M,
N,
K,
StrideA,
StrideB,
StrideC,
BatchStrideA,
BatchStrideB,
BatchStrideC,
BatchCount,
ck::tensor_operation::element_wise::PassThrough{},
ck::tensor_operation::element_wise::PassThrough{},
ck::tensor_operation::element_wise::PassThrough{});
}
else
{
argument_ptr =
op_ptr->MakeArgumentPointer(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
{},
static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
M,
N,
K,
BatchCount,
StrideA,
StrideB,
{},
StrideC,
BatchStrideA,
BatchStrideB,
{},
BatchStrideC,
ck::tensor_operation::element_wise::PassThrough{},
ck::tensor_operation::element_wise::PassThrough{},
ck::tensor_operation::element_wise::PassThrough{});
}
auto invoker_ptr = op_ptr->MakeInvokerPointer(); auto invoker_ptr = op_ptr->MakeInvokerPointer();
......
...@@ -34,6 +34,7 @@ set(PROFILER_SOURCES ...@@ -34,6 +34,7 @@ set(PROFILER_SOURCES
profile_grouped_gemm_fastgelu.cpp profile_grouped_gemm_fastgelu.cpp
profile_contraction_bilinear.cpp profile_contraction_bilinear.cpp
profile_contraction_scale.cpp profile_contraction_scale.cpp
profile_batched_gemm_multi_d.cpp
) )
set(PROFILER_EXECUTABLE ckProfiler) set(PROFILER_EXECUTABLE ckProfiler)
...@@ -77,5 +78,5 @@ target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_fastgel ...@@ -77,5 +78,5 @@ target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_fastgel
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_bilinear_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_bilinear_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_scale_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_scale_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_pool_fwd_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_pool_fwd_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_multi_d_instance)
rocm_install(TARGETS ${PROFILER_EXECUTABLE} COMPONENT profiler) rocm_install(TARGETS ${PROFILER_EXECUTABLE} COMPONENT profiler)
...@@ -10,6 +10,8 @@ ...@@ -10,6 +10,8 @@
#include "profiler/profile_batched_gemm_impl.hpp" #include "profiler/profile_batched_gemm_impl.hpp"
#include "profiler_operation_registry.hpp" #include "profiler_operation_registry.hpp"
#include "ck/library/tensor_operation_instance/gpu/batched_gemm.hpp"
enum struct GemmMatrixLayout enum struct GemmMatrixLayout
{ {
MK_KN_MN, // 0 MK_KN_MN, // 0
...@@ -78,55 +80,72 @@ int profile_batched_gemm(int argc, char* argv[]) ...@@ -78,55 +80,72 @@ int profile_batched_gemm(int argc, char* argv[])
using Row = ck::tensor_layout::gemm::RowMajor; using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor; using Col = ck::tensor_layout::gemm::ColumnMajor;
auto profile = [&](auto a_type, auto profile =
auto b_type, [&](auto a_type, auto b_type, auto c_type, auto a_layout, auto b_layout, auto c_layout) {
auto c_type, using ADataType = decltype(a_type);
auto a_layout, using BDataType = decltype(b_type);
auto b_layout, using CDataType = decltype(c_type);
auto c_layout) {
using ADataType = decltype(a_type); using ALayout = decltype(a_layout);
using BDataType = decltype(b_type); using BLayout = decltype(b_layout);
using CDataType = decltype(c_type); using CLayout = decltype(c_layout);
using ALayout = decltype(a_layout); const int DefaultStrideA = ck::is_same_v<ALayout, Row> ? K : M;
using BLayout = decltype(b_layout); const int DefaultStrideB = ck::is_same_v<BLayout, Row> ? N : K;
using CLayout = decltype(c_layout); const int DefaultStrideC = ck::is_same_v<CLayout, Row> ? N : M;
const int DefaultStrideA = ck::is_same_v<ALayout, Row> ? K : M; const int StrideA_ = (StrideA < 0) ? DefaultStrideA : StrideA;
const int DefaultStrideB = ck::is_same_v<BLayout, Row> ? N : K; const int StrideB_ = (StrideB < 0) ? DefaultStrideB : StrideB;
const int DefaultStrideC = ck::is_same_v<CLayout, Row> ? N : M; const int StrideC_ = (StrideC < 0) ? DefaultStrideC : StrideC;
const int StrideA_ = (StrideA < 0) ? DefaultStrideA : StrideA; const int DefaultBatchStrideA = (ck::is_same_v<ALayout, Row> ? M : K) * StrideA_;
const int StrideB_ = (StrideB < 0) ? DefaultStrideB : StrideB; const int DefaultBatchStrideB = (ck::is_same_v<BLayout, Row> ? K : N) * StrideB_;
const int StrideC_ = (StrideC < 0) ? DefaultStrideC : StrideC; const int DefaultBatchStrideC = (ck::is_same_v<CLayout, Row> ? M : N) * StrideC_;
const int DefaultBatchStrideA = (ck::is_same_v<ALayout, Row> ? M : K) * StrideA_; const int BatchStrideA_ = (BatchStrideA < 0) ? DefaultBatchStrideA : BatchStrideA;
const int DefaultBatchStrideB = (ck::is_same_v<BLayout, Row> ? K : N) * StrideB_; const int BatchStrideB_ = (BatchStrideB < 0) ? DefaultBatchStrideB : BatchStrideB;
const int DefaultBatchStrideC = (ck::is_same_v<CLayout, Row> ? M : N) * StrideC_; const int BatchStrideC_ = (BatchStrideC < 0) ? DefaultBatchStrideC : BatchStrideC;
const int BatchStrideA_ = (BatchStrideA < 0) ? DefaultBatchStrideA : BatchStrideA; using AElementOp = ck::tensor_operation::element_wise::PassThrough;
const int BatchStrideB_ = (BatchStrideB < 0) ? DefaultBatchStrideB : BatchStrideB; using BElementOp = ck::tensor_operation::element_wise::PassThrough;
const int BatchStrideC_ = (BatchStrideC < 0) ? DefaultBatchStrideC : BatchStrideC; using CElementOp = ck::tensor_operation::element_wise::PassThrough;
bool pass = ck::profiler:: using DeviceOp = ck::tensor_operation::device::DeviceBatchedGemm<ALayout,
profile_batched_gemm_impl<ADataType, BDataType, CDataType, ALayout, BLayout, CLayout>( BLayout,
do_verification, CLayout,
init_method, ADataType,
do_log, BDataType,
time_kernel, CDataType,
M, AElementOp,
N, BElementOp,
K, CElementOp>;
BatchStrideA_,
BatchStrideB_, bool pass = ck::profiler::profile_batched_gemm_impl<ADataType,
BatchStrideC_, BDataType,
StrideA_, CDataType,
StrideB_, ALayout,
StrideC_, BLayout,
BatchCount); CLayout,
AElementOp,
return pass ? 0 : 1; BElementOp,
}; CElementOp,
DeviceOp>(do_verification,
init_method,
do_log,
time_kernel,
M,
N,
K,
BatchStrideA_,
BatchStrideB_,
BatchStrideC_,
StrideA_,
StrideB_,
StrideC_,
BatchCount);
return pass ? 0 : 1;
};
if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::MK_KN_MN) if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::MK_KN_MN)
{ {
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdint>
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "profiler/profile_batched_gemm_impl.hpp"
#include "profiler_operation_registry.hpp"
#include "ck/library/tensor_operation_instance/gpu/batched_gemm_multi_d.hpp"
enum struct GemmMatrixLayout
{
MK_KN_MN, // 0
MK_NK_MN, // 1
KM_KN_MN, // 2
KM_NK_MN, // 3
};
enum struct GemmDataType
{
F16_F16_F16, // 0
INT8_INT8_INT8, // 1
};
#define OP_NAME "batched_gemm_multi_d"
#define OP_DESC "Batched GEMM multi D"
int profile_batched_gemm_multi_d(int argc, char* argv[])
{
if(argc != 18)
{
// clang-format off
printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
printf("arg2: data type (0: fp16; 1: int8)\n");
printf("arg3: matrix layout (0: A[g, m, k] * B[g, k, n] = C[g, m, n];\n");
printf(" 1: A[g, m, k] * B[g, n, k] = C[g, m, n];\n");
printf(" 2: A[g, k, m] * B[g, k, n] = C[g, m, n];\n");
printf(" 3: A[g, k, m] * B[g, n, k] = C[g, m, n])\n");
printf("arg4: verification (0: no; 1: yes)\n");
printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
printf("arg6: print tensor value (0: no; 1: yes)\n");
printf("arg7: time kernel (0=n0, 1=yes)\n");
printf("arg8 to 17: M, N, K, StrideA, StrideB, StrideC, BatchStrideA, BatchStrideB, BatchStrideC, BatchCount\n");
// clang-format on
exit(1);
}
const auto data_type = static_cast<GemmDataType>(std::stoi(argv[2]));
const auto layout = static_cast<GemmMatrixLayout>(std::stoi(argv[3]));
const bool do_verification = std::stoi(argv[4]);
const int init_method = std::stoi(argv[5]);
const bool do_log = std::stoi(argv[6]);
const bool time_kernel = std::stoi(argv[7]);
const int M = std::stoi(argv[8]);
const int N = std::stoi(argv[9]);
const int K = std::stoi(argv[10]);
const int StrideA = std::stoi(argv[11]);
const int StrideB = std::stoi(argv[12]);
const int StrideC = std::stoi(argv[13]);
const int BatchStrideA = std::stoi(argv[14]);
const int BatchStrideB = std::stoi(argv[15]);
const int BatchStrideC = std::stoi(argv[16]);
const int BatchCount = std::stoi(argv[17]);
using F16 = ck::half_t;
using INT8 = int8_t;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
auto profile =
[&](auto a_type, auto b_type, auto c_type, auto a_layout, auto b_layout, auto c_layout) {
using ADataType = decltype(a_type);
using BDataType = decltype(b_type);
using CDataType = decltype(c_type);
using DsDataType = ck::Tuple<>;
using ALayout = decltype(a_layout);
using BLayout = decltype(b_layout);
using CLayout = decltype(c_layout);
using DsLayout = ck::Tuple<>;
const int DefaultStrideA = ck::is_same_v<ALayout, Row> ? K : M;
const int DefaultStrideB = ck::is_same_v<BLayout, Row> ? N : K;
const int DefaultStrideC = ck::is_same_v<CLayout, Row> ? N : M;
const int StrideA_ = (StrideA < 0) ? DefaultStrideA : StrideA;
const int StrideB_ = (StrideB < 0) ? DefaultStrideB : StrideB;
const int StrideC_ = (StrideC < 0) ? DefaultStrideC : StrideC;
const int DefaultBatchStrideA = (ck::is_same_v<ALayout, Row> ? M : K) * StrideA_;
const int DefaultBatchStrideB = (ck::is_same_v<BLayout, Row> ? K : N) * StrideB_;
const int DefaultBatchStrideC = (ck::is_same_v<CLayout, Row> ? M : N) * StrideC_;
const int BatchStrideA_ = (BatchStrideA < 0) ? DefaultBatchStrideA : BatchStrideA;
const int BatchStrideB_ = (BatchStrideB < 0) ? DefaultBatchStrideB : BatchStrideB;
const int BatchStrideC_ = (BatchStrideC < 0) ? DefaultBatchStrideC : BatchStrideC;
using AElementOp = ck::tensor_operation::element_wise::PassThrough;
using BElementOp = ck::tensor_operation::element_wise::PassThrough;
using CElementOp = ck::tensor_operation::element_wise::PassThrough;
using DeviceOp = ck::tensor_operation::device::DeviceBatchedGemmMultiD<ALayout,
BLayout,
DsLayout,
CLayout,
ADataType,
BDataType,
DsDataType,
CDataType,
AElementOp,
BElementOp,
CElementOp>;
bool pass = ck::profiler::profile_batched_gemm_impl<ADataType,
BDataType,
CDataType,
ALayout,
BLayout,
CLayout,
AElementOp,
BElementOp,
CElementOp,
DeviceOp>(do_verification,
init_method,
do_log,
time_kernel,
M,
N,
K,
BatchStrideA_,
BatchStrideB_,
BatchStrideC_,
StrideA_,
StrideB_,
StrideC_,
BatchCount);
return pass ? 0 : 1;
};
if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
{
return profile(F16{}, F16{}, F16{}, Row{}, Row{}, Row{});
}
else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN)
{
return profile(F16{}, F16{}, F16{}, Row{}, Col{}, Row{});
}
else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_KN_MN)
{
return profile(F16{}, F16{}, F16{}, Col{}, Row{}, Row{});
}
else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_NK_MN)
{
return profile(F16{}, F16{}, F16{}, Col{}, Col{}, Row{});
}
else if(data_type == GemmDataType::INT8_INT8_INT8 && layout == GemmMatrixLayout::MK_KN_MN)
{
return profile(INT8{}, INT8{}, INT8{}, Row{}, Row{}, Row{});
}
else if(data_type == GemmDataType::INT8_INT8_INT8 && layout == GemmMatrixLayout::MK_NK_MN)
{
return profile(INT8{}, INT8{}, INT8{}, Row{}, Col{}, Row{});
}
else if(data_type == GemmDataType::INT8_INT8_INT8 && layout == GemmMatrixLayout::KM_KN_MN)
{
return profile(INT8{}, INT8{}, INT8{}, Col{}, Row{}, Row{});
}
else if(data_type == GemmDataType::INT8_INT8_INT8 && layout == GemmMatrixLayout::KM_NK_MN)
{
return profile(INT8{}, INT8{}, INT8{}, Col{}, Col{}, Row{});
}
else
{
std::cout << "this data_type & layout is not implemented" << std::endl;
return 1;
}
}
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_batched_gemm_multi_d);
...@@ -58,6 +58,7 @@ add_subdirectory(elementwise_normalization) ...@@ -58,6 +58,7 @@ add_subdirectory(elementwise_normalization)
add_subdirectory(batchnorm) add_subdirectory(batchnorm)
add_subdirectory(contraction) add_subdirectory(contraction)
add_subdirectory(pool_fwd) add_subdirectory(pool_fwd)
add_subdirectory(batched_gemm_multi_d)
if(GPU_TARGETS MATCHES "gfx1100") if(GPU_TARGETS MATCHES "gfx1100")
add_subdirectory(wmma_op) add_subdirectory(wmma_op)
endif() endif()
...@@ -5,6 +5,8 @@ ...@@ -5,6 +5,8 @@
#include "profiler/profile_batched_gemm_impl.hpp" #include "profiler/profile_batched_gemm_impl.hpp"
#include "ck/library/tensor_operation_instance/gpu/batched_gemm.hpp"
namespace { namespace {
using ADataType = ck::bhalf_t; using ADataType = ck::bhalf_t;
using BDataType = ck::bhalf_t; using BDataType = ck::bhalf_t;
...@@ -12,6 +14,8 @@ using CDataType = ck::bhalf_t; ...@@ -12,6 +14,8 @@ using CDataType = ck::bhalf_t;
using Row = ck::tensor_layout::gemm::RowMajor; using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor; using Col = ck::tensor_layout::gemm::ColumnMajor;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
} // namespace } // namespace
int main() int main()
...@@ -23,21 +27,87 @@ int main() ...@@ -23,21 +27,87 @@ int main()
bool pass = true; bool pass = true;
pass = pass && using namespace ck::tensor_operation::device;
ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Row, Row, Row>(
true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount); pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
BDataType,
CDataType,
Row,
Row,
Row,
PassThrough,
PassThrough,
PassThrough,
DeviceBatchedGemm<Row,
Row,
Row,
ADataType,
BDataType,
CDataType,
PassThrough,
PassThrough,
PassThrough>>(
true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);
pass = pass && pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Row, Col, Row>( BDataType,
true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount); CDataType,
Row,
Col,
Row,
PassThrough,
PassThrough,
PassThrough,
DeviceBatchedGemm<Row,
Col,
Row,
ADataType,
BDataType,
CDataType,
PassThrough,
PassThrough,
PassThrough>>(
true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount);
pass = pass && pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Col, Row, Row>( BDataType,
true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount); CDataType,
Col,
Row,
Row,
PassThrough,
PassThrough,
PassThrough,
DeviceBatchedGemm<Col,
Row,
Row,
ADataType,
BDataType,
CDataType,
PassThrough,
PassThrough,
PassThrough>>(
true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount);
pass = pass && pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Col, Col, Row>( BDataType,
true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount); CDataType,
Col,
Col,
Row,
PassThrough,
PassThrough,
PassThrough,
DeviceBatchedGemm<Col,
Col,
Row,
ADataType,
BDataType,
CDataType,
PassThrough,
PassThrough,
PassThrough>>(
true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount);
std::cout << "test BatchedGEMM bf16: " << (pass ? "Pass" : "Fail") << std::endl; std::cout << "test BatchedGEMM bf16: " << (pass ? "Pass" : "Fail") << std::endl;
return pass ? 0 : 1; return pass ? 0 : 1;
......
...@@ -5,6 +5,8 @@ ...@@ -5,6 +5,8 @@
#include "profiler/profile_batched_gemm_impl.hpp" #include "profiler/profile_batched_gemm_impl.hpp"
#include "ck/library/tensor_operation_instance/gpu/batched_gemm.hpp"
namespace { namespace {
using ADataType = ck::half_t; using ADataType = ck::half_t;
using BDataType = ck::half_t; using BDataType = ck::half_t;
...@@ -12,6 +14,8 @@ using CDataType = ck::half_t; ...@@ -12,6 +14,8 @@ using CDataType = ck::half_t;
using Row = ck::tensor_layout::gemm::RowMajor; using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor; using Col = ck::tensor_layout::gemm::ColumnMajor;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
} // namespace } // namespace
int main() int main()
...@@ -23,21 +27,87 @@ int main() ...@@ -23,21 +27,87 @@ int main()
bool pass = true; bool pass = true;
pass = pass && using namespace ck::tensor_operation::device;
ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Row, Row, Row>(
true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount); pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
BDataType,
CDataType,
Row,
Row,
Row,
PassThrough,
PassThrough,
PassThrough,
DeviceBatchedGemm<Row,
Row,
Row,
ADataType,
BDataType,
CDataType,
PassThrough,
PassThrough,
PassThrough>>(
true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);
pass = pass && pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Row, Col, Row>( BDataType,
true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount); CDataType,
Row,
Col,
Row,
PassThrough,
PassThrough,
PassThrough,
DeviceBatchedGemm<Row,
Col,
Row,
ADataType,
BDataType,
CDataType,
PassThrough,
PassThrough,
PassThrough>>(
true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount);
pass = pass && pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Col, Row, Row>( BDataType,
true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount); CDataType,
Col,
Row,
Row,
PassThrough,
PassThrough,
PassThrough,
DeviceBatchedGemm<Col,
Row,
Row,
ADataType,
BDataType,
CDataType,
PassThrough,
PassThrough,
PassThrough>>(
true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount);
pass = pass && pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Col, Col, Row>( BDataType,
true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount); CDataType,
Col,
Col,
Row,
PassThrough,
PassThrough,
PassThrough,
DeviceBatchedGemm<Col,
Col,
Row,
ADataType,
BDataType,
CDataType,
PassThrough,
PassThrough,
PassThrough>>(
true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount);
std::cout << "test BatchedGEMM fp16: " << (pass ? "Pass" : "Fail") << std::endl; std::cout << "test BatchedGEMM fp16: " << (pass ? "Pass" : "Fail") << std::endl;
return pass ? 0 : 1; return pass ? 0 : 1;
......
...@@ -5,6 +5,8 @@ ...@@ -5,6 +5,8 @@
#include "profiler/profile_batched_gemm_impl.hpp" #include "profiler/profile_batched_gemm_impl.hpp"
#include "ck/library/tensor_operation_instance/gpu/batched_gemm.hpp"
namespace { namespace {
using ADataType = float; using ADataType = float;
using BDataType = float; using BDataType = float;
...@@ -12,6 +14,8 @@ using CDataType = float; ...@@ -12,6 +14,8 @@ using CDataType = float;
using Row = ck::tensor_layout::gemm::RowMajor; using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor; using Col = ck::tensor_layout::gemm::ColumnMajor;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
} // namespace } // namespace
int main() int main()
...@@ -23,21 +27,87 @@ int main() ...@@ -23,21 +27,87 @@ int main()
bool pass = true; bool pass = true;
pass = pass && using namespace ck::tensor_operation::device;
ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Row, Row, Row>(
true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount); pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
BDataType,
CDataType,
Row,
Row,
Row,
PassThrough,
PassThrough,
PassThrough,
DeviceBatchedGemm<Row,
Row,
Row,
ADataType,
BDataType,
CDataType,
PassThrough,
PassThrough,
PassThrough>>(
true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);
pass = pass && pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Row, Col, Row>( BDataType,
true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount); CDataType,
Row,
Col,
Row,
PassThrough,
PassThrough,
PassThrough,
DeviceBatchedGemm<Row,
Col,
Row,
ADataType,
BDataType,
CDataType,
PassThrough,
PassThrough,
PassThrough>>(
true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount);
pass = pass && pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Col, Row, Row>( BDataType,
true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount); CDataType,
Col,
Row,
Row,
PassThrough,
PassThrough,
PassThrough,
DeviceBatchedGemm<Col,
Row,
Row,
ADataType,
BDataType,
CDataType,
PassThrough,
PassThrough,
PassThrough>>(
true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount);
pass = pass && pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Col, Col, Row>( BDataType,
true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount); CDataType,
Col,
Col,
Row,
PassThrough,
PassThrough,
PassThrough,
DeviceBatchedGemm<Col,
Col,
Row,
ADataType,
BDataType,
CDataType,
PassThrough,
PassThrough,
PassThrough>>(
true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount);
std::cout << "test BatchedGEMM fp32: " << (pass ? "Pass" : "Fail") << std::endl; std::cout << "test BatchedGEMM fp32: " << (pass ? "Pass" : "Fail") << std::endl;
return pass ? 0 : 1; return pass ? 0 : 1;
......
...@@ -5,6 +5,8 @@ ...@@ -5,6 +5,8 @@
#include "profiler/profile_batched_gemm_impl.hpp" #include "profiler/profile_batched_gemm_impl.hpp"
#include "ck/library/tensor_operation_instance/gpu/batched_gemm.hpp"
namespace { namespace {
using ADataType = int8_t; using ADataType = int8_t;
using BDataType = int8_t; using BDataType = int8_t;
...@@ -12,6 +14,8 @@ using CDataType = int8_t; ...@@ -12,6 +14,8 @@ using CDataType = int8_t;
using Row = ck::tensor_layout::gemm::RowMajor; using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor; using Col = ck::tensor_layout::gemm::ColumnMajor;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
} // namespace } // namespace
int main() int main()
...@@ -23,21 +27,87 @@ int main() ...@@ -23,21 +27,87 @@ int main()
bool pass = true; bool pass = true;
pass = pass && using namespace ck::tensor_operation::device;
ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Row, Row, Row>(
true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount); pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
BDataType,
CDataType,
Row,
Row,
Row,
PassThrough,
PassThrough,
PassThrough,
DeviceBatchedGemm<Row,
Row,
Row,
ADataType,
BDataType,
CDataType,
PassThrough,
PassThrough,
PassThrough>>(
true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);
pass = pass && pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Row, Col, Row>( BDataType,
true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount); CDataType,
Row,
Col,
Row,
PassThrough,
PassThrough,
PassThrough,
DeviceBatchedGemm<Row,
Col,
Row,
ADataType,
BDataType,
CDataType,
PassThrough,
PassThrough,
PassThrough>>(
true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount);
pass = pass && pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Col, Row, Row>( BDataType,
true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount); CDataType,
Col,
Row,
Row,
PassThrough,
PassThrough,
PassThrough,
DeviceBatchedGemm<Col,
Row,
Row,
ADataType,
BDataType,
CDataType,
PassThrough,
PassThrough,
PassThrough>>(
true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount);
pass = pass && pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Col, Col, Row>( BDataType,
true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount); CDataType,
Col,
Col,
Row,
PassThrough,
PassThrough,
PassThrough,
DeviceBatchedGemm<Col,
Col,
Row,
ADataType,
BDataType,
CDataType,
PassThrough,
PassThrough,
PassThrough>>(
true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount);
std::cout << "test BatchedGEMM int8: " << (pass ? "Pass" : "Fail") << std::endl; std::cout << "test BatchedGEMM int8: " << (pass ? "Pass" : "Fail") << std::endl;
return pass ? 0 : 1; return pass ? 0 : 1;
......
# TODO: Enable for gfx90a after complier fix
if(NOT GPU_TARGETS MATCHES "gfx90a")
add_gtest_executable(test_batched_gemm_multi_d test_batched_gemm_multi_d.cpp)
target_link_libraries(test_batched_gemm_multi_d PRIVATE utility device_batched_gemm_multi_d_instance)
endif()
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <gtest/gtest.h>
#include "profiler/profile_batched_gemm_impl.hpp"
#include "ck/library/tensor_operation_instance/gpu/batched_gemm_multi_d.hpp"
namespace {
using F16 = ck::half_t;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Empty_Tuple = ck::Tuple<>;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
template <typename Tuple>
class TestBatchedGemmMultiD : public ::testing::Test
{
protected:
using ALayout = std::tuple_element_t<0, Tuple>;
using BLayout = std::tuple_element_t<1, Tuple>;
using CLayout = std::tuple_element_t<2, Tuple>;
static constexpr int M = 512;
static constexpr int N = 256;
static constexpr int K = 128;
static constexpr int BatchCount = 3;
template <typename DataType>
void Run()
{
using namespace ck::tensor_operation::device;
const bool pass =
ck::profiler::profile_batched_gemm_impl<DataType,
DataType,
DataType,
ALayout,
BLayout,
CLayout,
PassThrough,
PassThrough,
PassThrough,
DeviceBatchedGemmMultiD<ALayout,
BLayout,
Empty_Tuple,
CLayout,
DataType,
DataType,
Empty_Tuple,
DataType,
PassThrough,
PassThrough,
PassThrough>>(
true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);
EXPECT_TRUE(pass);
}
};
using KernelTypes = ::testing::Types<std::tuple<Row, Row, Row>,
std::tuple<Row, Col, Row>,
std::tuple<Col, Row, Row>,
std::tuple<Col, Col, Row>>;
} // namespace
TYPED_TEST_SUITE(TestBatchedGemmMultiD, KernelTypes);
TYPED_TEST(TestBatchedGemmMultiD, f16) { this->template Run<F16>(); }
TYPED_TEST(TestBatchedGemmMultiD, int8) { this->template Run<int8_t>(); }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment