"...git@developer.sourcefind.cn:OpenDAS/mmdetection3d.git" did not exist on "fbbdd0e3be99969beb2dc6048ebae5d523e58503"
Unverified Commit d00e6115 authored by Adam Osewski's avatar Adam Osewski Committed by GitHub
Browse files

Gemm reduce examples int4/int8/fp32/bf16 (#368)



* GEMM + Reduce max fp16+fp32

* GEmm + Max bf16 + int8

* Refactor common definitions.

* Refactor common func of mean meansquare example.

* More examples for mean meansquare.

* Update int8 examples and skip them cause of random errors.

* Int4 examples.

* Fix examples for max int4/8

* Tensor conversion for int4 input data for mean meansquare example.

* Remove int4 mean_meansquare example

* Fix int8 mean_meansquare example.

-All ReductionAccData and R<N>DataType have to be F32. The INT32 data
type is giving wrong results.

* Guard int4 with ifdef

* Change int8 example to add_addsquare due to div rounding err.

* Clang format

* Change the return type of common function.

* Get back int8 example with division.

* Remove int8 mean meansquare.

* Use proper cast for BF16 data type.

* Use ck::literals.

* Use proper data type for host tensors & reference.

- Use ReduceAccDataType for reference gemm output data type.
- Cast host reference output tensor to EDataType
- Fix ifdefs for int4.
Co-authored-by: default avatarAdam Osewski <aosewski@amd.com>
parent 45adb736
add_custom_target(example_gemm_reduce_xdl)
add_custom_target(example_gemm_reduce_xdl_max)
add_custom_target(example_gemm_reduce_xdl_mean_meansquare)
add_custom_target(example_gemm_add_add_mean_meansquare_xdl)
add_example_executable(example_gemm_max_xdl_fp16 gemm_max_xdl_fp16.cpp)
add_example_executable(example_gemm_max_xdl_int8 gemm_max_xdl_int8.cpp)
add_example_executable(example_gemm_max_xdl_fp32 gemm_max_xdl_fp32.cpp)
add_example_executable(example_gemm_max_xdl_bf16 gemm_max_xdl_bf16.cpp)
add_example_executable(example_gemm_add_add_mean_meansquare_xdl_fp16 gemm_add_add_mean_meansquare_xdl_fp16.cpp)
add_example_executable(example_gemm_mean_meansquare_xdl_fp16 gemm_mean_meansquare_xdl_fp16.cpp)
add_example_executable(example_gemm_max_xdl_fp16 gemm_max_xdl_fp16.cpp)
add_example_executable(example_gemm_mean_meansquare_xdl_fp32 gemm_mean_meansquare_xdl_fp32.cpp)
add_example_executable(example_gemm_mean_meansquare_xdl_bf16 gemm_mean_meansquare_xdl_bf16.cpp)
add_example_executable(example_gemm_add_addsquare_xdl_int8 gemm_add_addsquare_xdl_int8.cpp)
add_dependencies(example_gemm_reduce_xdl_max
example_gemm_max_xdl_bf16
example_gemm_max_xdl_fp16
example_gemm_max_xdl_fp32
example_gemm_max_xdl_int8)
add_dependencies(example_gemm_reduce_xdl_mean_meansquare
example_gemm_mean_meansquare_xdl_fp16
example_gemm_mean_meansquare_xdl_fp32
example_gemm_mean_meansquare_xdl_bf16
example_gemm_add_addsquare_xdl_int8)
add_dependencies(example_gemm_add_add_mean_meansquare_xdl example_gemm_add_add_mean_meansquare_xdl_fp16)
add_dependencies(example_gemm_reduce_xdl
example_gemm_reduce_xdl_mean_meansquare
example_gemm_reduce_xdl_max
example_gemm_add_add_mean_meansquare_xdl)
if(USE_BITINT_EXTENSION_INT4)
add_example_executable(example_gemm_max_xdl_int4 gemm_max_xdl_int4.cpp)
add_dependencies(example_gemm_reduce_xdl_max example_gemm_max_xdl_int4)
endif()
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "gemm_reduce_xdl_common.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
// DataType
using ADataType = INT8;
using BDataType = INT8;
using GemmAccDataType = INT32;
using CShuffleDataType = INT32;
using DsDataType = ck::Tuple<>;
using EDataType = INT8;
using ReduceAccDataType = INT32;
using R0DataType = INT32;
using R1DataType = INT32;
using RsDataType = ck::Tuple<R0DataType, R1DataType>;
// Layout
using ALayout = Row;
using BLayout = Col;
using ELayout = Row;
// Elementwise op
using Square = ck::tensor_operation::element_wise::UnarySquare;
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CDEElementOp = PassThrough;
using QsElementOp = ck::Tuple<PassThrough, Square>;
using RsElementOp = ck::Tuple<PassThrough, PassThrough>;
// ReduceOp
using R0ThreadReduceOp = ck::reduce::Add;
using R1ThreadReduceOp = ck::reduce::Add;
using RsThreadReduceOp = ck::Tuple<R0ThreadReduceOp, R1ThreadReduceOp>;
static constexpr auto R0GlobalReduceOp = ck::InMemoryDataOperationEnum::AtomicAdd;
static constexpr auto R1GlobalReduceOp = ck::InMemoryDataOperationEnum::AtomicAdd;
using RsGlobalReduceOp = ck::InMemoryDataOperationEnumSequence<R0GlobalReduceOp, R1GlobalReduceOp>;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
// clang-format off
using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultipleR_Xdl_CShuffle
<ALayout, // ALayout
BLayout, // BLayout
ELayout, // ELayout
ADataType, // ADataType
BDataType, // BDataType
GemmAccDataType, // GemmAccDataType
CShuffleDataType, // CShuffleDataType
DsDataType, // DsDataType
EDataType, // EDataType
ReduceAccDataType, // ReduceAccDataType
RsDataType, // RsDataType
AElementOp, // AElementwiseOperation
BElementOp, // BElementwiseOperation
CDEElementOp, // CDE ElementwiseOperation
QsElementOp, // Qs Elementwise Operation
RsElementOp, // Rs Elementwise Operation
RsThreadReduceOp, // Thread Reduce Operation
RsGlobalReduceOp, // Global Reduce Operation
GemmDefault, // GEMM Specialization
1, // NumGemmKPrefetchStage
256, // BlockSize
256, // MPerBlock
128, // NPerBlock
64, // KPerBlock
16, // AK1
16, // BK1
32, // MPerXdl
32, // NPerXdl
4, // MXdlPerWave
2, // NXdlPerWave
S<4, 64, 1>, // ABlockTransfer ThreadCluster Lengths_K0_M_K1
S<1, 0, 2>, // ABlockTransfer ThreadCluster ArrangeOrder
S<1, 0, 2>, // ABlockTransfer SrcAccessOrder
2, // ABlockTransfer SrcVectorDim
16, // ABlockTransfer SrcScalarPerVector
16, // ABlockTransfer DstScalarPerVector_K1
1, // ABlockLdsExtraM
S<4, 64, 1>, // BBlockTransfer ThreadCluster Lengths_K0_N_K1
S<1, 0, 2>, // BBlockTransfer ThreadCluster ArrangeOrder
S<1, 0, 2>, // BBlockTransfer SrcAccessOrder
2, // BBlockTransfer SrcVectorDim
16, // BBlockTransfer SrcScalarPerVector
16, // BBlockTransfer DstScalarPerVector_K1
1, // BBlockLdsExtraN
1, // CShuffleMXdlPerWavePerShuffle
1, // CShuffleNXdlPerWavePerShuffle
S<64, 4>, // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
4, // CDE ReduceThreadTransfer ScalarPerVector _NPerBlock
1>; // RThread DstScalarPerVector _MPerBlock
// clang-format on
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
BDataType,
ReduceAccDataType,
GemmAccDataType,
AElementOp,
BElementOp,
CDEElementOp>;
using namespace ck::literals;
template <typename ADataType,
typename BDataType,
typename EDataType,
typename R0DataType,
typename R1DataType,
typename ALayout,
typename BLayout,
typename ELayout,
typename AElementOp,
typename BElementOp,
typename CDEElementOp,
typename QsElementOp,
typename RsElementOp,
typename RsThreadReduceOp,
typename ReduceAccDataType,
typename DeviceOpInstance,
typename ReferenceGemmInstance>
bool run_gemm_reduce_add_addsquare_xdl(ck::index_t M,
ck::index_t N,
ck::index_t K,
ck::index_t StrideA,
ck::index_t StrideB,
ck::index_t StrideE,
bool do_verification,
int init_method,
bool time_kernel)
{
auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) {
return HostTensorDescriptor({len}, {stride});
};
auto f_host_tensor_descriptor2d =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{
return HostTensorDescriptor({row, col}, {stride, 1_uz});
}
else
{
return HostTensorDescriptor({row, col}, {1_uz, stride});
}
};
Tensor<ADataType> a_m_k(f_host_tensor_descriptor2d(M, K, StrideA, ALayout{}));
Tensor<BDataType> b_k_n(f_host_tensor_descriptor2d(K, N, StrideB, BLayout{}));
Tensor<EDataType> e_m_n(f_host_tensor_descriptor2d(M, N, StrideE, ELayout{}));
Tensor<R0DataType> r0_m(f_host_tensor_descriptor1d(M, 1));
Tensor<R1DataType> r1_m(f_host_tensor_descriptor1d(M, 1));
switch(init_method)
{
case 0: break;
case 1:
ck::utils::FillUniformDistributionIntegerValue<ADataType>{-5.f, 5.f}(a_m_k.begin(),
a_m_k.end());
ck::utils::FillUniformDistributionIntegerValue<BDataType>{-5.f, 5.f}(b_k_n.begin(),
b_k_n.end());
break;
default:
ck::utils::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k.begin(), a_m_k.end());
ck::utils::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n.begin(), b_k_n.end());
break;
}
DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem e_device_buf(sizeof(EDataType) * e_m_n.mDesc.GetElementSpaceSize());
DeviceMem r0_device_buf(sizeof(R0DataType) * r0_m.mDesc.GetElementSpaceSize());
DeviceMem r1_device_buf(sizeof(R1DataType) * r1_m.mDesc.GetElementSpaceSize());
a_device_buf.ToDevice(a_m_k.mData.data());
b_device_buf.ToDevice(b_k_n.mData.data());
auto a_element_op = AElementOp{};
auto b_element_op = BElementOp{};
auto cde_element_op = CDEElementOp{};
auto qs_element_op = QsElementOp{};
auto rs_element_op = RsElementOp{};
// Prepare GEMM, add, add_square
auto device_op = DeviceOpInstance{};
auto invoker = device_op.MakeInvoker();
auto argument =
device_op.MakeArgument(a_device_buf.GetDeviceBuffer(),
b_device_buf.GetDeviceBuffer(),
{},
e_device_buf.GetDeviceBuffer(),
{r0_device_buf.GetDeviceBuffer(), r1_device_buf.GetDeviceBuffer()},
M,
N,
K,
StrideA,
StrideB,
{},
StrideE,
a_element_op,
b_element_op,
cde_element_op,
qs_element_op,
rs_element_op);
if(!device_op.IsSupportedArgument(argument))
{
throw std::runtime_error("wrong! this device_op instance does not support this problem");
}
// init reducetion buffer to 0
r0_device_buf.SetZero();
r1_device_buf.SetZero();
invoker.Run(argument, StreamConfig{nullptr, false});
bool pass = true;
if(do_verification)
{
auto I0 = ck::Number<0>{};
auto I1 = ck::Number<1>{};
Tensor<ReduceAccDataType> e_m_n_host(e_m_n.mDesc);
Tensor<R0DataType> r0_m_host(r0_m.mDesc);
Tensor<R1DataType> r1_m_host(r1_m.mDesc);
auto ref_gemm = ReferenceGemmInstance{};
auto ref_invoker = ref_gemm.MakeInvoker();
auto ref_argument = ref_gemm.MakeArgument(
a_m_k, b_k_n, e_m_n_host, a_element_op, b_element_op, PassThrough{});
ref_invoker.Run(ref_argument);
auto reduce0_op = RsThreadReduceOp{}[I0];
auto reduce1_op = RsThreadReduceOp{}[I1];
for(int m = 0; m < M; ++m)
{
auto reduce0_acc = reduce0_op.template GetIdentityValue<ReduceAccDataType>();
auto reduce1_acc = reduce1_op.template GetIdentityValue<ReduceAccDataType>();
for(int n = 0; n < N; ++n)
{
ReduceAccDataType square_e_val;
auto e_val = ck::type_convert<ReduceAccDataType>(e_m_n_host(m, n));
qs_element_op[I1](square_e_val, e_val);
reduce0_op(reduce0_acc, e_val);
reduce1_op(reduce1_acc, square_e_val);
}
r0_m_host(m) = ck::type_convert<R0DataType>(reduce0_acc);
r1_m_host(m) = ck::type_convert<R1DataType>(reduce1_acc);
}
e_device_buf.FromDevice(e_m_n.mData.data());
Tensor<EDataType> e_m_n_host_converted(e_m_n_host);
pass = ck::utils::check_err(
e_m_n.mData, e_m_n_host_converted.mData, "Error: Incorrect results c", 1e-2, 1e-2);
r0_device_buf.FromDevice(r0_m.mData.data());
r1_device_buf.FromDevice(r1_m.mData.data());
pass &= ck::utils::check_err(
r0_m.mData, r0_m_host.mData, "Error: Incorrect results d0", 1e-2, 1e-2);
pass &= ck::utils::check_err(
r1_m.mData, r1_m_host.mData, "Error: Incorrect results d1", 1e-2, 1e-2);
if(pass)
{
std::cout << "Success!" << std::endl;
}
}
if(time_kernel)
{
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::size_t flop = 2_uz * M * N * K + 3_uz * M * N;
std::size_t gemm_num_byte = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
sizeof(EDataType) * M * N + sizeof(R0DataType) * M +
sizeof(R1DataType) * M;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gemm_gb_per_sec = gemm_num_byte / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gemm_gb_per_sec
<< " GB/s, " << std::endl;
}
return pass;
}
int main(int argc, char* argv[])
{
bool do_verification = true;
int init_method = 1;
bool time_kernel = true;
// GEMM shape
ck::index_t M = 1024;
ck::index_t N = 1152;
ck::index_t K = 512;
ck::index_t StrideA = 512;
ck::index_t StrideB = 512;
ck::index_t StrideE = 1152;
if(argc == 1)
{
// do nothing
}
else if(argc == 4)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
}
else if(argc == 10)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
M = std::stoi(argv[4]);
N = std::stoi(argv[5]);
K = std::stoi(argv[6]);
StrideA = std::stoi(argv[7]);
StrideB = std::stoi(argv[8]);
StrideE = std::stoi(argv[9]);
}
else
{
std::cout << "arg1: verification (0=no, 1=yes)\n"
<< " arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
<< " arg3: Measure kernel execution time (1=ON, 0=Off)\n"
<< " arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideE\n"
<< std::endl;
exit(EXIT_SUCCESS);
}
return !run_gemm_reduce_add_addsquare_xdl<ADataType,
BDataType,
EDataType,
R0DataType,
R1DataType,
ALayout,
BLayout,
ELayout,
AElementOp,
BElementOp,
CDEElementOp,
QsElementOp,
RsElementOp,
RsThreadReduceOp,
ReduceAccDataType,
DeviceOpInstance,
ReferenceGemmInstance>(
M, N, K, StrideA, StrideB, StrideE, do_verification, init_method, time_kernel);
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "gemm_reduce_xdl_common.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
// DataType
using ADataType = BF16;
using BDataType = BF16;
using GemmAccDataType = F32;
using CShuffleDataType = F32;
using DsDataType = ck::Tuple<>;
using EDataType = BF16;
using ReduceAccDataType = F32;
using R0DataType = F32;
using RsDataType = ck::Tuple<R0DataType>;
// Layout
using ALayout = Row;
using BLayout = Col;
using ELayout = Row;
// Elementwise op
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CDEElementOp = PassThrough;
using QsElementOp = ck::Tuple<PassThrough>;
using RsElementOp = ck::Tuple<PassThrough>;
// ReduceOp
using RsThreadReduceOp = ck::Tuple<ck::reduce::Max>;
using RsGlobalReduceOp =
ck::InMemoryDataOperationEnumSequence<ck::InMemoryDataOperationEnum::AtomicMax>;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
// clang-format off
using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultipleR_Xdl_CShuffle
<ALayout, // ALayout
BLayout, // BLayout
ELayout, // ELayout
ADataType, // ADataType
BDataType, // BDataType
GemmAccDataType, // GemmAccDataType
CShuffleDataType, // CShuffleDataType
DsDataType, // DsDataType
EDataType, // EDataType
ReduceAccDataType, // ReduceAccDataType
RsDataType, // RsDataType
AElementOp, // AElementwiseOperation
BElementOp, // BElementwiseOperation
CDEElementOp, // CDE ElementwiseOperation
QsElementOp, // Qs Elementwise Operation
RsElementOp, // Rs Elementwise Operation
RsThreadReduceOp, // Thread Reduce Operation
RsGlobalReduceOp, // Global Reduce Operation
GemmDefault, // GEMM Specialization
1, // NumGemmKPrefetchStage
256, // BlockSize
256, // MPerBlock
128, // NPerBlock
32, // KPerBlock
8, // AK1
8, // BK1
32, // MPerXdl
32, // NPerXdl
4, // MXdlPerWave
2, // NXdlPerWave
S<4, 64, 1>, // ABlockTransfer ThreadCluster Lengths_K0_M_K1
S<1, 0, 2>, // ABlockTransfer ThreadCluster ArrangeOrder
S<1, 0, 2>, // ABlockTransfer SrcAccessOrder
2, // ABlockTransfer SrcVectorDim
8, // ABlockTransfer SrcScalarPerVector
8, // ABlockTransfer DstScalarPerVector_K1
1, // ABlockLdsExtraM
S<4, 64, 1>, // BBlockTransfer ThreadCluster Lengths_K0_N_K1
S<1, 0, 2>, // BBlockTransfer ThreadCluster ArrangeOrder
S<1, 0, 2>, // BBlockTransfer SrcAccessOrder
2, // BBlockTransfer SrcVectorDim
8, // BBlockTransfer SrcScalarPerVector
8, // BBlockTransfer DstScalarPerVector_K1
1, // BBlockLdsExtraN
1, // CShuffleMXdlPerWavePerShuffle
1, // CShuffleNXdlPerWavePerShuffle
S<64, 4>, // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
4, // CDE ReduceThreadTransfer ScalarPerVector _NPerBlock
1>; // RThread DstScalarPerVector _MPerBlock
// clang-format on
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
BDataType,
ReduceAccDataType,
GemmAccDataType,
AElementOp,
BElementOp,
CDEElementOp>;
int main(int argc, char* argv[])
{
bool do_verification = true;
int init_method = 1;
bool time_kernel = true;
// GEMM shape
ck::index_t M = 1024;
ck::index_t N = 1152;
ck::index_t K = 256;
ck::index_t StrideA = 256;
ck::index_t StrideB = 256;
ck::index_t StrideE = 1152;
if(argc == 1)
{
// do nothing
}
else if(argc == 4)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
}
else if(argc == 10)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
M = std::stoi(argv[4]);
N = std::stoi(argv[5]);
K = std::stoi(argv[6]);
StrideA = std::stoi(argv[7]);
StrideB = std::stoi(argv[8]);
StrideE = std::stoi(argv[9]);
}
else
{
std::cout << "arg1: verification (0=no, 1=yes)\n"
<< " arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
<< " arg3: Measure kernel execution time (1=ON, 0=Off)\n"
<< " arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideE\n"
<< std::endl;
exit(EXIT_SUCCESS);
}
return run_gemm_reduce_max_xdl<ADataType,
BDataType,
EDataType,
R0DataType,
ALayout,
BLayout,
ELayout,
AElementOp,
BElementOp,
CDEElementOp,
QsElementOp,
RsElementOp,
RsThreadReduceOp,
ReduceAccDataType,
DeviceOpInstance,
ReferenceGemmInstance>(
M, N, K, StrideA, StrideB, StrideE, do_verification, init_method, time_kernel);
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "gemm_reduce_xdl_common.hpp"
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp"
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using F16 = ck::half_t;
using F32 = float;
using F64 = double;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
// DataType
using ADataType = F16;
......@@ -45,7 +24,6 @@ using BLayout = Col;
using ELayout = Row;
// Elementwise op
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CDEElementOp = PassThrough;
......@@ -54,7 +32,6 @@ using RsElementOp = ck::Tuple<PassThrough>;
// ReduceOp
using RsThreadReduceOp = ck::Tuple<ck::reduce::Max>;
using RsGlobalReduceOp =
ck::InMemoryDataOperationEnumSequence<ck::InMemoryDataOperationEnum::AtomicMax>;
......@@ -62,56 +39,72 @@ static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecializa
// clang-format off
using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultipleR_Xdl_CShuffle
//######| ALayout| BLayout| ELayout| AData| BData| GemmAccData| CShuffle| DsData| EData| ReduceAccData| RsData| A| B| CDE| Qs| Rs| Thread| Global| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CDRThreadTransfer| CDE| RThreadTransfer|
//######| | | | Type| Type| Type| DataType| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| Elementwise| Elementwise| Reduce| Reduce| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| ClusterLengths| ReduceThreadTransfer| DstScalarPerVector|
//######| | | | | | | | | | | | Operation| Operation| Operation| Operation| Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _MPerBlock_NPerBlock| ScalarPerVector| _MPerBlock|
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | _NPerBlock| |
< ALayout, BLayout, ELayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, DsDataType, EDataType, ReduceAccDataType, RsDataType, AElementOp, BElementOp, CDEElementOp, QsElementOp, RsElementOp, RsThreadReduceOp, RsGlobalReduceOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<64, 4>, 4, 1>;
<ALayout, // ALayout
BLayout, // BLayout
ELayout, // ELayout
ADataType, // ADataType
BDataType, // BDataType
GemmAccDataType, // GemmAccDataType
CShuffleDataType, // CShuffleDataType
DsDataType, // DsDataType
EDataType, // EDataType
ReduceAccDataType, // ReduceAccDataType
RsDataType, // RsDataType
AElementOp, // AElementwiseOperation
BElementOp, // BElementwiseOperation
CDEElementOp, // CDE ElementwiseOperation
QsElementOp, // Qs Elementwise Operation
RsElementOp, // Rs Elementwise Operation
RsThreadReduceOp, // Thread Reduce Operation
RsGlobalReduceOp, // Global Reduce Operation
GemmDefault, // GEMM Specialization
1, // NumGemmKPrefetchStage
256, // BlockSize
256, // MPerBlock
128, // NPerBlock
32, // KPerBlock
8, // AK1
8, // BK1
32, // MPerXdl
32, // NPerXdl
4, // MXdlPerWave
2, // NXdlPerWave
S<4, 64, 1>, // ABlockTransfer ThreadCluster Lengths_K0_M_K1
S<1, 0, 2>, // ABlockTransfer ThreadCluster ArrangeOrder
S<1, 0, 2>, // ABlockTransfer SrcAccessOrder
2, // ABlockTransfer SrcVectorDim
8, // ABlockTransfer SrcScalarPerVector
8, // ABlockTransfer DstScalarPerVector_K1
1, // ABlockLdsExtraM
S<4, 64, 1>, // BBlockTransfer ThreadCluster Lengths_K0_N_K1
S<1, 0, 2>, // BBlockTransfer ThreadCluster ArrangeOrder
S<1, 0, 2>, // BBlockTransfer SrcAccessOrder
2, // BBlockTransfer SrcVectorDim
8, // BBlockTransfer SrcScalarPerVector
8, // BBlockTransfer DstScalarPerVector_K1
1, // BBlockLdsExtraN
1, // CShuffleMXdlPerWavePerShuffle
1, // CShuffleNXdlPerWavePerShuffle
S<64, 4>, // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
4, // CDE ReduceThreadTransfer ScalarPerVector _NPerBlock
1>; // RThread DstScalarPerVector _MPerBlock
// clang-format on
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
BDataType,
EDataType,
ReduceAccDataType,
GemmAccDataType,
AElementOp,
BElementOp,
CDEElementOp>;
template <typename ADataType, typename BDataType, typename EDataType, typename R0DataType>
void DumpPerf(float ave_time, int M, int N, int K)
int main(int argc, char* argv[])
{
std::size_t flop = std::size_t(2) * M * N * K;
std::size_t gemm_num_byte = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
sizeof(EDataType) * M * N + sizeof(R0DataType) * M;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gemm_gb_per_sec = gemm_num_byte / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gemm_gb_per_sec
<< " GB/s, " << std::endl;
}
bool do_verification = true;
int init_method = 1;
bool time_kernel = true;
auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) {
return HostTensorDescriptor(std::vector<std::size_t>({len}),
std::vector<std::size_t>({stride}));
};
auto f_host_tensor_descriptor2d =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({stride, 1}));
}
else
{
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({1, stride}));
}
};
int main()
{
// GEMM shape
ck::index_t M = 1024;
ck::index_t N = 1024;
ck::index_t K = 1024;
......@@ -120,108 +113,55 @@ int main()
ck::index_t StrideB = 1024;
ck::index_t StrideE = 1024;
Tensor<ADataType> a_m_k(f_host_tensor_descriptor2d(M, K, StrideA, ALayout{}));
Tensor<BDataType> b_k_n(f_host_tensor_descriptor2d(K, N, StrideB, BLayout{}));
Tensor<EDataType> e_m_n(f_host_tensor_descriptor2d(M, N, StrideE, ELayout{}));
Tensor<R0DataType> r0_m(f_host_tensor_descriptor1d(M, 1));
a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{-1, 1});
b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-1, 1});
DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem e_device_buf(sizeof(EDataType) * e_m_n.mDesc.GetElementSpaceSize());
DeviceMem r0_device_buf(sizeof(R0DataType) * r0_m.mDesc.GetElementSpaceSize());
a_device_buf.ToDevice(a_m_k.mData.data());
b_device_buf.ToDevice(b_k_n.mData.data());
auto a_element_op = AElementOp{};
auto b_element_op = BElementOp{};
auto cde_element_op = CDEElementOp{};
auto qs_element_op = QsElementOp{};
auto rs_element_op = RsElementOp{};
// Prepare GEMM, max
auto device_op = DeviceOpInstance{};
auto invoker = device_op.MakeInvoker();
auto argument = device_op.MakeArgument(a_device_buf.GetDeviceBuffer(),
b_device_buf.GetDeviceBuffer(),
{},
e_device_buf.GetDeviceBuffer(),
{r0_device_buf.GetDeviceBuffer()},
M,
N,
K,
StrideA,
StrideB,
{},
StrideE,
a_element_op,
b_element_op,
cde_element_op,
qs_element_op,
rs_element_op);
if(!device_op.IsSupportedArgument(argument))
if(argc == 1)
{
throw std::runtime_error("wrong! this device_op instance does not support this problem");
// do nothing
}
// [CAUSION]: launch_and_time_kernel will not initialize D.
// If we evaluate kernel multiple time but without initialize D. Verification will fail
r0_device_buf.SetValue(ck::NumericLimits<R0DataType>::Lowest());
invoker.Run(argument, StreamConfig{nullptr, false});
bool do_verification = true;
bool pass = true;
if(do_verification)
else if(argc == 4)
{
auto I0 = ck::Number<0>{};
Tensor<EDataType> e_m_n_host(e_m_n.mDesc);
Tensor<R0DataType> r0_m_host(r0_m.mDesc);
auto ref_gemm = ReferenceGemmInstance{};
auto ref_invoker = ref_gemm.MakeInvoker();
auto ref_argument = ref_gemm.MakeArgument(
a_m_k, b_k_n, e_m_n_host, a_element_op, b_element_op, cde_element_op);
ref_invoker.Run(ref_argument);
auto reduce0_op = RsThreadReduceOp{}[I0];
for(int m = 0; m < M; ++m)
{
auto reduce0_acc = reduce0_op.GetIdentityValue<ReduceAccDataType>();
for(int n = 0; n < N; ++n)
{
auto e_val = ck::type_convert<ReduceAccDataType>(e_m_n_host(m, n));
reduce0_op(reduce0_acc, e_val);
};
r0_m_host(m) = ck::type_convert<R0DataType>(reduce0_acc);
}
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
}
else if(argc == 10)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
e_device_buf.FromDevice(e_m_n.mData.data());
r0_device_buf.FromDevice(r0_m.mData.data());
M = std::stoi(argv[4]);
N = std::stoi(argv[5]);
K = std::stoi(argv[6]);
pass = ck::utils::check_err(
e_m_n.mData, e_m_n_host.mData, "Error: Incorrect results e", 1e-2, 1e-2);
pass &= ck::utils::check_err(
r0_m.mData, r0_m_host.mData, "Error: Incorrect results d0", 1e-2, 1e-2);
StrideA = std::stoi(argv[7]);
StrideB = std::stoi(argv[8]);
StrideE = std::stoi(argv[9]);
}
bool time_kernel = true;
if(time_kernel)
else
{
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
DumpPerf<ADataType, BDataType, EDataType, R0DataType>(ave_time, M, N, K);
std::cout << "arg1: verification (0=no, 1=yes)\n"
<< " arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
<< " arg3: Measure kernel execution time (1=ON, 0=Off)\n"
<< " arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideE\n"
<< std::endl;
exit(EXIT_SUCCESS);
}
return pass ? 0 : 1;
return run_gemm_reduce_max_xdl<ADataType,
BDataType,
EDataType,
R0DataType,
ALayout,
BLayout,
ELayout,
AElementOp,
BElementOp,
CDEElementOp,
QsElementOp,
RsElementOp,
RsThreadReduceOp,
ReduceAccDataType,
DeviceOpInstance,
ReferenceGemmInstance>(
M, N, K, StrideA, StrideB, StrideE, do_verification, init_method, time_kernel);
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "gemm_reduce_xdl_common.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
// DataType
using ADataType = F32;
using BDataType = F32;
using GemmAccDataType = F32;
using CShuffleDataType = F32;
using DsDataType = ck::Tuple<>;
using EDataType = F32;
using ReduceAccDataType = F32;
using R0DataType = F32;
using RsDataType = ck::Tuple<R0DataType>;
// Layout
using ALayout = Row;
using BLayout = Col;
using ELayout = Row;
// Elementwise op
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CDEElementOp = PassThrough;
using QsElementOp = ck::Tuple<PassThrough>;
using RsElementOp = ck::Tuple<PassThrough>;
// ReduceOp
using RsThreadReduceOp = ck::Tuple<ck::reduce::Max>;
using RsGlobalReduceOp =
ck::InMemoryDataOperationEnumSequence<ck::InMemoryDataOperationEnum::AtomicMax>;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
// clang-format off
using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultipleR_Xdl_CShuffle
<ALayout, // ALayout
BLayout, // BLayout
ELayout, // ELayout
ADataType, // ADataType
BDataType, // BDataType
GemmAccDataType, // GemmAccDataType
CShuffleDataType, // CShuffleDataType
DsDataType, // DsDataType
EDataType, // EDataType
ReduceAccDataType, // ReduceAccDataType
RsDataType, // RsDataType
AElementOp, // AElementwiseOperation
BElementOp, // BElementwiseOperation
CDEElementOp, // CDE ElementwiseOperation
QsElementOp, // Qs Elementwise Operation
RsElementOp, // Rs Elementwise Operation
RsThreadReduceOp, // Thread Reduce Operation
RsGlobalReduceOp, // Global Reduce Operation
GemmDefault, // GEMM Specialization
1, // NumGemmKPrefetchStage
256, // BlockSize
256, // MPerBlock
128, // NPerBlock
16, // KPerBlock
4, // AK1
4, // BK1
32, // MPerXdl
32, // NPerXdl
4, // MXdlPerWave
2, // NXdlPerWave
S<4, 64, 1>, // ABlockTransfer ThreadCluster Lengths_K0_M_K1
S<1, 0, 2>, // ABlockTransfer ThreadCluster ArrangeOrder
S<1, 0, 2>, // ABlockTransfer SrcAccessOrder
2, // ABlockTransfer SrcVectorDim
4, // ABlockTransfer SrcScalarPerVector
4, // ABlockTransfer DstScalarPerVector_K1
1, // ABlockLdsExtraM
S<4, 64, 1>, // BBlockTransfer ThreadCluster Lengths_K0_N_K1
S<1, 0, 2>, // BBlockTransfer ThreadCluster ArrangeOrder
S<1, 0, 2>, // BBlockTransfer SrcAccessOrder
2, // BBlockTransfer SrcVectorDim
4, // BBlockTransfer SrcScalarPerVector
4, // BBlockTransfer DstScalarPerVector_K1
1, // BBlockLdsExtraN
1, // CShuffleMXdlPerWavePerShuffle
1, // CShuffleNXdlPerWavePerShuffle
S<64, 4>, // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
4, // CDE ReduceThreadTransfer ScalarPerVector _NPerBlock
1>; // RThread DstScalarPerVector _MPerBlock
// clang-format on
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
BDataType,
ReduceAccDataType,
GemmAccDataType,
AElementOp,
BElementOp,
CDEElementOp>;
int main(int argc, char* argv[])
{
bool do_verification = true;
int init_method = 1;
bool time_kernel = true;
// GEMM shape
ck::index_t M = 1024;
ck::index_t N = 1024;
ck::index_t K = 1024;
ck::index_t StrideA = 1024;
ck::index_t StrideB = 1024;
ck::index_t StrideE = 1024;
if(argc == 1)
{
// do nothing
}
else if(argc == 4)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
}
else if(argc == 10)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
M = std::stoi(argv[4]);
N = std::stoi(argv[5]);
K = std::stoi(argv[6]);
StrideA = std::stoi(argv[7]);
StrideB = std::stoi(argv[8]);
StrideE = std::stoi(argv[9]);
}
else
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
printf("arg3: Measure kernel execution time (1=ON, 0=Off)\n");
printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideE\n");
exit(0);
}
return run_gemm_reduce_max_xdl<ADataType,
BDataType,
EDataType,
R0DataType,
ALayout,
BLayout,
ELayout,
AElementOp,
BElementOp,
CDEElementOp,
QsElementOp,
RsElementOp,
RsThreadReduceOp,
ReduceAccDataType,
DeviceOpInstance,
ReferenceGemmInstance>(
M, N, K, StrideA, StrideB, StrideE, do_verification, init_method, time_kernel);
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "gemm_reduce_xdl_common.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
using ADataType = INT4;
using ADataKernelType = INT8;
using BDataType = INT4;
using BDataKernelType = INT8;
using GemmAccDataType = INT32;
using CShuffleDataType = INT32;
using DsDataType = ck::Tuple<>;
using EDataType = INT4;
using EDataKernelType = INT8;
using ReduceAccDataType = INT32;
using R0DataType = INT32;
using RsDataType = ck::Tuple<R0DataType>;
// Layout
using ALayout = Row;
using BLayout = Col;
using ELayout = Row;
// Elementwise op
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CDEElementOp = PassThrough;
using QsElementOp = ck::Tuple<PassThrough>;
using RsElementOp = ck::Tuple<PassThrough>;
// ReduceOp
using RsThreadReduceOp = ck::Tuple<ck::reduce::Max>;
using RsGlobalReduceOp =
ck::InMemoryDataOperationEnumSequence<ck::InMemoryDataOperationEnum::AtomicMax>;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
// clang-format off
using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultipleR_Xdl_CShuffle
<ALayout, // ALayout
BLayout, // BLayout
ELayout, // ELayout
ADataKernelType, // ADataType
BDataKernelType, // BDataType
GemmAccDataType, // GemmAccDataType
CShuffleDataType, // CShuffleDataType
DsDataType, // DsDataType
EDataKernelType, // EDataType
ReduceAccDataType, // ReduceAccDataType
RsDataType, // RsDataType
AElementOp, // AElementwiseOperation
BElementOp, // BElementwiseOperation
CDEElementOp, // CDE ElementwiseOperation
QsElementOp, // Qs Elementwise Operation
RsElementOp, // Rs Elementwise Operation
RsThreadReduceOp, // Thread Reduce Operation
RsGlobalReduceOp, // Global Reduce Operation
GemmDefault, // GEMM Specialization
1, // NumGemmKPrefetchStage
256, // BlockSize
256, // MPerBlock
128, // NPerBlock
64, // KPerBlock
16, // AK1
16, // BK1
32, // MPerXdl
32, // NPerXdl
4, // MXdlPerWave
2, // NXdlPerWave
S<4, 64, 1>, // ABlockTransfer ThreadCluster Lengths_K0_M_K1
S<1, 0, 2>, // ABlockTransfer ThreadCluster ArrangeOrder
S<1, 0, 2>, // ABlockTransfer SrcAccessOrder
2, // ABlockTransfer SrcVectorDim
16, // ABlockTransfer SrcScalarPerVector
16, // ABlockTransfer DstScalarPerVector_K1
1, // ABlockLdsExtraM
S<4, 64, 1>, // BBlockTransfer ThreadCluster Lengths_K0_N_K1
S<1, 0, 2>, // BBlockTransfer ThreadCluster ArrangeOrder
S<1, 0, 2>, // BBlockTransfer SrcAccessOrder
2, // BBlockTransfer SrcVectorDim
16, // BBlockTransfer SrcScalarPerVector
16, // BBlockTransfer DstScalarPerVector_K1
1, // BBlockLdsExtraN
1, // CShuffleMXdlPerWavePerShuffle
1, // CShuffleNXdlPerWavePerShuffle
S<64, 4>, // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
4, // CDE ReduceThreadTransfer ScalarPerVector _NPerBlock
1>; // RThread DstScalarPerVector _MPerBlock
// clang-format on
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
BDataType,
ReduceAccDataType,
GemmAccDataType,
AElementOp,
BElementOp,
CDEElementOp>;
int main(int argc, char* argv[])
{
bool do_verification = true;
int init_method = 1;
bool time_kernel = true;
// GEMM shape
ck::index_t M = 1024;
ck::index_t N = 1152;
ck::index_t K = 256;
ck::index_t StrideA = 256;
ck::index_t StrideB = 256;
ck::index_t StrideE = 1152;
if(argc == 1)
{
// do nothing
}
else if(argc == 4)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
}
else if(argc == 10)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
M = std::stoi(argv[4]);
N = std::stoi(argv[5]);
K = std::stoi(argv[6]);
StrideA = std::stoi(argv[7]);
StrideB = std::stoi(argv[8]);
StrideE = std::stoi(argv[9]);
}
else
{
std::cout << "arg1: verification (0=no, 1=yes)\n"
<< " arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
<< " arg3: Measure kernel execution time (1=ON, 0=Off)\n"
<< " arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideE\n"
<< std::endl;
exit(EXIT_SUCCESS);
}
return run_gemm_reduce_max_xdl<ADataType,
BDataType,
EDataType,
R0DataType,
ALayout,
BLayout,
ELayout,
AElementOp,
BElementOp,
CDEElementOp,
QsElementOp,
RsElementOp,
RsThreadReduceOp,
ReduceAccDataType,
DeviceOpInstance,
ReferenceGemmInstance,
ADataKernelType,
BDataKernelType,
EDataKernelType>(
M, N, K, StrideA, StrideB, StrideE, do_verification, init_method, time_kernel);
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "gemm_reduce_xdl_common.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
using ADataType = INT8;
using BDataType = INT8;
using GemmAccDataType = INT32;
using CShuffleDataType = INT32;
using DsDataType = ck::Tuple<>;
using EDataType = INT8;
using ReduceAccDataType = INT32;
using R0DataType = INT32;
using RsDataType = ck::Tuple<R0DataType>;
// Layout
using ALayout = Row;
using BLayout = Col;
using ELayout = Row;
// Elementwise op
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CDEElementOp = PassThrough;
using QsElementOp = ck::Tuple<PassThrough>;
using RsElementOp = ck::Tuple<PassThrough>;
// ReduceOp
using RsThreadReduceOp = ck::Tuple<ck::reduce::Max>;
using RsGlobalReduceOp =
ck::InMemoryDataOperationEnumSequence<ck::InMemoryDataOperationEnum::AtomicMax>;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
// clang-format off
using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultipleR_Xdl_CShuffle
<ALayout, // ALayout
BLayout, // BLayout
ELayout, // ELayout
ADataType, // ADataType
BDataType, // BDataType
GemmAccDataType, // GemmAccDataType
CShuffleDataType, // CShuffleDataType
DsDataType, // DsDataType
EDataType, // EDataType
ReduceAccDataType, // ReduceAccDataType
RsDataType, // RsDataType
AElementOp, // AElementwiseOperation
BElementOp, // BElementwiseOperation
CDEElementOp, // CDE ElementwiseOperation
QsElementOp, // Qs Elementwise Operation
RsElementOp, // Rs Elementwise Operation
RsThreadReduceOp, // Thread Reduce Operation
RsGlobalReduceOp, // Global Reduce Operation
GemmDefault, // GEMM Specialization
1, // NumGemmKPrefetchStage
256, // BlockSize
256, // MPerBlock
128, // NPerBlock
64, // KPerBlock
16, // AK1
16, // BK1
32, // MPerXdl
32, // NPerXdl
4, // MXdlPerWave
2, // NXdlPerWave
S<4, 64, 1>, // ABlockTransfer ThreadCluster Lengths_K0_M_K1
S<1, 0, 2>, // ABlockTransfer ThreadCluster ArrangeOrder
S<1, 0, 2>, // ABlockTransfer SrcAccessOrder
2, // ABlockTransfer SrcVectorDim
16, // ABlockTransfer SrcScalarPerVector
16, // ABlockTransfer DstScalarPerVector_K1
1, // ABlockLdsExtraM
S<4, 64, 1>, // BBlockTransfer ThreadCluster Lengths_K0_N_K1
S<1, 0, 2>, // BBlockTransfer ThreadCluster ArrangeOrder
S<1, 0, 2>, // BBlockTransfer SrcAccessOrder
2, // BBlockTransfer SrcVectorDim
16, // BBlockTransfer SrcScalarPerVector
16, // BBlockTransfer DstScalarPerVector_K1
1, // BBlockLdsExtraN
1, // CShuffleMXdlPerWavePerShuffle
1, // CShuffleNXdlPerWavePerShuffle
S<64, 4>, // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
4, // CDE ReduceThreadTransfer ScalarPerVector _NPerBlock
1>; // RThread DstScalarPerVector _MPerBlock
// clang-format on
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
BDataType,
ReduceAccDataType,
GemmAccDataType,
AElementOp,
BElementOp,
CDEElementOp>;
int main(int argc, char* argv[])
{
bool do_verification = true;
int init_method = 1;
bool time_kernel = true;
// GEMM shape
ck::index_t M = 1024;
ck::index_t N = 1152;
ck::index_t K = 512;
ck::index_t StrideA = 512;
ck::index_t StrideB = 512;
ck::index_t StrideE = 1152;
if(argc == 1)
{
// do nothing
}
else if(argc == 4)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
}
else if(argc == 10)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
M = std::stoi(argv[4]);
N = std::stoi(argv[5]);
K = std::stoi(argv[6]);
StrideA = std::stoi(argv[7]);
StrideB = std::stoi(argv[8]);
StrideE = std::stoi(argv[9]);
}
else
{
std::cout << "arg1: verification (0=no, 1=yes)\n"
<< " arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
<< " arg3: Measure kernel execution time (1=ON, 0=Off)\n"
<< " arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideE\n"
<< std::endl;
exit(EXIT_SUCCESS);
}
return run_gemm_reduce_max_xdl<ADataType,
BDataType,
EDataType,
R0DataType,
ALayout,
BLayout,
ELayout,
AElementOp,
BElementOp,
CDEElementOp,
QsElementOp,
RsElementOp,
RsThreadReduceOp,
ReduceAccDataType,
DeviceOpInstance,
ReferenceGemmInstance>(
M, N, K, StrideA, StrideB, StrideE, do_verification, init_method, time_kernel);
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "gemm_reduce_xdl_common.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
// DataType
using ADataType = BF16;
using BDataType = BF16;
using GemmAccDataType = F32;
using CShuffleDataType = F32;
using DsDataType = ck::Tuple<>;
using EDataType = BF16;
using ReduceAccDataType = F32;
using R0DataType = F32;
using R1DataType = F32;
using RsDataType = ck::Tuple<R0DataType, R1DataType>;
// Layout
using ALayout = Row;
using BLayout = Col;
using ELayout = Row;
// Elementwise op
using Square = ck::tensor_operation::element_wise::UnarySquare;
using Div = ck::tensor_operation::element_wise::UnaryDivide;
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CDEElementOp = PassThrough;
using QsElementOp = ck::Tuple<PassThrough, Square>;
using RsElementOp = ck::Tuple<Div, Div>;
// ReduceOp
using R0ThreadReduceOp = ck::reduce::Add;
using R1ThreadReduceOp = ck::reduce::Add;
using RsThreadReduceOp = ck::Tuple<R0ThreadReduceOp, R1ThreadReduceOp>;
static constexpr auto R0GlobalReduceOp = ck::InMemoryDataOperationEnum::AtomicAdd;
static constexpr auto R1GlobalReduceOp = ck::InMemoryDataOperationEnum::AtomicAdd;
using RsGlobalReduceOp = ck::InMemoryDataOperationEnumSequence<R0GlobalReduceOp, R1GlobalReduceOp>;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
// clang-format off
using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultipleR_Xdl_CShuffle
<ALayout, // ALayout
BLayout, // BLayout
ELayout, // ELayout
ADataType, // ADataType
BDataType, // BDataType
GemmAccDataType, // GemmAccDataType
CShuffleDataType, // CShuffleDataType
DsDataType, // DsDataType
EDataType, // EDataType
ReduceAccDataType, // ReduceAccDataType
RsDataType, // RsDataType
AElementOp, // AElementwiseOperation
BElementOp, // BElementwiseOperation
CDEElementOp, // CDE ElementwiseOperation
QsElementOp, // Qs Elementwise Operation
RsElementOp, // Rs Elementwise Operation
RsThreadReduceOp, // Thread Reduce Operation
RsGlobalReduceOp, // Global Reduce Operation
GemmDefault, // GEMM Specialization
1, // NumGemmKPrefetchStage
256, // BlockSize
256, // MPerBlock
128, // NPerBlock
32, // KPerBlock
8, // AK1
8, // BK1
32, // MPerXdl
32, // NPerXdl
4, // MXdlPerWave
2, // NXdlPerWave
S<4, 64, 1>, // ABlockTransfer ThreadCluster Lengths_K0_M_K1
S<1, 0, 2>, // ABlockTransfer ThreadCluster ArrangeOrder
S<1, 0, 2>, // ABlockTransfer SrcAccessOrder
2, // ABlockTransfer SrcVectorDim
8, // ABlockTransfer SrcScalarPerVector
8, // ABlockTransfer DstScalarPerVector_K1
1, // ABlockLdsExtraM
S<4, 64, 1>, // BBlockTransfer ThreadCluster Lengths_K0_N_K1
S<1, 0, 2>, // BBlockTransfer ThreadCluster ArrangeOrder
S<1, 0, 2>, // BBlockTransfer SrcAccessOrder
2, // BBlockTransfer SrcVectorDim
8, // BBlockTransfer SrcScalarPerVector
8, // BBlockTransfer DstScalarPerVector_K1
1, // BBlockLdsExtraN
1, // CShuffleMXdlPerWavePerShuffle
1, // CShuffleNXdlPerWavePerShuffle
S<64, 4>, // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
4, // CDE ReduceThreadTransfer ScalarPerVector _NPerBlock
1>; // RThread DstScalarPerVector _MPerBlock
// clang-format on
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
BDataType,
ReduceAccDataType,
GemmAccDataType,
AElementOp,
BElementOp,
CDEElementOp>;
int main(int argc, char* argv[])
{
bool do_verification = true;
int init_method = 1;
bool time_kernel = true;
// GEMM shape
ck::index_t M = 1024;
ck::index_t N = 1152;
ck::index_t K = 192;
ck::index_t StrideA = 192;
ck::index_t StrideB = 192;
ck::index_t StrideE = 1152;
if(argc == 1)
{
// do nothing
}
else if(argc == 4)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
}
else if(argc == 10)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
M = std::stoi(argv[4]);
N = std::stoi(argv[5]);
K = std::stoi(argv[6]);
StrideA = std::stoi(argv[7]);
StrideB = std::stoi(argv[8]);
StrideE = std::stoi(argv[9]);
}
else
{
std::cout << "arg1: verification (0=no, 1=yes)\n"
<< " arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
<< " arg3: Measure kernel execution time (1=ON, 0=Off)\n"
<< " arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideE\n"
<< std::endl;
exit(EXIT_SUCCESS);
}
return !run_gemm_reduce_mean_meansquare_xdl<ADataType,
BDataType,
EDataType,
R0DataType,
R1DataType,
ALayout,
BLayout,
ELayout,
AElementOp,
BElementOp,
CDEElementOp,
QsElementOp,
RsElementOp,
RsThreadReduceOp,
ReduceAccDataType,
DeviceOpInstance,
ReferenceGemmInstance>(
M, N, K, StrideA, StrideB, StrideE, do_verification, init_method, time_kernel);
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "gemm_reduce_xdl_common.hpp"
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp"
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using F16 = ck::half_t;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
// DataType
using ADataType = F16;
......@@ -45,7 +25,6 @@ using BLayout = Col;
using ELayout = Row;
// Elementwise op
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using Square = ck::tensor_operation::element_wise::UnarySquare;
using Div = ck::tensor_operation::element_wise::UnaryDivide;
using AElementOp = PassThrough;
......@@ -67,61 +46,71 @@ static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecializa
// clang-format off
using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultipleR_Xdl_CShuffle
//######| ALayout| BLayout| ELayout| AData| BData| GemmAccData| CShuffle| DsData| EData| ReduceAccData| RsData| A| B| CDE| Qs| Rs| Thread| Global| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CDRThreadTransfer| CDE| RThreadTransfer|
//######| | | | Type| Type| Type| DataType| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| Elementwise| Elementwise| Reduce| Reduce| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| ClusterLengths| ReduceThreadTransfer| DstScalarPerVector|
//######| | | | | | | | | | | | Operation| Operation| Operation| Operation| Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _MPerBlock_NPerBlock| ScalarPerVector| _MPerBlock|
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | _NPerBlock| |
< ALayout, BLayout, ELayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, DsDataType, EDataType, ReduceAccDataType, RsDataType, AElementOp, BElementOp, CDEElementOp, QsElementOp, RsElementOp, RsThreadReduceOp, RsGlobalReduceOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<64, 4>, 4, 1>;
<ALayout, // ALayout
BLayout, // BLayout
ELayout, // ELayout
ADataType, // ADataType
BDataType, // BDataType
GemmAccDataType, // GemmAccDataType
CShuffleDataType, // CShuffleDataType
DsDataType, // DsDataType
EDataType, // EDataType
ReduceAccDataType, // ReduceAccDataType
RsDataType, // RsDataType
AElementOp, // AElementwiseOperation
BElementOp, // BElementwiseOperation
CDEElementOp, // CDE ElementwiseOperation
QsElementOp, // Qs Elementwise Operation
RsElementOp, // Rs Elementwise Operation
RsThreadReduceOp, // Thread Reduce Operation
RsGlobalReduceOp, // Global Reduce Operation
GemmDefault, // GEMM Specialization
1, // NumGemmKPrefetchStage
256, // BlockSize
256, // MPerBlock
128, // NPerBlock
32, // KPerBlock
8, // AK1
8, // BK1
32, // MPerXdl
32, // NPerXdl
4, // MXdlPerWave
2, // NXdlPerWave
S<4, 64, 1>, // ABlockTransfer ThreadCluster Lengths_K0_M_K1
S<1, 0, 2>, // ABlockTransfer ThreadCluster ArrangeOrder
S<1, 0, 2>, // ABlockTransfer SrcAccessOrder
2, // ABlockTransfer SrcVectorDim
8, // ABlockTransfer SrcScalarPerVector
8, // ABlockTransfer DstScalarPerVector_K1
1, // ABlockLdsExtraM
S<4, 64, 1>, // BBlockTransfer ThreadCluster Lengths_K0_N_K1
S<1, 0, 2>, // BBlockTransfer ThreadCluster ArrangeOrder
S<1, 0, 2>, // BBlockTransfer SrcAccessOrder
2, // BBlockTransfer SrcVectorDim
8, // BBlockTransfer SrcScalarPerVector
8, // BBlockTransfer DstScalarPerVector_K1
1, // BBlockLdsExtraN
1, // CShuffleMXdlPerWavePerShuffle
1, // CShuffleNXdlPerWavePerShuffle
S<64, 4>, // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
4, // CDE ReduceThreadTransfer ScalarPerVector _NPerBlock
1>; // RThread DstScalarPerVector _MPerBlock
// clang-format on
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
BDataType,
EDataType,
ReduceAccDataType,
GemmAccDataType,
AElementOp,
BElementOp,
CDEElementOp>;
template <typename ADataType,
typename BDataType,
typename EDataType,
typename R0DataType,
typename R1DataType>
void DumpPerf(float ave_time, int M, int N, int K)
int main(int argc, char* argv[])
{
std::size_t flop = std::size_t(2) * M * N * K;
std::size_t gemm_num_byte = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
sizeof(EDataType) * M * N + sizeof(R0DataType) * M +
sizeof(R1DataType) * M;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gemm_gb_per_sec = gemm_num_byte / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gemm_gb_per_sec
<< " GB/s, " << std::endl;
}
bool do_verification = true;
int init_method = 1;
bool time_kernel = true;
auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) {
return HostTensorDescriptor(std::vector<std::size_t>({len}),
std::vector<std::size_t>({stride}));
};
auto f_host_tensor_descriptor2d =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({stride, 1}));
}
else
{
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({1, stride}));
}
};
int main()
{
// GEMM shape
ck::index_t M = 1024;
ck::index_t N = 1024;
ck::index_t K = 1024;
......@@ -130,125 +119,56 @@ int main()
ck::index_t StrideB = 1024;
ck::index_t StrideE = 1024;
Tensor<ADataType> a_m_k(f_host_tensor_descriptor2d(M, K, StrideA, ALayout{}));
Tensor<BDataType> b_k_n(f_host_tensor_descriptor2d(K, N, StrideB, BLayout{}));
Tensor<EDataType> e_m_n(f_host_tensor_descriptor2d(M, N, StrideE, ELayout{}));
Tensor<R0DataType> r0_m(f_host_tensor_descriptor1d(M, 1));
Tensor<R1DataType> r1_m(f_host_tensor_descriptor1d(M, 1));
a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{-1, 1});
b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-1, 1});
DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem e_device_buf(sizeof(EDataType) * e_m_n.mDesc.GetElementSpaceSize());
DeviceMem r0_device_buf(sizeof(R0DataType) * r0_m.mDesc.GetElementSpaceSize());
DeviceMem r1_device_buf(sizeof(R1DataType) * r1_m.mDesc.GetElementSpaceSize());
a_device_buf.ToDevice(a_m_k.mData.data());
b_device_buf.ToDevice(b_k_n.mData.data());
auto a_element_op = AElementOp{};
auto b_element_op = BElementOp{};
auto cde_element_op = CDEElementOp{};
auto qs_element_op = QsElementOp{};
auto rs_element_op = RsElementOp{N, N};
// Prepare GEMM, mean, mean_square
auto device_op = DeviceOpInstance{};
auto invoker = device_op.MakeInvoker();
auto argument =
device_op.MakeArgument(a_device_buf.GetDeviceBuffer(),
b_device_buf.GetDeviceBuffer(),
{},
e_device_buf.GetDeviceBuffer(),
{r0_device_buf.GetDeviceBuffer(), r1_device_buf.GetDeviceBuffer()},
M,
N,
K,
StrideA,
StrideB,
{},
StrideE,
a_element_op,
b_element_op,
cde_element_op,
qs_element_op,
rs_element_op);
if(!device_op.IsSupportedArgument(argument))
if(argc == 1)
{
throw std::runtime_error("wrong! this device_op instance does not support this problem");
// do nothing
}
// init reducetion buffer to 0
r0_device_buf.SetZero();
r1_device_buf.SetZero();
invoker.Run(argument, StreamConfig{nullptr, false});
bool do_verification = true;
bool pass = true;
if(do_verification)
else if(argc == 4)
{
auto I0 = ck::Number<0>{};
auto I1 = ck::Number<1>{};
Tensor<EDataType> e_m_n_host(e_m_n.mDesc);
Tensor<R0DataType> r0_m_host(r0_m.mDesc);
Tensor<R1DataType> r1_m_host(r1_m.mDesc);
auto ref_gemm = ReferenceGemmInstance{};
auto ref_invoker = ref_gemm.MakeInvoker();
auto ref_argument = ref_gemm.MakeArgument(
a_m_k, b_k_n, e_m_n_host, a_element_op, b_element_op, PassThrough{});
ref_invoker.Run(ref_argument);
auto reduce0_op = R0ThreadReduceOp{};
auto reduce1_op = R1ThreadReduceOp{};
for(int m = 0; m < M; ++m)
{
auto reduce0_acc = reduce0_op.GetIdentityValue<ReduceAccDataType>();
auto reduce1_acc = reduce1_op.GetIdentityValue<ReduceAccDataType>();
for(int n = 0; n < N; ++n)
{
ReduceAccDataType square_e_val;
auto e_val = ck::type_convert<ReduceAccDataType>(e_m_n_host(m, n));
qs_element_op[I1](square_e_val, e_val);
reduce0_op(reduce0_acc, e_val);
reduce1_op(reduce1_acc, square_e_val);
}
rs_element_op[I0](reduce0_acc, reduce0_acc);
rs_element_op[I1](reduce1_acc, reduce1_acc);
r0_m_host(m) = ck::type_convert<R0DataType>(reduce0_acc);
r1_m_host(m) = ck::type_convert<R1DataType>(reduce1_acc);
}
e_device_buf.FromDevice(e_m_n.mData.data());
r0_device_buf.FromDevice(r0_m.mData.data());
r1_device_buf.FromDevice(r1_m.mData.data());
pass = ck::utils::check_err(
e_m_n.mData, e_m_n_host.mData, "Error: Incorrect results c", 1e-2, 1e-2);
pass &= ck::utils::check_err(
r0_m.mData, r0_m_host.mData, "Error: Incorrect results d0", 1e-2, 1e-2);
pass &= ck::utils::check_err(
r1_m.mData, r1_m_host.mData, "Error: Incorrect results d1", 1e-2, 1e-2);
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
}
else if(argc == 10)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
bool time_kernel = true;
if(time_kernel)
M = std::stoi(argv[4]);
N = std::stoi(argv[5]);
K = std::stoi(argv[6]);
StrideA = std::stoi(argv[7]);
StrideB = std::stoi(argv[8]);
StrideE = std::stoi(argv[9]);
}
else
{
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
DumpPerf<ADataType, BDataType, EDataType, R0DataType, R1DataType>(ave_time, M, N, K);
std::cout << "arg1: verification (0=no, 1=yes)\n"
<< " arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
<< " arg3: Measure kernel execution time (1=ON, 0=Off)\n"
<< " arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideE\n"
<< std::endl;
exit(EXIT_SUCCESS);
}
return pass ? 0 : 1;
return !run_gemm_reduce_mean_meansquare_xdl<ADataType,
BDataType,
EDataType,
R0DataType,
R1DataType,
ALayout,
BLayout,
ELayout,
AElementOp,
BElementOp,
CDEElementOp,
QsElementOp,
RsElementOp,
RsThreadReduceOp,
ReduceAccDataType,
DeviceOpInstance,
ReferenceGemmInstance>(
M, N, K, StrideA, StrideB, StrideE, do_verification, init_method, time_kernel);
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "gemm_reduce_xdl_common.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
// DataType
using ADataType = F32;
using BDataType = F32;
using GemmAccDataType = F32;
using CShuffleDataType = F32;
using DsDataType = ck::Tuple<>;
using EDataType = F32;
using ReduceAccDataType = F32;
using R0DataType = F32;
using R1DataType = F32;
using RsDataType = ck::Tuple<R0DataType, R1DataType>;
// Layout
using ALayout = Row;
using BLayout = Col;
using ELayout = Row;
// Elementwise op
using Square = ck::tensor_operation::element_wise::UnarySquare;
using Div = ck::tensor_operation::element_wise::UnaryDivide;
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CDEElementOp = PassThrough;
using QsElementOp = ck::Tuple<PassThrough, Square>;
using RsElementOp = ck::Tuple<Div, Div>;
// ReduceOp
using R0ThreadReduceOp = ck::reduce::Add;
using R1ThreadReduceOp = ck::reduce::Add;
using RsThreadReduceOp = ck::Tuple<R0ThreadReduceOp, R1ThreadReduceOp>;
static constexpr auto R0GlobalReduceOp = ck::InMemoryDataOperationEnum::AtomicAdd;
static constexpr auto R1GlobalReduceOp = ck::InMemoryDataOperationEnum::AtomicAdd;
using RsGlobalReduceOp = ck::InMemoryDataOperationEnumSequence<R0GlobalReduceOp, R1GlobalReduceOp>;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
// clang-format off
using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultipleR_Xdl_CShuffle
<ALayout, // ALayout
BLayout, // BLayout
ELayout, // ELayout
ADataType, // ADataType
BDataType, // BDataType
GemmAccDataType, // GemmAccDataType
CShuffleDataType, // CShuffleDataType
DsDataType, // DsDataType
EDataType, // EDataType
ReduceAccDataType, // ReduceAccDataType
RsDataType, // RsDataType
AElementOp, // AElementwiseOperation
BElementOp, // BElementwiseOperation
CDEElementOp, // CDE ElementwiseOperation
QsElementOp, // Qs Elementwise Operation
RsElementOp, // Rs Elementwise Operation
RsThreadReduceOp, // Thread Reduce Operation
RsGlobalReduceOp, // Global Reduce Operation
GemmDefault, // GEMM Specialization
1, // NumGemmKPrefetchStage
256, // BlockSize
256, // MPerBlock
128, // NPerBlock
16, // KPerBlock
4, // AK1
4, // BK1
32, // MPerXdl
32, // NPerXdl
4, // MXdlPerWave
2, // NXdlPerWave
S<4, 64, 1>, // ABlockTransfer ThreadCluster Lengths_K0_M_K1
S<1, 0, 2>, // ABlockTransfer ThreadCluster ArrangeOrder
S<1, 0, 2>, // ABlockTransfer SrcAccessOrder
2, // ABlockTransfer SrcVectorDim
4, // ABlockTransfer SrcScalarPerVector
4, // ABlockTransfer DstScalarPerVector_K1
1, // ABlockLdsExtraM
S<4, 64, 1>, // BBlockTransfer ThreadCluster Lengths_K0_N_K1
S<1, 0, 2>, // BBlockTransfer ThreadCluster ArrangeOrder
S<1, 0, 2>, // BBlockTransfer SrcAccessOrder
2, // BBlockTransfer SrcVectorDim
4, // BBlockTransfer SrcScalarPerVector
4, // BBlockTransfer DstScalarPerVector_K1
1, // BBlockLdsExtraN
1, // CShuffleMXdlPerWavePerShuffle
1, // CShuffleNXdlPerWavePerShuffle
S<64, 4>, // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
4, // CDE ReduceThreadTransfer ScalarPerVector _NPerBlock
1>; // RThread DstScalarPerVector _MPerBlock
// clang-format on
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
BDataType,
ReduceAccDataType,
GemmAccDataType,
AElementOp,
BElementOp,
CDEElementOp>;
int main(int argc, char* argv[])
{
bool do_verification = true;
int init_method = 1;
bool time_kernel = true;
// GEMM shape
ck::index_t M = 1024;
ck::index_t N = 1024;
ck::index_t K = 1024;
ck::index_t StrideA = 1024;
ck::index_t StrideB = 1024;
ck::index_t StrideE = 1024;
if(argc == 1)
{
// do nothing
}
else if(argc == 4)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
}
else if(argc == 10)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
M = std::stoi(argv[4]);
N = std::stoi(argv[5]);
K = std::stoi(argv[6]);
StrideA = std::stoi(argv[7]);
StrideB = std::stoi(argv[8]);
StrideE = std::stoi(argv[9]);
}
else
{
std::cout << "arg1: verification (0=no, 1=yes)\n"
<< " arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
<< " arg3: Measure kernel execution time (1=ON, 0=Off)\n"
<< " arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideE\n"
<< std::endl;
exit(EXIT_SUCCESS);
}
return !run_gemm_reduce_mean_meansquare_xdl<ADataType,
BDataType,
EDataType,
R0DataType,
R1DataType,
ALayout,
BLayout,
ELayout,
AElementOp,
BElementOp,
CDEElementOp,
QsElementOp,
RsElementOp,
RsThreadReduceOp,
ReduceAccDataType,
DeviceOpInstance,
ReferenceGemmInstance>(
M, N, K, StrideA, StrideB, StrideE, do_verification, init_method, time_kernel);
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <iostream>
#include "ck/ck.hpp"
#include "ck/host_utility/io.hpp"
#include "ck/stream_config.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/fill.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using F16 = ck::half_t;
using BF16 = ck::bhalf_t;
using F32 = float;
using F64 = double;
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
using INT4 = ck::int4_t;
#endif
using INT8 = std::int8_t;
using INT32 = std::int32_t;
template <typename ADataType, typename BDataType, typename EDataType, typename R0DataType>
void DumpGemmReduceMaxPerf(float ave_time, int M, int N, int K)
{
using namespace ck::literals;
std::size_t flop = 2_uz * M * N * K;
std::size_t gemm_num_byte = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
sizeof(EDataType) * M * N + sizeof(R0DataType) * M;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gemm_gb_per_sec = gemm_num_byte / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gemm_gb_per_sec
<< " GB/s, " << std::endl;
}
template <typename ADataType,
typename BDataType,
typename EDataType,
typename R0DataType,
typename R1DataType>
void DumpGemmReduceMeanSquareMeanPerf(float ave_time, int M, int N, int K)
{
using namespace ck::literals;
std::size_t flop = 2_uz * M * N * K + M * (3_uz * N + 2_uz);
std::size_t gemm_num_byte = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
sizeof(EDataType) * M * N + sizeof(R0DataType) * M +
sizeof(R1DataType) * M;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gemm_gb_per_sec = gemm_num_byte / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gemm_gb_per_sec
<< " GB/s, " << std::endl;
}
template <typename ADataType,
typename BDataType,
typename EDataType,
typename R0DataType,
typename ALayout,
typename BLayout,
typename ELayout,
typename AElementOp,
typename BElementOp,
typename CDEElementOp,
typename QsElementOp,
typename RsElementOp,
typename RsThreadReduceOp,
typename ReduceAccDataType,
typename DeviceOpInstance,
typename ReferenceGemmInstance,
typename ADataKernelType = ADataType,
typename BDataKernelType = BDataType,
typename EDataKernelType = EDataType>
auto run_gemm_reduce_max_xdl(ck::index_t M,
ck::index_t N,
ck::index_t K,
ck::index_t StrideA,
ck::index_t StrideB,
ck::index_t StrideE,
bool do_verification,
int init_method,
bool time_kernel)
{
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
static_assert(sizeof(ck::int4_t) == sizeof(int8_t));
static_assert(sizeof(ADataType) == sizeof(ADataKernelType));
static_assert(sizeof(BDataType) == sizeof(BDataKernelType));
static_assert(sizeof(EDataType) == sizeof(EDataKernelType));
#endif
using namespace ck::literals;
auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) {
return HostTensorDescriptor({len}, {stride});
};
auto f_host_tensor_descriptor2d =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{
return HostTensorDescriptor({row, col}, {stride, 1_uz});
}
else
{
return HostTensorDescriptor({row, col}, {1_uz, stride});
}
};
Tensor<ADataType> a_m_k(f_host_tensor_descriptor2d(M, K, StrideA, ALayout{}));
Tensor<BDataType> b_k_n(f_host_tensor_descriptor2d(K, N, StrideB, BLayout{}));
Tensor<EDataKernelType> e_m_n(f_host_tensor_descriptor2d(M, N, StrideE, ELayout{}));
Tensor<R0DataType> r0_m(f_host_tensor_descriptor1d(M, 1));
switch(init_method)
{
case 0: break;
case 1:
ck::utils::FillUniformDistributionIntegerValue<ADataType>{-5.f, 5.f}(a_m_k.begin(),
a_m_k.end());
ck::utils::FillUniformDistributionIntegerValue<BDataType>{-5.f, 5.f}(b_k_n.begin(),
b_k_n.end());
break;
default:
ck::utils::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k.begin(), a_m_k.end());
ck::utils::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n.begin(), b_k_n.end());
break;
}
DeviceMem a_device_buf(sizeof(ADataKernelType) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_device_buf(sizeof(BDataKernelType) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem e_device_buf(sizeof(EDataKernelType) * e_m_n.mDesc.GetElementSpaceSize());
DeviceMem r0_device_buf(sizeof(R0DataType) * r0_m.mDesc.GetElementSpaceSize());
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
if constexpr(std::is_same_v<ADataType, ck::int4_t>)
{
Tensor<ADataKernelType> a_m_k_converted = a_m_k.template CopyAsType<ADataKernelType>();
Tensor<BDataKernelType> b_k_n_converted = b_k_n.template CopyAsType<BDataKernelType>();
a_device_buf.ToDevice(a_m_k_converted.mData.data());
b_device_buf.ToDevice(b_k_n_converted.mData.data());
}
else
#endif // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
{
a_device_buf.ToDevice(a_m_k.mData.data());
b_device_buf.ToDevice(b_k_n.mData.data());
}
auto a_element_op = AElementOp{};
auto b_element_op = BElementOp{};
auto cde_element_op = CDEElementOp{};
auto qs_element_op = QsElementOp{};
auto rs_element_op = RsElementOp{};
// Prepare GEMM, max
auto device_op = DeviceOpInstance{};
auto invoker = device_op.MakeInvoker();
auto argument = device_op.MakeArgument(a_device_buf.GetDeviceBuffer(),
b_device_buf.GetDeviceBuffer(),
{},
e_device_buf.GetDeviceBuffer(),
{r0_device_buf.GetDeviceBuffer()},
M,
N,
K,
StrideA,
StrideB,
{},
StrideE,
a_element_op,
b_element_op,
cde_element_op,
qs_element_op,
rs_element_op);
if(!device_op.IsSupportedArgument(argument))
{
throw std::runtime_error("wrong! this device_op instance does not support this problem");
}
// [CAUTION]: launch_and_time_kernel will not initialize D.
// If we evaluate kernel multiple time but without initialize D. Verification will fail
r0_device_buf.SetValue(ck::NumericLimits<R0DataType>::Lowest());
invoker.Run(argument, StreamConfig{nullptr, false});
bool pass = true;
if(do_verification)
{
auto I0 = ck::Number<0>{};
Tensor<ReduceAccDataType> e_m_n_host(e_m_n.mDesc);
Tensor<R0DataType> r0_m_host(r0_m.mDesc);
auto ref_gemm = ReferenceGemmInstance{};
auto ref_invoker = ref_gemm.MakeInvoker();
auto ref_argument = ref_gemm.MakeArgument(
a_m_k, b_k_n, e_m_n_host, a_element_op, b_element_op, cde_element_op);
ref_invoker.Run(ref_argument);
auto reduce0_op = RsThreadReduceOp{}[I0];
for(int m = 0; m < M; ++m)
{
auto reduce0_acc = reduce0_op.template GetIdentityValue<ReduceAccDataType>();
for(int n = 0; n < N; ++n)
{
auto e_val = e_m_n_host(m, n);
reduce0_op(reduce0_acc, e_val);
};
r0_m_host(m) = ck::type_convert<R0DataType>(reduce0_acc);
}
e_device_buf.FromDevice(e_m_n.mData.data());
Tensor<EDataType> e_m_n_host_converted(e_m_n_host);
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
if constexpr(std::is_same_v<ADataType, ck::int4_t>)
{
Tensor<EDataType> e_m_n_device_converted(e_m_n);
pass = ck::utils::check_err(e_m_n_device_converted.mData,
e_m_n_host_converted.mData,
"Error: Incorrect results c",
1e-2,
1e-2);
}
else
#endif // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
{
pass = ck::utils::check_err(
e_m_n.mData, e_m_n_host_converted.mData, "Error: Incorrect results c", 1e-2, 1e-2);
}
r0_device_buf.FromDevice(r0_m.mData.data());
pass &= ck::utils::check_err(
r0_m.mData, r0_m_host.mData, "Error: Incorrect results d0", 1e-2, 1e-2);
if(pass)
{
std::cout << "Success!" << std::endl;
}
}
if(time_kernel)
{
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
DumpGemmReduceMaxPerf<ADataType, BDataType, EDataType, R0DataType>(ave_time, M, N, K);
}
return pass ? 0 : 1;
}
template <typename ADataType,
typename BDataType,
typename EDataType,
typename R0DataType,
typename R1DataType,
typename ALayout,
typename BLayout,
typename ELayout,
typename AElementOp,
typename BElementOp,
typename CDEElementOp,
typename QsElementOp,
typename RsElementOp,
typename RsThreadReduceOp,
typename ReduceAccDataType,
typename DeviceOpInstance,
typename ReferenceGemmInstance,
typename ADataKernelType = ADataType,
typename BDataKernelType = BDataType,
typename EDataKernelType = EDataType>
bool run_gemm_reduce_mean_meansquare_xdl(ck::index_t M,
ck::index_t N,
ck::index_t K,
ck::index_t StrideA,
ck::index_t StrideB,
ck::index_t StrideE,
bool do_verification,
int init_method,
bool time_kernel)
{
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
static_assert(sizeof(ck::int4_t) == sizeof(int8_t));
static_assert(sizeof(ADataType) == sizeof(ADataKernelType));
static_assert(sizeof(BDataType) == sizeof(BDataKernelType));
static_assert(sizeof(EDataType) == sizeof(EDataKernelType));
#endif
using namespace ck::literals;
auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) {
return HostTensorDescriptor({len}, {stride});
};
auto f_host_tensor_descriptor2d =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{
return HostTensorDescriptor({row, col}, {stride, 1_uz});
}
else
{
return HostTensorDescriptor({row, col}, {1_uz, stride});
}
};
Tensor<ADataType> a_m_k(f_host_tensor_descriptor2d(M, K, StrideA, ALayout{}));
Tensor<BDataType> b_k_n(f_host_tensor_descriptor2d(K, N, StrideB, BLayout{}));
Tensor<EDataKernelType> e_m_n(f_host_tensor_descriptor2d(M, N, StrideE, ELayout{}));
Tensor<R0DataType> r0_m(f_host_tensor_descriptor1d(M, 1));
Tensor<R1DataType> r1_m(f_host_tensor_descriptor1d(M, 1));
switch(init_method)
{
case 0: break;
case 1:
ck::utils::FillUniformDistributionIntegerValue<ADataType>{-5.f, 5.f}(a_m_k.begin(),
a_m_k.end());
ck::utils::FillUniformDistributionIntegerValue<BDataType>{-5.f, 5.f}(b_k_n.begin(),
b_k_n.end());
break;
default:
ck::utils::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k.begin(), a_m_k.end());
ck::utils::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n.begin(), b_k_n.end());
break;
}
DeviceMem a_device_buf(sizeof(ADataKernelType) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_device_buf(sizeof(BDataKernelType) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem e_device_buf(sizeof(EDataKernelType) * e_m_n.mDesc.GetElementSpaceSize());
DeviceMem r0_device_buf(sizeof(R0DataType) * r0_m.mDesc.GetElementSpaceSize());
DeviceMem r1_device_buf(sizeof(R1DataType) * r1_m.mDesc.GetElementSpaceSize());
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
if constexpr(std::is_same_v<ADataType, ck::int4_t>)
{
Tensor<ADataKernelType> a_m_k_converted = a_m_k.template CopyAsType<ADataKernelType>();
Tensor<BDataKernelType> b_k_n_converted = b_k_n.template CopyAsType<BDataKernelType>();
a_device_buf.ToDevice(a_m_k_converted.mData.data());
b_device_buf.ToDevice(b_k_n_converted.mData.data());
}
else
#endif // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
{
a_device_buf.ToDevice(a_m_k.mData.data());
b_device_buf.ToDevice(b_k_n.mData.data());
}
auto a_element_op = AElementOp{};
auto b_element_op = BElementOp{};
auto cde_element_op = CDEElementOp{};
auto qs_element_op = QsElementOp{};
auto rs_element_op = RsElementOp{N, N};
// Prepare GEMM, mean, mean_square
auto device_op = DeviceOpInstance{};
auto invoker = device_op.MakeInvoker();
auto argument =
device_op.MakeArgument(a_device_buf.GetDeviceBuffer(),
b_device_buf.GetDeviceBuffer(),
{},
e_device_buf.GetDeviceBuffer(),
{r0_device_buf.GetDeviceBuffer(), r1_device_buf.GetDeviceBuffer()},
M,
N,
K,
StrideA,
StrideB,
{},
StrideE,
a_element_op,
b_element_op,
cde_element_op,
qs_element_op,
rs_element_op);
if(!device_op.IsSupportedArgument(argument))
{
throw std::runtime_error("wrong! this device_op instance does not support this problem");
}
// init reducetion buffer to 0
r0_device_buf.SetZero();
r1_device_buf.SetZero();
invoker.Run(argument, StreamConfig{nullptr, false});
bool pass = true;
if(do_verification)
{
auto I0 = ck::Number<0>{};
auto I1 = ck::Number<1>{};
Tensor<ReduceAccDataType> e_m_n_host(e_m_n.mDesc);
Tensor<R0DataType> r0_m_host(r0_m.mDesc);
Tensor<R1DataType> r1_m_host(r1_m.mDesc);
auto ref_gemm = ReferenceGemmInstance{};
auto ref_invoker = ref_gemm.MakeInvoker();
auto ref_argument = ref_gemm.MakeArgument(
a_m_k, b_k_n, e_m_n_host, a_element_op, b_element_op, PassThrough{});
ref_invoker.Run(ref_argument);
auto reduce0_op = RsThreadReduceOp{}[I0];
auto reduce1_op = RsThreadReduceOp{}[I1];
for(int m = 0; m < M; ++m)
{
auto reduce0_acc = reduce0_op.template GetIdentityValue<ReduceAccDataType>();
auto reduce1_acc = reduce1_op.template GetIdentityValue<ReduceAccDataType>();
for(int n = 0; n < N; ++n)
{
ReduceAccDataType square_e_val;
auto e_val = ck::type_convert<ReduceAccDataType>(e_m_n_host(m, n));
qs_element_op[I1](square_e_val, e_val);
reduce0_op(reduce0_acc, e_val);
reduce1_op(reduce1_acc, square_e_val);
}
rs_element_op[I0](reduce0_acc, reduce0_acc);
rs_element_op[I1](reduce1_acc, reduce1_acc);
r0_m_host(m) = ck::type_convert<R0DataType>(reduce0_acc);
r1_m_host(m) = ck::type_convert<R1DataType>(reduce1_acc);
}
e_device_buf.FromDevice(e_m_n.mData.data());
Tensor<EDataType> e_m_n_host_converted(e_m_n_host);
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
if constexpr(std::is_same_v<ADataType, ck::int4_t>)
{
Tensor<EDataType> e_m_n_device_converted(e_m_n);
pass = ck::utils::check_err(e_m_n_device_converted.mData,
e_m_n_host_converted.mData,
"Error: Incorrect results c",
1e-2,
1e-2);
}
else
#endif // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
{
pass = ck::utils::check_err(
e_m_n.mData, e_m_n_host_converted.mData, "Error: Incorrect results c", 1e-2, 1e-2);
}
r0_device_buf.FromDevice(r0_m.mData.data());
r1_device_buf.FromDevice(r1_m.mData.data());
pass &= ck::utils::check_err(
r0_m.mData, r0_m_host.mData, "Error: Incorrect results d0", 1e-2, 1e-2);
pass &= ck::utils::check_err(
r1_m.mData, r1_m_host.mData, "Error: Incorrect results d1", 1e-2, 1e-2);
if(pass)
{
std::cout << "Success!" << std::endl;
}
}
if(time_kernel)
{
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
DumpGemmReduceMeanSquareMeanPerf<ADataType, BDataType, EDataType, R0DataType, R1DataType>(
ave_time, M, N, K);
}
return pass;
}
......@@ -21,9 +21,9 @@ namespace reduce {
// vector space
// (http://pages.cs.wisc.edu/~matthewb/pages/notes/pdf/linearalgebra/VectorSpaces.pdf).
// 2) IsCompatibleInMemoryDataOperation() -- return true if the reduction task corresponding to this
// operator can use the InMemoryDataOperation to finalize, or else it return false 3) operator() --
// the first argument of the operator must be both an input & output, and the corresponding variable
// usually stores
// operator can use the InMemoryDataOperation to finalize, or else it return false
// 3) operator() -- the first argument of the operator must be both an input & output, and the
// corresponding variable usually stores
// the accumulated result of many operator() calls; the second argument is only an
// input. For indexable binary
// operator, the second version of operator() has third argument (which is an
......
......@@ -259,7 +259,7 @@ struct Tensor
Tensor<OutT> ret(mDesc);
for(size_t i = 0; i < mData.size(); i++)
{
ret.mData[i] = static_cast<OutT>(mData[i]);
ret.mData[i] = ck::type_convert<OutT>(mData[i]);
}
return ret;
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment