// SPDX-License-Identifier: MIT // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. #include #include #include #include #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/utility/check_err.hpp" struct AlphaBetaAdd { AlphaBetaAdd(float alpha, float beta) : alpha_(alpha), beta_(beta){}; template __host__ __device__ constexpr void operator()(E& e, const C& c, const D& d) const; template <> __host__ __device__ constexpr void operator()( ck::half_t& e, const float& c, const ck::half_t& d) const { e = ck::type_convert(alpha_ * c + beta_ * ck::type_convert(d)); }; float alpha_; float beta_; }; template using S = ck::Sequence; using F16 = ck::half_t; using F32 = float; using Row = ck::tensor_layout::gemm::RowMajor; using Col = ck::tensor_layout::gemm::ColumnMajor; using PassThrough = ck::tensor_operation::element_wise::PassThrough; using ADataType = F16; using BDataType = F16; using AccDataType = F32; using CShuffleDataType = F32; using DDataType = F16; using EDataType = F16; using ALayout = Row; using BLayout = Col; using DLayout = Row; using ELayout = Row; using AElementOp = PassThrough; using BElementOp = PassThrough; using CDEElementOp = AlphaBetaAdd; static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding; using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_CShuffle, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, ck::Tuple, EDataType, AElementOp, BElementOp, CDEElementOp, GemmSpec, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>; int main(int argc, char* argv[]) { bool do_verification = true; int init_method = 1; bool time_kernel = false; // GEMM shape ck::index_t M = 3840; ck::index_t N = 4096; ck::index_t K = 4096; ck::index_t StrideA = 4096; ck::index_t StrideB = 4096; ck::index_t StrideD = 4096; ck::index_t StrideE = 4096; float alpha = 1.0f; float beta = 1.0f; if(argc == 1) { // use default case } else if(argc == 4) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); time_kernel = std::stoi(argv[3]); } else if(argc == 6) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); time_kernel = std::stoi(argv[3]); alpha = std::stof(argv[4]); beta = std::stof(argv[5]); } else if(argc == 13) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); time_kernel = std::stoi(argv[3]); M = std::stoi(argv[4]); N = std::stoi(argv[5]); K = std::stoi(argv[6]); StrideA = std::stoi(argv[7]); StrideB = std::stoi(argv[8]); StrideD = std::stoi(argv[9]); StrideE = std::stoi(argv[10]); alpha = std::stof(argv[11]); beta = std::stof(argv[12]); } else { printf("arg1: verification (0=no, 1=yes)\n"); printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); printf("arg3: time kernel (0=no, 1=yes)\n"); printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD, StrideE, alpha, " "beta\n"); exit(0); } auto f_host_tensor_descriptor = [](std::size_t row, std::size_t col, std::size_t stride, auto layout) { using namespace ck::literals; if(std::is_same::value) { return HostTensorDescriptor({row, col}, {stride, 1_uz}); } else { return HostTensorDescriptor({row, col}, {1_uz, stride}); } }; Tensor a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{})); Tensor b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{})); Tensor d_m_n(f_host_tensor_descriptor(M, N, StrideD, DLayout{})); Tensor e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{})); Tensor e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{})); std::cout << "a_m_k: " << a_m_k.mDesc << std::endl; std::cout << "b_k_n: " << b_k_n.mDesc << std::endl; std::cout << "d_m_n: " << d_m_n.mDesc << std::endl; std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl; switch(init_method) { case 0: break; case 1: a_m_k.GenerateTensorValue(GeneratorTensor_2{-5, 5}); b_k_n.GenerateTensorValue(GeneratorTensor_2{-5, 5}); d_m_n.GenerateTensorValue(GeneratorTensor_2{-5, 5}); break; default: a_m_k.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); b_k_n.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); d_m_n.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); } DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize()); DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize()); DeviceMem d_device_buf(sizeof(DDataType) * d_m_n.mDesc.GetElementSpaceSize()); DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize()); a_device_buf.ToDevice(a_m_k.mData.data()); b_device_buf.ToDevice(b_k_n.mData.data()); d_device_buf.ToDevice(d_m_n.mData.data()); e_device_buf.ToDevice(e_m_n_device_result.mData.data()); auto a_element_op = AElementOp{}; auto b_element_op = BElementOp{}; auto cde_element_op = CDEElementOp{alpha, beta}; // do GEMM auto device_op = DeviceOpInstance{}; auto invoker = device_op.MakeInvoker(); auto argument = device_op.MakeArgument(a_device_buf.GetDeviceBuffer(), b_device_buf.GetDeviceBuffer(), std::array{d_device_buf.GetDeviceBuffer()}, e_device_buf.GetDeviceBuffer(), M, N, K, StrideA, StrideB, std::array{StrideD}, StrideE, a_element_op, b_element_op, cde_element_op); if(!device_op.IsSupportedArgument(argument)) { throw std::runtime_error( "wrong! device_gemm with the specified compilation parameters does " "not support this GEMM problem"); } float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); std::size_t flop = std::size_t(2) * M * N * K; std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N; float tflops = static_cast(flop) / 1.E9 / ave_time; float gb_per_sec = num_btype / 1.E6 / ave_time; std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s" << std::endl; e_device_buf.FromDevice(e_m_n_device_result.mData.data()); if(do_verification) { Tensor c_m_n({M, N}); using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm; auto ref_gemm = ReferenceGemmInstance{}; auto ref_invoker = ref_gemm.MakeInvoker(); auto ref_argument = ref_gemm.MakeArgument(a_m_k, b_k_n, c_m_n, a_element_op, b_element_op, PassThrough{}); ref_invoker.Run(ref_argument); for(int m = 0; m < M; ++m) { for(int n = 0; n < N; ++n) { cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), d_m_n(m, n)); } } e_device_buf.FromDevice(e_m_n_device_result.mData.data()); return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result) ? 0 : 1; } return 0; }