Unverified Commit 5aa3c344 authored by rocking5566's avatar rocking5566 Committed by GitHub
Browse files

Merge branch 'develop' into gemm_layernorm_welford

parents 7fefc966 9d8f834a
...@@ -58,7 +58,7 @@ using Acc0ElementOp = ck::tensor_operation::element_wise::Scale; ...@@ -58,7 +58,7 @@ using Acc0ElementOp = ck::tensor_operation::element_wise::Scale;
using B1ElementOp = PassThrough; using B1ElementOp = PassThrough;
using CElementOp = PassThrough; using CElementOp = PassThrough;
static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNOPadding; static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
using DeviceGemmInstance = using DeviceGemmInstance =
ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle< ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<
...@@ -117,7 +117,8 @@ using DeviceGemmInstance = ...@@ -117,7 +117,8 @@ using DeviceGemmInstance =
1, // CShuffleMXdlPerWavePerShuffle 1, // CShuffleMXdlPerWavePerShuffle
2, // CShuffleNXdlPerWavePerShuffle 2, // CShuffleNXdlPerWavePerShuffle
S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
8>; // CShuffleBlockTransferScalarPerVector_NPerBlock 8, // CShuffleBlockTransferScalarPerVector_NPerBlock
false>; // MaskOutUpperTriangle
// Ref Gemm0: fp16 in, fp32 out // Ref Gemm0: fp16 in, fp32 out
using ReferenceGemm0Instance = ck::tensor_operation::host::ReferenceBatchedGemm<ADataType, using ReferenceGemm0Instance = ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
...@@ -149,8 +150,8 @@ int main(int argc, char* argv[]) ...@@ -149,8 +150,8 @@ int main(int argc, char* argv[])
// GEMM shape for A/B0/B1/C // GEMM shape for A/B0/B1/C
// C_g_m_o = A_g_m_k * B0_g_k_n * B1_g_n_o // C_g_m_o = A_g_m_k * B0_g_k_n * B1_g_n_o
ck::index_t M = 128; ck::index_t M = 120;
ck::index_t N = 1024; ck::index_t N = 1000;
ck::index_t K = 64; ck::index_t K = 64;
ck::index_t O = 128; ck::index_t O = 128;
ck::index_t StrideA = -1; ck::index_t StrideA = -1;
......
...@@ -55,7 +55,7 @@ using Acc0ElementOp = ck::tensor_operation::element_wise::Scale; ...@@ -55,7 +55,7 @@ using Acc0ElementOp = ck::tensor_operation::element_wise::Scale;
using B1ElementOp = PassThrough; using B1ElementOp = PassThrough;
using CElementOp = PassThrough; using CElementOp = PassThrough;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle< using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle<
ALayout, ALayout,
...@@ -73,7 +73,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmSoftma ...@@ -73,7 +73,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmSoftma
Acc0ElementOp, Acc0ElementOp,
B1ElementOp, B1ElementOp,
CElementOp, CElementOp,
GemmDefault, GemmSpec,
1, 1,
256, 256,
128, // MPerBlock 128, // MPerBlock
...@@ -113,7 +113,8 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmSoftma ...@@ -113,7 +113,8 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmSoftma
1, // CShuffleMXdlPerWavePerShuffle 1, // CShuffleMXdlPerWavePerShuffle
2, // CShuffleNXdlPerWavePerShuffle 2, // CShuffleNXdlPerWavePerShuffle
S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
8>; // CShuffleBlockTransferScalarPerVector_NPerBlock 8, // CShuffleBlockTransferScalarPerVector_NPerBlock
false>;
// Ref Gemm0: fp16 in, fp32 out // Ref Gemm0: fp16 in, fp32 out
using ReferenceGemm0Instance = ck::tensor_operation::host::ReferenceBatchedGemm<ADataType, using ReferenceGemm0Instance = ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
...@@ -144,8 +145,8 @@ int main(int argc, char* argv[]) ...@@ -144,8 +145,8 @@ int main(int argc, char* argv[])
bool time_kernel = false; bool time_kernel = false;
// GEMM shape // GEMM shape
ck::index_t M = 1024; ck::index_t M = 1020;
ck::index_t N = 1024; ck::index_t N = 1020;
ck::index_t K = 64; ck::index_t K = 64;
ck::index_t O = 128; ck::index_t O = 128;
ck::index_t BatchCount = 4; ck::index_t BatchCount = 4;
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
/*
Gemm + Softmax + Gemm fused operation. Computes C_g_m_o = Softmax(A_g_m_k * B0_g_k_n) * B1_g_n_o
|-----------------|
Gemm0
|-------------------------------------|
Gemm1
*/
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/tensor_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using F16 = ck::half_t;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using ADataType = F16;
using B0DataType = F16;
using B1DataType = F16;
using AccDataType = F32;
using CShuffleDataType = F32;
using CDataType = F16;
using ALayout = Row;
using B0Layout = Col;
using B1Layout = Row;
using CPermuteNumDims_G_M_O =
S<1, 1, 1>; // "using CLayout = Row" has been replaced by CPermuteNumDims_M_O
using AElementOp = PassThrough;
using B0ElementOp = PassThrough;
using Acc0ElementOp = ck::tensor_operation::element_wise::Scale;
using B1ElementOp = PassThrough;
using CElementOp = PassThrough;
static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
using DeviceGemmInstance =
ck::tensor_operation::device::DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle<
ALayout,
B0Layout,
B1Layout,
CPermuteNumDims_G_M_O,
ADataType,
B0DataType,
B1DataType,
CDataType,
AccDataType,
CShuffleDataType,
AElementOp,
B0ElementOp,
Acc0ElementOp,
B1ElementOp,
CElementOp,
GemmSpec,
1,
256,
128, // MPerBlock
128, // NPerBlock
32, // KPerBlock
64, // Gemm1NPerBlock
32, // Gemm1KPerBlock
8, // AK1
8, // BK1
2, // B1K1
32, // MPerXDL
32, // NPerXDL
1, // MXdlPerWave
4, // NXdlPerWave
2, // Gemm1NXdlPerWave
S<4, 64, 1>, // ABlockTransfer
S<1, 0, 2>,
S<1, 0, 2>,
2,
8,
8,
true,
S<4, 64, 1>, // BBlockTransfer
S<1, 0, 2>,
S<1, 0, 2>,
2,
8,
8,
true,
S<16, 16, 1>, // B1BlockTransfer
S<0, 2, 1>,
S<0, 2, 1>,
1,
4,
2,
false,
1, // CShuffleMXdlPerWavePerShuffle
2, // CShuffleNXdlPerWavePerShuffle
S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
8, // CShuffleBlockTransferScalarPerVector_NPerBlock
false>;
// Ref Gemm0: fp16 in, fp32 out
using ReferenceGemm0Instance = ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
B0DataType,
AccDataType,
AccDataType,
AElementOp,
B0ElementOp,
Acc0ElementOp>;
// Ref Softmax: fp32 in, fp16 out
using ReferenceSoftmaxInstance =
ck::tensor_operation::host::ReferenceSoftmax<AccDataType, ADataType, AccDataType>;
// Ref Gemm1: fp16 in, fp16 out
using ReferenceGemm1Instance = ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
B1DataType,
CDataType,
AccDataType,
AElementOp,
B1ElementOp,
CElementOp>;
int main(int argc, char* argv[])
{
bool do_verification = true;
int init_method = 1;
bool time_kernel = false;
if(argc == 1)
{
// use default case
}
else if(argc == 4)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
}
else
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
printf("arg3: time kernel (0=no, 1=yes)\n");
exit(0);
}
float alpha = 1; // scaling after 1st gemm
std::size_t group_count = 13;
// Problem descs
std::vector<DeviceGemmInstance::ProblemDesc> problem_descs;
std::vector<const void*> p_a;
std::vector<const void*> p_b0;
std::vector<const void*> p_b1;
std::vector<void*> p_c;
for(std::size_t i = 0; i < group_count; i++)
{
int M = 128 * (rand() % 8 + 1);
int N = 128 * (rand() % 8 + 1);
int K = 40;
int O = 40 * (rand() % 2 + 1);
int Batch = rand() % 8 + 1;
const int StrideA = ck::is_same_v<ALayout, Row> ? K : M;
const int StrideB0 = ck::is_same_v<B0Layout, Row> ? N : K;
const int StrideB1 = ck::is_same_v<B1Layout, Row> ? O : N;
const int BatchStrideA = (ck::is_same_v<ALayout, Col> ? K : M) * StrideA;
const int BatchStrideB0 = (ck::is_same_v<B0Layout, Col> ? N : K) * StrideB0;
const int BatchStrideB1 = (ck::is_same_v<B1Layout, Col> ? O : N) * StrideB1;
std::vector<ck::index_t> c_gs_ms_os_lengths{Batch, M, O};
std::vector<ck::index_t> c_gs_ms_os_strides{O, Batch * O, 1};
problem_descs.push_back({M,
N,
K,
O,
Batch,
StrideA,
StrideB0,
StrideB1,
BatchStrideA,
BatchStrideB0,
BatchStrideB1,
c_gs_ms_os_lengths,
c_gs_ms_os_strides});
}
auto f_host_tensor_descriptor = [](std::size_t batch_count,
std::size_t row,
std::size_t col,
std::size_t stride,
std::size_t batch_stride,
auto layout) {
if(std::is_same<decltype(layout), Row>::value)
{
return HostTensorDescriptor(std::vector<std::size_t>({batch_count, row, col}),
std::vector<std::size_t>({batch_stride, stride, 1}));
}
else
{
return HostTensorDescriptor(std::vector<std::size_t>({batch_count, row, col}),
std::vector<std::size_t>({batch_stride, 1, stride}));
}
};
std::vector<Tensor<ADataType>> a_tensors;
std::vector<Tensor<B0DataType>> b0_tensors;
std::vector<Tensor<B1DataType>> b1_tensors;
std::vector<Tensor<CDataType>> c_tensors;
using DeviceMemPtr = std::unique_ptr<DeviceMem>;
std::vector<DeviceMemPtr> a_tensors_device;
std::vector<DeviceMemPtr> b0_tensors_device;
std::vector<DeviceMemPtr> b1_tensors_device;
std::vector<DeviceMemPtr> c_tensors_device;
std::size_t flop = 0, num_byte = 0;
std::cout << "group count " << group_count << ". printing first 4 groups\n";
for(std::size_t i = 0; i < group_count; i++)
{
const auto& M = problem_descs[i].M;
const auto& N = problem_descs[i].N;
const auto& K = problem_descs[i].K;
const auto& O = problem_descs[i].O;
const auto& Batch = problem_descs[i].Batch;
const auto& StrideA = problem_descs[i].StrideA;
const auto& StrideB0 = problem_descs[i].StrideB0;
const auto& StrideB1 = problem_descs[i].StrideB1;
const auto& BatchStrideA = problem_descs[i].BatchStrideA;
const auto& BatchStrideB0 = problem_descs[i].BatchStrideB0;
const auto& BatchStrideB1 = problem_descs[i].BatchStrideB1;
const auto& c_gs_ms_os_lengths = problem_descs[i].c_gs_ms_os_lengths;
const auto& c_gs_ms_os_strides = problem_descs[i].c_gs_ms_os_strides;
// C_m_o = A_m_k * B0_k_n * B1_n_o
Tensor<ADataType> a_g_m_k(
f_host_tensor_descriptor(Batch, M, K, StrideA, BatchStrideA, ALayout{}));
Tensor<B0DataType> b0_g_k_n(
f_host_tensor_descriptor(Batch, K, N, StrideB0, BatchStrideB0, B0Layout{}));
Tensor<B1DataType> b1_g_n_o(
f_host_tensor_descriptor(Batch, N, O, StrideB1, BatchStrideB1, B1Layout{}));
Tensor<CDataType> c_gs_ms_os_device_result(
std::vector<std::size_t>(c_gs_ms_os_lengths.begin(), c_gs_ms_os_lengths.end()),
std::vector<std::size_t>(c_gs_ms_os_strides.begin(), c_gs_ms_os_strides.end()));
flop += (size_t(M) * N * K * 2 + size_t(M) * N * O * 2) * Batch;
num_byte += (sizeof(ADataType) * M * K + sizeof(B0DataType) * K * N +
sizeof(B1DataType) * N * O + sizeof(CDataType) * M * O) *
Batch;
if(i < 4)
{
std::cout << "a_g_m_k[" << i << "]: " << a_g_m_k.mDesc << ", "
<< "b0_g_k_n[" << i << "]: " << b0_g_k_n.mDesc << ", "
<< "b1_g_n_o[" << i << "]: " << b1_g_n_o.mDesc << ", "
<< "c_gs_ms_os[" << i << "]: " << c_gs_ms_os_device_result.mDesc << std::endl;
}
switch(init_method)
{
case 0: break;
case 1:
a_g_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
b0_g_k_n.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
b1_g_n_o.GenerateTensorValue(GeneratorTensor_2<B1DataType>{-2, 2});
break;
case 2:
a_g_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
b0_g_k_n.GenerateTensorValue(GeneratorTensor_3<B0DataType>{0.0, 1.0});
b1_g_n_o.GenerateTensorValue(GeneratorTensor_3<B1DataType>{-0.5, 0.5});
break;
case 3:
a_g_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
b0_g_k_n.GenerateTensorValue(GeneratorTensor_Diagonal<B0DataType>{});
b1_g_n_o.GenerateTensorValue(GeneratorTensor_Diagonal<B1DataType>{});
break;
default:
a_g_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
b0_g_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{});
b1_g_n_o.GenerateTensorValue(GeneratorTensor_Diagonal<B1DataType>{});
}
a_tensors.push_back(a_g_m_k);
b0_tensors.push_back(b0_g_k_n);
b1_tensors.push_back(b1_g_n_o);
c_tensors.push_back(c_gs_ms_os_device_result);
a_tensors_device.emplace_back(
std::make_unique<DeviceMem>(sizeof(ADataType) * a_g_m_k.mDesc.GetElementSpaceSize()));
b0_tensors_device.emplace_back(
std::make_unique<DeviceMem>(sizeof(B0DataType) * b0_g_k_n.mDesc.GetElementSpaceSize()));
b1_tensors_device.emplace_back(
std::make_unique<DeviceMem>(sizeof(B1DataType) * b1_g_n_o.mDesc.GetElementSpaceSize()));
c_tensors_device.emplace_back(std::make_unique<DeviceMem>(
sizeof(CDataType) * c_gs_ms_os_device_result.mDesc.GetElementSpaceSize()));
a_tensors_device[i]->ToDevice(a_g_m_k.mData.data());
b0_tensors_device[i]->ToDevice(b0_g_k_n.mData.data());
b1_tensors_device[i]->ToDevice(b1_g_n_o.mData.data());
p_a.push_back(a_tensors_device[i]->GetDeviceBuffer());
p_b0.push_back(b0_tensors_device[i]->GetDeviceBuffer());
p_b1.push_back(b1_tensors_device[i]->GetDeviceBuffer());
p_c.push_back(c_tensors_device[i]->GetDeviceBuffer());
}
auto a_element_op = AElementOp{};
auto b0_element_op = B0ElementOp{};
auto acc0_element_op = Acc0ElementOp{alpha};
auto b1_element_op = B1ElementOp{};
auto c_element_op = CElementOp{};
// do GEMM
auto gemm = DeviceGemmInstance{};
auto invoker = gemm.MakeInvoker();
auto argument = gemm.MakeArgument(p_a,
p_b0,
p_b1,
p_c,
problem_descs,
a_element_op,
b0_element_op,
acc0_element_op,
b1_element_op,
c_element_op);
// specify workspace for problem_desc
DeviceMem problem_desc_workspace(gemm.GetWorkSpaceSize(&argument));
gemm.SetWorkSpacePointer(&argument, problem_desc_workspace.GetDeviceBuffer());
if(!gemm.IsSupportedArgument(argument))
{
std::cout << gemm.GetTypeString() << " does not support this problem" << std::endl;
return 0;
}
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_byte / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
<< gemm.GetTypeString() << std::endl;
bool pass = true;
if(do_verification)
{
for(std::size_t i = 0; i < group_count; i++)
{
const auto& M = problem_descs[i].M;
const auto& N = problem_descs[i].N;
const auto& O = problem_descs[i].O;
const auto& Batch = problem_descs[i].Batch;
const auto& c_gs_ms_os_lengths = problem_descs[i].c_gs_ms_os_lengths;
const auto& c_gs_ms_os_strides = problem_descs[i].c_gs_ms_os_strides;
const auto& a_g_m_k = a_tensors[i];
const auto& b0_g_k_n = b0_tensors[i];
const auto& b1_g_n_o = b1_tensors[i];
auto& c_gs_ms_os_device_result = c_tensors[i];
auto& c_gs_ms_os_device_buf = *c_tensors_device[i];
Tensor<CDataType> c_gs_ms_os_host_result(
std::vector<std::size_t>(c_gs_ms_os_lengths.begin(), c_gs_ms_os_lengths.end()),
std::vector<std::size_t>(c_gs_ms_os_strides.begin(), c_gs_ms_os_strides.end()));
c_gs_ms_os_device_buf.FromDevice(c_gs_ms_os_device_result.mData.data());
// Output of Gemm0 is input A of Gemm1
Tensor<AccDataType> acc0_m_n(f_host_tensor_descriptor(Batch, M, N, N, M * N, Row{}));
Tensor<ADataType> a1_g_m_n(f_host_tensor_descriptor(Batch, M, N, N, M * N, Row{}));
Tensor<CDataType> c_g_m_o_host_result(std::vector<int>{Batch, M, O},
std::vector<int>{M * O, O, 1});
auto ref_gemm0 = ReferenceGemm0Instance{};
auto ref_gemm0_invoker = ref_gemm0.MakeInvoker();
auto ref_gemm0_argument = ref_gemm0.MakeArgument(
a_g_m_k, b0_g_k_n, acc0_m_n, a_element_op, b0_element_op, acc0_element_op);
ref_gemm0_invoker.Run(ref_gemm0_argument);
auto ref_softmax = ReferenceSoftmaxInstance{};
auto ref_softmax_invoker = ref_softmax.MakeInvoker();
auto ref_softmax_argument = ref_softmax.MakeArgument(acc0_m_n, a1_g_m_n, 1, 0, {2});
ref_softmax_invoker.Run(ref_softmax_argument);
auto ref_gemm1 = ReferenceGemm1Instance{};
auto ref_gemm1_invoker = ref_gemm1.MakeInvoker();
auto ref_gemm1_argument = ref_gemm1.MakeArgument(a1_g_m_n,
b1_g_n_o,
c_g_m_o_host_result,
PassThrough{},
b1_element_op,
c_element_op);
ref_gemm1_invoker.Run(ref_gemm1_argument);
// Note: in this example, we merely permute the dimensions by changing underlying
// strides so we simply access data as-is
c_gs_ms_os_host_result.ForEach(
[&](auto& self, auto idx) { self(idx) = c_g_m_o_host_result(idx); });
bool pass_ =
ck::utils::check_err(c_gs_ms_os_device_result.mData, c_gs_ms_os_host_result.mData);
pass &= pass_;
}
}
return pass ? 0 : 1;
}
add_example_executable(example_batched_gemm_add_add_relu_gemm_add_xdl_fp16 batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp)
add_example_executable(example_grouped_conv_bwd_data_bias_relu_fp16 grouped_conv_bwd_data_bias_relu_fp16.cpp)
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp"
void print_helper_msg()
{
std::cout << "arg1: verification (0=no, 1=yes)\n"
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
<< "arg3: time kernel (0=no, 1=yes)\n"
<< ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
}
template <ck::index_t NDimSpatial,
typename OutDataType,
typename WeiDataType,
typename BiasDataType,
typename InDataType,
typename OutElementOp,
typename WeiElementOp,
typename InElementOp,
typename DeviceInstance>
int run_conv_bwd_data_bias_relu(bool do_verification,
int init_method,
bool time_kernel,
const ck::utils::conv::ConvParam& conv_param,
const HostTensorDescriptor& out_g_n_k_wos_desc,
const HostTensorDescriptor& wei_g_k_c_xs_desc,
const HostTensorDescriptor& bias_g_n_c_wis_desc,
const HostTensorDescriptor& in_g_n_c_wis_desc,
const OutElementOp& out_element_op,
const WeiElementOp& wei_element_op,
const InElementOp& in_element_op)
{
Tensor<OutDataType> out(out_g_n_k_wos_desc);
Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
Tensor<BiasDataType> bias(bias_g_n_c_wis_desc);
Tensor<InDataType> in_host(in_g_n_c_wis_desc);
Tensor<InDataType> in_device(in_g_n_c_wis_desc);
std::cout << "out: " << out.mDesc << std::endl;
std::cout << "wei: " << wei.mDesc << std::endl;
std::cout << "bias: " << bias.mDesc << std::endl;
std::cout << "in: " << in_host.mDesc << std::endl;
switch(init_method)
{
case 0: break;
case 1:
out.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
bias.GenerateTensorValue(GeneratorTensor_2<BiasDataType>{-5, 5});
break;
default:
out.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0});
wei.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
bias.GenerateTensorValue(GeneratorTensor_3<BiasDataType>{0.0, 1.0});
}
DeviceMem out_device_buf(sizeof(OutDataType) * out.mDesc.GetElementSpaceSize());
DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
DeviceMem bias_device_buf(sizeof(BiasDataType) * bias.mDesc.GetElementSpaceSize());
DeviceMem in_device_buf(sizeof(InDataType) * in_device.mDesc.GetElementSpaceSize());
out_device_buf.ToDevice(out.mData.data());
wei_device_buf.ToDevice(wei.mData.data());
bias_device_buf.ToDevice(bias.mData.data());
// reset input to zero
in_device_buf.SetZero();
std::array<ck::index_t, NDimSpatial + 3> a_g_n_k_wos_lengths{};
std::array<ck::index_t, NDimSpatial + 3> a_g_n_k_wos_strides{};
std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
std::array<ck::index_t, NDimSpatial + 3> d0_g_n_c_wis_lengths{};
std::array<ck::index_t, NDimSpatial + 3> d0_g_n_c_wis_strides{};
std::array<ck::index_t, NDimSpatial + 3> e_g_n_c_wis_lengths{};
std::array<ck::index_t, NDimSpatial + 3> e_g_n_c_wis_strides{};
std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
std::array<ck::index_t, NDimSpatial> input_left_pads{};
std::array<ck::index_t, NDimSpatial> input_right_pads{};
auto copy = [](auto& x, auto& y) { std::copy(x.begin(), x.end(), y.begin()); };
copy(out_g_n_k_wos_desc.GetLengths(), a_g_n_k_wos_lengths);
copy(out_g_n_k_wos_desc.GetStrides(), a_g_n_k_wos_strides);
copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
copy(bias_g_n_c_wis_desc.GetLengths(), d0_g_n_c_wis_lengths);
copy(bias_g_n_c_wis_desc.GetStrides(), d0_g_n_c_wis_strides);
copy(in_g_n_c_wis_desc.GetLengths(), e_g_n_c_wis_lengths);
copy(in_g_n_c_wis_desc.GetStrides(), e_g_n_c_wis_strides);
copy(conv_param.conv_filter_strides_, conv_filter_strides);
copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
copy(conv_param.input_left_pads_, input_left_pads);
copy(conv_param.input_right_pads_, input_right_pads);
// do conv
auto conv = DeviceInstance{};
auto invoker = conv.MakeInvoker();
auto argument = conv.MakeArgument(
out_device_buf.GetDeviceBuffer(),
wei_device_buf.GetDeviceBuffer(),
std::array<const void*, 1>{bias_device_buf.GetDeviceBuffer()},
in_device_buf.GetDeviceBuffer(),
a_g_n_k_wos_lengths,
a_g_n_k_wos_strides,
b_g_k_c_xs_lengths,
b_g_k_c_xs_strides,
std::array<std::array<ck::index_t, NDimSpatial + 3>, 1>{d0_g_n_c_wis_lengths},
std::array<std::array<ck::index_t, NDimSpatial + 3>, 1>{d0_g_n_c_wis_strides},
e_g_n_c_wis_lengths,
e_g_n_c_wis_strides,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads,
out_element_op,
wei_element_op,
in_element_op);
if(!conv.IsSupportedArgument(argument))
{
printf("wrong! device_conv with the specified compilation parameters does "
"not support this Conv problem\n");
return 1;
}
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::size_t flop = conv_param.GetFlops();
std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
<< std::endl;
if(do_verification)
{
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
// c doesn't physically exist, any layout is fine
Tensor<float> c_host(in_g_n_c_wis_desc);
auto ref_conv = ck::tensor_operation::host::ReferenceConvBwdData<NDimSpatial,
float,
WeiDataType,
OutDataType,
PassThrough,
WeiElementOp,
OutElementOp>();
auto ref_invoker = ref_conv.MakeInvoker();
auto ref_argument = ref_conv.MakeArgument(c_host,
wei,
out,
conv_param.conv_filter_strides_,
conv_param.conv_filter_dilations_,
conv_param.input_left_pads_,
conv_param.input_right_pads_,
PassThrough{},
wei_element_op,
out_element_op);
ref_invoker.Run(ref_argument);
// TODO: implement elementwise operation for host
in_host.ForEach(
[&](auto&, auto idx) { in_element_op(in_host(idx), c_host(idx), bias(idx)); });
in_device_buf.FromDevice(in_device.mData.data());
return ck::utils::check_err(in_device.mData, in_host.mData) ? 0 : 1;
}
return 0;
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "grouped_conv_bwd_data_bias_relu_common.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data_multiple_d.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp"
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using OutDataType = ck::half_t;
using WeiDataType = ck::half_t;
using AccDataType = float;
using CShuffleDataType = ck::half_t;
using BiasDataType = ck::half_t; // bias
using InDataType = ck::half_t;
using OutLayout = ck::tensor_layout::convolution::GNHWK;
using WeiLayout = ck::tensor_layout::convolution::GKYXC;
using BiasLayout = ck::tensor_layout::convolution::G_C;
using InLayout = ck::tensor_layout::convolution::GNHWC;
using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
using CBiasInElementOp = ck::tensor_operation::element_wise::AddRelu;
static constexpr auto ConvBwdDataDefault =
ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Default;
template <ck::index_t NDimSpatial>
using DeviceConvNdBwdDataInstance =
ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<
NDimSpatial,
OutLayout,
WeiLayout,
ck::Tuple<BiasLayout>,
InLayout,
OutDataType,
WeiDataType,
AccDataType,
CShuffleDataType,
ck::Tuple<BiasDataType>,
InDataType,
OutElementOp,
WeiElementOp,
CBiasInElementOp,
ConvBwdDataDefault,
true, // DoPadGemmM
true, // DoPadGemmN
1,
256,
128,
256,
32,
8,
2,
32,
32,
2,
4,
S<4, 64, 1>,
S<1, 0, 2>,
S<1, 0, 2>,
2,
8,
8,
1,
S<4, 64, 1>,
S<0, 2, 1>,
S<0, 2, 1>,
1,
4,
2,
0,
1,
1,
S<1, 32, 1, 8>,
8>;
int main(int argc, char* argv[])
{
namespace ctc = ck::tensor_layout::convolution;
print_helper_msg();
bool do_verification = true;
int init_method = 1;
bool time_kernel = false;
ck::utils::conv::ConvParam conv_param{
2, 2, 128, 256, 256, {3, 3}, {14, 14}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
if(argc == 1)
{
// use default
}
else if(argc == 4)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
}
else
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
const ck::index_t num_dim_spatial = std::stoi(argv[4]);
conv_param = ck::utils::conv::parse_conv_param(num_dim_spatial, 5, argv);
}
const auto in_element_op = CBiasInElementOp{};
const auto wei_element_op = WeiElementOp{};
const auto out_element_op = OutElementOp{};
if(conv_param.num_dim_spatial_ == 2)
{
// output image: GNHWK
const auto out_g_n_k_wos_desc =
ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(
conv_param);
// weight: GKYXC
const auto wei_g_k_c_xs_desc =
ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(
conv_param);
// input image bias: G_C
const auto bias_g_n_c_wis_desc =
HostTensorDescriptor({conv_param.G_,
conv_param.N_,
conv_param.C_,
conv_param.input_spatial_lengths_[0],
conv_param.input_spatial_lengths_[1]},
{
conv_param.C_, // g
0, // n
1, // c
0, // hi
0 // wi
});
// input image: GNHWC
const auto in_g_n_c_wis_desc =
ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(
conv_param);
using DeviceInstance = DeviceConvNdBwdDataInstance<2>;
run_conv_bwd_data_bias_relu<2,
OutDataType,
WeiDataType,
BiasDataType,
InDataType,
OutElementOp,
WeiElementOp,
CBiasInElementOp,
DeviceInstance>(do_verification,
init_method,
time_kernel,
conv_param,
out_g_n_k_wos_desc,
wei_g_k_c_xs_desc,
bias_g_n_c_wis_desc,
in_g_n_c_wis_desc,
wei_element_op,
out_element_op,
in_element_op);
}
return 0;
}
add_custom_target(example_permute)
add_example_executable(example_permute_1xHxW_fp16 permute_1xHxW_fp16.cpp)
add_example_executable(example_permute_NxHxW_fp16 permute_NxHxW_fp16.cpp)
add_example_executable(example_permute_HxWx4_fp16 permute_HxWx4_fp16.cpp)
add_dependencies(example_permute example_permute_1xHxW_fp16)
add_dependencies(example_permute example_permute_NxHxW_fp16)
add_dependencies(example_permute example_permute_HxWx4_fp16)
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <algorithm>
#include <cassert>
#include <cstddef>
#include <cstdlib>
#include <cstring>
#include <iostream>
#include <iterator>
#include <numeric>
#include <type_traits>
#include <utility>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_permute_impl.hpp"
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
#include "ck/utility/type.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/fill.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
using F16 = ck::half_t;
using F32 = float;
using F64 = double;
struct Problem final
{
static constexpr std::size_t NumDim = 3;
using Shape = std::array<std::size_t, NumDim>;
using Axes = Shape;
Problem() = delete;
explicit Problem(const Shape& default_shape, const Axes& default_axes)
: shape(default_shape), axes(default_axes)
{
}
Shape shape;
Axes axes;
};
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
namespace detail {
template <typename Array, std::size_t Difference>
struct enlarge_array_size;
template <typename T, std::size_t Size, std::size_t Difference>
struct enlarge_array_size<std::array<T, Size>, Difference>
{
using type = std::array<T, Size + Difference>;
};
template <typename Array, std::size_t Difference>
using enlarge_array_size_t = typename enlarge_array_size<Array, Difference>::type;
template <typename Array>
struct get_array_size;
template <typename T, std::size_t Size>
struct get_array_size<std::array<T, Size>> : std::integral_constant<std::size_t, Size>
{
};
template <typename Array>
inline constexpr std::size_t get_array_size_v = get_array_size<Array>::value;
template <typename T, typename = void>
struct is_iterator : std::false_type
{
};
template <typename T>
struct is_iterator<T,
std::void_t<decltype(*std::declval<T>()),
decltype(++std::declval<std::add_lvalue_reference_t<T>>()),
decltype(std::declval<std::add_lvalue_reference_t<T>>()++)>>
: std::true_type
{
};
template <typename T>
inline constexpr bool is_iterator_v = is_iterator<T>::value;
struct Placeholder final
{
template <typename T>
constexpr inline operator T() const noexcept;
};
template <typename Iterator, typename = void>
struct is_output_iterator : std::false_type
{
};
template <typename Iterator>
struct is_output_iterator<
Iterator,
std::void_t<decltype(*std::declval<Iterator>() = std::declval<Placeholder>())>>
: std::bool_constant<is_iterator_v<Iterator>>
{
};
template <typename T>
inline constexpr bool is_output_iterator_v = is_output_iterator<T>::value;
template <typename Iterator, typename = void>
struct is_bidirectional_iterator : std::false_type
{
};
template <typename Iterator>
struct is_bidirectional_iterator<
Iterator,
std::void_t<decltype(--std::declval<std::add_lvalue_reference_t<Iterator>>()),
decltype(std::declval<std::add_lvalue_reference_t<Iterator>>()--)>>
: std::bool_constant<is_iterator_v<Iterator>>
{
};
template <typename Iterator>
inline constexpr bool is_bidirectional_iterator_v = is_bidirectional_iterator<Iterator>::value;
template <typename Iterator, typename = void>
struct is_random_access_iterator : std::false_type
{
};
template <typename Iterator>
struct is_random_access_iterator<Iterator,
std::void_t<decltype(std::declval<Iterator>() + 1),
decltype(std::declval<Iterator>() - 1),
decltype(std::declval<Iterator>()[1])>>
: std::bool_constant<is_iterator_v<Iterator>>
{
};
template <typename Iterator>
inline constexpr bool is_random_access_iterator_v = is_random_access_iterator<Iterator>::value;
template <typename T, typename = void>
struct is_range : std::false_type
{
};
template <typename T>
struct is_range<T,
std::void_t<decltype(begin(std::declval<T>())),
decltype(end(std::declval<T>())),
decltype(begin(std::declval<T>()) != end(std::declval<T>()))>>
: std::bool_constant<is_iterator_v<ck::remove_cvref_t<decltype(begin(std::declval<T>()))>>>
{
};
template <typename T>
inline constexpr bool is_range_v = is_range<T>::value;
template <typename Range, typename = void>
struct is_sized_range : std::false_type
{
};
template <typename Range>
struct is_sized_range<Range, std::void_t<decltype(size(std::declval<Range>()))>>
: std::bool_constant<is_range_v<Range>>
{
};
template <typename Range>
inline constexpr bool is_sized_range_v = is_sized_range<Range>::value;
template <typename Range, typename = void>
struct is_bidirectional_range : std::false_type
{
};
template <typename Range>
struct is_bidirectional_range<Range, std::void_t<>>
: std::bool_constant<
is_range_v<Range> &&
is_bidirectional_iterator_v<ck::remove_cvref_t<decltype(begin(std::declval<Range>()))>>>
{
};
template <typename Range>
inline constexpr bool is_bidirectional_range_v = is_bidirectional_range<Range>::value;
template <typename Range, typename = void>
struct is_random_access_range : std::false_type
{
};
template <typename Range>
struct is_random_access_range<Range, std::void_t<>>
: std::bool_constant<
is_range_v<Range> &&
is_random_access_iterator_v<ck::remove_cvref_t<decltype(begin(std::declval<Range>()))>>>
{
};
template <typename Range>
inline constexpr bool is_random_access_range_v = is_random_access_range<Range>::value;
template <typename Range>
class to_array_proxy
{
static_assert(is_range_v<Range>);
public:
explicit to_array_proxy(const Range& source) noexcept : source_(source) {}
template <typename T, std::size_t Size>
operator std::array<T, Size>() const
{
std::array<T, Size> destination;
std::copy_n(std::begin(source_),
std::min<std::size_t>(Size, std::size(source_)),
std::begin(destination));
return destination;
}
private:
const Range& source_;
};
} // namespace detail
template <typename Range>
inline auto to_array(Range& range) noexcept
-> std::enable_if_t<detail::is_range_v<Range>,
detail::to_array_proxy<ck::remove_cvref_t<Range>>>
{
return detail::to_array_proxy<ck::remove_cvref_t<Range>>{range};
}
namespace ranges {
template <typename InputRange, typename OutputIterator>
inline auto copy(InputRange&& range, OutputIterator iter)
-> decltype(std::copy(std::begin(std::forward<InputRange>(range)),
std::end(std::forward<InputRange>(range)),
iter))
{
return std::copy(std::begin(std::forward<InputRange>(range)),
std::end(std::forward<InputRange>(range)),
iter);
}
} // namespace ranges
template <typename Axes>
inline auto is_valid_axes(const Axes& axes)
-> std::enable_if_t<detail::is_random_access_range_v<Axes>, bool>
{
using std::empty;
if(empty(axes))
{
return false;
}
using std::begin, std::end;
std::vector<std::size_t> sorted_axes(begin(axes), end(axes));
std::sort(begin(sorted_axes), end(sorted_axes));
const auto last = std::unique(begin(sorted_axes), end(sorted_axes));
return (last == end(sorted_axes)) && (*begin(sorted_axes) == 0) &&
(*std::prev(last) == size(axes) - 1);
}
template <typename Shape>
inline auto is_valid_shape(const Shape& shape) -> std::enable_if_t<detail::is_range_v<Shape>, bool>
{
static_assert(std::is_unsigned_v<ck::remove_cvref_t<decltype(*std::begin(shape))>>);
using std::begin, std::end;
using std::empty;
return !empty(shape) && std::all_of(begin(shape), end(shape), [](auto dim) { return 0 < dim; });
}
template <typename Shape, typename Indices>
inline auto is_valid_indices(const Shape& shape, const Indices& indices)
-> std::enable_if_t<detail::is_sized_range_v<Shape> && detail::is_sized_range_v<Indices>, bool>
{
static_assert(std::is_unsigned_v<ck::remove_cvref_t<decltype(*std::begin(indices))>>);
if(!is_valid_shape(shape))
{
return false;
}
using std::empty;
if(empty(indices))
{
return false;
}
using std::size;
if(size(shape) != size(indices))
{
return false;
}
using std::begin, std::end;
auto dim = begin(shape);
auto idx = begin(indices);
for(; dim != end(shape) && idx != end(indices); ++dim, ++idx)
{
if(*dim <= *idx)
{
return false;
}
}
return true;
}
template <std::size_t Size>
std::array<std::size_t, Size> transpose(const std::array<std::size_t, Size>& shape,
const std::array<std::size_t, Size>& axes)
{
assert(is_valid_shape(shape) && is_valid_axes(axes));
std::array<std::size_t, Size> transposed;
auto iter = std::begin(transposed);
for(const auto axis : axes)
{
*iter++ = shape[axis];
}
return transposed;
}
auto extend_shape(const Problem::Shape& shape, std::size_t new_dim)
{
detail::enlarge_array_size_t<Problem::Shape, 1> extended_shape;
using std::begin, std::end;
std::copy(begin(shape), end(shape), begin(extended_shape));
extended_shape.back() = new_dim;
return extended_shape;
}
auto extend_axes(const Problem::Axes& axes)
{
detail::enlarge_array_size_t<Problem::Axes, 1> extended_axes;
using std::begin, std::end;
std::copy(begin(axes), end(axes), begin(extended_axes));
extended_axes.back() = detail::get_array_size_v<Problem::Axes>;
return extended_axes;
}
template <typename Shape, typename Indices>
auto advance_indices(const Shape& shape, Indices& indices) -> std::enable_if_t<
detail::is_bidirectional_range_v<Shape> && detail::is_sized_range_v<Shape> &&
detail::is_bidirectional_range_v<Indices> && detail::is_sized_range_v<Indices>,
bool>
{
using std::size;
if(!(is_valid_shape(shape) && is_valid_indices(shape, indices) && size(shape) == size(indices)))
{
return false;
}
bool carry = true;
using std::rbegin, std::rend;
auto dim = rbegin(shape);
auto idx = rbegin(indices);
for(; carry && dim != rend(shape) && idx != rend(indices); ++dim, ++idx)
{
*idx = (*idx + carry);
carry = ((*idx == *dim) ? (*idx = 0, true) : false);
}
return !carry;
}
template <typename Src, typename Axes, typename Functor, typename Dest>
auto host_permute(const Tensor<Src>& src, const Axes& axes, Functor functor, Tensor<Dest>& dest)
-> std::enable_if_t<detail::is_random_access_range_v<Axes> && detail::is_sized_range_v<Axes> &&
std::is_invocable_v<Functor,
std::add_lvalue_reference_t<Dest>,
std::add_lvalue_reference_t<Src>>,
bool>
{
const auto& shape = src.mDesc.GetLengths();
const auto& transposed_shape = dest.mDesc.GetLengths();
if(!(is_valid_shape(shape) && is_valid_shape(transposed_shape)))
{
return false;
}
using std::size;
if(!is_valid_axes(axes))
{
return false;
}
static_assert(detail::is_sized_range_v<ck::remove_cvref_t<decltype(shape)>> &&
detail::is_sized_range_v<ck::remove_cvref_t<decltype(transposed_shape)>>);
if(size(shape) != size(transposed_shape))
{
return false;
}
static_assert(detail::is_random_access_range_v<ck::remove_cvref_t<decltype(shape)>> &&
detail::is_random_access_range_v<ck::remove_cvref_t<decltype(transposed_shape)>>);
{
for(std::size_t idx = 0; idx < size(shape); ++idx)
{
if(transposed_shape[idx] != shape[axes[idx]])
{
return false;
}
}
}
std::vector<std::size_t> indices(size(shape), 0);
if(!is_valid_indices(shape, indices))
{
return false;
}
switch(size(shape))
{
case 3: {
do
{
Dest output = 0;
functor(output, src(indices[0], indices[1], indices[2]));
dest(indices[axes[0]], indices[axes[1]], indices[axes[2]]) = output;
} while(advance_indices(shape, indices));
}
break;
case 4: {
do
{
Dest output = 0;
functor(output, src(indices[0], indices[1], indices[2], indices[3]));
dest(indices[axes[0]], indices[axes[1]], indices[axes[2]], indices[axes[3]]) = output;
} while(advance_indices(shape, indices));
}
break;
default: return false;
}
return true;
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
using InDataType = F16;
using OutDataType = F16;
// clang-format off
using DevicePermuteInstance = ck::tensor_operation::device::DevicePermuteImpl
// ######| NumDim| InData| OutData| Elementwise| Block| NPer| HPer| WPer| InBlock| InBlockTransfer| InBlockTransfer| Src| Dst| Src| Dst|
// ######| | Type| Type| Operation| Size| Block| Block| Block| LdsExtraW| ThreadClusterLengths| ThreadClusterArrangeOrder| VectorDim| VectorDim| ScalarPerVector| ScalarPerVector|
// ######| | | | | | | | | | | | | | | |
// ######| | | | | | | | | | | | | | | |
< 3, InDataType, OutDataType, PassThrough, 256, 1, 32, 32, 3, S<1, 32, 8>, S<0, 1, 2>, 2, 1, 2, 1>;
// clang-format on
#include "run_permute_element_example.inc"
int main() { return !run_permute_element_example({1, 32000, 80}, {0, 2, 1}); }
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
using DataType = F16;
using BundleType = F64;
static_assert(sizeof(BundleType) % sizeof(DataType) == 0);
// clang-format off
using DevicePermuteInstance = ck::tensor_operation::device::DevicePermuteImpl
// ######| NumDim| InData| OutData| Elementwise| Block| NPer| HPer| WPer| InBlock| InBlockTransfer| InBlockTransfer| Src| Dst| Src| Dst|
// ######| | Type| Type| Operation| Size| Block| Block| Block| LdsExtraW| ThreadClusterLengths| ThreadClusterArrangeOrder| VectorDim| VectorDim| ScalarPerVector| ScalarPerVector|
// ######| | | | | | | | | | | | | | | |
// ######| | | | | | | | | | | | | | | |
< 3, BundleType, BundleType, PassThrough, 256, 1, 32, 32, 5, S<1, 32, 8>, S<0, 1, 2>, 2, 1, 4, 1>;
// clang-format on
#include "run_permute_bundle_example.inc"
int main() { return !run_permute_bundle_example({1, 80, 32000}, {0, 2, 1}); }
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
using InDataType = F16;
using OutDataType = F16;
// clang-format off
using DevicePermuteInstance = ck::tensor_operation::device::DevicePermuteImpl
// ######| NumDim| InData| OutData| Elementwise| Block| NPer| HPer| WPer| InBlock| InBlockTransfer| InBlockTransfer| Src| Dst| Src| Dst|
// ######| | Type| Type| Operation| Size| Block| Block| Block| LdsExtraW| ThreadClusterLengths| ThreadClusterArrangeOrder| VectorDim| VectorDim| ScalarPerVector| ScalarPerVector|
// ######| | | | | | | | | | | | | | | |
// ######| | | | | | | | | | | | | | | |
< 3, InDataType, OutDataType, PassThrough, 128, 4, 16, 8, 6, S<2, 16, 4>, S<0, 1, 2>, 2, 1, 2, 1>;
// clang-format on
#include "run_permute_element_example.inc"
int main() { return !run_permute_element_example({121, 768, 80}, {0, 2, 1}); }
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
bool run_permute_bundle(const Problem& problem)
{
const auto& input_bundle_shape = problem.shape;
const auto& input_bundle_axes = problem.axes;
const auto output_bundle_shape = transpose(input_bundle_shape, input_bundle_axes);
Tensor<BundleType> input_bundle_tensor(input_bundle_shape);
Tensor<BundleType> output_bundle_tensor(output_bundle_shape);
// initialize tensor by assigning DataType values
ck::utils::FillUniformDistribution<DataType>{-1.f, 1.f}(input_bundle_tensor.AsSpan<DataType>());
DeviceMem input_device_buf(input_bundle_tensor.GetElementSpaceSizeInBytes());
DeviceMem output_device_buf(output_bundle_tensor.GetElementSpaceSizeInBytes());
using std::data;
input_device_buf.ToDevice(data(input_bundle_tensor));
static_assert(std::is_default_constructible_v<DevicePermuteInstance>);
auto permute = DevicePermuteInstance{};
auto argument = permute.MakeArgument(to_array(input_bundle_shape),
to_array(input_bundle_tensor.GetStrides()),
to_array(output_bundle_shape),
to_array(output_bundle_tensor.GetStrides()),
input_device_buf.GetDeviceBuffer(),
output_device_buf.GetDeviceBuffer(),
PassThrough{});
if(!permute.IsSupportedArgument(argument))
{
std::cerr << "The runtime parameters seems not supported by the device instance, exiting!"
<< std::endl;
return false;
};
auto invoker = permute.MakeInvoker();
float ave_time = invoker.Run(argument, StreamConfig{nullptr, true});
std::cout << "Perf: " << ave_time << " ms" << std::endl;
output_device_buf.FromDevice(data(output_bundle_tensor));
constexpr std::size_t NumElemsInBundle = sizeof(BundleType) / sizeof(DataType);
// extend tensor shape from [N, H, W] to [N, H, W, NumElemsInBundle]
// axes from [0, 2, 1] to [0, 2, 1, 3]
const auto input_shape = extend_shape(input_bundle_shape, NumElemsInBundle);
const auto input_axes = extend_axes(input_bundle_axes);
using std::begin;
Tensor<DataType> input_tensor(input_shape);
ranges::copy(input_bundle_tensor.AsSpan<const DataType>(), begin(input_tensor));
Tensor<DataType> output_tensor(transpose(input_shape, input_axes));
if(!host_permute(input_tensor, input_axes, PassThrough{}, output_tensor))
{
return false;
}
return ck::utils::check_err(output_bundle_tensor.AsSpan<const DataType>(),
output_tensor.AsSpan<const DataType>(),
"Error: incorrect results in output tensor",
1e-6,
1e-6);
}
bool run_permute_bundle_example(const Problem::Shape& shape, const Problem::Axes& axes)
{
return run_permute_bundle(Problem{shape, axes});
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
bool run_permute_element(const Problem& problem)
{
const auto& input_shape = problem.shape;
const auto& input_axes = problem.axes;
const auto output_shape = transpose(input_shape, input_axes);
Tensor<InDataType> input_tensor(input_shape);
Tensor<OutDataType> output_tensor(output_shape);
ck::utils::FillUniformDistribution<InDataType>{-1.f, 1.f}(input_tensor);
DeviceMem input_device_buf(input_tensor.GetElementSpaceSizeInBytes());
DeviceMem output_device_buf(output_tensor.GetElementSpaceSizeInBytes());
using std::data;
input_device_buf.ToDevice(data(input_tensor));
static_assert(std::is_default_constructible_v<DevicePermuteInstance>);
auto permute = DevicePermuteInstance{};
auto argument = permute.MakeArgument(to_array(input_shape),
to_array(input_tensor.GetStrides()),
to_array(output_shape),
to_array(output_tensor.GetStrides()),
input_device_buf.GetDeviceBuffer(),
output_device_buf.GetDeviceBuffer(),
PassThrough{});
if(!permute.IsSupportedArgument(argument))
{
std::cerr << "The runtime parameters seems not supported by the device instance, exiting!"
<< std::endl;
return false;
};
auto invoker = permute.MakeInvoker();
float ave_time = invoker.Run(argument, StreamConfig{nullptr, true});
std::cout << "Perf: " << ave_time << " ms" << std::endl;
output_device_buf.FromDevice(data(output_tensor));
Tensor<OutDataType> output_tensor_host(output_shape);
if(!host_permute(input_tensor, input_axes, PassThrough{}, output_tensor_host))
{
return false;
}
return ck::utils::check_err(output_tensor.AsSpan<const OutDataType>(),
output_tensor_host.AsSpan<const OutDataType>(),
"Error: incorrect results in output tensor",
1e-6,
1e-6);
}
bool run_permute_element_example(const Problem::Shape& shape, const Problem::Axes& axes)
{
return run_permute_element(Problem{shape, axes});
}
add_example_executable(example_groupnorm_sigmoid_fp16 groupnorm_sigmoid_fp16.cpp)
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <getopt.h>
#include "ck/ck.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/tensor_operation/gpu/device/device_layernorm_impl.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/library/utility/fill.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_common_util.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp"
constexpr int Rank = 5;
constexpr int NumReduceDim = 3;
using XDataType = ck::half_t;
using GammaDataType = ck::half_t;
using BetaDataType = ck::half_t;
using YDataType = ck::half_t;
using AccDataType = float;
struct YElementOp
{
template <typename T>
__host__ __device__ void operator()(T& y, const T& x) const
{
static_assert(ck::is_same<T, float>::value || ck::is_same<T, double>::value ||
ck::is_same<T, ck::half_t>::value,
"Data type is not supported by this operation!");
T a;
ck::tensor_operation::element_wise::Sigmoid{}(a, x);
y = x * a;
};
};
using DeviceInstance =
ck::tensor_operation::device::DeviceLayernormImpl<XDataType,
GammaDataType,
BetaDataType,
AccDataType,
YDataType,
YElementOp,
Rank,
NumReduceDim,
256, // BlockSize
8, // ClusterM
32, // ClusterK
1, // SliceM
8, // SliceK
1, // SrcVecDim (0=M, 1=K)
8, // SrcScalarPerVector
1, // GammaVecDim (0=M, 1=K)
8, // GammaScalarPerVector
1, // BetaVecDim (0=M, 1=K)
8, // BetaScalarPerVector
8>; // OutScalarPerVector
int main(int argc, char* argv[])
{
ck::index_t N = 128;
ck::index_t H = 16;
ck::index_t W = 16;
ck::index_t G = 32;
ck::index_t C = 40;
if(argc == 1)
{
// use default case
}
else if(argc == 6)
{
N = std::stoi(argv[1]);
H = std::stoi(argv[2]);
W = std::stoi(argv[3]);
G = std::stoi(argv[4]);
C = std::stoi(argv[5]);
}
else
{
std::cerr << "arg1 to 5: N, H, W, G, C" << std::endl;
return 1;
}
Tensor<XDataType> x({N, H, W, G, C});
Tensor<YDataType> y({N, H, W, G, C});
Tensor<GammaDataType> gamma({G, C});
Tensor<BetaDataType> beta({G, C});
ck::utils::FillUniformDistribution<XDataType>{0.f, 1.f}(x.begin(), x.end());
ck::utils::FillUniformDistribution<GammaDataType>{0.f, 1.f}(gamma.begin(), gamma.end());
ck::utils::FillUniformDistribution<BetaDataType>{0.f, 1.f}(beta.begin(), beta.end());
DeviceMem x_dev(sizeof(XDataType) * x.mDesc.GetElementSpaceSize());
DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpaceSize());
DeviceMem beta_dev(sizeof(BetaDataType) * beta.mDesc.GetElementSpaceSize());
DeviceMem y_dev(sizeof(YDataType) * y.mDesc.GetElementSpaceSize());
x_dev.ToDevice(x.mData.data());
gamma_dev.ToDevice(gamma.mData.data());
beta_dev.ToDevice(beta.mData.data());
const auto y_element_op = YElementOp{};
auto device_instance = DeviceInstance{};
auto argument_ptr = device_instance.MakeArgumentPointer(
{N, H, W, G, C},
std::vector<ck::index_t>{x.mDesc.GetStrides().begin(), x.mDesc.GetStrides().end()},
{0, 0, 0, C, 1},
{0, 0, 0, C, 1},
std::vector<ck::index_t>{y.mDesc.GetStrides().begin(), y.mDesc.GetStrides().end()},
{1, 2, 4}, // reduction dimension: [H, W, C]
1e-6,
x_dev.GetDeviceBuffer(),
gamma_dev.GetDeviceBuffer(),
beta_dev.GetDeviceBuffer(),
y_dev.GetDeviceBuffer(),
y_element_op);
if(!device_instance.IsSupportedArgument(argument_ptr.get()))
{
std::cout << "The runtime parameters are not supported" << std::endl;
return 1;
};
auto invoker_ptr = device_instance.MakeInvokerPointer();
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true, true});
std::size_t num_btype = sizeof(XDataType) * N * H * W * G * C +
sizeof(YDataType) * N * H * W * G * C + sizeof(GammaDataType) * G * C +
sizeof(BetaDataType) * G * C;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << gb_per_sec << " GB/s, "
<< device_instance.GetTypeString() << std::endl;
bool pass = true;
{
Tensor<YDataType> host_y({N, H, W, G, C});
using ReferenceInstance = ck::tensor_operation::host::ReferenceGroupnorm<XDataType,
GammaDataType,
BetaDataType,
YDataType,
AccDataType,
YElementOp>;
ReferenceInstance ref;
auto ref_argument =
ref.MakeArgument(x, gamma, beta, host_y, y_element_op, {N, H, W, G, C}, 1e-6);
auto ref_invoker = ref.MakeInvoker();
ref_invoker.Run(ref_argument);
y_dev.FromDevice(y.mData.data());
pass &= ck::utils::check_err(y.mData, host_y.mData, "Error: Incorrect results", 1e-3, 1e-3);
}
return (pass ? 0 : 1);
}
...@@ -21,35 +21,10 @@ function(add_example_executable_no_testing EXAMPLE_NAME FILE_NAME) ...@@ -21,35 +21,10 @@ function(add_example_executable_no_testing EXAMPLE_NAME FILE_NAME)
add_dependencies(examples ${EXAMPLE_NAME}) add_dependencies(examples ${EXAMPLE_NAME})
endfunction(add_example_executable_no_testing EXAMPLE_NAME) endfunction(add_example_executable_no_testing EXAMPLE_NAME)
add_subdirectory(01_gemm) # add all example subdir
add_subdirectory(02_gemm_bilinear) file(GLOB dir_list LIST_DIRECTORIES true *)
add_subdirectory(03_gemm_bias_relu) FOREACH(subdir ${dir_list})
add_subdirectory(04_gemm_add_add_fastgelu) IF(IS_DIRECTORY "${subdir}")
add_subdirectory(09_convnd_fwd) add_subdirectory(${subdir})
add_subdirectory(10_convnd_fwd_multiple_d_multiple_reduce) ENDIF()
add_subdirectory(12_reduce) ENDFOREACH()
add_subdirectory(13_pool2d_fwd)
add_subdirectory(14_gemm_xdl_requant_relu_requant)
add_subdirectory(15_grouped_gemm)
add_subdirectory(16_gemm_multi_d_multi_reduces)
add_subdirectory(17_convnd_bwd_data)
add_subdirectory(18_batched_gemm_reduce)
add_subdirectory(19_binary_elementwise)
add_subdirectory(20_convnd_bwd_weight)
add_subdirectory(21_gemm_layernorm)
add_subdirectory(22_cgemm)
add_subdirectory(23_softmax)
add_subdirectory(24_batched_gemm)
add_subdirectory(25_gemm_bias_e_permute)
add_subdirectory(26_contraction)
add_subdirectory(27_layernorm)
add_subdirectory(28_grouped_gemm_bias_e_permute)
add_subdirectory(29_batched_gemm_bias_e_permute)
add_subdirectory(30_grouped_convnd_fwd_bias_relu_add)
add_subdirectory(31_batched_gemm_gemm)
add_subdirectory(32_batched_gemm_scale_softmax_gemm)
add_subdirectory(33_multiple_reduce)
add_subdirectory(34_batchnorm)
add_subdirectory(35_splitK_gemm)
add_subdirectory(36_sparse_embedding)
add_subdirectory(41_grouped_conv_conv_fwd)
...@@ -10,4 +10,5 @@ struct StreamConfig ...@@ -10,4 +10,5 @@ struct StreamConfig
{ {
hipStream_t stream_id_ = nullptr; hipStream_t stream_id_ = nullptr;
bool time_kernel_ = false; bool time_kernel_ = false;
int log_level_ = 0;
}; };
...@@ -649,6 +649,9 @@ struct BlockwiseGemmXdlops_v2 ...@@ -649,6 +649,9 @@ struct BlockwiseGemmXdlops_v2
static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerXDL); static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerXDL);
static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerXDL); static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerXDL);
static_assert(KPerThread % KPack == 0,
"Wrong KPack setting; try increasing KPerThread or decreasing KPack");
StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr, StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr,
FloatAcc, FloatAcc,
MRepeat * NRepeat, MRepeat * NRepeat,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment