Commit c5b3f69f authored by Chao Liu's avatar Chao Liu
Browse files

pseudo code

parent 800f0ab3
#include "tile_program.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm_gemm.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_batched_gemm_gemm_xdl_cshuffle_v1.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/host_utility/io.hpp"
#include "ck/library/utility/device_memory.hpp"
// program
struct GemmMultiplD
{
__host__ __device__ void operator()(TileProgram& tp, int x, int y)
{
auto desc = tp.make_naive_tensor_descriptor_packed(ck::make_tuple(x));
printf("length %d\n", desc.GetLength(ck::Number<0>{}));
}
};
int main()
{
int x = 100;
int y = 101;
launch(HelloWorld{}, 1, 1, x, y);
return 0;
}
template <typename ADataType,
typename BDataType,
typename DsDataType,
typename EDataType,
typename AElementwiseOperation,
typename BElementwiseOperation,
typename CDEElementwiseOperation>
struct GemmMultiD
{
static constexpr index_t NumDTensor = DsDataType::Size();
__host__ __device__ void
operator()(TileProgram& tp,
const std::array<index_t, 2> a_m_k_lengths,
const std::array<index_t, 2> a_m_k_strides,
const std::array<index_t, 2> b_n_k_lengths,
const std::array<index_t, 2> b_n_k_strides,
const std::array<const std::array<index_t, 2>, NumDTensor> ds_m_n_lengths,
const std::array<const std::array<index_t, 2>, NumDTensor> ds_m_n_strides,
const std::array<index_t, 2> e_m_n_lengths,
const std::array<index_t, 2> e_m_n_strides,
//
const T* p_a,
const T* p_b,
const std::array<const T*> p_ds,
T* p_e)
{
using namespace ck;
const auto b = tp(make_naive_tensor(b_n_k_lengths, b_n_k_strides), p_b);
const auto ds = tp(generate_tuple(
[&](auto i) {
return make_naive_tensor(ds_m_n_lengths[i], ds_m_n_strides[i], p_ds[i]),
},
Number<NumDTensor>{}));
auto e = tp(make_naive_tensor(e_m_n_lengths, e_m_n_strides), p_e);
// divide problem
const auto num_m = e_m_n_lengths[0];
const auto num_n = e_m_n_lengths[1];
const auto id_block = get_block_1d_id();
const auto num_tile_m = num_gemmm / MPerTile;
const auto num_tile_n = num_gemmn / NPerTile;
const auto block2tile = tp(make_cluster_descriptor(make_tuple(num_tile_m, num_tile_n)));
const auto id_tile = block2tile.CalculateBottonIndex(id_block);
const auto id_tile_m = id_tile.At<0>();
const auto id_tile_n = id_tile.At<1>();
// A/B in DRAM
// A/B DRAM layout is part of problem, not solution
#if 1
// DO NOT let user know there is optimization on tensor transform on A/B DRAM tensor
const auto a_dram_global = tp(make_naive_tensor(a_m_k_lengths, a_m_k_strides), p_a_dram);
const auto b_dram_global = tp(make_naive_tensor(b_n_k_lengths, b_n_k_strides), p_b_dram);
#endif
// A/B tile in LDS
// A/B DRAM layout is part of solution
ADataType* p_a_lds = shared_memmory.get_pointer(0);
// [allow optimization] allow different LDS layouts
constexpr auto a_lds_block =
make_tensor(p_a_lds, {kMPerBlock, kKPerBlock}, a_lds_block_strategy);
constexpr auto a_lds_byte = a_lds_block.get_num_of_byte();
BDataType* p_b_lds = shared_memory.get_aligned_pointer(a_lds_byte);
// [allow optimization] allow different LDS layouts
constexpr auto b_lds_block =
make_tensor({p_b_lds, kNPerBlock, kKPerBlock}, b_lds_block_strategy);
// A/B copy
#if 0
auto a_block_copy = make_copier(a_dram_global,
a_lds_block,
make_tuple(kMPerBlock, kKPerBlock),
make_tuple(id_tile_m * kMPerBlock, 0),
a_block_copy_strategy);
auto b_block_copy = make_copier(b_dram_global,
b_lds_block,
make_tuple(kNPerBlock, kKPerBlock),
make_tuple(id_tile_n * kNPerBlock, 0),
b_block_copy_strategy);
#else
auto window_a_dram = make_window(a_dram_global,
{MPerTile, KPerTile},
{id_tile_m * MPerTile, id_tile_k * KPerTile},
a_dram_window_map_strategy);
auto window_a_block =
make_window(a_lds_block, {NPerTile, KPerTile}, {0, 0}, a_lds_window_map_strategy);
#endif
#if 1
// block GEMM
// operation-based syntax: per-operation solution strategy
auto block_gemm = make_block_gemm(a_lds_block, b_lds_block, block_gemm_strategy);
#endif
// Distributed C in VGPR
#if 1
// C layout is decided alone
// C should be distributed,
auto c_vgpr_block =
make_distributed_tensor({kMPerBlock, kNPerBlock}, c_vgpr_block_strategy);
#elif 0
// C layout is decided by block GEMM
auto c_vgpr_block = block_gemm.get_c_vgpr_block();
#endif
for(index_t k = 0; k < K; k += kKPerBlock)
{
auto a_vgpr_block_tmp = load(window_a_dram, a_dram_load_strategy);
auto b_vgpr_block_tmp = load(window_b_dram, b_dram_load_strategy);
auto a_vpgr_block = elementwise_op(a_vgpr_block_tmp, a_element_op);
auto b_vpgr_block = elementwise_op(b_vgpr_block_tmp, b_element_op);
copy(a_vgpr_block, a_lds_block, a_lds_store_strategy);
copy(b_vgpr_block, b_lds_block, b_lds_store_strategy);
block_sync_lds();
dot_product_accumulate(c_vgpr_block, a_lds_block, b_lds_block);
block_sync_lds();
window_a_dram += {0, kKPerBlock};
window_b_dram += {0, kKPerBlock};
}
auto p_c_lds = xxx;
auto c_lds = make_tensor(p_c_lds, xxxxxx);
auto window_c_vgpr =
make_window(c_vgpr, {kMPerShuffle, kNPerShuffle}, {0, 0}, c_vgpr_window_strategy);
auto window_c_lds =
make_window(c_lds, {kMPerShuffle, kNPerShuffle}, {0, 0}, c_lds_window_strategy);
auto window_d_dram = make_window(d_dram_global,
{kMPerShuffle, kNPerShuffle},
{id_tile_m * kMPerTile, id_tile_n * kNPerTile},
d_dram_window_strategy);
auto window_e_dram = make_window(e_dram_global,
{kMPerShuffle, kNPerShuffle},
{id_tile_m * kMPerTile, id_tile_n * kNPerTile},
e_dram_window_strategy);
for(m = 0; m < kMPerBlock; m += kMPerShuffle)
{
for(n = 0; n < kNPerBlock; n += kNPerShuffle)
{
// write C into LDS for shuffle
copy(window_c_vgpr, window_c_lds, c_lds_store_strategy);
// load C from LDS to complete shuffle
auto c_vgpr_slice_shuffled = load(window_c_lds, c_lds_load_strategy);
// load D from dram
auto d_vgpr_block_slice = load(window_d_dram, d_dram_load_strategy);
// element wise op
// [Question] need to gurantee it always function
// 1. C/D should have same layout, how to gurantee?
// 2. if C/D have different layout, then need to do shuffle
// 3. if C/D have different layout, what should E layout be?
auto e_vgpr_block_slice =
elementwise_op(c_vgpr_block_slice, d_vgpr_block_slice, cd_elementwise_op);
// write E into dram
copy(e_vgpr_block_slice, window_e_dram, e_dram_store_strategy);
}
}
}
};
......@@ -19,6 +19,8 @@ struct HelloWorld
auto desc0 = tp(make_naive_tensor_descriptor_packed(ck::make_tuple(x)));
auto desc1 = tp(make_naive_tensor_descriptor_packed(ck::make_tuple(y)));
// only for testing purpose
// cpu should not do work here
res[0] = desc0.GetLength(ck::Number<0>{});
res[1] = desc1.GetLength(ck::Number<0>{});
}
......
#include "tile_program.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm_gemm.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_batched_gemm_gemm_xdl_cshuffle_v1.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/host_utility/io.hpp"
#include "ck/library/utility/device_memory.hpp"
// program
template <ck::index_t NDimSpatial, typename ALayout>
struct Im2Col
{
__host__ __device__ void
operator()(TileProgram& tp,
const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,
const std::array<index_t, NDimSpatial>& conv_filter_strides,
const std::array<index_t, NDimSpatial>& conv_filter_dilations,
const std::array<index_t, NDimSpatial>& input_left_pads,
const std::array<index_t, NDimSpatial>& input_right_pads,
//
const std::array<index_t, 2> a_gemmg_gemmm_gemmk_lengths,
const std::array<index_t, 2> a_gemmg_gemmm_gemmk_strides,
//
const T* p_a_img,
T* p_a_mtx)
{
using namespace ck;
const auto a_src_desc =
tp(TransformConvFwdToGemm<NDimSpatial, ConvolutionForwardSpecialization::Default>::
template MakeADescriptor_M_K<ALayout>(a_g_n_c_wis_lengths,
a_g_n_c_wis_strides,
b_g_k_c_xs_lengths,
b_g_k_c_xs_strides,
c_g_n_k_wos_lengths,
c_g_n_k_wos_strides,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads));
const auto a_dst_desc =
tp(make_naive_tensor_descriptor(a_gemmm_gemmk_lengths, a_gemmm_gemmk_strides));
const auto a_src = make_tensor(a_src_desc, p_a_img);
const auto a_dst = make_tensor(a_dst_desc, p_a_mtx);
const auto num_gemmg = a_gemmg_gemmm_gemmk_c_wis_lengths[0];
const auto num_gemmm = a_gemmg_gemmm_gemmk_c_wis_lengths[1];
const auto num_gemmk = a_gemmg_gemmm_gemmk_c_wis_lengths[2];
const auto id_block = get_block_1d_id();
const auto num_tile_m = num_gemmm / MPerTile;
const auto num_tile_k = num_gemmk / KPerTile;
const auto block2tile = tp(make_cluster_descriptor(make_tuple(num_tile_m, num_tile_k)));
const auto id_tile = block2tile.CalculateBottonIndex(id_block);
const auto id_tile_m = id_tile.At<0>();
const auto id_tile_k = id_tile.At<1>();
#if 0
// operation-based syntax: per-oeration solution strategy
// operation here is data movement
auto copier = make_copier(a_src,
a_dst,
make_tuple(1, MPerTile, KPerTile),
make_tuple(0, id_tile_m * MPerTile, id_tile_k * KPerTile),
copy_strategy);
for(ck::index_t id_gemmg = 0; id_gemmg < num_gemmg; id_gemmg++)
{
copier();
copier.move_src_window(make_tuple(1, 0, 0));
copier.move_dst_window(make_tuple(1, 0, 0));
}
#else
// data-based syntax: per-data solution strategy
auto window_a_src = make_window(a_src,
make_tuple(1, MPerTile, KPerTile),
make_tuple(0, id_tile_m * MPerTile, id_tile_k * KPerTile),
a_src_window_map_strategy);
auto window_a_dst = make_window(a_dst,
make_tuple(1, MPerTile, KPerTile),
make_tuple(0, id_tile_m * MPerTile, id_tile_k * KPerTile),
a_dst_window_map_strategy);
for(ck::index_t id_gemmg = 0; id_gemmg < num_gemmg; id_gemmg++)
{
copy(window_a_src, window_a_dst, a_copy_strategy);
window_a_src += make_tuple(1, 0, 0);
window_a_dst += make_tuple(1, 0, 0);
}
#endif
}
};
int main()
{
ck::index_t NumDimSpatial = 2;
ck::index_t G = 32;
ck::index_t N = 256;
ck::index_t K = 192;
ck::index_t C = 192;
ck::index_t Y = 3;
ck::index_t X = 3;
ck::index_t Hi = 28;
ck::index_t Wi = 28;
ck::index_t Ho = 28;
ck::index_t Wo = 28;
std::array<ck::index_t, NumDimSpatial + 3> in_lengths{G, N, Hi, Wi, C};
std::array<ck::index_t, NumDimSpatial + 3> in_strides{0, 0, 0, 0, 1};
std::array<ck::index_t, NumDimSpatial + 3> wei_lengths{G, K, Y, X, C};
std::array<ck::index_t, NumDimSpatial + 3> wei_strides{0, 0, 0, 0, 1};
std::array<ck::index_t, NumDimSpatial + 3> out_lengths{G, N, Ho, Wo, K};
std::array<ck::index_t, NumDimSpatial + 3> out_strides{0, 0, 0, 0, 1};
std::partial_sum(rbegin(in_lengths),
std::prev(rend(in_lengths)),
std::next(rbegin(in_strides)),
std::multiplies<>{});
std::partial_sum(rbegin(wei_lengths),
std::prev(rend(wei_lengths)),
std::next(rbegin(wei_strides)),
std::multiplies<>{});
std::partial_sum(rbegin(out_lengths),
std::prev(rend(out_lengths)),
std::next(rbegin(out_strides)),
std::multiplies<>{});
// transpose GNHWC/GKYXC/GNHWK to GNCHW/GKCYX/GNCHW
std::rotate(
rbegin(in_lengths), std::next(rbegin(in_lengths)), std::next(rbegin(in_lengths), 3));
std::rotate(
rbegin(in_strides), std::next(rbegin(in_strides)), std::next(rbegin(in_strides), 3));
std::rotate(
rbegin(wei_lengths), std::next(rbegin(wei_lengths)), std::next(rbegin(wei_lengths), 3));
std::rotate(
rbegin(wei_strides), std::next(rbegin(wei_strides)), std::next(rbegin(wei_strides), 3));
std::rotate(
rbegin(out_lengths), std::next(rbegin(out_lengths)), std::next(rbegin(out_lengths), 3));
std::rotate(
rbegin(out_strides), std::next(rbegin(out_strides)), std::next(rbegin(out_strides), 3));
std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1};
std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1};
std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1};
std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1};
// matrix
std::array<ck::index_t, NumDimSpatial + 3> in_mtx_lengths{G, G * Ho * Wo, C * Y * X};
std::array<ck::index_t, NumDimSpatial + 3> in_mtx_strides{0, 0, 1};
std::partial_sum(rbegin(in_mtx_lengths),
std::prev(rend(in_mtx_lengths)),
std::next(rbegin(in_mtx_strides)),
std::multiplies<>{});
DeviceMem in(sizeof(InDataType) * G * N * Hi * Wi * C);
DeviceMem in_mtx(sizeof(InDataType) * G * N * Ho * Wo * C * Y * X);
launch(HelloWorld{},
1,
1,
in_lengths,
in_strides,
wei_lengths,
wei_strides,
out_lengths,
out_strides,
filter_strides,
filter_dilations,
input_left_pads,
input_right_pads,
//
in_mtx_lengths,
in_mtx_strides,
//
in.GetDeviceBuffer(),
in_mtx.GetDeviceBuffer());
return 0;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment