Commit fe2ea9b6 authored by Chao Liu's avatar Chao Liu
Browse files

tile im2col

parent c5b3f69f
add_example_executable(example_hello_world hello_world.cpp)
add_example_executable(example_im2col im2col.cpp)
#pragma once
#include "tile_program.hpp"
template
namespace ck{make_tensor};
......@@ -27,6 +27,7 @@ struct GemmMultiD
{
using namespace ck;
const auto a = tp(make_naive_tensor(a_m_k_lengths, a_m_k_strides), p_a);
const auto b = tp(make_naive_tensor(b_n_k_lengths, b_n_k_strides), p_b);
const auto ds = tp(generate_tuple(
[&](auto i) {
......@@ -95,7 +96,7 @@ struct GemmMultiD
a_dram_window_map_strategy);
auto window_a_block =
make_window(a_lds_block, {NPerTile, KPerTile}, {0, 0}, a_lds_window_map_strategy);
make_window(a_lds_block, {MPerTile, KPerTile}, {0, 0}, a_lds_window_map_strategy);
#endif
......
#include "tile_program.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/cluster_descriptor.hpp"
#include "ck/tensor/tensor.hpp"
#include "ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp"
#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm_gemm.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_batched_gemm_gemm_xdl_cshuffle_v1.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/host_utility/io.hpp"
#include "ck/library/utility/device_memory.hpp"
// program
template <ck::index_t NDimSpatial, typename ALayout>
template <ck::index_t NDimSpatial,
typename ALayout,
typename T,
// tuning parameter
ck::index_t kMPerTile,
ck::index_t kKPerTile>
struct Im2Col
{
__host__ __device__ void
operator()(TileProgram& tp,
const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,
const std::array<index_t, NDimSpatial>& conv_filter_strides,
const std::array<index_t, NDimSpatial>& conv_filter_dilations,
const std::array<index_t, NDimSpatial>& input_left_pads,
const std::array<index_t, NDimSpatial>& input_right_pads,
const std::array<ck::index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
const std::array<ck::index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
const std::array<ck::index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
const std::array<ck::index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,
const std::array<ck::index_t, NDimSpatial + 3>& c_g_n_k_wos_lengths,
const std::array<ck::index_t, NDimSpatial + 3>& c_g_n_k_wos_strides,
const std::array<ck::index_t, NDimSpatial>& conv_filter_strides,
const std::array<ck::index_t, NDimSpatial>& conv_filter_dilations,
const std::array<ck::index_t, NDimSpatial>& input_left_pads,
const std::array<ck::index_t, NDimSpatial>& input_right_pads,
//
const std::array<index_t, 2> a_gemmg_gemmm_gemmk_lengths,
const std::array<index_t, 2> a_gemmg_gemmm_gemmk_strides,
const std::array<ck::index_t, 3> a_gemmg_gemmm_gemmk_lengths,
const std::array<ck::index_t, 3> a_gemmg_gemmm_gemmk_strides,
//
const T* p_a_img,
T* p_a_mtx)
{
using namespace ck;
const auto a_src_desc =
tp(TransformConvFwdToGemm<NDimSpatial, ConvolutionForwardSpecialization::Default>::
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
const auto a_src_desc = tensor_operation::TransformConvFwdToGemm<
NDimSpatial,
tensor_operation::device::ConvolutionForwardSpecialization::Default>::
template MakeADescriptor_M_K<ALayout>(a_g_n_c_wis_lengths,
a_g_n_c_wis_strides,
b_g_k_c_xs_lengths,
......@@ -45,48 +57,37 @@ struct Im2Col
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads));
input_right_pads);
const auto a_dst_desc =
tp(make_naive_tensor_descriptor(a_gemmm_gemmk_lengths, a_gemmm_gemmk_strides));
make_naive_tensor_descriptor(make_tuple(a_gemmg_gemmm_gemmk_lengths[0],
a_gemmg_gemmm_gemmk_lengths[1],
a_gemmg_gemmm_gemmk_lengths[2]),
make_tuple(a_gemmg_gemmm_gemmk_strides[0],
a_gemmg_gemmm_gemmk_strides[1],
a_gemmg_gemmm_gemmk_strides[2]));
const auto a_src = make_tensor(a_src_desc, p_a_img);
const auto a_src = tp(make_tensor<AddressSpaceEnum::Global, true>(a_src_desc, p_a_img));
const auto a_dst = make_tensor(a_dst_desc, p_a_mtx);
auto a_dst = tp(make_tensor<AddressSpaceEnum::Global, true>(a_dst_desc, p_a_mtx));
const auto num_gemmg = a_gemmg_gemmm_gemmk_c_wis_lengths[0];
const auto num_gemmm = a_gemmg_gemmm_gemmk_c_wis_lengths[1];
const auto num_gemmk = a_gemmg_gemmm_gemmk_c_wis_lengths[2];
const auto num_gemmg = a_gemmg_gemmm_gemmk_lengths[0];
const auto num_gemmm = a_gemmg_gemmm_gemmk_lengths[1];
const auto num_gemmk = a_gemmg_gemmm_gemmk_lengths[2];
const auto id_block = get_block_1d_id();
const auto id_block = tp.get_block_1d_id();
const auto num_tile_m = num_gemmm / MPerTile;
const auto num_tile_k = num_gemmk / KPerTile;
const auto num_tile_m = num_gemmm / kMPerTile;
const auto num_tile_k = num_gemmk / kKPerTile;
const auto block2tile = tp(make_cluster_descriptor(make_tuple(num_tile_m, num_tile_k)));
const auto id_tile = block2tile.CalculateBottonIndex(id_block);
const auto id_tile = block2tile.CalculateBottomIndex(make_tuple(id_block));
const auto id_tile_m = id_tile.At<0>();
const auto id_tile_k = id_tile.At<1>();
#if 0
// operation-based syntax: per-oeration solution strategy
// operation here is data movement
auto copier = make_copier(a_src,
a_dst,
make_tuple(1, MPerTile, KPerTile),
make_tuple(0, id_tile_m * MPerTile, id_tile_k * KPerTile),
copy_strategy);
for(ck::index_t id_gemmg = 0; id_gemmg < num_gemmg; id_gemmg++)
{
copier();
const auto id_tile_m = id_tile[I0];
const auto id_tile_k = id_tile[I1];
copier.move_src_window(make_tuple(1, 0, 0));
copier.move_dst_window(make_tuple(1, 0, 0));
}
#else
#if 1
// data-based syntax: per-data solution strategy
auto window_a_src = make_window(a_src,
make_tuple(1, MPerTile, KPerTile),
......@@ -105,13 +106,17 @@ struct Im2Col
window_a_src += make_tuple(1, 0, 0);
window_a_dst += make_tuple(1, 0, 0);
}
#else
#endif
}
};
int main()
{
ck::index_t NumDimSpatial = 2;
using DataType = float;
constexpr ck::index_t NumDimSpatial = 2;
ck::index_t G = 32;
ck::index_t N = 256;
ck::index_t K = 192;
......@@ -165,20 +170,20 @@ int main()
std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1};
// matrix
std::array<ck::index_t, NumDimSpatial + 3> in_mtx_lengths{G, G * Ho * Wo, C * Y * X};
std::array<ck::index_t, NumDimSpatial + 3> in_mtx_strides{0, 0, 1};
std::array<ck::index_t, 3> in_mtx_lengths{G, G * Ho * Wo, C * Y * X};
std::array<ck::index_t, 3> in_mtx_strides{0, 0, 1};
std::partial_sum(rbegin(in_mtx_lengths),
std::prev(rend(in_mtx_lengths)),
std::next(rbegin(in_mtx_strides)),
std::multiplies<>{});
DeviceMem in(sizeof(InDataType) * G * N * Hi * Wi * C);
DeviceMem in_mtx(sizeof(InDataType) * G * N * Ho * Wo * C * Y * X);
DeviceMem in(sizeof(DataType) * G * N * Hi * Wi * C);
DeviceMem in_mtx(sizeof(DataType) * G * N * Ho * Wo * C * Y * X);
launch(HelloWorld{},
1,
launch(Im2Col<2, ck::tensor_layout::convolution::GNHWC, float, 128, 128>{},
1,
256,
in_lengths,
in_strides,
wei_lengths,
......@@ -193,8 +198,8 @@ int main()
in_mtx_lengths,
in_mtx_strides,
//
in.GetDeviceBuffer(),
in_mtx.GetDeviceBuffer());
static_cast<DataType*>(in.GetDeviceBuffer()),
static_cast<DataType*>(in_mtx.GetDeviceBuffer()));
return 0;
}
......@@ -65,6 +65,14 @@ struct TileProgram
{
return arg_.pull<T>();
}
__host__ static ck::index_t get_block_1d_id() { return -1; }
__host__ static ck::index_t get_grid_size() { return -1; }
__device__ static ck::index_t get_block_1d_id() { return ck::get_block_1d_id(); }
__device__ static ck::index_t get_grid_size() { return ck::get_grid_size(); }
};
template <typename Program, typename... Xs>
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
namespace ck {
template <AddressSpaceEnum AddressSpace,
bool InvalidElementUseNumericalZeroValue,
typename T,
typename TensorDesc>
struct Tensor
{
static constexpr AddressSpaceEnum kAdressSpace_ = AddressSpace;
static constexpr bool kInvalidElementUseNumericalZeroValue_ =
InvalidElementUseNumericalZeroValue;
__host__ __device__ constexpr Tensor() : buf_{nullptr, 0}, desc_{} {}
__host__ __device__ constexpr Tensor(T* p_data, TensorDesc desc)
: buf_{p_data, desc.GetElementSpaceSize()}, desc_{desc}
{
}
__host__ __device__ constexpr Tensor(T* p_data, TensorDesc desc, T invalid_element_value)
: buf_{p_data, desc.GetElementSpaceSize(), invalid_element_value}, desc_{desc}
{
}
DynamicBuffer<AddressSpace,
T,
typename TensorDesc::ElementSpaceSizeType,
InvalidElementUseNumericalZeroValue>
buf_;
TensorDesc desc_;
};
template <AddressSpaceEnum AddressSpace,
bool InvalidElementUseNumericalZeroValue,
typename T,
typename TensorDesc>
__host__ __device__ constexpr auto make_tensor(const TensorDesc& desc, T* p_data)
{
return Tensor<AddressSpace, InvalidElementUseNumericalZeroValue, T, remove_cvref_t<TensorDesc>>{
p_data, desc};
}
template <typename OldTensor,
typename NewTransforms,
typename NewLowerDimensionOldVisibleIdss,
typename NewUpperDimensionNewVisibleIdss>
__host__ __device__ constexpr auto transform_tensor(const OldTensor& old_tensor,
const NewTransforms& new_transforms,
NewLowerDimensionOldVisibleIdss,
NewUpperDimensionNewVisibleIdss)
{
const auto new_desc = transform_tensor(old_tensor.desc_,
new_transforms,
NewLowerDimensionOldVisibleIdss{},
NewUpperDimensionNewVisibleIdss{});
return Tensor<OldTensor::kAddressSpace_,
OldTensor::kInvalidElementUseNumericalZeroValue,
typename OldTensor::DataType,
remove_cvref_t<decltype(new_desc)>>{old_tensor.buf_.p_data_, new_desc};
}
} // namespace ck
......@@ -26,6 +26,8 @@ template <typename Transforms,
typename ElementSpaceSize>
struct TensorDescriptor
{
using ElementSpaceSizeType = ElementSpaceSize;
// TODO make these private
__host__ __device__ static constexpr index_t GetNumOfTransform() { return Transforms::Size(); }
......
......@@ -24,7 +24,7 @@ struct TransformConvFwdToGemm
typename std::enable_if<NDimSpatial == 1 &&
is_same_v<ALayout, tensor_layout::convolution::GNWC>,
bool>::type = false>
static auto
__host__ __device__ static auto
MakeADescriptor_M_K(const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
const std::array<index_t, NDimSpatial + 3>& /* a_g_n_c_wis_strides */,
const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
......@@ -119,7 +119,7 @@ struct TransformConvFwdToGemm
typename std::enable_if<NDimSpatial == 2 &&
is_same_v<ALayout, tensor_layout::convolution::GNHWC>,
bool>::type = false>
static auto
__host__ __device__ static auto
MakeADescriptor_M_K(const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
const std::array<index_t, NDimSpatial + 3>& /* a_g_n_c_wis_strides */,
const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
......@@ -230,7 +230,7 @@ struct TransformConvFwdToGemm
typename std::enable_if<NDimSpatial == 3 &&
is_same_v<ALayout, tensor_layout::convolution::GNDHWC>,
bool>::type = false>
static auto
__host__ __device__ static auto
MakeADescriptor_M_K(const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
const std::array<index_t, NDimSpatial + 3>& /* a_g_n_c_wis_strides */,
const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
......@@ -363,7 +363,7 @@ struct TransformConvFwdToGemm
(is_same_v<ALayout, tensor_layout::convolution::G_NW_C> ||
is_same_v<ALayout, tensor_layout::convolution::NWGC>),
bool>::type = false>
static auto
__host__ __device__ static auto
MakeADescriptor_M_K(const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
......@@ -475,7 +475,7 @@ struct TransformConvFwdToGemm
NDimSpatial == 2 && (is_same_v<ALayout, tensor_layout::convolution::G_NHW_C> ||
is_same_v<ALayout, tensor_layout::convolution::NHWGC>),
bool>::type = false>
static auto
__host__ __device__ static auto
MakeADescriptor_M_K(const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
......@@ -603,7 +603,7 @@ struct TransformConvFwdToGemm
NDimSpatial == 3 && (is_same_v<ALayout, tensor_layout::convolution::G_NDHW_C> ||
is_same_v<ALayout, tensor_layout::convolution::NDHWGC>),
bool>::type = false>
static auto
__host__ __device__ static auto
MakeADescriptor_M_K(const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
......@@ -754,7 +754,7 @@ struct TransformConvFwdToGemm
is_same_v<BLayout, tensor_layout::convolution::GKYXC> ||
is_same_v<BLayout, tensor_layout::convolution::GKZYXC>,
bool>::type = false>
static auto
__host__ __device__ static auto
MakeBDescriptor_N_K(const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
const std::array<index_t, NDimSpatial + 3>& /* b_g_k_c_xs_strides */)
{
......@@ -779,7 +779,8 @@ struct TransformConvFwdToGemm
is_same_v<BLayout, tensor_layout::convolution::KYXGC> ||
is_same_v<BLayout, tensor_layout::convolution::KZYXGC>,
bool>::type = false>
static auto MakeBDescriptor_N_K(const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
__host__ __device__ static auto
MakeBDescriptor_N_K(const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides)
{
const index_t K = b_g_k_c_xs_lengths[1];
......@@ -809,7 +810,7 @@ struct TransformConvFwdToGemm
is_same_v<CLayout, tensor_layout::convolution::GNHWK> ||
is_same_v<CLayout, tensor_layout::convolution::GNDHWK>,
bool>::type = false>
static auto
__host__ __device__ static auto
MakeCDescriptor_M_N(const std::array<index_t, NDimSpatial + 3>& c_g_n_k_wos_lengths,
const std::array<index_t, NDimSpatial + 3>& /* c_g_n_k_wos_strides */)
{
......@@ -834,7 +835,8 @@ struct TransformConvFwdToGemm
is_same_v<CLayout, tensor_layout::convolution::NHWGK> ||
is_same_v<CLayout, tensor_layout::convolution::NDHWGK>,
bool>::type = false>
static auto MakeCDescriptor_M_N(const std::array<index_t, NDimSpatial + 3>& c_g_n_k_wos_lengths,
__host__ __device__ static auto
MakeCDescriptor_M_N(const std::array<index_t, NDimSpatial + 3>& c_g_n_k_wos_lengths,
const std::array<index_t, NDimSpatial + 3>& c_g_n_k_wos_strides)
{
const index_t N = c_g_n_k_wos_lengths[1];
......@@ -858,7 +860,7 @@ struct TransformConvFwdToGemm
typename std::enable_if<is_same_v<CLayout, tensor_layout::convolution::GK> ||
is_same_v<CLayout, tensor_layout::convolution::G_K>,
bool>::type = false>
static auto
__host__ __device__ static auto
MakeCDescriptor_M_N(const std::array<index_t, NDimSpatial + 3>& c_g_n_k_wos_lengths,
const std::array<index_t, NDimSpatial + 3>& /* c_g_n_k_wos_strides */)
{
......
......@@ -26,7 +26,7 @@ struct DynamicBuffer
T* p_data_;
ElementSpaceSize element_space_size_;
T invalid_element_value_ = T{0};
remove_cvref_t<T> invalid_element_value_ = T{0};
__host__ __device__ constexpr DynamicBuffer(T* p_data, ElementSpaceSize element_space_size)
: p_data_{p_data}, element_space_size_{element_space_size}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment